Added better handling for languages and colors for advanced F&R

The change is significant if the search format is not disabled. We try to analyze the pattern string first to get needed features for the search. We try to analyse the searched string and if it does not contain all expected featers (color, language, char style, char decoration) Still some problems though
2024-11-10 20:04:46 +00:00 · 2018-10-05 20:26:44 +02:00 · 2018-10-05 20:26:44 +02:00 · b78bdf80a8
commit b78bdf80a8
parent b1f1b2321b
2 changed files with 202 additions and 16 deletions
--- a/src/lyxfind.cpp
+++ b/src/lyxfind.cpp
@ -52,6 +52,7 @@
 #include "support/lstrings.h"

 #include "support/regex.h"
+#include <map>

 using namespace std;
 using namespace lyx::support;
@ -799,6 +800,7 @@ static docstring buffer_to_latex(Buffer & buffer)
 	runparams.linelen = 80; //lyxrc.plaintext_linelen;
 	// No side effect of file copying and image conversion
 	runparams.dryrun = true;
+	runparams.for_search = true;
 	pit_type const endpit = buffer.paragraphs().size();
 	for (pit_type pit = 0; pit != endpit; ++pit) {
 		TeXOnePar(buffer, buffer.text(), pit, os, runparams);
@ -843,15 +845,167 @@ static size_t identifyLeading(string const & s)
 	// @TODO Support \item[text]
 	// Kornel: Added textsl, textsf, textit, texttt and noun
 	// + allow to seach for colored text too
-	while (regex_replace(t, t, REGEX_BOS "\\\\(emph|noun|text(bf|sl|sf|it|tt|color\\{[a-z]+\\})|(u|uu)line|(s|x)out|uwave|subsubsection|subsection|section|subparagraph|paragraph|part)\\*?\\{", "")
+	while (regex_replace(t, t, REGEX_BOS "\\\\(((emph|noun|text(bf|sl|sf|it|tt))|((textcolor|foreignlanguage)\\{[a-z]+\\})|(u|uu)line|(s|x)out|uwave)|((sub)?(((sub)?section)|paragraph)|part)\\*?)\\{", "")
 	       || regex_replace(t, t, REGEX_BOS "\\$", "")
 	       || regex_replace(t, t, REGEX_BOS "\\\\\\[ ", "")
 	       || regex_replace(t, t, REGEX_BOS "\\\\item ", "")
 	       || regex_replace(t, t, REGEX_BOS "\\\\begin\\{[a-zA-Z_]*\\*?\\} ", ""))
+	       ;
 	LYXERR(Debug::FIND, "  after removing leading $, \\[ , \\emph{, \\textbf{, etc.: '" << t << "'");
 	return s.find(t);
 }

+typedef map<string, bool> Features;
+
+static Features identifyFeatures(string const & s)
+{
+	static regex const feature("\\\\(([a-z]+(\\{([a-z]+)\\}|\\*)?))\\{");
+	static regex const valid("^(((emph|noun|text(bf|sl|sf|it|tt)|(textcolor|foreignlanguage)\\{[a-z]+\\})|(u|uu)line|(s|x)out|uwave)|((sub)?(((sub)?section)|paragraph)|part)\\*?)$");
+	smatch sub;
+	bool displ = true;
+	Features info;
+
+	for (sregex_iterator it(s.begin(), s.end(), feature), end; it != end; ++it) {
+		sub = *it;
+		if (displ) {
+			if (sub.str(1).compare("regexp") == 0) {
+				displ = false;
+				continue;
+			}
+			string token = sub.str(1);
+			smatch sub2;
+			if (regex_match(token, sub2, valid)) {
+				info[token] = true;
+			}
+			else {
+				// ignore
+			}
+		}
+		else {
+			if (sub.str(1).compare("endregexp") == 0) {
+				displ = true;
+				continue;
+			}
+		}
+	}
+	return(info);
+}
+
+static int findclosing(string p, int start, int end)
+{
+	int skip = 0;
+	int depth = 0;
+	for (int i = start; i < end; i += 1 + skip) {
+		char c;
+		c = p[i];
+		skip = 0;
+		if (c == '\\') skip = 1;
+		else if (c == '{') depth++;
+		else if (c == '}') {
+			if (depth == 0) return(i);
+			--depth;
+		}
+	}
+	return(-1);
+}
+
+
+static string correctlanguagesetting(string par, bool from_regex, bool withformat)
+{
+	static string langstart = "\\foreignlanguage{";
+	static int llen = langstart.length();
+	static bool removefirstlang = false;
+	static Features regex_f;
+	static int missed = 0;
+	static bool regex_with_format = false;
+
+	int parlen = par.length();
+	string result = par;
+
+	while ((parlen > 0) && (par[parlen-1] == '\n')) {
+		parlen--;
+	}
+	if (from_regex) {
+		missed = 0;
+		if (withformat) {
+			regex_f = identifyFeatures(par);
+			for (auto it = regex_f.cbegin(); it != regex_f.cend(); ++it) {
+				string a = it->first;
+				regex_with_format = true;
+				// LYXERR0("Identified regex format:" << a);
+			}
+
+		}
+	} else if (regex_with_format) {
+		Features info = identifyFeatures(par);
+		for (auto it = regex_f.cbegin(); it != regex_f.cend(); ++it) {
+			string a = it->first;
+			bool b = it->second;
+			if (b && ! info[a]) {
+				missed++;
+				// LYXERR0("Missed(" << missed << ", srclen = " << parlen );
+				return("");
+			}
+		}
+	}
+	else {
+		// LYXERR0("No regex formats");
+	}
+	if (par.compare(0, llen, langstart) == 0) {
+		if (from_regex) {
+			removefirstlang = false;
+		}
+		int i = findclosing(par, llen, par.length());
+		if (removefirstlang) {
+			if (i < 0)
+				result = "";
+			else {
+				int closepos = findclosing(par, i+2, par.length());
+				if (closepos > 0) {
+					result = par.substr(i+2, closepos-i-2) + par.substr(closepos+1, parlen - closepos-1);
+				}
+				else {
+					result = par.substr(i+2, parlen-i-2);
+				}
+			}
+		}
+		else if (i > 0) {
+			// skip '}{' after the language spec
+			int closepos = findclosing(par, i+2, par.length());
+			size_t insertpos = par.find(langstart, i+2);
+			if (closepos < 0) {
+				if (insertpos == string::npos) {
+					// there are no closing in par, and no next lang spec
+					result = par.substr(0, parlen) + "}";
+				}
+				else {
+					// Add '}' at insertpos only, because closing is missing
+					result = par.substr(0,insertpos) + "}" + par.substr(insertpos, parlen-insertpos);
+				}
+			}
+			else if ((size_t) closepos > insertpos) {
+				// Add '}' at insertpos and remove from closepos if closepos > insertpos
+				result = par.substr(0,insertpos) + "}" + par.substr(insertpos, closepos - insertpos) + par.substr(closepos+1, parlen -closepos-1);
+			}
+		}
+		else {
+			result = par;
+			// For i == 0, it is empty language spec
+			// and for i < 0 it is Error
+		}
+	}
+	else {
+		if (from_regex) {
+			removefirstlang = true;
+		}
+	}
+	// remove possible \inputencoding entries
+	while (regex_replace(result, result, "\\\\inputencoding\\{[^\\}]*}", ""))
+		;
+	// Either not found language spec,or is single and closed spec or empty
+	return(result);
+}
+

 // Remove trailing closure of math, macros and environments, so to catch parts of them.
 static int identifyClosing(string & t)
@ -887,6 +1041,8 @@ MatchStringAdv::MatchStringAdv(lyx::Buffer & buf, FindAndReplaceOptions const &
 	close_wildcards = 0;

 	size_t lead_size = 0;
+	// correct the language settings
+	par_as_string = correctlanguagesetting(par_as_string, true, !opt.ignoreformat);
 	if (opt.ignoreformat) {
 		if (!use_regexp) {
 			// if par_as_string_nolead were emty,
@ -897,6 +1053,7 @@ MatchStringAdv::MatchStringAdv(lyx::Buffer & buf, FindAndReplaceOptions const &
 		}
 	} else {
 		lead_size = identifyLeading(par_as_string);
+		LYXERR(Debug::FIND, "Lead_size: " << lead_size);
 		lead_as_string = par_as_string.substr(0, lead_size);
 		par_as_string_nolead = par_as_string.substr(lead_size, par_as_string.size() - lead_size);
 	}
@ -985,6 +1142,7 @@ int MatchStringAdv::findAux(DocIterator const & cur, int len, bool at_begin) con

 	docstring docstr = stringifyFromForSearch(opt, cur, len);
 	string str = normalize(docstr, true);
+	if (str.empty()) return(-1);
 	LYXERR(Debug::FIND, "Matching against     '" << lyx::to_utf8(docstr) << "'");
 	LYXERR(Debug::FIND, "After normalization: '" << str << "'");

@ -1108,9 +1266,10 @@ string MatchStringAdv::normalize(docstring const & s, bool hack_braces) const
 	// Kornel: Added textsl, textsf, textit, texttt and noun
 	// + allow to seach for colored text too
 	LYXERR(Debug::FIND, "Removing stale empty \\emph{}, \\textbf{}, \\*section{} macros from: " << t);
-	while (regex_replace(t, t, "\\\\(emph|noun|text(bf|sl|sf|it|tt|color\\{[a-z]+\\})|(u|uu)line|(s|x)out|uwave|subsubsection|subsection|section|subparagraph|paragraph|part)(\\{\\})+", ""))
+	while (regex_replace(t, t, "\\\\((emph|noun|text(bf|sl|sf|it|tt|color\\{[a-z]+\\})|(u|uu)line|(s|x)out|uwave)|((sub)?(((sub)?section)|paragraph)|part)\\*?)(\\{\\})+", ""))
 		LYXERR(Debug::FIND, "  further removing stale empty \\emph{}, \\textbf{} macros from: " << t);

+	while (regex_replace(t, t, "\\\\foreignlanguage\\{[a-z]+\\}(\\{(\\\\item )?\\})+", ""));
 	// FIXME - check what preceeds the brace
 	if (hack_braces) {
 		if (opt.ignoreformat)
@ -1185,6 +1344,7 @@ docstring latexifyFromCursor(DocIterator const & cur, int len)
 	runparams.linelen = 8000; //lyxrc.plaintext_linelen;
 	// No side effect of file copying and image conversion
 	runparams.dryrun = true;
+	runparams.for_search = true;

 	if (cur.inTexted()) {
 		// @TODO what about searching beyond/across paragraph breaks ?
@ -1194,6 +1354,9 @@ docstring latexifyFromCursor(DocIterator const & cur, int len)
 		TeXOnePar(buf, *cur.innerText(), cur.pit(), os, runparams,
 			  string(), cur.pos(), endpos);
 		LYXERR(Debug::FIND, "Latexified text: '" << lyx::to_utf8(ods.str()) << "'");
+		string s = correctlanguagesetting(lyx::to_utf8(ods.str()), false, false);
+		LYXERR(Debug::FIND, "Latexified text: '" << s << "'");
+		return(lyx::from_utf8(s));
 	} else if (cur.inMathed()) {
 		// Retrieve the math environment type, and add '$' or '$[' or others (\begin{equation}) accordingly
 		for (int s = cur.depth() - 1; s >= 0; --s) {
@ -1259,12 +1422,13 @@ int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match)
 	if (cur.pos() + len > cur.lastpos())
 		return 0;
 	LYXERR(Debug::FIND, "verifying unmatch with len = " << len);
-	while (cur.pos() + len <= cur.lastpos() && match(cur, len) == 0) {
+	while (cur.pos() + len <= cur.lastpos() && match(cur, len) <= 0) {
 		++len;
 		LYXERR(Debug::FIND, "verifying unmatch with len = " << len);
 	}
 	// Length of matched text (different from len param)
 	int old_len = match(cur, len);
+	if (old_len < 0) old_len = 0;
 	int new_len;
 	// Greedy behaviour while matching regexps
 	while ((new_len = match(cur, len + 1)) > old_len) {
@ -1281,27 +1445,46 @@ int findForwardAdv(DocIterator & cur, MatchStringAdv & match)
 {
 	if (!cur)
 		return 0;
+	static int max_missed = 0;
 	while (!theApp()->longOperationCancelled() && cur) {
 		LYXERR(Debug::FIND, "findForwardAdv() cur: " << cur);
 		int match_len = match(cur, -1, false);
 		LYXERR(Debug::FIND, "match_len: " << match_len);
-		if (match_len) {
+		if (match_len > 0) {
+			int count = 0;
+			int match_len_zero_count = 0;
 			for (; !theApp()->longOperationCancelled() && cur; cur.forwardPos()) {
 				LYXERR(Debug::FIND, "Advancing cur: " << cur);
 				int match_len2 = match(cur);
-				LYXERR(Debug::FIND, "match_len: " << match_len2);
-				if (match_len2) {
+				LYXERR(Debug::FIND, "match_len2: " << match_len2);
+				if (match_len2 > 0) {
 					// Sometimes in finalize we understand it wasn't a match
 					// and we need to continue the outest loop
 					int len = findAdvFinalize(cur, match);
-					if (len > 0)
+					if (len > 0) {
 						return len;
 					}
 				}
+				if (match_len2 >= 0) {
+					count = 0;
+					if (match_len2 == 0)
+						match_len_zero_count++;
+					else
+						match_len_zero_count = 0;
+				}
+				else {
+					count++;
+					if (count > max_missed) max_missed = count;
+					if (count > 5) {
+						LYXERR(Debug::FIND, "match_len2_zero_count: " << match_len_zero_count << ", match_len was " << match_len);
+						break;
+					}
+				}
+			}
 			if (!cur)
 				return 0;
 		}
-		if (cur.pit() < cur.lastpit()) {
+		if (match_len >= 0 && cur.pit() < cur.lastpit()) {
 			LYXERR(Debug::FIND, "Advancing par: cur=" << cur);
 			cur.forwardPar();
 		} else {
@ -1393,8 +1576,8 @@ int findBackwardsAdv(DocIterator & cur, MatchStringAdv & match)
 docstring stringifyFromForSearch(FindAndReplaceOptions const & opt,
 				 DocIterator const & cur, int len)
 {
-	LASSERT(cur.pos() >= 0 && cur.pos() <= cur.lastpos(),
-	        return docstring());
+	if (cur.pos() < 0 || cur.pos() > cur.lastpos())
+	        return docstring();
 	if (!opt.ignoreformat)
 		return latexifyFromCursor(cur, len);
 	else
--- a/src/output_latex.cpp
+++ b/src/output_latex.cpp
@ -814,7 +814,9 @@ void TeXOnePar(Buffer const & buf,
 			    || (priorpar->getDepth() == par.getDepth()
 				    && priorpar->layout() != par.layout()));
 	Language const * const prev_language =
-		(priorpar && !priorpar->isPassThru())
+		runparams_in.for_search ?
+			languages.getLanguage("ignore")
+		:(priorpar && !priorpar->isPassThru())
 			? (use_prev_env_language ? state->prev_env_language_
 					 	: priorpar->getParLanguage(bparams))
 			: outer_language;
@ -854,7 +856,8 @@ void TeXOnePar(Buffer const & buf,
 		&& runparams.local_font != 0
 		&& outer_language->rightToLeft()
 		&& !par_language->rightToLeft();
-	bool const localswitch = text.inset().forceLocalFontSwitch()
+	bool const localswitch = runparams_in.for_search
+			|| text.inset().forceLocalFontSwitch()
 			|| (using_begin_end && text.inset().forcePlainLayout())
 			|| in_polyglossia_rtl_env;
 	if (localswitch) {