From b78bdf80a8e1954e1075500d896d95cab10beff1 Mon Sep 17 00:00:00 2001
From: Kornel Benko <kornel@lyx.org>
Date: Fri, 5 Oct 2018 20:26:44 +0200
Subject: [PATCH] Added better handling for languages and colors for advanced
 F&R

The change is significant if the search format is not disabled.
We try to analyze the pattern string first to get needed features
for the search.
We try to analyse the searched string and if it does not
contain all expected featers (color, language, char style, char decoration)

Still some problems though
---
 src/lyxfind.cpp      | 205 ++++++++++++++++++++++++++++++++++++++++---
 src/output_latex.cpp |  13 +--
 2 files changed, 202 insertions(+), 16 deletions(-)
diff --git a/src/lyxfind.cpp b/src/lyxfind.cpp
index 936ea2410e..cbb9289c8a 100644
--- a/src/lyxfind.cpp
+++ b/src/lyxfind.cpp
@@ -52,6 +52,7 @@
 #include "support/lstrings.h"
 
 #include "support/regex.h"
+#include <map>
 
 using namespace std;
 using namespace lyx::support;
@@ -799,6 +800,7 @@ static docstring buffer_to_latex(Buffer & buffer)
 	runparams.linelen = 80; //lyxrc.plaintext_linelen;
 	// No side effect of file copying and image conversion
 	runparams.dryrun = true;
+	runparams.for_search = true;
 	pit_type const endpit = buffer.paragraphs().size();
 	for (pit_type pit = 0; pit != endpit; ++pit) {
 		TeXOnePar(buffer, buffer.text(), pit, os, runparams);
@@ -843,15 +845,167 @@ static size_t identifyLeading(string const & s)
 	// @TODO Support \item[text]
 	// Kornel: Added textsl, textsf, textit, texttt and noun
 	// + allow to seach for colored text too
-	while (regex_replace(t, t, REGEX_BOS "\\\\(emph|noun|text(bf|sl|sf|it|tt|color\\{[a-z]+\\})|(u|uu)line|(s|x)out|uwave|subsubsection|subsection|section|subparagraph|paragraph|part)\\*?\\{", "")
+	while (regex_replace(t, t, REGEX_BOS "\\\\(((emph|noun|text(bf|sl|sf|it|tt))|((textcolor|foreignlanguage)\\{[a-z]+\\})|(u|uu)line|(s|x)out|uwave)|((sub)?(((sub)?section)|paragraph)|part)\\*?)\\{", "")
 	       || regex_replace(t, t, REGEX_BOS "\\$", "")
 	       || regex_replace(t, t, REGEX_BOS "\\\\\\[ ", "")
 	       || regex_replace(t, t, REGEX_BOS "\\\\item ", "")
 	       || regex_replace(t, t, REGEX_BOS "\\\\begin\\{[a-zA-Z_]*\\*?\\} ", ""))
-		LYXERR(Debug::FIND, "  after removing leading $, \\[ , \\emph{, \\textbf{, etc.: '" << t << "'");
+	       ;
+	LYXERR(Debug::FIND, "  after removing leading $, \\[ , \\emph{, \\textbf{, etc.: '" << t << "'");
 	return s.find(t);
 }
 
+typedef map<string, bool> Features;
+
+static Features identifyFeatures(string const & s)
+{
+	static regex const feature("\\\\(([a-z]+(\\{([a-z]+)\\}|\\*)?))\\{");
+	static regex const valid("^(((emph|noun|text(bf|sl|sf|it|tt)|(textcolor|foreignlanguage)\\{[a-z]+\\})|(u|uu)line|(s|x)out|uwave)|((sub)?(((sub)?section)|paragraph)|part)\\*?)$");
+	smatch sub;
+	bool displ = true;
+	Features info;
+
+	for (sregex_iterator it(s.begin(), s.end(), feature), end; it != end; ++it) {
+		sub = *it;
+		if (displ) {
+			if (sub.str(1).compare("regexp") == 0) {
+				displ = false;
+				continue;
+			}
+			string token = sub.str(1);
+			smatch sub2;
+			if (regex_match(token, sub2, valid)) {
+				info[token] = true;
+			}
+			else {
+				// ignore
+			}
+		}
+		else {
+			if (sub.str(1).compare("endregexp") == 0) {
+				displ = true;
+				continue;
+			}
+		}
+	}
+	return(info);
+}
+
+static int findclosing(string p, int start, int end)
+{
+	int skip = 0;
+	int depth = 0;
+	for (int i = start; i < end; i += 1 + skip) {
+		char c;
+		c = p[i];
+		skip = 0;
+		if (c == '\\') skip = 1;
+		else if (c == '{') depth++;
+		else if (c == '}') {
+			if (depth == 0) return(i);
+			--depth;
+		}
+	}
+	return(-1);
+}
+
+
+static string correctlanguagesetting(string par, bool from_regex, bool withformat)
+{
+	static string langstart = "\\foreignlanguage{";
+	static int llen = langstart.length();
+	static bool removefirstlang = false;
+	static Features regex_f;
+	static int missed = 0;
+	static bool regex_with_format = false;
+
+	int parlen = par.length();
+	string result = par;
+
+	while ((parlen > 0) && (par[parlen-1] == '\n')) {
+		parlen--;
+	}
+	if (from_regex) {
+		missed = 0;
+		if (withformat) {
+			regex_f = identifyFeatures(par);
+			for (auto it = regex_f.cbegin(); it != regex_f.cend(); ++it) {
+				string a = it->first;
+				regex_with_format = true;
+				// LYXERR0("Identified regex format:" << a);
+			}
+
+		}
+	} else if (regex_with_format) {
+		Features info = identifyFeatures(par);
+		for (auto it = regex_f.cbegin(); it != regex_f.cend(); ++it) {
+			string a = it->first;
+			bool b = it->second;
+			if (b && ! info[a]) {
+				missed++;
+				// LYXERR0("Missed(" << missed << ", srclen = " << parlen );
+				return("");
+			}
+		}
+	}
+	else {
+		// LYXERR0("No regex formats");
+	}
+	if (par.compare(0, llen, langstart) == 0) {
+		if (from_regex) {
+			removefirstlang = false;
+		}
+		int i = findclosing(par, llen, par.length());
+		if (removefirstlang) {
+			if (i < 0)
+				result = "";
+			else {
+				int closepos = findclosing(par, i+2, par.length());
+				if (closepos > 0) {
+					result = par.substr(i+2, closepos-i-2) + par.substr(closepos+1, parlen - closepos-1);
+				}
+				else {
+					result = par.substr(i+2, parlen-i-2);
+				}
+			}
+		}
+		else if (i > 0) {
+			// skip '}{' after the language spec
+			int closepos = findclosing(par, i+2, par.length());
+			size_t insertpos = par.find(langstart, i+2);
+			if (closepos < 0) {
+				if (insertpos == string::npos) {
+					// there are no closing in par, and no next lang spec
+					result = par.substr(0, parlen) + "}";
+				}
+				else {
+					// Add '}' at insertpos only, because closing is missing
+					result = par.substr(0,insertpos) + "}" + par.substr(insertpos, parlen-insertpos);
+				}
+			}
+			else if ((size_t) closepos > insertpos) {
+				// Add '}' at insertpos and remove from closepos if closepos > insertpos
+				result = par.substr(0,insertpos) + "}" + par.substr(insertpos, closepos - insertpos) + par.substr(closepos+1, parlen -closepos-1);
+			}
+		}
+		else {
+			result = par;
+			// For i == 0, it is empty language spec
+			// and for i < 0 it is Error
+		}
+	}
+	else {
+		if (from_regex) {
+			removefirstlang = true;
+		}
+	}
+	// remove possible \inputencoding entries
+	while (regex_replace(result, result, "\\\\inputencoding\\{[^\\}]*}", ""))
+		;
+	// Either not found language spec,or is single and closed spec or empty
+	return(result);
+}
+
 
 // Remove trailing closure of math, macros and environments, so to catch parts of them.
 static int identifyClosing(string & t)
@@ -887,6 +1041,8 @@ MatchStringAdv::MatchStringAdv(lyx::Buffer & buf, FindAndReplaceOptions const &
 	close_wildcards = 0;
 
 	size_t lead_size = 0;
+	// correct the language settings
+	par_as_string = correctlanguagesetting(par_as_string, true, !opt.ignoreformat);
 	if (opt.ignoreformat) {
 		if (!use_regexp) {
 			// if par_as_string_nolead were emty,
@@ -897,6 +1053,7 @@ MatchStringAdv::MatchStringAdv(lyx::Buffer & buf, FindAndReplaceOptions const &
 		}
 	} else {
 		lead_size = identifyLeading(par_as_string);
+		LYXERR(Debug::FIND, "Lead_size: " << lead_size);
 		lead_as_string = par_as_string.substr(0, lead_size);
 		par_as_string_nolead = par_as_string.substr(lead_size, par_as_string.size() - lead_size);
 	}
@@ -985,6 +1142,7 @@ int MatchStringAdv::findAux(DocIterator const & cur, int len, bool at_begin) con
 
 	docstring docstr = stringifyFromForSearch(opt, cur, len);
 	string str = normalize(docstr, true);
+	if (str.empty()) return(-1);
 	LYXERR(Debug::FIND, "Matching against     '" << lyx::to_utf8(docstr) << "'");
 	LYXERR(Debug::FIND, "After normalization: '" << str << "'");
 
@@ -1108,9 +1266,10 @@ string MatchStringAdv::normalize(docstring const & s, bool hack_braces) const
 	// Kornel: Added textsl, textsf, textit, texttt and noun
 	// + allow to seach for colored text too
 	LYXERR(Debug::FIND, "Removing stale empty \\emph{}, \\textbf{}, \\*section{} macros from: " << t);
-	while (regex_replace(t, t, "\\\\(emph|noun|text(bf|sl|sf|it|tt|color\\{[a-z]+\\})|(u|uu)line|(s|x)out|uwave|subsubsection|subsection|section|subparagraph|paragraph|part)(\\{\\})+", ""))
+	while (regex_replace(t, t, "\\\\((emph|noun|text(bf|sl|sf|it|tt|color\\{[a-z]+\\})|(u|uu)line|(s|x)out|uwave)|((sub)?(((sub)?section)|paragraph)|part)\\*?)(\\{\\})+", ""))
 		LYXERR(Debug::FIND, "  further removing stale empty \\emph{}, \\textbf{} macros from: " << t);
 
+	while (regex_replace(t, t, "\\\\foreignlanguage\\{[a-z]+\\}(\\{(\\\\item )?\\})+", ""));
 	// FIXME - check what preceeds the brace
 	if (hack_braces) {
 		if (opt.ignoreformat)
@@ -1185,6 +1344,7 @@ docstring latexifyFromCursor(DocIterator const & cur, int len)
 	runparams.linelen = 8000; //lyxrc.plaintext_linelen;
 	// No side effect of file copying and image conversion
 	runparams.dryrun = true;
+	runparams.for_search = true;
 
 	if (cur.inTexted()) {
 		// @TODO what about searching beyond/across paragraph breaks ?
@@ -1194,6 +1354,9 @@ docstring latexifyFromCursor(DocIterator const & cur, int len)
 		TeXOnePar(buf, *cur.innerText(), cur.pit(), os, runparams,
 			  string(), cur.pos(), endpos);
 		LYXERR(Debug::FIND, "Latexified text: '" << lyx::to_utf8(ods.str()) << "'");
+		string s = correctlanguagesetting(lyx::to_utf8(ods.str()), false, false);
+		LYXERR(Debug::FIND, "Latexified text: '" << s << "'");
+		return(lyx::from_utf8(s));
 	} else if (cur.inMathed()) {
 		// Retrieve the math environment type, and add '$' or '$[' or others (\begin{equation}) accordingly
 		for (int s = cur.depth() - 1; s >= 0; --s) {
@@ -1259,12 +1422,13 @@ int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match)
 	if (cur.pos() + len > cur.lastpos())
 		return 0;
 	LYXERR(Debug::FIND, "verifying unmatch with len = " << len);
-	while (cur.pos() + len <= cur.lastpos() && match(cur, len) == 0) {
+	while (cur.pos() + len <= cur.lastpos() && match(cur, len) <= 0) {
 		++len;
 		LYXERR(Debug::FIND, "verifying unmatch with len = " << len);
 	}
 	// Length of matched text (different from len param)
 	int old_len = match(cur, len);
+	if (old_len < 0) old_len = 0;
 	int new_len;
 	// Greedy behaviour while matching regexps
 	while ((new_len = match(cur, len + 1)) > old_len) {
@@ -1281,27 +1445,46 @@ int findForwardAdv(DocIterator & cur, MatchStringAdv & match)
 {
 	if (!cur)
 		return 0;
+	static int max_missed = 0;
 	while (!theApp()->longOperationCancelled() && cur) {
 		LYXERR(Debug::FIND, "findForwardAdv() cur: " << cur);
 		int match_len = match(cur, -1, false);
 		LYXERR(Debug::FIND, "match_len: " << match_len);
-		if (match_len) {
+		if (match_len > 0) {
+			int count = 0;
+			int match_len_zero_count = 0;
 			for (; !theApp()->longOperationCancelled() && cur; cur.forwardPos()) {
 				LYXERR(Debug::FIND, "Advancing cur: " << cur);
 				int match_len2 = match(cur);
-				LYXERR(Debug::FIND, "match_len: " << match_len2);
-				if (match_len2) {
+				LYXERR(Debug::FIND, "match_len2: " << match_len2);
+				if (match_len2 > 0) {
 					// Sometimes in finalize we understand it wasn't a match
 					// and we need to continue the outest loop
 					int len = findAdvFinalize(cur, match);
-					if (len > 0)
+					if (len > 0) {
 						return len;
+					}
+				}
+				if (match_len2 >= 0) {
+					count = 0;
+					if (match_len2 == 0)
+						match_len_zero_count++;
+					else
+						match_len_zero_count = 0;
+				}
+				else {
+					count++;
+					if (count > max_missed) max_missed = count;
+					if (count > 5) {
+						LYXERR(Debug::FIND, "match_len2_zero_count: " << match_len_zero_count << ", match_len was " << match_len);
+						break;
+					}
 				}
 			}
 			if (!cur)
 				return 0;
 		}
-		if (cur.pit() < cur.lastpit()) {
+		if (match_len >= 0 && cur.pit() < cur.lastpit()) {
 			LYXERR(Debug::FIND, "Advancing par: cur=" << cur);
 			cur.forwardPar();
 		} else {
@@ -1393,8 +1576,8 @@ int findBackwardsAdv(DocIterator & cur, MatchStringAdv & match)
 docstring stringifyFromForSearch(FindAndReplaceOptions const & opt,
 				 DocIterator const & cur, int len)
 {
-	LASSERT(cur.pos() >= 0 && cur.pos() <= cur.lastpos(),
-	        return docstring());
+	if (cur.pos() < 0 || cur.pos() > cur.lastpos())
+	        return docstring();
 	if (!opt.ignoreformat)
 		return latexifyFromCursor(cur, len);
 	else
diff --git a/src/output_latex.cpp b/src/output_latex.cpp
index f73990de21..96f51df5d3 100644
--- a/src/output_latex.cpp
+++ b/src/output_latex.cpp
@@ -814,10 +814,12 @@ void TeXOnePar(Buffer const & buf,
 			    || (priorpar->getDepth() == par.getDepth()
 				    && priorpar->layout() != par.layout()));
 	Language const * const prev_language =
-		(priorpar && !priorpar->isPassThru())
-		? (use_prev_env_language ? state->prev_env_language_
-					 : priorpar->getParLanguage(bparams))
-		: outer_language;
+		runparams_in.for_search ?
+			languages.getLanguage("ignore")
+		:(priorpar && !priorpar->isPassThru())
+			? (use_prev_env_language ? state->prev_env_language_
+					 	: priorpar->getParLanguage(bparams))
+			: outer_language;
 
 	bool const use_polyglossia = runparams.use_polyglossia;
 	string const par_lang = use_polyglossia ?
@@ -854,7 +856,8 @@ void TeXOnePar(Buffer const & buf,
 		&& runparams.local_font != 0
 		&& outer_language->rightToLeft()
 		&& !par_language->rightToLeft();
-	bool const localswitch = text.inset().forceLocalFontSwitch()
+	bool const localswitch = runparams_in.for_search
+			|| text.inset().forceLocalFontSwitch()
 			|| (using_begin_end && text.inset().forcePlainLayout())
 			|| in_polyglossia_rtl_env;
 	if (localswitch) {