Amend(2) 8c67cb8c: (FindAdv: Try to make regex search with format enabled somehow faster)

Use innermost nesting to start searches.
Some fine tuning to determine correct match.
(If the regex contains '(\S)\1' at the end, then this regex would match '}}',
but this is often the case at and of examined string. We have to disable this invalid match.
)
This commit is contained in:
Kornel Benko 2021-01-04 07:16:59 +01:00
parent db36ec1ccf
commit 6750724717

View File

@ -745,6 +745,7 @@ string escape_for_regex(string s, bool match_latex)
new_pos = s.size(); new_pos = s.size();
string t; string t;
if (new_pos > pos) { if (new_pos > pos) {
// outside regexp
LYXERR(Debug::FIND, "new_pos: " << new_pos); LYXERR(Debug::FIND, "new_pos: " << new_pos);
t = apply_escapes(s.substr(pos, new_pos - pos), get_lyx_unescapes()); t = apply_escapes(s.substr(pos, new_pos - pos), get_lyx_unescapes());
LYXERR(Debug::FIND, "t [lyx]: " << t); LYXERR(Debug::FIND, "t [lyx]: " << t);
@ -875,6 +876,12 @@ static MatchResult::range interpretMatch(MatchResult &oldres, MatchResult &newre
return MatchResult::newIsTooFar; return MatchResult::newIsTooFar;
if ((newres.match_len == oldres.match_len) && (newres.match2end == oldres.match2end)) if ((newres.match_len == oldres.match_len) && (newres.match2end == oldres.match2end))
return MatchResult::newIsBetter; return MatchResult::newIsBetter;
if ((newres.match_len == oldres.match_len) && (newres.match2end -2 == oldres.match2end)) {
// The string contained for instance "\usepackage...fontenc ..."
// and now after moved 9 char forward contains "ge...{fontenc} ..."
// so we accept it as OK
return MatchResult::newIsBetter;
}
return MatchResult::newIsInvalid; return MatchResult::newIsInvalid;
} }
@ -947,6 +954,7 @@ private:
// number of (.*?) subexpressions added at end of search regexp for closing // number of (.*?) subexpressions added at end of search regexp for closing
// environments, math mode, styles, etc... // environments, math mode, styles, etc...
int close_wildcards; int close_wildcards;
public:
// Are we searching with regular expressions ? // Are we searching with regular expressions ?
bool use_regexp; bool use_regexp;
}; };
@ -1415,6 +1423,7 @@ static void buildAccentsMap()
accents["LaTeX"] = getutf8(0xf0012); accents["LaTeX"] = getutf8(0xf0012);
accents["latexe"] = getutf8(0xf0013); accents["latexe"] = getutf8(0xf0013);
accents["LaTeXe"] = getutf8(0xf0013); accents["LaTeXe"] = getutf8(0xf0013);
accents["lyxarrow"] = getutf8(0xf0020);
accents["backslash lyx"] = getutf8(0xf0010); // Used logos inserted with starting \backslash accents["backslash lyx"] = getutf8(0xf0010); // Used logos inserted with starting \backslash
accents["backslash LyX"] = getutf8(0xf0010); accents["backslash LyX"] = getutf8(0xf0010);
accents["backslash tex"] = getutf8(0xf0011); accents["backslash tex"] = getutf8(0xf0011);
@ -1423,6 +1432,7 @@ static void buildAccentsMap()
accents["backslash LaTeX"] = getutf8(0xf0012); accents["backslash LaTeX"] = getutf8(0xf0012);
accents["backslash latexe"] = getutf8(0xf0013); accents["backslash latexe"] = getutf8(0xf0013);
accents["backslash LaTeXe"] = getutf8(0xf0013); accents["backslash LaTeXe"] = getutf8(0xf0013);
accents["backslash lyxarrow"] = getutf8(0xf0020);
accents["ddot{\\imath}"] = "ï"; accents["ddot{\\imath}"] = "ï";
buildaccent("ddot", "aAeEhHiIioOtuUwWxXyY", buildaccent("ddot", "aAeEhHiIioOtuUwWxXyY",
"äÄëËḧḦïÏïöÖẗüÜẅẄẍẌÿŸ"); // umlaut "äÄëËḧḦïÏïöÖẗüÜẅẄẍẌÿŸ"); // umlaut
@ -1488,7 +1498,7 @@ void Intervall::removeAccents()
buildAccentsMap(); buildAccentsMap();
static regex const accre("\\\\(([\\S]|grave|breve|ddot|dot|acute|dacute|mathring|check|hat|bar|tilde|subdot|ogonek|" static regex const accre("\\\\(([\\S]|grave|breve|ddot|dot|acute|dacute|mathring|check|hat|bar|tilde|subdot|ogonek|"
"cedilla|subring|textsubring|subhat|textsubcircum|subtilde|textsubtilde|dgrave|textdoublegrave|rcap|textroundcap|slashed)\\{[^\\{\\}]+\\}" "cedilla|subring|textsubring|subhat|textsubcircum|subtilde|textsubtilde|dgrave|textdoublegrave|rcap|textroundcap|slashed)\\{[^\\{\\}]+\\}"
"|((i|imath|jmath|cdot|[a-z]+space)|((backslash )?([lL]y[xX]|[tT]e[xX]|[lL]a[tT]e[xX]e?)))(?![a-zA-Z]))"); "|((i|imath|jmath|cdot|[a-z]+space)|((backslash )?([lL]y[xX]|[tT]e[xX]|[lL]a[tT]e[xX]e?|lyxarrow)))(?![a-zA-Z]))");
smatch sub; smatch sub;
for (sregex_iterator itacc(par.begin(), par.end(), accre), end; itacc != end; ++itacc) { for (sregex_iterator itacc(par.begin(), par.end(), accre), end; itacc != end; ++itacc) {
sub = *itacc; sub = *itacc;
@ -2024,6 +2034,7 @@ void LatexInfo::buildEntries(bool isPatternString)
if (interval_.par[optend] != '{') { if (interval_.par[optend] != '{') {
closings = 0; closings = 0;
found.parenthesiscount = 0; found.parenthesiscount = 0;
found.head = "\\" + key;
} }
else else
closings = found.parenthesiscount; closings = found.parenthesiscount;
@ -2042,9 +2053,17 @@ void LatexInfo::buildEntries(bool isPatternString)
found._tokensize = found.head.length(); found._tokensize = found.head.length();
found._dataStart = found._tokenstart + found.head.length(); found._dataStart = found._tokenstart + found.head.length();
if (found.keytype == KeyInfo::doRemove) { if (found.keytype == KeyInfo::doRemove) {
int endpar = 2 + interval_.findclosing(found._dataStart, interval_.par.length(), '{', '}', closings); if (closings > 0) {
found._dataStart = endpar; size_t endpar = 2 + interval_.findclosing(found._dataStart, interval_.par.length(), '{', '}', closings);
found._tokensize = found._dataStart - found._tokenstart; if (endpar >= interval_.par.length())
found._dataStart = interval_.par.length();
else
found._dataStart = endpar;
found._tokensize = found._dataStart - found._tokenstart;
}
else {
found._dataStart = found._tokenstart + found._tokensize;
}
closings = 0; closings = 0;
} }
if (interval_.par.substr(found._dataStart-1, 15).compare("\\endarguments{}") == 0) { if (interval_.par.substr(found._dataStart-1, 15).compare("\\endarguments{}") == 0) {
@ -2420,8 +2439,11 @@ int LatexInfo::dispatch(ostringstream &os, int previousStart, KeyInfo &actual)
} }
case KeyInfo::isSize: { case KeyInfo::isSize: {
if (actual.disabled || (interval_.par[actual._dataStart] != '{') || (interval_.par[actual._dataStart-1] == ' ')) { if (actual.disabled || (interval_.par[actual._dataStart] != '{') || (interval_.par[actual._dataStart-1] == ' ')) {
processRegion(actual._dataEnd, actual._dataEnd+1); /* remove possibly following {} */ if (actual.parenthesiscount == 0)
interval_.addIntervall(actual._tokenstart, actual._dataEnd+1); interval_.addIntervall(actual._tokenstart, actual._dataEnd);
else {
interval_.addIntervall(actual._tokenstart, actual._dataEnd+1);
}
nextKeyIdx = getNextKey(); nextKeyIdx = getNextKey();
} else { } else {
// Here _dataStart points to '{', so correct it // Here _dataStart points to '{', so correct it
@ -3050,7 +3072,7 @@ MatchStringAdv::MatchStringAdv(lyx::Buffer & buf, FindAndReplaceOptions const &
} }
} }
#if 0
// Count number of characters in string // Count number of characters in string
// {]} ==> 1 // {]} ==> 1
// \& ==> 1 // \& ==> 1
@ -3110,6 +3132,7 @@ static int computeSize(string s, int len)
} }
return count; return count;
} }
#endif
MatchResult MatchStringAdv::findAux(DocIterator const & cur, int len, bool at_begin) const MatchResult MatchStringAdv::findAux(DocIterator const & cur, int len, bool at_begin) const
{ {
@ -3213,19 +3236,61 @@ MatchResult MatchStringAdv::findAux(DocIterator const & cur, int len, bool at_be
result = 0; result = 0;
#if QTSEARCH #if QTSEARCH
mres.match_prefix = match.capturedEnd(2) - match.capturedStart(2); mres.match_prefix = match.capturedEnd(2) - match.capturedStart(2);
// mres.match_len = computeSize(QStringRef(&qstr, pos+leadingsize,result), result) - mres.match_prefix;
mres.match_len = match.capturedEnd(0) - match.capturedEnd(2); mres.match_len = match.capturedEnd(0) - match.capturedEnd(2);
// mres.match2end = qstr.size() - pos - leadingsize - mres.match_prefix; // because of different number of closing at end of string
mres.match2end = qstr.size() - match.capturedEnd(0); // we have to 'unify' the length of the post-match.
// Done by ignoring closing parenthesis and linefeeds at string end
int matchend = match.capturedEnd(0);
while (mres.match_len > 0) {
QChar c = qstr.at(matchend - 1);
if ((c == '\n') || (c == '}') || (c == '{')) {
mres.match_len--;
matchend--;
}
else
break;
}
size_t strsize = qstr.size();
while (strsize > (size_t) match.capturedEnd(0)) {
QChar c = qstr.at(strsize-1);
if ((c == '\n') || (c == '}')) {
--strsize;
}
else
break;
}
// LYXERR0(qstr.toStdString());
mres.match2end = strsize - matchend;
mres.pos = match.capturedStart(2); mres.pos = match.capturedStart(2);
#else #else
mres.match_prefix = m[2].second - m[2].first; mres.match_prefix = m[2].second - m[2].first;
// mres.match_len = computeSize(str.substr(pos+leadingsize,result), result) - mres.match_prefix;
mres.match_len = m[0].second - m[2].second; mres.match_len = m[0].second - m[2].second;
// mres.match2end = str.size() - pos - leadingsize - mres.match_prefix; // ignore closing parenthesis and linefeeds at string end
mres.match2end = str.size() - m[0].second; size_t strend = m[0].second - m[0].first;
mres.pos = m[2].first; int matchend = strend;
while (mres.match_len > 0) {
char c = str.at(matchend - 1);
if ((c == '\n') || (c == '}') || (c == '{')) {
mres.match_len--;
matchend--;
}
else
break;
}
size_t strsize = str.size();
while (strsize > strend) {
if ((str.at(strsize-1) == '}') || (str.at(strsize-1) == '\n')) {
--strsize;
}
else
break;
}
// LYXERR0(str);
mres.match2end = strsize - matchend;
mres.pos = m[2].first - m[0].first;;
#endif #endif
if (mres.match2end < 0)
mres.match_len = 0;
mres.leadsize = leadingsize; mres.leadsize = leadingsize;
return mres; return mres;
} }
@ -3477,25 +3542,57 @@ docstring latexifyFromCursor(DocIterator const & cur, int len)
return ods.str(); return ods.str();
} }
#if 0
// Debugging output
static void displayMResult(MatchResult &mres, int increment)
{
LYXERR0( "pos: " << mres.pos << " increment " << increment);
LYXERR0( "leadsize: " << mres.leadsize);
LYXERR0( "match_len: " << mres.match_len);
LYXERR0( "match_prefix: " << mres.match_prefix);
LYXERR0( "match2end: " << mres.match2end);
}
#define displayMres(s,i) displayMResult(s,i);
#else
#define displayMres(s,i)
#endif
/** Finalize an advanced find operation, advancing the cursor to the innermost /** Finalize an advanced find operation, advancing the cursor to the innermost
** position that matches, plus computing the length of the matching text to ** position that matches, plus computing the length of the matching text to
** be selected ** be selected
**/ **/
int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match) int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match, int expected_len)
{ {
// Search the foremost position that matches (avoids find of entire math // Search the foremost position that matches (avoids find of entire math
// inset when match at start of it) // inset when match at start of it)
size_t d; size_t d;
DocIterator old_cur(cur.buffer()); DocIterator old_cur(cur.buffer());
MatchResult mres;
do { do {
LYXERR(Debug::FIND, "Forwarding one step (searching for innermost match)"); LYXERR(Debug::FIND, "Forwarding one step (searching for innermost match)");
d = cur.depth(); d = cur.depth();
old_cur = cur; old_cur = cur;
cur.forwardPos(); cur.forwardPos();
} while (cur && cur.depth() > d && match(cur).match_len > 0); if (!cur)
break;
if (cur.depth() > d)
continue;
if (cur.depth() == d)
break;
mres = match(cur);
displayMres(mres, 1);
if (expected_len > 0) {
if (mres.match_len < expected_len)
break;
}
else {
if (mres.match_len <= 0)
break;
}
} while (1);
cur = old_cur; cur = old_cur;
int max_match = match(cur).match_len; /* match valid only if not searching whole words */ mres = match(cur); /* match valid only if not searching whole words */
int max_match = mres.match_len;
if (max_match <= 0) return 0; if (max_match <= 0) return 0;
LYXERR(Debug::FIND, "Ok"); LYXERR(Debug::FIND, "Ok");
@ -3503,7 +3600,9 @@ int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match)
int len = 1; int len = 1;
if (cur.pos() + len > cur.lastpos()) if (cur.pos() + len > cur.lastpos())
return 0; return 0;
if (match.opt.matchword) { // regexp should use \w+, \S+, or \b(some string)\b
// to search for whole words
if (match.opt.matchword && !match.use_regexp) {
LYXERR(Debug::FIND, "verifying unmatch with len = " << len); LYXERR(Debug::FIND, "verifying unmatch with len = " << len);
while (cur.pos() + len <= cur.lastpos() && match(cur, len).match_len <= 0) { while (cur.pos() + len <= cur.lastpos() && match(cur, len).match_len <= 0) {
++len; ++len;
@ -3523,28 +3622,31 @@ int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match)
if (old_match == 0) if (old_match == 0)
len = 0; len = 0;
} }
else { else {
int minl = 1; int minl = 1;
int maxl = cur.lastpos() - cur.pos(); int maxl = cur.lastpos() - cur.pos();
// Greedy behaviour while matching regexps // Greedy behaviour while matching regexps
while (maxl > minl) { while (maxl > minl) {
int actual_match = match(cur, len).match_len; int actual_match = match(cur, len).match_len;
if (actual_match >= max_match) { if (actual_match >= max_match) {
// actual_match > max_match _can_ happen, // actual_match > max_match _can_ happen,
// if the search area splits // if the search area splits
// some following word so that the regex // some following word so that the regex
// (e.g. 'r.*r\b' matches 'r' from the middle of the // (e.g. 'r.*r\b' matches 'r' from the middle of the
// splitted word) // splitted word)
// This means, the len value is too big // This means, the len value is too big
maxl = len; maxl = len;
len = (int)((maxl + minl)/2); if (maxl - minl < 4)
} len = (int)((maxl + minl)/2);
else { else
// (actual_match < max_match) len = (int)(minl + (maxl - minl + 3)/4);
minl = len + 1; }
len = (int)((maxl + minl)/2); else {
} // (actual_match < max_match)
} minl = len + 1;
len = (int)((maxl + minl)/2);
}
}
old_cur = cur; old_cur = cur;
// Search for real start of matched characters // Search for real start of matched characters
while (len > 1) { while (len > 1) {
@ -3582,29 +3684,33 @@ int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match)
return len; return len;
} }
#if 0
static void displayMResult(MatchResult &mres)
{
LYXERR0( "pos: " << mres.pos);
LYXERR0( "leadsize: " << mres.leadsize);
LYXERR0( "match_len: " << mres.match_len);
LYXERR0( "match_prefix: " << mres.match_prefix);
LYXERR0( "match2end: " << mres.match2end);
}
#define displayMres(s) displayMResult(s);
#else
#define displayMres(s)
#endif
/// Finds forward /// Finds forward
int findForwardAdv(DocIterator & cur, MatchStringAdv const & match) int findForwardAdv(DocIterator & cur, MatchStringAdv const & match)
{ {
if (!cur) if (!cur)
return 0; return 0;
while (!theApp()->longOperationCancelled() && cur) { while (!theApp()->longOperationCancelled() && cur) {
{
// forward to
size_t d;
DocIterator old_cur(cur.buffer());
do {
d = cur.depth();
old_cur = cur;
cur.forwardPos();
if (!cur)
break;
if (cur.depth() > d)
continue;
if (cur.depth() == d)
break;
} while (1);
cur = old_cur;
}
LYXERR(Debug::FIND, "findForwardAdv() cur: " << cur); LYXERR(Debug::FIND, "findForwardAdv() cur: " << cur);
MatchResult mres = match(cur, -1, false); MatchResult mres = match(cur, -1, false);
displayMres(mres) displayMres(mres,-1)
int match_len = mres.match_len; int match_len = mres.match_len;
if ((mres.pos > 100000) || (mres.match2end > 100000) || (match_len > 100000)) { if ((mres.pos > 100000) || (mres.match2end > 100000) || (match_len > 100000)) {
LYXERR(Debug::INFO, "BIG LENGTHS: " << mres.pos << ", " << match_len << ", " << mres.match2end); LYXERR(Debug::INFO, "BIG LENGTHS: " << mres.pos << ", " << match_len << ", " << mres.match2end);
@ -3613,11 +3719,11 @@ int findForwardAdv(DocIterator & cur, MatchStringAdv const & match)
if (match_len > 0) { if (match_len > 0) {
// Try to find the begin of searched string // Try to find the begin of searched string
int increment; int increment;
int firstInvalid = 100000; int firstInvalid = 100000;
if (mres.match_prefix + mres.pos - mres.leadsize > 0) if (mres.match_prefix + mres.pos - mres.leadsize > 1)
increment = (mres.match_prefix + mres.pos - mres.leadsize)/2; increment = (mres.match_prefix + mres.pos - mres.leadsize + 1)*3/4;
else else
increment = 10; increment = 10;
LYXERR(Debug::FIND, "Set increment to " << increment); LYXERR(Debug::FIND, "Set increment to " << increment);
while (increment > 0) { while (increment > 0) {
DocIterator old_cur = cur; DocIterator old_cur = cur;
@ -3631,38 +3737,56 @@ int findForwardAdv(DocIterator & cur, MatchStringAdv const & match)
} }
else { else {
MatchResult mres2 = match(cur, -1, false); MatchResult mres2 = match(cur, -1, false);
displayMres(mres2) displayMres(mres2,increment)
switch (interpretMatch(mres, mres2)) { switch (interpretMatch(mres, mres2)) {
case MatchResult::newIsTooFar: case MatchResult::newIsTooFar:
// behind the expected match // behind the expected match
firstInvalid = increment; firstInvalid = increment;
cur = old_cur; cur = old_cur;
increment /= 2; increment /= 2;
break; break;
case MatchResult::newIsBetter: case MatchResult::newIsBetter:
// not reached yet // not reached ye, but cur.pos()+increment is bettert
mres = mres2; mres = mres2;
firstInvalid -= increment; firstInvalid -= increment;
if (increment > firstInvalid/2) if (increment > firstInvalid*3/4)
increment = firstInvalid/2; increment = firstInvalid*3/4;
break; if ((mres2.pos == mres2.leadsize) && (increment >= mres2.match_prefix)) {
default: if (increment >= mres2.match_prefix)
// Handle not like MatchResult::newIsTooFar increment = (mres2.match_prefix+1)*3/4;
LYXERR0( "Something is wrong: Increment = " << increment << " match_prefix = " << mres.match_prefix); }
firstInvalid--; break;
increment = firstInvalid -1; default:
cur = old_cur; // Todo@
break; // Handle not like MatchResult::newIsTooFar
} // LYXERR0( "Something is wrong: Increment = " << increment << " match_prefix = " << mres.match_prefix);
firstInvalid--;
increment = increment*3/4;
cur = old_cur;
break;
}
} }
} }
// LYXERR0("Leaving first loop"); // LYXERR0("Leaving first loop");
{
LYXERR(Debug::FIND, "Finalizing 1");
int len = findAdvFinalize(cur, match, mres.match_len);
if (len > 0)
return len;
else {
// try next possible match
cur.forwardPos();
continue;
}
}
// The following code is newer reached
// but parts of it may be needed in future
int match_len_zero_count = 0; int match_len_zero_count = 0;
MatchResult mres3; MatchResult mres3;
for (int i = 0; !theApp()->longOperationCancelled() && cur; cur.forwardPos()) { for (int i = 0; !theApp()->longOperationCancelled() && cur; cur.forwardPos()) {
if (i++ > 3) { if (i++ > 3) {
mres3 = match(cur, -1, false); mres3 = match(cur, -1, false);
displayMres(mres3) displayMres(mres3, 1)
int remaining_len = mres3.match_len; int remaining_len = mres3.match_len;
if (remaining_len <= 0) { if (remaining_len <= 0) {
// Apparently the searched string is not in the remaining part // Apparently the searched string is not in the remaining part
@ -3674,19 +3798,19 @@ int findForwardAdv(DocIterator & cur, MatchStringAdv const & match)
} }
LYXERR(Debug::FIND, "Advancing cur: " << cur); LYXERR(Debug::FIND, "Advancing cur: " << cur);
mres3 = match(cur, 1); mres3 = match(cur, 1);
displayMres(mres3) displayMres(mres3, 1)
int match_len3 = mres3.match_len; int match_len3 = mres3.match_len;
if (match_len3 < 0) if (match_len3 < 0)
continue; continue;
mres3 = match(cur); mres3 = match(cur);
displayMres(mres3) displayMres(mres3, 1)
int match_len2 = mres3.match_len; int match_len2 = mres3.match_len;
LYXERR(Debug::FIND, "match_len2: " << match_len2); LYXERR(Debug::FIND, "match_len2: " << match_len2);
if (match_len2 > 0) { if (match_len2 > 0) {
// Sometimes in finalize we understand it wasn't a match // Sometimes in finalize we understand it wasn't a match
// and we need to continue the outest loop // and we need to continue the outest loop
LYXERR(Debug::FIND, "Finalizing"); LYXERR(Debug::FIND, "Finalizing 2");
int len = findAdvFinalize(cur, match); int len = findAdvFinalize(cur, match, mres.match_len);
if (len > 0) { if (len > 0) {
return len; return len;
} }
@ -3724,7 +3848,7 @@ int findMostBackwards(DocIterator & cur, MatchStringAdv const & match)
{ {
DocIterator cur_begin = doc_iterator_begin(cur.buffer()); DocIterator cur_begin = doc_iterator_begin(cur.buffer());
DocIterator tmp_cur = cur; DocIterator tmp_cur = cur;
int len = findAdvFinalize(tmp_cur, match); int len = findAdvFinalize(tmp_cur, match, -1);
Inset & inset = cur.inset(); Inset & inset = cur.inset();
for (; cur != cur_begin; cur.backwardPos()) { for (; cur != cur_begin; cur.backwardPos()) {
LYXERR(Debug::FIND, "findMostBackwards(): cur=" << cur); LYXERR(Debug::FIND, "findMostBackwards(): cur=" << cur);
@ -3732,7 +3856,7 @@ int findMostBackwards(DocIterator & cur, MatchStringAdv const & match)
new_cur.backwardPos(); new_cur.backwardPos();
if (new_cur == cur || &new_cur.inset() != &inset || !match(new_cur).match_len) if (new_cur == cur || &new_cur.inset() != &inset || !match(new_cur).match_len)
break; break;
int new_len = findAdvFinalize(new_cur, match); int new_len = findAdvFinalize(new_cur, match, -1);
if (new_len == len) if (new_len == len)
break; break;
len = new_len; len = new_len;