Amend(2) 8c67cb8c: (FindAdv: Try to make regex search with format enabled somehow faster)

Use innermost nesting to start searches.
Some fine tuning to determine correct match.
(If the regex contains '(\S)\1' at the end, then this regex would match '}}',
but this is often the case at and of examined string. We have to disable this invalid match.
)
This commit is contained in:
Kornel Benko 2021-01-04 07:16:59 +01:00
parent db36ec1ccf
commit 6750724717

View File

@ -745,6 +745,7 @@ string escape_for_regex(string s, bool match_latex)
new_pos = s.size();
string t;
if (new_pos > pos) {
// outside regexp
LYXERR(Debug::FIND, "new_pos: " << new_pos);
t = apply_escapes(s.substr(pos, new_pos - pos), get_lyx_unescapes());
LYXERR(Debug::FIND, "t [lyx]: " << t);
@ -875,6 +876,12 @@ static MatchResult::range interpretMatch(MatchResult &oldres, MatchResult &newre
return MatchResult::newIsTooFar;
if ((newres.match_len == oldres.match_len) && (newres.match2end == oldres.match2end))
return MatchResult::newIsBetter;
if ((newres.match_len == oldres.match_len) && (newres.match2end -2 == oldres.match2end)) {
// The string contained for instance "\usepackage...fontenc ..."
// and now after moved 9 char forward contains "ge...{fontenc} ..."
// so we accept it as OK
return MatchResult::newIsBetter;
}
return MatchResult::newIsInvalid;
}
@ -947,6 +954,7 @@ private:
// number of (.*?) subexpressions added at end of search regexp for closing
// environments, math mode, styles, etc...
int close_wildcards;
public:
// Are we searching with regular expressions ?
bool use_regexp;
};
@ -1415,6 +1423,7 @@ static void buildAccentsMap()
accents["LaTeX"] = getutf8(0xf0012);
accents["latexe"] = getutf8(0xf0013);
accents["LaTeXe"] = getutf8(0xf0013);
accents["lyxarrow"] = getutf8(0xf0020);
accents["backslash lyx"] = getutf8(0xf0010); // Used logos inserted with starting \backslash
accents["backslash LyX"] = getutf8(0xf0010);
accents["backslash tex"] = getutf8(0xf0011);
@ -1423,6 +1432,7 @@ static void buildAccentsMap()
accents["backslash LaTeX"] = getutf8(0xf0012);
accents["backslash latexe"] = getutf8(0xf0013);
accents["backslash LaTeXe"] = getutf8(0xf0013);
accents["backslash lyxarrow"] = getutf8(0xf0020);
accents["ddot{\\imath}"] = "ï";
buildaccent("ddot", "aAeEhHiIioOtuUwWxXyY",
"äÄëËḧḦïÏïöÖẗüÜẅẄẍẌÿŸ"); // umlaut
@ -1488,7 +1498,7 @@ void Intervall::removeAccents()
buildAccentsMap();
static regex const accre("\\\\(([\\S]|grave|breve|ddot|dot|acute|dacute|mathring|check|hat|bar|tilde|subdot|ogonek|"
"cedilla|subring|textsubring|subhat|textsubcircum|subtilde|textsubtilde|dgrave|textdoublegrave|rcap|textroundcap|slashed)\\{[^\\{\\}]+\\}"
"|((i|imath|jmath|cdot|[a-z]+space)|((backslash )?([lL]y[xX]|[tT]e[xX]|[lL]a[tT]e[xX]e?)))(?![a-zA-Z]))");
"|((i|imath|jmath|cdot|[a-z]+space)|((backslash )?([lL]y[xX]|[tT]e[xX]|[lL]a[tT]e[xX]e?|lyxarrow)))(?![a-zA-Z]))");
smatch sub;
for (sregex_iterator itacc(par.begin(), par.end(), accre), end; itacc != end; ++itacc) {
sub = *itacc;
@ -2024,6 +2034,7 @@ void LatexInfo::buildEntries(bool isPatternString)
if (interval_.par[optend] != '{') {
closings = 0;
found.parenthesiscount = 0;
found.head = "\\" + key;
}
else
closings = found.parenthesiscount;
@ -2042,9 +2053,17 @@ void LatexInfo::buildEntries(bool isPatternString)
found._tokensize = found.head.length();
found._dataStart = found._tokenstart + found.head.length();
if (found.keytype == KeyInfo::doRemove) {
int endpar = 2 + interval_.findclosing(found._dataStart, interval_.par.length(), '{', '}', closings);
found._dataStart = endpar;
found._tokensize = found._dataStart - found._tokenstart;
if (closings > 0) {
size_t endpar = 2 + interval_.findclosing(found._dataStart, interval_.par.length(), '{', '}', closings);
if (endpar >= interval_.par.length())
found._dataStart = interval_.par.length();
else
found._dataStart = endpar;
found._tokensize = found._dataStart - found._tokenstart;
}
else {
found._dataStart = found._tokenstart + found._tokensize;
}
closings = 0;
}
if (interval_.par.substr(found._dataStart-1, 15).compare("\\endarguments{}") == 0) {
@ -2420,8 +2439,11 @@ int LatexInfo::dispatch(ostringstream &os, int previousStart, KeyInfo &actual)
}
case KeyInfo::isSize: {
if (actual.disabled || (interval_.par[actual._dataStart] != '{') || (interval_.par[actual._dataStart-1] == ' ')) {
processRegion(actual._dataEnd, actual._dataEnd+1); /* remove possibly following {} */
interval_.addIntervall(actual._tokenstart, actual._dataEnd+1);
if (actual.parenthesiscount == 0)
interval_.addIntervall(actual._tokenstart, actual._dataEnd);
else {
interval_.addIntervall(actual._tokenstart, actual._dataEnd+1);
}
nextKeyIdx = getNextKey();
} else {
// Here _dataStart points to '{', so correct it
@ -3050,7 +3072,7 @@ MatchStringAdv::MatchStringAdv(lyx::Buffer & buf, FindAndReplaceOptions const &
}
}
#if 0
// Count number of characters in string
// {]} ==> 1
// \& ==> 1
@ -3110,6 +3132,7 @@ static int computeSize(string s, int len)
}
return count;
}
#endif
MatchResult MatchStringAdv::findAux(DocIterator const & cur, int len, bool at_begin) const
{
@ -3213,19 +3236,61 @@ MatchResult MatchStringAdv::findAux(DocIterator const & cur, int len, bool at_be
result = 0;
#if QTSEARCH
mres.match_prefix = match.capturedEnd(2) - match.capturedStart(2);
// mres.match_len = computeSize(QStringRef(&qstr, pos+leadingsize,result), result) - mres.match_prefix;
mres.match_len = match.capturedEnd(0) - match.capturedEnd(2);
// mres.match2end = qstr.size() - pos - leadingsize - mres.match_prefix;
mres.match2end = qstr.size() - match.capturedEnd(0);
// because of different number of closing at end of string
// we have to 'unify' the length of the post-match.
// Done by ignoring closing parenthesis and linefeeds at string end
int matchend = match.capturedEnd(0);
while (mres.match_len > 0) {
QChar c = qstr.at(matchend - 1);
if ((c == '\n') || (c == '}') || (c == '{')) {
mres.match_len--;
matchend--;
}
else
break;
}
size_t strsize = qstr.size();
while (strsize > (size_t) match.capturedEnd(0)) {
QChar c = qstr.at(strsize-1);
if ((c == '\n') || (c == '}')) {
--strsize;
}
else
break;
}
// LYXERR0(qstr.toStdString());
mres.match2end = strsize - matchend;
mres.pos = match.capturedStart(2);
#else
mres.match_prefix = m[2].second - m[2].first;
// mres.match_len = computeSize(str.substr(pos+leadingsize,result), result) - mres.match_prefix;
mres.match_len = m[0].second - m[2].second;
// mres.match2end = str.size() - pos - leadingsize - mres.match_prefix;
mres.match2end = str.size() - m[0].second;
mres.pos = m[2].first;
// ignore closing parenthesis and linefeeds at string end
size_t strend = m[0].second - m[0].first;
int matchend = strend;
while (mres.match_len > 0) {
char c = str.at(matchend - 1);
if ((c == '\n') || (c == '}') || (c == '{')) {
mres.match_len--;
matchend--;
}
else
break;
}
size_t strsize = str.size();
while (strsize > strend) {
if ((str.at(strsize-1) == '}') || (str.at(strsize-1) == '\n')) {
--strsize;
}
else
break;
}
// LYXERR0(str);
mres.match2end = strsize - matchend;
mres.pos = m[2].first - m[0].first;;
#endif
if (mres.match2end < 0)
mres.match_len = 0;
mres.leadsize = leadingsize;
return mres;
}
@ -3477,25 +3542,57 @@ docstring latexifyFromCursor(DocIterator const & cur, int len)
return ods.str();
}
#if 0
// Debugging output
static void displayMResult(MatchResult &mres, int increment)
{
LYXERR0( "pos: " << mres.pos << " increment " << increment);
LYXERR0( "leadsize: " << mres.leadsize);
LYXERR0( "match_len: " << mres.match_len);
LYXERR0( "match_prefix: " << mres.match_prefix);
LYXERR0( "match2end: " << mres.match2end);
}
#define displayMres(s,i) displayMResult(s,i);
#else
#define displayMres(s,i)
#endif
/** Finalize an advanced find operation, advancing the cursor to the innermost
** position that matches, plus computing the length of the matching text to
** be selected
**/
int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match)
int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match, int expected_len)
{
// Search the foremost position that matches (avoids find of entire math
// inset when match at start of it)
size_t d;
DocIterator old_cur(cur.buffer());
MatchResult mres;
do {
LYXERR(Debug::FIND, "Forwarding one step (searching for innermost match)");
d = cur.depth();
old_cur = cur;
cur.forwardPos();
} while (cur && cur.depth() > d && match(cur).match_len > 0);
if (!cur)
break;
if (cur.depth() > d)
continue;
if (cur.depth() == d)
break;
mres = match(cur);
displayMres(mres, 1);
if (expected_len > 0) {
if (mres.match_len < expected_len)
break;
}
else {
if (mres.match_len <= 0)
break;
}
} while (1);
cur = old_cur;
int max_match = match(cur).match_len; /* match valid only if not searching whole words */
mres = match(cur); /* match valid only if not searching whole words */
int max_match = mres.match_len;
if (max_match <= 0) return 0;
LYXERR(Debug::FIND, "Ok");
@ -3503,7 +3600,9 @@ int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match)
int len = 1;
if (cur.pos() + len > cur.lastpos())
return 0;
if (match.opt.matchword) {
// regexp should use \w+, \S+, or \b(some string)\b
// to search for whole words
if (match.opt.matchword && !match.use_regexp) {
LYXERR(Debug::FIND, "verifying unmatch with len = " << len);
while (cur.pos() + len <= cur.lastpos() && match(cur, len).match_len <= 0) {
++len;
@ -3523,28 +3622,31 @@ int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match)
if (old_match == 0)
len = 0;
}
else {
int minl = 1;
int maxl = cur.lastpos() - cur.pos();
// Greedy behaviour while matching regexps
while (maxl > minl) {
int actual_match = match(cur, len).match_len;
if (actual_match >= max_match) {
// actual_match > max_match _can_ happen,
// if the search area splits
// some following word so that the regex
// (e.g. 'r.*r\b' matches 'r' from the middle of the
// splitted word)
// This means, the len value is too big
maxl = len;
len = (int)((maxl + minl)/2);
}
else {
// (actual_match < max_match)
minl = len + 1;
len = (int)((maxl + minl)/2);
}
}
else {
int minl = 1;
int maxl = cur.lastpos() - cur.pos();
// Greedy behaviour while matching regexps
while (maxl > minl) {
int actual_match = match(cur, len).match_len;
if (actual_match >= max_match) {
// actual_match > max_match _can_ happen,
// if the search area splits
// some following word so that the regex
// (e.g. 'r.*r\b' matches 'r' from the middle of the
// splitted word)
// This means, the len value is too big
maxl = len;
if (maxl - minl < 4)
len = (int)((maxl + minl)/2);
else
len = (int)(minl + (maxl - minl + 3)/4);
}
else {
// (actual_match < max_match)
minl = len + 1;
len = (int)((maxl + minl)/2);
}
}
old_cur = cur;
// Search for real start of matched characters
while (len > 1) {
@ -3582,29 +3684,33 @@ int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match)
return len;
}
#if 0
static void displayMResult(MatchResult &mres)
{
LYXERR0( "pos: " << mres.pos);
LYXERR0( "leadsize: " << mres.leadsize);
LYXERR0( "match_len: " << mres.match_len);
LYXERR0( "match_prefix: " << mres.match_prefix);
LYXERR0( "match2end: " << mres.match2end);
}
#define displayMres(s) displayMResult(s);
#else
#define displayMres(s)
#endif
/// Finds forward
int findForwardAdv(DocIterator & cur, MatchStringAdv const & match)
{
if (!cur)
return 0;
while (!theApp()->longOperationCancelled() && cur) {
{
// forward to
size_t d;
DocIterator old_cur(cur.buffer());
do {
d = cur.depth();
old_cur = cur;
cur.forwardPos();
if (!cur)
break;
if (cur.depth() > d)
continue;
if (cur.depth() == d)
break;
} while (1);
cur = old_cur;
}
LYXERR(Debug::FIND, "findForwardAdv() cur: " << cur);
MatchResult mres = match(cur, -1, false);
displayMres(mres)
displayMres(mres,-1)
int match_len = mres.match_len;
if ((mres.pos > 100000) || (mres.match2end > 100000) || (match_len > 100000)) {
LYXERR(Debug::INFO, "BIG LENGTHS: " << mres.pos << ", " << match_len << ", " << mres.match2end);
@ -3613,11 +3719,11 @@ int findForwardAdv(DocIterator & cur, MatchStringAdv const & match)
if (match_len > 0) {
// Try to find the begin of searched string
int increment;
int firstInvalid = 100000;
if (mres.match_prefix + mres.pos - mres.leadsize > 0)
increment = (mres.match_prefix + mres.pos - mres.leadsize)/2;
else
increment = 10;
int firstInvalid = 100000;
if (mres.match_prefix + mres.pos - mres.leadsize > 1)
increment = (mres.match_prefix + mres.pos - mres.leadsize + 1)*3/4;
else
increment = 10;
LYXERR(Debug::FIND, "Set increment to " << increment);
while (increment > 0) {
DocIterator old_cur = cur;
@ -3631,38 +3737,56 @@ int findForwardAdv(DocIterator & cur, MatchStringAdv const & match)
}
else {
MatchResult mres2 = match(cur, -1, false);
displayMres(mres2)
switch (interpretMatch(mres, mres2)) {
case MatchResult::newIsTooFar:
// behind the expected match
firstInvalid = increment;
cur = old_cur;
increment /= 2;
break;
case MatchResult::newIsBetter:
// not reached yet
mres = mres2;
firstInvalid -= increment;
if (increment > firstInvalid/2)
increment = firstInvalid/2;
break;
default:
// Handle not like MatchResult::newIsTooFar
LYXERR0( "Something is wrong: Increment = " << increment << " match_prefix = " << mres.match_prefix);
firstInvalid--;
increment = firstInvalid -1;
cur = old_cur;
break;
}
displayMres(mres2,increment)
switch (interpretMatch(mres, mres2)) {
case MatchResult::newIsTooFar:
// behind the expected match
firstInvalid = increment;
cur = old_cur;
increment /= 2;
break;
case MatchResult::newIsBetter:
// not reached ye, but cur.pos()+increment is bettert
mres = mres2;
firstInvalid -= increment;
if (increment > firstInvalid*3/4)
increment = firstInvalid*3/4;
if ((mres2.pos == mres2.leadsize) && (increment >= mres2.match_prefix)) {
if (increment >= mres2.match_prefix)
increment = (mres2.match_prefix+1)*3/4;
}
break;
default:
// Todo@
// Handle not like MatchResult::newIsTooFar
// LYXERR0( "Something is wrong: Increment = " << increment << " match_prefix = " << mres.match_prefix);
firstInvalid--;
increment = increment*3/4;
cur = old_cur;
break;
}
}
}
// LYXERR0("Leaving first loop");
// LYXERR0("Leaving first loop");
{
LYXERR(Debug::FIND, "Finalizing 1");
int len = findAdvFinalize(cur, match, mres.match_len);
if (len > 0)
return len;
else {
// try next possible match
cur.forwardPos();
continue;
}
}
// The following code is newer reached
// but parts of it may be needed in future
int match_len_zero_count = 0;
MatchResult mres3;
for (int i = 0; !theApp()->longOperationCancelled() && cur; cur.forwardPos()) {
if (i++ > 3) {
mres3 = match(cur, -1, false);
displayMres(mres3)
displayMres(mres3, 1)
int remaining_len = mres3.match_len;
if (remaining_len <= 0) {
// Apparently the searched string is not in the remaining part
@ -3674,19 +3798,19 @@ int findForwardAdv(DocIterator & cur, MatchStringAdv const & match)
}
LYXERR(Debug::FIND, "Advancing cur: " << cur);
mres3 = match(cur, 1);
displayMres(mres3)
displayMres(mres3, 1)
int match_len3 = mres3.match_len;
if (match_len3 < 0)
continue;
mres3 = match(cur);
displayMres(mres3)
displayMres(mres3, 1)
int match_len2 = mres3.match_len;
LYXERR(Debug::FIND, "match_len2: " << match_len2);
if (match_len2 > 0) {
// Sometimes in finalize we understand it wasn't a match
// and we need to continue the outest loop
LYXERR(Debug::FIND, "Finalizing");
int len = findAdvFinalize(cur, match);
LYXERR(Debug::FIND, "Finalizing 2");
int len = findAdvFinalize(cur, match, mres.match_len);
if (len > 0) {
return len;
}
@ -3724,7 +3848,7 @@ int findMostBackwards(DocIterator & cur, MatchStringAdv const & match)
{
DocIterator cur_begin = doc_iterator_begin(cur.buffer());
DocIterator tmp_cur = cur;
int len = findAdvFinalize(tmp_cur, match);
int len = findAdvFinalize(tmp_cur, match, -1);
Inset & inset = cur.inset();
for (; cur != cur_begin; cur.backwardPos()) {
LYXERR(Debug::FIND, "findMostBackwards(): cur=" << cur);
@ -3732,7 +3856,7 @@ int findMostBackwards(DocIterator & cur, MatchStringAdv const & match)
new_cur.backwardPos();
if (new_cur == cur || &new_cur.inset() != &inset || !match(new_cur).match_len)
break;
int new_len = findAdvFinalize(new_cur, match);
int new_len = findAdvFinalize(new_cur, match, -1);
if (new_len == len)
break;
len = new_len;