Remove performance bottleneck in getAuthors()

The regex is expensive which is especially noticeable with very long
author lists.

This introduces a case-insensitive subst() variant which is much faster.

(cherry picked from commit 8ba74fe958)
This commit is contained in:
Juergen Spitzmueller 2024-07-05 14:05:26 +02:00
parent 46d1d6e0d3
commit 0f35e3141b
3 changed files with 22 additions and 14 deletions

View File

@ -279,12 +279,7 @@ vector<docstring> const getAuthors(docstring const & author)
// Then, we temporarily make all " and " strings to ampersands in order // Then, we temporarily make all " and " strings to ampersands in order
// to handle them later on a per-char level. Note that arbitrary casing // to handle them later on a per-char level. Note that arbitrary casing
// ("And", "AND", "aNd", ...) is allowed in bibtex (#10465). // ("And", "AND", "aNd", ...) is allowed in bibtex (#10465).
static regex const and_reg("(.* )([aA][nN][dD])( .*)"); iname = subst(iname, from_ascii(" and "), from_ascii(" & "), false);
smatch sub;
string res = to_utf8(iname);
while (regex_match(res, sub, and_reg))
res = sub.str(1) + "&" + sub.str(3);
iname = from_utf8(res);
// Now we traverse through the string and replace the "&" by the proper // Now we traverse through the string and replace the "&" by the proper
// output in- and outside groups // output in- and outside groups
docstring name; docstring name;

View File

@ -913,17 +913,28 @@ String const subst_string(String const & a,
docstring const subst_string(docstring const & a, docstring const subst_string(docstring const & a,
docstring const & oldstr, docstring const & newstr) docstring const & oldstr, docstring const & newstr,
bool const case_sens)
{ {
LASSERT(!oldstr.empty(), return a); LASSERT(!oldstr.empty(), return a);
docstring lstr = a; docstring lstr = a;
size_t i = 0; size_t i = 0;
size_t const olen = oldstr.length(); size_t const olen = oldstr.length();
if (case_sens)
while ((i = lstr.find(oldstr, i)) != string::npos) { while ((i = lstr.find(oldstr, i)) != string::npos) {
lstr.replace(i, olen, newstr); lstr.replace(i, olen, newstr);
i += newstr.length(); // We need to be sure that we don't i += newstr.length(); // We need to be sure that we don't
// use the same i over and over again. // use the same i over and over again.
} }
else {
docstring lcstr = lowercase(lstr);
while ((i = lcstr.find(oldstr, i)) != string::npos) {
lstr.replace(i, olen, newstr);
i += newstr.length(); // We need to be sure that we don't
// use the same i over and over again.
lcstr = lowercase(lstr);
}
}
return lstr; return lstr;
} }
@ -951,9 +962,10 @@ string const subst(string const & a,
docstring const subst(docstring const & a, docstring const subst(docstring const & a,
docstring const & oldstr, docstring const & newstr) docstring const & oldstr, docstring const & newstr,
bool case_sens)
{ {
return subst_string(a, oldstr, newstr); return subst_string(a, oldstr, newstr, case_sens);
} }

View File

@ -196,7 +196,8 @@ std::string const subst(std::string const & a,
/// substitutes all instances of \a oldstr with \a newstr /// substitutes all instances of \a oldstr with \a newstr
docstring const subst(docstring const & a, docstring const subst(docstring const & a,
docstring const & oldstr, docstring const & newstr); docstring const & oldstr, docstring const & newstr,
bool case_sens = true);
/// Count all occurrences of char \a chr inside \a str /// Count all occurrences of char \a chr inside \a str
int count_char(std::string const & str, char chr); int count_char(std::string const & str, char chr);