Improve BibTeX name parsing #1

Consider groupings of name parts via {...}
This commit is contained in:
Juergen Spitzmueller 2017-03-19 11:41:33 +01:00
parent 82705a0e8a
commit ee26e7fadf

View File

@ -47,33 +47,62 @@ namespace lyx {
namespace { namespace {
// gets the "prename" and "family name" from an author-type string // Remove placeholders from names
pair<docstring, docstring> nameParts(docstring const & name) docstring renormalize(docstring const & input)
{ {
if (name.empty()) docstring res = subst(input, from_ascii("$$space!"), from_ascii(" "));
return subst(res, from_ascii("$$comma!"), from_ascii(","));
}
// gets the "prename" and "family name" from an author-type string
pair<docstring, docstring> nameParts(docstring const & iname)
{
if (iname.empty())
return make_pair(docstring(), docstring()); return make_pair(docstring(), docstring());
// first we look for a comma, and take the last name to be everything // First we check for goupings (via {...}) and replace blanks and
// commas inside groups with temporary placeholders
docstring name;
int gl = 0;
docstring::const_iterator p = iname.begin();
while (p != iname.end()) {
// count grouping level
if (*p == '{')
++gl;
else if (*p == '}')
--gl;
// generate string with probable placeholders
if (*p == ' ' && gl > 0)
name += from_ascii("$$space!");
else if (*p == ',' && gl > 0)
name += from_ascii("$$comma!");
else
name += *p;
++p;
}
// Now we look for a comma, and take the last name to be everything
// preceding the right-most one, so that we also get the "jr" part. // preceding the right-most one, so that we also get the "jr" part.
vector<docstring> pieces = getVectorFromString(name); vector<docstring> pieces = getVectorFromString(name);
if (pieces.size() > 1) if (pieces.size() > 1)
// whether we have a jr. part or not, it's always // whether we have a jr. part or not, it's always
// the first and last item (reversed) // the first and last item (reversed)
return make_pair(pieces.back(), pieces.front()); return make_pair(renormalize(pieces.back()), renormalize(pieces.front()));
// OK, so now we want to look for the last name. We're going to // OK, so now we want to look for the last name. We're going to
// include the "von" part. This isn't perfect. // include the "von" part. This isn't perfect.
// Split on spaces, to get various tokens. // Split on spaces, to get various tokens.
pieces = getVectorFromString(name, from_ascii(" ")); pieces = getVectorFromString(name, from_ascii(" "));
// unusual not to have a space, but could happen // No space: Only a family name given
if (pieces.size() < 2) if (pieces.size() < 2)
return make_pair(from_ascii(""), name); return make_pair(from_ascii(""), renormalize(pieces.back()));
// If we get two, assume the last one is the last name // If we get two pieces, assume the last one is the last name
if (pieces.size() == 2) if (pieces.size() == 2)
return make_pair(pieces.front(), pieces.back()); return make_pair(renormalize(pieces.front()), renormalize(pieces.back()));
// Now we look for the first token that begins with // More than 3 pieces: Now we look for the first piece that
// a lower case letter or an opening group {. // begins with a lower case letter (the "von-part").
docstring prename; docstring prename;
vector<docstring>::const_iterator it = pieces.begin(); vector<docstring>::const_iterator it = pieces.begin();
vector<docstring>::const_iterator const en = pieces.end(); vector<docstring>::const_iterator const en = pieces.end();
@ -82,14 +111,16 @@ pair<docstring, docstring> nameParts(docstring const & name)
if ((*it).empty()) if ((*it).empty())
continue; continue;
char_type const c = (*it)[0]; char_type const c = (*it)[0];
if (isLower(c) || c == '{') // If the piece starts with a lower case char, we assume
// this is the "von-part" (family name prefix) and thus part
// of the family name.
if (isLower(c))
break; break;
// if this is the last time through the loop, then // If this is the last piece, then what we now have is
// what we now have is the last name, so we do not want // the family name.
// to add that to the prename.
if (it + 1 == en) if (it + 1 == en)
break; break;
// add this piece to the prename // Nothing of the former, so add this piece to the prename
if (!first) if (!first)
prename += " "; prename += " ";
else else
@ -97,8 +128,8 @@ pair<docstring, docstring> nameParts(docstring const & name)
prename += *it; prename += *it;
} }
// reconstruct the family name // Reconstruct the family name.
// note that if we left the loop with because it + 1 == en, // Note that if we left the loop with because it + 1 == en,
// then this will still do the right thing, i.e., make surname // then this will still do the right thing, i.e., make surname
// just be the last piece. // just be the last piece.
docstring surname; docstring surname;
@ -110,7 +141,7 @@ pair<docstring, docstring> nameParts(docstring const & name)
first = false; first = false;
surname += *it; surname += *it;
} }
return make_pair(prename, surname); return make_pair(renormalize(prename), renormalize(surname));
} }