Fix bug 1826 + A more precise parser for bibtex files based on the description found on this web page: http://artis.imag.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html

Most important improvements:
- read file character by character and treat newline like space characters.
  Thus the @ does not need to be at the beginning of a line.
- it reads @string entries and replaces the strings in the field values
- it also handles @comment entries
- it recovers from syntax errors at the next entry starting with @
- it ignores everything between entries


git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@17723 a592a061-630c-0410-9148-cb99ea01b6c8
This commit is contained in:
Bernhard Roider 2007-04-04 20:06:34 +00:00
parent 66aea64a21
commit 4c9121df3b
3 changed files with 357 additions and 25 deletions

View File

@ -30,6 +30,7 @@
#include "support/lyxlib.h"
#include "support/os.h"
#include "support/path.h"
#include "support/textutils.h"
#include <boost/tokenizer.hpp>
@ -57,6 +58,7 @@ using support::split;
using support::subst;
using support::tokenPos;
using support::trim;
using support::lowercase;
namespace Alert = frontend::Alert;
namespace os = support::os;
@ -67,6 +69,7 @@ using std::string;
using std::ostream;
using std::pair;
using std::vector;
using std::map;
InsetBibtex::InsetBibtex(InsetCommandParams const & p)
@ -329,6 +332,199 @@ vector<FileName> const InsetBibtex::getFiles(Buffer const & buffer) const
return vec;
}
namespace {
// methods for parsing bibtex files
typedef map<docstring, docstring> VarMap;
/// remove whitespace characters, optionally a single comma,
/// and further whitespace characters from the stream.
/// @return true if a comma was found, false otherwise
///
bool removeWSAndComma(idocfstream & ifs) {
char_type ch;
if (!ifs)
return false;
// skip whitespace
do {
ifs.get(ch);
} while (ifs && isSpace(ch));
if (!ifs)
return false;
if (ch != ',') {
ifs.putback(ch);
return false;
}
// skip whitespace
do {
ifs.get(ch);
} while (ifs && isSpace(ch));
if (ifs) {
ifs.putback(ch);
}
return true;
}
/// remove whitespace characters, read characer sequence
/// not containing whitespace characters or characters in
/// delimChars, and remove further whitespace characters.
///
/// @return true if a string of length > 0 could be read.
///
bool readTypeOrKey(docstring & val, idocfstream & ifs, docstring const & delimChars) {
char_type ch;
val.clear();
if (!ifs)
return false;
// skip whitespace
do {
ifs.get(ch);
} while (ifs && isSpace(ch));
if (!ifs)
return false;
// read value
while (ifs && !isSpace(ch) && delimChars.find(ch) == docstring::npos) {
val += lowercase(ch);
ifs.get(ch);
}
// skip whitespace
while (ifs && isSpace(ch)) {
ifs.get(ch);
}
if (ifs) {
ifs.putback(ch);
}
return val.length() > 0;
}
/// read subsequent bibtex values that are delimited with a #-character.
/// Concatenate all parts and replace names with the associated string in
/// the variable strings.
/// @return true if reading was successfull (all single parts were delimited
/// correctly)
bool readValue(docstring & val, idocfstream & ifs, const VarMap & strings) {
char_type ch;
val.clear();
if (!ifs)
return false;
do {
// skip whitespace
do {
ifs.get(ch);
} while (ifs && isSpace(ch));
if (!ifs)
return false;
// check for field type
if (isDigit(ch)) {
// read integer value
do {
val += ch;
ifs.get(ch);
} while (ifs && isDigit(ch));
if (!ifs)
return false;
} else if (ch == '"' || ch == '{') {
// read delimited text - set end delimiter
char_type delim = ch == '"'? '"': '}';
// inside this delimited text braces must match.
// Thus we can have a closing delimiter only
// when nestLevel == 0
int nestLevel = 0;
ifs.get(ch);
while (ifs && (nestLevel > 0 || ch != delim)) {
val += ch;
// update nesting level
switch (ch) {
case '{':
++nestLevel;
break;
case '}':
--nestLevel;
if (nestLevel < 0) return false;
break;
}
ifs.get(ch);
}
if (!ifs)
return false;
ifs.get(ch);
if (!ifs)
return false;
} else {
// reading a string name
docstring strName;
while (ifs && !isSpace(ch) && ch != '#' && ch != ',' && ch != '}' && ch != ')') {
strName += lowercase(ch);
ifs.get(ch);
}
if (!ifs)
return false;
// replace the string with its assigned value or
// discard it if it's not assigned
if (strName.length()) {
VarMap::const_iterator pos = strings.find(strName);
if (pos != strings.end()) {
val += pos->second;
}
}
}
// skip WS
while (ifs && isSpace(ch)) {
ifs.get(ch);
}
if (!ifs)
return false;
// continue reading next value on concatenate with '#'
} while (ch == '#');
ifs.putback(ch);
return true;
}
}
// This method returns a comma separated list of Bibtex entries
void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
@ -337,10 +533,25 @@ void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
vector<FileName> const files = getFiles(buffer);
for (vector<FileName>::const_iterator it = files.begin();
it != files.end(); ++ it) {
// This is a _very_ simple parser for Bibtex database
// files. All it does is to look for lines starting
// in @ and not being @preamble and @string entries.
// It does NOT do any syntax checking!
// This bibtex parser is a first step to parse bibtex files
// more precisely.
//
// - it reads the whole bibtex entry and does a syntax check
// (matching delimiters, missing commas,...
// - it recovers from errors starting with the next @-character
// - it reads @string definitions and replaces them in the
// field values.
// - it accepts more characters in keys or value names than
// bibtex does.
//
// TODOS:
// - the entries are split into name = value pairs by the
// parser. These have to be merged again because of the
// way lyx treats the entries ( pair<...>(...) ). The citation
// mechanism in lyx should be changed such that it can use
// the split entries.
// - messages on parsing errors can be generated.
//
// Officially bibtex does only support ASCII, but in practice
// you can use the encoding of the main document as long as
@ -352,33 +563,139 @@ void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
idocfstream ifs(it->toFilesystemEncoding().c_str(),
std::ios_base::in,
buffer.params().encoding().iconvName());
docstring linebuf0;
while (getline(ifs, linebuf0)) {
docstring linebuf = trim(linebuf0);
if (linebuf.empty())
char_type ch;
VarMap strings;
while (ifs) {
ifs.get(ch);
if (!ifs)
break;
if (ch != '@')
continue;
if (prefixIs(linebuf, '@')) {
linebuf = subst(linebuf, '{', '(');
docstring tmp;
linebuf = split(linebuf, tmp, '(');
tmp = ascii_lowercase(tmp);
if (!prefixIs(tmp, from_ascii("@string")) &&
!prefixIs(tmp, from_ascii("@preamble"))) {
linebuf = split(linebuf, tmp, ',');
tmp = ltrim(tmp, " \t");
if (!tmp.empty()) {
// FIXME UNICODE
keys.push_back(pair<string, docstring>(
to_utf8(tmp), docstring()));
docstring entryType;
if (!readTypeOrKey(entryType, ifs, from_ascii("{(")) || !ifs)
continue;
if (entryType == from_ascii("comment")) {
ifs.ignore(std::numeric_limits<int>::max(), '\n');
continue;
}
// check entry delimiter
char_type entryDelim;
ifs.get(ch);
if (!ifs)
break;
if (ch == '(') entryDelim = ')';
else if (ch == '{') entryDelim = ')';
else {
// invalid entry delimiter
ifs.putback(ch);
continue;
}
// process the entry
if (entryType == from_ascii("string")) {
// read string and add it to the strings map
// (or replace it's old value)
docstring name;
docstring value;
if (!readTypeOrKey(name, ifs, from_ascii("#=}),")) || !ifs)
continue;
ifs.get(ch);
if (!ifs || ch != '=')
continue;
if (!readValue(value, ifs, strings))
continue;
strings[name] = value;
} else if (entryType == from_ascii("preamble")) {
// preamble definitions are discarded.
// can they be of any use in lyx?
docstring value;
if (!readValue(value, ifs, strings))
continue;
} else {
// Citation entry. Read the key and all name = value pairs
docstring key;
docstring fields;
docstring name;
docstring value;
docstring commaNewline;
if (!readTypeOrKey(key, ifs, from_ascii(",})")) || !ifs)
continue;
// now we have a key, so we will add an entry
// (even if it's empty, as bibtex does)
//
// all items must be separated by a comma. If
// it is missing the scanning of this entry is
// stopped and the next is searched.
bool readNext = removeWSAndComma(ifs);
while (ifs && readNext) {
// read field name
if (!readTypeOrKey(name, ifs, from_ascii("=}),")) || !ifs)
break;
// next char must be an equal sign
ifs.get(ch);
if (!ifs)
break;
if (ch != '=') {
ifs.putback(ch);
break;
}
// read field value
if (!readValue(value, ifs, strings))
break;
// append field to the total entry string.
//
// TODO: Here is where the fields can be put in
// a more intelligent structure that preserves
// the already known parts.
fields += commaNewline;
fields += name + from_ascii(" = {") + value + '}';
if (!commaNewline.length())
commaNewline = from_ascii(",\n");
readNext = removeWSAndComma(ifs);
}
} else if (!keys.empty())
keys.back().second += linebuf + '\n';
}
}
// add the new entry
keys.push_back(pair<string, docstring>(
to_utf8(key), fields));
}
} //< searching '@'
} //< for loop over files
}
bool InsetBibtex::addDatabase(string const & db)
{
// FIXME UNICODE

View File

@ -71,6 +71,18 @@ bool isPrintableNonspace(char_type c)
}
bool isSpace(char_type c)
{
if (!is_utf16(c)) {
// assume that no non-utf16 character is a space
// c outside the UCS4 range is catched as well
return false;
}
QChar const qc = ucs4_to_qchar(c);
return qc.isSpace();
}
bool isDigit(char_type c)
{
if (!is_utf16(c))

View File

@ -39,6 +39,9 @@ bool isPrintable(char_type c);
/// return true if the char is printable and not a space
bool isPrintableNonspace(char_type c);
/// return true if a unicode char is a space.
bool isSpace(char_type c);
/// return true if a unicode char is a digit.
bool isDigit(char_type c);