Fix bug 1826 + A more precise parser for bibtex files based on the description found on this web page: http://artis.imag.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html

Most important improvements: - read file character by character and treat newline like space characters. Thus the @ does not need to be at the beginning of a line. - it reads @string entries and replaces the strings in the field values - it also handles @comment entries - it recovers from syntax errors at the next entry starting with @ - it ignores everything between entries git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@17723 a592a061-630c-0410-9148-cb99ea01b6c8
2024-12-12 08:41:46 +00:00 · 2007-04-04 20:06:34 +00:00 · 2007-04-04 20:06:34 +00:00 · 4c9121df3b
commit 4c9121df3b
parent 66aea64a21
3 changed files with 357 additions and 25 deletions
--- a/src/insets/insetbibtex.C
+++ b/src/insets/insetbibtex.C
@ -30,6 +30,7 @@
 #include "support/lyxlib.h"
 #include "support/os.h"
 #include "support/path.h"
 #include "support/textutils.h"
 #include <boost/tokenizer.hpp>
@ -57,6 +58,7 @@ using support::split;
 using support::subst;
 using support::tokenPos;
 using support::trim;
 using support::lowercase;
 namespace Alert = frontend::Alert;
 namespace os = support::os;
@ -67,6 +69,7 @@ using std::string;
 using std::ostream;
 using std::pair;
 using std::vector;
 using std::map;
 InsetBibtex::InsetBibtex(InsetCommandParams const & p)
@ -329,6 +332,199 @@ vector<FileName> const InsetBibtex::getFiles(Buffer const & buffer) const
 	return vec;
 }
 namespace {
 	// methods for parsing bibtex files
 	typedef map<docstring, docstring> VarMap;
 	/// remove whitespace characters, optionally a single comma, 
 	/// and further whitespace characters from the stream.
 	/// @return true if a comma was found, false otherwise
 	///
 	bool removeWSAndComma(idocfstream & ifs) {
 		char_type ch;
 		if (!ifs) 
 			return false;
 		// skip whitespace
 		do {
 			ifs.get(ch);
 		} while (ifs && isSpace(ch));
 		if (!ifs) 
 			return false;
 		if (ch != ',') {
 			ifs.putback(ch);
 			return false;
 		}
 		// skip whitespace
 		do {
 			ifs.get(ch);
 		} while (ifs && isSpace(ch));
 		if (ifs) {
 			ifs.putback(ch);
 		}
 		return true;
 	}
 	/// remove whitespace characters, read characer sequence
 	/// not containing whitespace characters or characters in
 	/// delimChars, and remove further whitespace characters.
 	///
 	/// @return true if a string of length > 0 could be read.
 	/// 
 	bool readTypeOrKey(docstring & val, idocfstream & ifs, docstring const & delimChars) {
 		char_type ch;
 		val.clear();
 		if (!ifs) 
 			return false;
 		// skip whitespace
 		do {
 			ifs.get(ch);
 		} while (ifs && isSpace(ch));
 		if (!ifs) 
 			return false;
 		// read value 
 		while (ifs && !isSpace(ch) && delimChars.find(ch) == docstring::npos) {
 			val += lowercase(ch);
 			ifs.get(ch);
 		}
 		// skip whitespace
 		while (ifs && isSpace(ch)) {
 			ifs.get(ch);
 		}
 		if (ifs) {
 			ifs.putback(ch);
 		}
 		return val.length() > 0;
 	}
 	/// read subsequent bibtex values that are delimited with a #-character.
 	/// Concatenate all parts and replace names with the associated string in 
 	/// the variable strings.
 	/// @return true if reading was successfull (all single parts were delimited
 	/// correctly)
 	bool readValue(docstring & val, idocfstream & ifs, const VarMap & strings) {
 		char_type ch;
 		val.clear();
 		if (!ifs) 
 			return false;
 		do {
 			// skip whitespace
 			do {
 				ifs.get(ch);
 			} while (ifs && isSpace(ch));
 			if (!ifs)
 				return false;
 			// check for field type
 			if (isDigit(ch)) {
 				// read integer value
 				do {
 					val += ch;
 					ifs.get(ch);
 				} while (ifs && isDigit(ch));
 				if (!ifs)
 					return false;
 			} else if (ch == '"' || ch == '{') {
 				// read delimited text - set end delimiter
 				char_type delim = ch == '"'? '"': '}';
 				// inside this delimited text braces must match.
 				// Thus we can have a closing delimiter only
 				// when nestLevel == 0
 				int nestLevel = 0;
 				ifs.get(ch);
 				while (ifs && (nestLevel > 0 || ch != delim)) {
 					val += ch;
 					// update nesting level
 					switch (ch) {
 						case '{':
 							++nestLevel;
 							break;
 						case '}':
 							--nestLevel;
 							if (nestLevel < 0) return false;
 							break;
 					}
 					ifs.get(ch);
 				}
 				if (!ifs)
 					return false;
 				ifs.get(ch);
 				if (!ifs)
 					return false;
 			} else {
 				// reading a string name
 				docstring strName;
 				while (ifs && !isSpace(ch) && ch != '#' && ch != ',' && ch != '}' && ch != ')') {
 					strName += lowercase(ch);
 					ifs.get(ch);
 				}
 				if (!ifs)
 					return false;
 				// replace the string with its assigned value or
 				// discard it if it's not assigned
 				if (strName.length()) {
 					VarMap::const_iterator pos = strings.find(strName);
 					if (pos != strings.end()) {
 						val += pos->second;
 					}
 				}
 			}
 			// skip WS
 			while (ifs && isSpace(ch)) {
 				ifs.get(ch);
 			}
 			if (!ifs)
 				return false;
 			// continue reading next value on concatenate with '#'
 		} while (ch == '#');  
 		ifs.putback(ch);
 		return true;
 	}
 }
 // This method returns a comma separated list of Bibtex entries
 void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
@ -337,10 +533,25 @@ void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
 	vector<FileName> const files = getFiles(buffer);
 	for (vector<FileName>::const_iterator it = files.begin();
 	     it != files.end(); ++ it) {
-		// This is a _very_ simple parser for Bibtex database
+	    // This bibtex parser is a first step to parse bibtex files
-		// files. All it does is to look for lines starting
+		// more precisely. 
-		// in @ and not being @preamble and @string entries.
+		// 
-		// It does NOT do any syntax checking!
+		// - it reads the whole bibtex entry and does a syntax check
 		//   (matching delimiters, missing commas,...
 		// - it recovers from errors starting with the next @-character
 		// - it reads @string definitions and replaces them in the 
 		//   field values.
 		// - it accepts more characters in keys or value names than 
 		//   bibtex does.
 		//
 		// TODOS:
 		// - the entries are split into name = value pairs by the 
 		//   parser. These have to be merged again because of the 
 		//   way lyx treats the entries ( pair<...>(...) ). The citation
 		//   mechanism in lyx should be changed such that it can use
 		//   the split entries.
 		// - messages on parsing errors can be generated.
 		//
 		// Officially bibtex does only support ASCII, but in practice
 		// you can use the encoding of the main document as long as
@ -352,33 +563,139 @@ void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
 		idocfstream ifs(it->toFilesystemEncoding().c_str(),
 		                std::ios_base::in,
 		                buffer.params().encoding().iconvName());
-		docstring linebuf0;
+		
-		while (getline(ifs, linebuf0)) {
+		char_type ch;
-			docstring linebuf = trim(linebuf0);
+		VarMap strings;
-			if (linebuf.empty())
+
 		while (ifs) {
 			ifs.get(ch);
 			if (!ifs) 
 				break;
 			if (ch != '@') 
 				continue;
-			if (prefixIs(linebuf, '@')) {
+
-				linebuf = subst(linebuf, '{', '(');
+			docstring entryType;
-				docstring tmp;
+
-				linebuf = split(linebuf, tmp, '(');
+			if (!readTypeOrKey(entryType, ifs, from_ascii("{(")) || !ifs)
-				tmp = ascii_lowercase(tmp);
+				continue;
-				if (!prefixIs(tmp, from_ascii("@string")) &&
+
-				    !prefixIs(tmp, from_ascii("@preamble"))) {
+			if (entryType == from_ascii("comment")) {
-					linebuf = split(linebuf, tmp, ',');
+
-					tmp = ltrim(tmp, " \t");
+				ifs.ignore(std::numeric_limits<int>::max(), '\n');
-					if (!tmp.empty()) {
+				continue;
-						// FIXME UNICODE
+			} 
 			// check entry delimiter
 			char_type entryDelim;
 			ifs.get(ch);
 			if (!ifs) 
 				break;
 			if (ch == '(') entryDelim = ')';
 			else if (ch == '{') entryDelim = ')';
 			else {
 				// invalid entry delimiter
 				ifs.putback(ch);
 				continue;
 			}
 			// process the entry
 			if (entryType == from_ascii("string")) {
 				// read string and add it to the strings map 
 				// (or replace it's old value)
 				docstring name;
 				docstring value;
 				if (!readTypeOrKey(name, ifs, from_ascii("#=}),")) || !ifs)
 					continue;
 				ifs.get(ch);
 				if (!ifs || ch != '=')
 					continue;
 				if (!readValue(value, ifs, strings))
 					continue;
 				strings[name] = value;
 			} else if (entryType == from_ascii("preamble")) {
 				// preamble definitions are discarded. 
 				// can they be of any use in lyx?
 				docstring value;
 				if (!readValue(value, ifs, strings))
 					continue;
 			} else {
 				// Citation entry. Read the key and all name = value pairs
 				docstring key;
 				docstring fields;
 				docstring name;
 				docstring value;
 				docstring commaNewline;
 				if (!readTypeOrKey(key, ifs, from_ascii(",})")) || !ifs)
 					continue;
 				// now we have a key, so we will add an entry 
 				// (even if it's empty, as bibtex does)
 				// 
 				// all items must be separated by a comma. If
 				// it is missing the scanning of this entry is
 				// stopped and the next is searched.
 				bool readNext = removeWSAndComma(ifs);
 				while (ifs && readNext) {
 					// read field name
 					if (!readTypeOrKey(name, ifs, from_ascii("=}),")) || !ifs)
 						break;
 					// next char must be an equal sign
 					ifs.get(ch);
 					if (!ifs)
 						break;
 					if (ch != '=') {
 						ifs.putback(ch);
 						break;
 					}
 					// read field value
 					if (!readValue(value, ifs, strings)) 
 						break;
 					// append field to the total entry string.
 					//
 					// TODO: Here is where the fields can be put in 
 					//       a more intelligent structure that preserves
 					//	     the already known parts.
 					fields += commaNewline;
 					fields += name + from_ascii(" = {") + value + '}';
 					if (!commaNewline.length()) 
 						commaNewline = from_ascii(",\n"); 
 					readNext = removeWSAndComma(ifs);
 				}
 				// add the new entry
 				keys.push_back(pair<string, docstring>(
-							to_utf8(tmp), docstring()));
+				to_utf8(key), fields));
 					}
 				}
 			} else if (!keys.empty())
 				keys.back().second += linebuf + '\n';
 		}
 			}
 		} //< searching '@'
 	} //< for loop over files
 }
 bool InsetBibtex::addDatabase(string const & db)
 {
 	// FIXME UNICODE
--- a/src/support/textutils.C
+++ b/src/support/textutils.C
@ -71,6 +71,18 @@ bool isPrintableNonspace(char_type c)
 }
 bool isSpace(char_type c)
 {
 	if (!is_utf16(c)) {
 		// assume that no non-utf16 character is a space
 		// c outside the UCS4 range is catched as well
 		return false;
 	}
 	QChar const qc = ucs4_to_qchar(c);
 	return qc.isSpace();
 }
 bool isDigit(char_type c)
 {
 	if (!is_utf16(c))
--- a/src/support/textutils.h
+++ b/src/support/textutils.h
@ -39,6 +39,9 @@ bool isPrintable(char_type c);
 /// return true if the char is printable and not a space
 bool isPrintableNonspace(char_type c);
 /// return true if a unicode char is a space.
 bool isSpace(char_type c);
 /// return true if a unicode char is a digit.
 bool isDigit(char_type c);