split reading .tex from writing .lyx

git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@6111 a592a061-630c-0410-9148-cb99ea01b6c8
2024-11-09 18:31:04 +00:00 · 2003-02-12 07:53:03 +00:00 · 2003-02-12 07:53:03 +00:00 · d8e147d2b8
commit d8e147d2b8
parent c24e9194af
2 changed files with 473 additions and 0 deletions
--- a/src/tex2lyx/texparser.C
+++ b/src/tex2lyx/texparser.C
@ -0,0 +1,327 @@
+#include "parser.h"
+
+using std::cerr;
+using std::endl;
+using std::fill;
+using std::ios;
+using std::istream;
+using std::istringstream;
+using std::ostream;
+using std::string;
+
+
+// 
+// catcodes
+//
+
+mode_type asMode(mode_type oldmode, string const & str)
+{
+	if (str == "mathmode")
+		return MATH_MODE;
+	if (str == "textmode" || str == "forcetext")
+		return TEXT_MODE;
+	return oldmode;
+}
+
+
+CatCode theCatcode[256];
+
+
+CatCode catcode(unsigned char c)
+{
+	return theCatcode[c];
+}
+
+
+void catInit()
+{
+	fill(theCatcode, theCatcode + 256, catOther);
+	fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
+	fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
+
+	theCatcode['\\'] = catEscape;
+	theCatcode['{']  = catBegin;
+	theCatcode['}']  = catEnd;
+	theCatcode['$']  = cat;
+	theCatcode['&']  = catAlign;
+	theCatcode['\n'] = catNewline;
+	theCatcode['#']  = catParameter;
+	theCatcode['^']  = catSuper;
+	theCatcode['_']  = catSub;
+	theCatcode[''] = catIgnore;
+	theCatcode[' ']  = catSpace;
+	theCatcode['\t'] = catSpace;
+	theCatcode['\r'] = catNewline;
+	theCatcode['~']  = catActive;
+	theCatcode['%']  = catComment;
+}
+
+
+//
+// Token
+//
+
+ostream & operator<<(ostream & os, Token const & t)
+{
+	if (t.cs().size())
+		os << '\\' << t.cs();
+	else
+		os << '[' << t.character() << ',' << t.cat() << ']';
+	return os;
+}
+
+
+//
+// Parser
+//
+
+
+Parser::Parser(istream & is)
+	: lineno_(0), pos_(0)
+{
+	tokenize(is);
+}
+
+
+void Parser::push_back(Token const & t)
+{
+	tokens_.push_back(t);
+}
+
+
+void Parser::pop_back()
+{
+	tokens_.pop_back();
+}
+
+
+Token const & Parser::prevToken() const
+{
+	static const Token dummy;
+	return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
+}
+
+
+Token const & Parser::nextToken() const
+{
+	static const Token dummy;
+	return good() ? tokens_[pos_] : dummy;
+}
+
+
+Token const & Parser::getToken()
+{
+	static const Token dummy;
+	//cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
+	return good() ? tokens_[pos_++] : dummy;
+}
+
+
+void Parser::skipSpaces()
+{
+	while (nextToken().cat() == catSpace || nextToken().cat() == catNewline)
+		getToken();
+}
+
+
+void Parser::putback()
+{
+	--pos_;
+}
+
+
+bool Parser::good() const
+{
+	return pos_ < tokens_.size();
+}
+
+
+char Parser::getChar()
+{
+	if (!good())
+		error("The input stream is not well...");
+	return tokens_[pos_++].character();
+}
+
+
+string Parser::getArg(char left, char right)
+{
+	skipSpaces();
+
+	string result;
+	char c = getChar();
+
+	if (c != left)
+		putback();
+	else
+		while ((c = getChar()) != right && good())
+			result += c;
+
+	return result;
+}
+
+
+void Parser::skipSpaceTokens(istream & is, char c)
+{
+	// skip trailing spaces
+	while (catcode(c) == catSpace || catcode(c) == catNewline)
+		if (!is.get(c))
+			break;
+	//cerr << "putting back: " << c << "\n";
+	is.putback(c);
+}
+
+
+void Parser::tokenize(istream & is)
+{
+	// eat everything up to the next \end_inset or end of stream
+	// and store it in s for further tokenization
+	string s;
+	char c;
+	while (is.get(c)) {
+		s += c;
+		if (s.size() >= 10 && s.substr(s.size() - 10) == "\\end_inset") {
+			s = s.substr(0, s.size() - 10);
+			break;
+		}
+	}
+	// Remove the space after \end_inset
+	if (is.get(c) && c != ' ')
+		is.unget();
+
+	// tokenize buffer
+	tokenize(s);
+}
+
+
+void Parser::tokenize(string const & buffer)
+{
+	static bool init_done = false;
+
+	if (!init_done) {
+		catInit();
+		init_done = true;
+	}
+
+	istringstream is(buffer.c_str(), ios::in | ios::binary);
+
+	char c;
+	while (is.get(c)) {
+		//cerr << "reading c: " << c << "\n";
+
+		switch (catcode(c)) {
+			case catNewline: {
+				++lineno_;
+				is.get(c);
+				if (catcode(c) == catNewline)
+					push_back(Token("par"));
+				else {
+					push_back(Token('\n', catNewline));
+					is.putback(c);
+				}
+				break;
+			}
+
+/*
+			case catComment: {
+				while (is.get(c) && catcode(c) != catNewline)
+					;
+				++lineno_;
+				break;
+			}
+*/
+
+			case catEscape: {
+				is.get(c);
+				if (!is) {
+					error("unexpected end of input");
+				} else {
+					string s(1, c);
+					if (catcode(c) == catLetter) {
+						// collect letters
+						while (is.get(c) && catcode(c) == catLetter)
+							s += c;
+						skipSpaceTokens(is, c);
+					}
+					push_back(Token(s));
+				}
+				break;
+			}
+
+			case catSuper:
+			case catSub: {
+				push_back(Token(c, catcode(c)));
+				is.get(c);
+				skipSpaceTokens(is, c);
+				break;
+			}
+
+			case catIgnore: {
+				cerr << "ignoring a char: " << int(c) << "\n";
+				break;
+			}
+
+			default:
+				push_back(Token(c, catcode(c)));
+		}
+	}
+
+#ifdef FILEDEBUG
+	dump();
+#endif
+}
+
+
+void Parser::dump() const
+{
+	cerr << "\nTokens: ";
+	for (unsigned i = 0; i < tokens_.size(); ++i) {
+		if (i == pos_)
+			cerr << " <#> ";
+		cerr << tokens_[i];
+	}
+	cerr << " pos: " << pos_ << "\n";
+}
+
+
+void Parser::error(string const & msg)
+{
+	cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
+	dump();
+	//exit(1);
+}
+
+
+string Parser::verbatimOption()
+{
+	string res;
+	if (nextToken().character() == '[') {
+		Token t = getToken();
+		for (Token t = getToken(); t.character() != ']' && good(); t = getToken()) {
+			if (t.cat() == catBegin) {
+				putback();
+				res += '{' + verbatimItem() + '}';
+			} else
+				res += t.asString();
+		}
+	}
+	return res;
+}
+
+
+string Parser::verbatimItem()
+{
+	string res;
+	if (nextToken().cat() == catBegin) {
+		Token t = getToken();
+		for (Token t = getToken(); t.cat() != catEnd && good(); t = getToken()) {
+			if (t.cat() == catBegin) {
+				putback();
+				res += '{' + verbatimItem() + '}';
+			}
+			else
+				res += t.asString();
+		}
+	}
+	return res;
+}
--- a/src/tex2lyx/texparser.h
+++ b/src/tex2lyx/texparser.h
@ -0,0 +1,146 @@
+
+#ifndef PARSER_H
+#define PARSER_H
+
+#include "LString.h"
+
+enum mode_type {UNDECIDED_MODE, TEXT_MODE, MATH_MODE};
+
+mode_type asMode(mode_type oldmode, string const & str);
+
+
+// These are TeX's catcodes
+enum CatCode {
+	catEscape,     // 0    backslash
+	catBegin,      // 1    {
+	catEnd,        // 2    }
+	catMath,       // 3    $
+	catAlign,      // 4    &
+	catNewline,    // 5    ^^M
+	catParameter,  // 6    #
+	catSuper,      // 7    ^
+	catSub,        // 8    _
+	catIgnore,     // 9
+	catSpace,      // 10   space
+	catLetter,     // 11   a-zA-Z
+	catOther,      // 12   none of the above
+	catActive,     // 13   ~
+	catComment,    // 14   %
+	catInvalid     // 15   <delete>
+};
+
+
+CatCode catcode(unsigned char c);
+
+
+enum {
+	FLAG_BRACE_LAST = 1 << 1,  //  last closing brace ends the parsing
+	FLAG_RIGHT      = 1 << 2,  //  next \\right ends the parsing process
+	FLAG_END        = 1 << 3,  //  next \\end ends the parsing process
+	FLAG_BRACK_LAST = 1 << 4,  //  next closing bracket ends the parsing
+	FLAG_TEXTMODE   = 1 << 5,  //  we are in a box
+	FLAG_ITEM       = 1 << 6,  //  read a (possibly braced token)
+	FLAG_LEAVE      = 1 << 7,  //  leave the loop at the end
+	FLAG_SIMPLE     = 1 << 8,  //  next $ leaves the loop
+	FLAG_EQUATION   = 1 << 9,  //  next \] leaves the loop
+	FLAG_SIMPLE2    = 1 << 10, //  next \) leaves the loop
+	FLAG_OPTION     = 1 << 11, //  read [...] style option
+	FLAG_BRACED     = 1 << 12  //  read {...} style argument
+};
+
+
+
+//
+// Helper class for parsing
+//
+
+class Token {
+public:
+	///
+	Token() : cs_(), char_(0), cat_(catIgnore) {}
+	///
+	Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {}
+	///
+	Token(string const & cs) : cs_(cs), char_(0), cat_(catIgnore) {}
+
+	///
+	string const & cs() const { return cs_; }
+	///
+	CatCode cat() const { return cat_; }
+	///
+	char character() const { return char_; }
+	///
+	string asString() const { return cs_.size() ? cs_ : string(1, char_); }
+
+private:
+	///
+	string cs_;
+	///
+	char char_;
+	///
+	CatCode cat_;
+};
+
+ostream & operator<<(ostream & os, Token const & t);
+
+
+//
+// Actual parser class
+//
+
+class Parser {
+
+public:
+	///
+	Parser(istream & is);
+
+	///
+	int lineno() const { return lineno_; }
+	///
+	void putback();
+	/// dump contents to screen
+	void dump() const;
+
+	///
+	string getArg(char left, char right);
+	///
+	char getChar();
+	///
+	void error(string const & msg);
+	///
+	void tokenize(istream & is);
+	///
+	void tokenize(string const & s);
+	///
+	void skipSpaceTokens(istream & is, char c);
+	///
+	void push_back(Token const & t);
+	///
+	void pop_back();
+	///
+	Token const & prevToken() const;
+	///
+	Token const & nextToken() const;
+	///
+	Token const & getToken();
+	/// skips spaces if any
+	void skipSpaces();
+	///
+	void lex(string const & s);
+	///
+	bool good() const;
+	///
+	string verbatimItem();
+	///
+	string verbatimOption();
+
+//private:
+	///
+	int lineno_;
+	///
+	vector<Token> tokens_;
+	///
+	unsigned pos_;
+};
+
+#endif