diff --git a/src/tex2lyx/texparser.C b/src/tex2lyx/texparser.C new file mode 100644 index 0000000000..a638b93890 --- /dev/null +++ b/src/tex2lyx/texparser.C @@ -0,0 +1,327 @@ +#include "parser.h" + +using std::cerr; +using std::endl; +using std::fill; +using std::ios; +using std::istream; +using std::istringstream; +using std::ostream; +using std::string; + + +// +// catcodes +// + +mode_type asMode(mode_type oldmode, string const & str) +{ + if (str == "mathmode") + return MATH_MODE; + if (str == "textmode" || str == "forcetext") + return TEXT_MODE; + return oldmode; +} + + +CatCode theCatcode[256]; + + +CatCode catcode(unsigned char c) +{ + return theCatcode[c]; +} + + +void catInit() +{ + fill(theCatcode, theCatcode + 256, catOther); + fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter); + fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter); + + theCatcode['\\'] = catEscape; + theCatcode['{'] = catBegin; + theCatcode['}'] = catEnd; + theCatcode['$'] = cat; + theCatcode['&'] = catAlign; + theCatcode['\n'] = catNewline; + theCatcode['#'] = catParameter; + theCatcode['^'] = catSuper; + theCatcode['_'] = catSub; + theCatcode[''] = catIgnore; + theCatcode[' '] = catSpace; + theCatcode['\t'] = catSpace; + theCatcode['\r'] = catNewline; + theCatcode['~'] = catActive; + theCatcode['%'] = catComment; +} + + +// +// Token +// + +ostream & operator<<(ostream & os, Token const & t) +{ + if (t.cs().size()) + os << '\\' << t.cs(); + else + os << '[' << t.character() << ',' << t.cat() << ']'; + return os; +} + + +// +// Parser +// + + +Parser::Parser(istream & is) + : lineno_(0), pos_(0) +{ + tokenize(is); +} + + +void Parser::push_back(Token const & t) +{ + tokens_.push_back(t); +} + + +void Parser::pop_back() +{ + tokens_.pop_back(); +} + + +Token const & Parser::prevToken() const +{ + static const Token dummy; + return pos_ > 0 ? tokens_[pos_ - 1] : dummy; +} + + +Token const & Parser::nextToken() const +{ + static const Token dummy; + return good() ? tokens_[pos_] : dummy; +} + + +Token const & Parser::getToken() +{ + static const Token dummy; + //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n'; + return good() ? tokens_[pos_++] : dummy; +} + + +void Parser::skipSpaces() +{ + while (nextToken().cat() == catSpace || nextToken().cat() == catNewline) + getToken(); +} + + +void Parser::putback() +{ + --pos_; +} + + +bool Parser::good() const +{ + return pos_ < tokens_.size(); +} + + +char Parser::getChar() +{ + if (!good()) + error("The input stream is not well..."); + return tokens_[pos_++].character(); +} + + +string Parser::getArg(char left, char right) +{ + skipSpaces(); + + string result; + char c = getChar(); + + if (c != left) + putback(); + else + while ((c = getChar()) != right && good()) + result += c; + + return result; +} + + +void Parser::skipSpaceTokens(istream & is, char c) +{ + // skip trailing spaces + while (catcode(c) == catSpace || catcode(c) == catNewline) + if (!is.get(c)) + break; + //cerr << "putting back: " << c << "\n"; + is.putback(c); +} + + +void Parser::tokenize(istream & is) +{ + // eat everything up to the next \end_inset or end of stream + // and store it in s for further tokenization + string s; + char c; + while (is.get(c)) { + s += c; + if (s.size() >= 10 && s.substr(s.size() - 10) == "\\end_inset") { + s = s.substr(0, s.size() - 10); + break; + } + } + // Remove the space after \end_inset + if (is.get(c) && c != ' ') + is.unget(); + + // tokenize buffer + tokenize(s); +} + + +void Parser::tokenize(string const & buffer) +{ + static bool init_done = false; + + if (!init_done) { + catInit(); + init_done = true; + } + + istringstream is(buffer.c_str(), ios::in | ios::binary); + + char c; + while (is.get(c)) { + //cerr << "reading c: " << c << "\n"; + + switch (catcode(c)) { + case catNewline: { + ++lineno_; + is.get(c); + if (catcode(c) == catNewline) + push_back(Token("par")); + else { + push_back(Token('\n', catNewline)); + is.putback(c); + } + break; + } + +/* + case catComment: { + while (is.get(c) && catcode(c) != catNewline) + ; + ++lineno_; + break; + } +*/ + + case catEscape: { + is.get(c); + if (!is) { + error("unexpected end of input"); + } else { + string s(1, c); + if (catcode(c) == catLetter) { + // collect letters + while (is.get(c) && catcode(c) == catLetter) + s += c; + skipSpaceTokens(is, c); + } + push_back(Token(s)); + } + break; + } + + case catSuper: + case catSub: { + push_back(Token(c, catcode(c))); + is.get(c); + skipSpaceTokens(is, c); + break; + } + + case catIgnore: { + cerr << "ignoring a char: " << int(c) << "\n"; + break; + } + + default: + push_back(Token(c, catcode(c))); + } + } + +#ifdef FILEDEBUG + dump(); +#endif +} + + +void Parser::dump() const +{ + cerr << "\nTokens: "; + for (unsigned i = 0; i < tokens_.size(); ++i) { + if (i == pos_) + cerr << " <#> "; + cerr << tokens_[i]; + } + cerr << " pos: " << pos_ << "\n"; +} + + +void Parser::error(string const & msg) +{ + cerr << "Line ~" << lineno_ << ": parse error: " << msg << endl; + dump(); + //exit(1); +} + + +string Parser::verbatimOption() +{ + string res; + if (nextToken().character() == '[') { + Token t = getToken(); + for (Token t = getToken(); t.character() != ']' && good(); t = getToken()) { + if (t.cat() == catBegin) { + putback(); + res += '{' + verbatimItem() + '}'; + } else + res += t.asString(); + } + } + return res; +} + + +string Parser::verbatimItem() +{ + string res; + if (nextToken().cat() == catBegin) { + Token t = getToken(); + for (Token t = getToken(); t.cat() != catEnd && good(); t = getToken()) { + if (t.cat() == catBegin) { + putback(); + res += '{' + verbatimItem() + '}'; + } + else + res += t.asString(); + } + } + return res; +} diff --git a/src/tex2lyx/texparser.h b/src/tex2lyx/texparser.h new file mode 100644 index 0000000000..99ccd5db9b --- /dev/null +++ b/src/tex2lyx/texparser.h @@ -0,0 +1,146 @@ + +#ifndef PARSER_H +#define PARSER_H + +#include "LString.h" + +enum mode_type {UNDECIDED_MODE, TEXT_MODE, MATH_MODE}; + +mode_type asMode(mode_type oldmode, string const & str); + + +// These are TeX's catcodes +enum CatCode { + catEscape, // 0 backslash + catBegin, // 1 { + catEnd, // 2 } + catMath, // 3 $ + catAlign, // 4 & + catNewline, // 5 ^^M + catParameter, // 6 # + catSuper, // 7 ^ + catSub, // 8 _ + catIgnore, // 9 + catSpace, // 10 space + catLetter, // 11 a-zA-Z + catOther, // 12 none of the above + catActive, // 13 ~ + catComment, // 14 % + catInvalid // 15 +}; + + +CatCode catcode(unsigned char c); + + +enum { + FLAG_BRACE_LAST = 1 << 1, // last closing brace ends the parsing + FLAG_RIGHT = 1 << 2, // next \\right ends the parsing process + FLAG_END = 1 << 3, // next \\end ends the parsing process + FLAG_BRACK_LAST = 1 << 4, // next closing bracket ends the parsing + FLAG_TEXTMODE = 1 << 5, // we are in a box + FLAG_ITEM = 1 << 6, // read a (possibly braced token) + FLAG_LEAVE = 1 << 7, // leave the loop at the end + FLAG_SIMPLE = 1 << 8, // next $ leaves the loop + FLAG_EQUATION = 1 << 9, // next \] leaves the loop + FLAG_SIMPLE2 = 1 << 10, // next \) leaves the loop + FLAG_OPTION = 1 << 11, // read [...] style option + FLAG_BRACED = 1 << 12 // read {...} style argument +}; + + + +// +// Helper class for parsing +// + +class Token { +public: + /// + Token() : cs_(), char_(0), cat_(catIgnore) {} + /// + Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {} + /// + Token(string const & cs) : cs_(cs), char_(0), cat_(catIgnore) {} + + /// + string const & cs() const { return cs_; } + /// + CatCode cat() const { return cat_; } + /// + char character() const { return char_; } + /// + string asString() const { return cs_.size() ? cs_ : string(1, char_); } + +private: + /// + string cs_; + /// + char char_; + /// + CatCode cat_; +}; + +ostream & operator<<(ostream & os, Token const & t); + + +// +// Actual parser class +// + +class Parser { + +public: + /// + Parser(istream & is); + + /// + int lineno() const { return lineno_; } + /// + void putback(); + /// dump contents to screen + void dump() const; + + /// + string getArg(char left, char right); + /// + char getChar(); + /// + void error(string const & msg); + /// + void tokenize(istream & is); + /// + void tokenize(string const & s); + /// + void skipSpaceTokens(istream & is, char c); + /// + void push_back(Token const & t); + /// + void pop_back(); + /// + Token const & prevToken() const; + /// + Token const & nextToken() const; + /// + Token const & getToken(); + /// skips spaces if any + void skipSpaces(); + /// + void lex(string const & s); + /// + bool good() const; + /// + string verbatimItem(); + /// + string verbatimOption(); + +//private: + /// + int lineno_; + /// + vector tokens_; + /// + unsigned pos_; +}; + +#endif