/** * \file Parser.cpp * This file is part of LyX, the document processor. * Licence details can be found in the file COPYING. * * \author André Pönitz * * Full author contact details are available in file CREDITS. */ #include #include "Encoding.h" #include "Parser.h" #include "support/textutils.h" #include using namespace std; namespace lyx { namespace { CatCode theCatcode[256]; void catInit() { static bool init_done = false; if (init_done) return; init_done = true; fill(theCatcode, theCatcode + 256, catOther); fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter); fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter); theCatcode[int('\\')] = catEscape; theCatcode[int('{')] = catBegin; theCatcode[int('}')] = catEnd; theCatcode[int('$')] = catMath; theCatcode[int('&')] = catAlign; theCatcode[int('\n')] = catNewline; theCatcode[int('#')] = catParameter; theCatcode[int('^')] = catSuper; theCatcode[int('_')] = catSub; theCatcode[0x7f] = catIgnore; theCatcode[int(' ')] = catSpace; theCatcode[int('\t')] = catSpace; theCatcode[int('\r')] = catNewline; theCatcode[int('~')] = catActive; theCatcode[int('%')] = catComment; // This is wrong! theCatcode[int('@')] = catLetter; } /*! * Translate a line ending to '\n'. * \p c must have catcode catNewline, and it must be the last character read * from \p is. */ char_type getNewline(idocstream & is, char_type c) { // we have to handle 3 different line endings: // - UNIX (\n) // - MAC (\r) // - DOS (\r\n) if (c == '\r') { // MAC or DOS char_type wc; if (is.get(wc) && wc != '\n') { // MAC is.putback(wc); } return '\n'; } // UNIX return c; } CatCode catcode(char_type c) { if (c < 256) return theCatcode[(unsigned char)c]; return catOther; } } // // Token // ostream & operator<<(ostream & os, Token const & t) { if (t.cat() == catComment) os << '%' << t.cs() << '\n'; else if (t.cat() == catSpace) os << t.cs(); else if (t.cat() == catEscape) os << '\\' << t.cs() << ' '; else if (t.cat() == catLetter) os << t.cs(); else if (t.cat() == catNewline) os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n"; else os << '[' << t.cs() << ',' << t.cat() << ']'; return os; } string Token::asInput() const { if (cat_ == catComment) return '%' + cs_ + '\n'; if (cat_ == catEscape) return '\\' + cs_; return cs_; } bool Token::isAlnumASCII() const { return cat_ == catLetter || (cat_ == catOther && cs_.length() == 1 && isDigitASCII(cs_[0])); } #ifdef FILEDEBUG void debugToken(std::ostream & os, Token const & t, unsigned int flags) { char sep = ' '; os << "t: " << t << " flags: " << flags; if (flags & FLAG_BRACE_LAST) { os << sep << "BRACE_LAST"; sep = '|'; } if (flags & FLAG_RIGHT ) { os << sep << "RIGHT" ; sep = '|'; } if (flags & FLAG_END ) { os << sep << "END" ; sep = '|'; } if (flags & FLAG_BRACK_LAST) { os << sep << "BRACK_LAST"; sep = '|'; } if (flags & FLAG_TEXTMODE ) { os << sep << "TEXTMODE" ; sep = '|'; } if (flags & FLAG_ITEM ) { os << sep << "ITEM" ; sep = '|'; } if (flags & FLAG_LEAVE ) { os << sep << "LEAVE" ; sep = '|'; } if (flags & FLAG_SIMPLE ) { os << sep << "SIMPLE" ; sep = '|'; } if (flags & FLAG_EQUATION ) { os << sep << "EQUATION" ; sep = '|'; } if (flags & FLAG_SIMPLE2 ) { os << sep << "SIMPLE2" ; sep = '|'; } if (flags & FLAG_OPTION ) { os << sep << "OPTION" ; sep = '|'; } if (flags & FLAG_BRACED ) { os << sep << "BRACED" ; sep = '|'; } if (flags & FLAG_CELL ) { os << sep << "CELL" ; sep = '|'; } if (flags & FLAG_TABBING ) { os << sep << "TABBING" ; sep = '|'; } os << "\n"; } #endif // // Parser // Parser::Parser(idocstream & is) : lineno_(0), pos_(0), iss_(0), is_(is), encoding_iconv_("UTF-8") { } Parser::Parser(string const & s) : lineno_(0), pos_(0), iss_(new idocstringstream(from_utf8(s))), is_(*iss_), encoding_iconv_("UTF-8") { } Parser::~Parser() { delete iss_; } void Parser::setEncoding(std::string const & e, int const & p) { // We may (and need to) use unsafe encodings here: Since the text is // converted to unicode while reading from is_, we never see text in // the original encoding of the parser, but operate on utf8 strings // instead. Therefore, we cannot misparse high bytes as {, } or \\. Encoding const * const enc = encodings.fromLaTeXName(e, p, true); if (!enc) { cerr << "Unknown encoding " << e << ". Ignoring." << std::endl; return; } setEncoding(enc->iconvName()); } void Parser::setEncoding(std::string const & e) { //cerr << "setting encoding to " << e << std::endl; is_ << lyx::setEncoding(e); encoding_iconv_ = e; } void Parser::push_back(Token const & t) { tokens_.push_back(t); } // We return a copy here because the tokens_ vector may get reallocated Token const Parser::prev_token() const { static const Token dummy; return pos_ > 1 ? tokens_[pos_ - 2] : dummy; } // We return a copy here because the tokens_ vector may get reallocated Token const Parser::curr_token() const { static const Token dummy; return pos_ > 0 ? tokens_[pos_ - 1] : dummy; } // We return a copy here because the tokens_ vector may get reallocated Token const Parser::next_token() { static const Token dummy; return good() ? tokens_[pos_] : dummy; } // We return a copy here because the tokens_ vector may get reallocated Token const Parser::next_next_token() { static const Token dummy; // If good() has not been called after the last get_token() we need // to tokenize two more tokens. if (pos_ + 1 >= tokens_.size()) { tokenize_one(); tokenize_one(); } return pos_ + 1 < tokens_.size() ? tokens_[pos_ + 1] : dummy; } // We return a copy here because the tokens_ vector may get reallocated Token const Parser::get_token() { static const Token dummy; //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n'; return good() ? tokens_[pos_++] : dummy; } bool Parser::isParagraph() { // A new paragraph in TeX ist started // - either by a newline, following any amount of whitespace // characters (including zero), and another newline // - or the token \par if (curr_token().cat() == catNewline && (curr_token().cs().size() > 1 || (next_token().cat() == catSpace && next_next_token().cat() == catNewline))) return true; if (curr_token().cat() == catEscape && curr_token().cs() == "par") return true; return false; } bool Parser::skip_spaces(bool skip_comments) { // We just silently return if we have no more tokens. // skip_spaces() should be callable at any time, // the caller must check p::good() anyway. bool skipped = false; while (good()) { get_token(); if (isParagraph()) { putback(); break; } if (curr_token().cat() == catSpace || curr_token().cat() == catNewline) { skipped = true; continue; } if ((curr_token().cat() == catComment && curr_token().cs().empty())) continue; if (skip_comments && curr_token().cat() == catComment) { // If positions_ is not empty we are doing some kind // of look ahead if (!positions_.empty()) cerr << " Ignoring comment: " << curr_token().asInput(); } else { putback(); break; } } return skipped; } void Parser::unskip_spaces(bool skip_comments) { while (pos_ > 0) { if ( curr_token().cat() == catSpace || (curr_token().cat() == catNewline && curr_token().cs().size() == 1)) putback(); else if (skip_comments && curr_token().cat() == catComment) { // TODO: Get rid of this // If positions_ is not empty we are doing some kind // of look ahead if (!positions_.empty()) cerr << "Unignoring comment: " << curr_token().asInput(); putback(); } else break; } } void Parser::putback() { --pos_; } void Parser::pushPosition() { positions_.push_back(pos_); } void Parser::popPosition() { pos_ = positions_.back(); positions_.pop_back(); } bool Parser::good() { if (pos_ < tokens_.size()) return true; tokenize_one(); return pos_ < tokens_.size(); } char Parser::getChar() { if (!good()) error("The input stream is not well..."); return get_token().character(); } bool Parser::hasOpt() { // An optional argument can occur in any of the following forms: // - \foo[bar] // - \foo [bar] // - \foo // [bar] // - \foo %comment // [bar] // remember current position unsigned int oldpos = pos_; // skip spaces and comments while (good()) { get_token(); if (isParagraph()) { putback(); break; } if (curr_token().cat() == catSpace || curr_token().cat() == catNewline || curr_token().cat() == catComment) continue; putback(); break; } bool const retval = (next_token().asInput() == "["); pos_ = oldpos; return retval; } Parser::Arg Parser::getFullArg(char left, char right, bool allow_escaping) { skip_spaces(true); // This is needed if a partial file ends with a command without arguments, // e. g. \medskip if (! good()) return make_pair(false, string()); string result; Token t = get_token(); if (t.cat() == catComment || t.cat() == catEscape || t.character() != left) { putback(); return make_pair(false, string()); } else { for (t = get_token(); good(); t = get_token()) { // Ignore comments if (t.cat() == catComment) { if (!t.cs().empty()) cerr << "Ignoring comment: " << t.asInput(); continue; } if (allow_escaping) { if (t.cat() != catEscape && t.character() == right) break; } else { if (t.character() == right) { if (t.cat() == catEscape) result += '\\'; break; } } result += t.asInput(); } } return make_pair(true, result); } string Parser::getArg(char left, char right, bool allow_escaping) { return getFullArg(left, right, allow_escaping).second; } string Parser::getFullOpt(bool keepws) { Arg arg = getFullArg('[', ']'); if (arg.first) return '[' + arg.second + ']'; if (keepws) unskip_spaces(true); return string(); } string Parser::getOpt(bool keepws) { string const res = getArg('[', ']'); if (res.empty()) { if (keepws) unskip_spaces(true); return string(); } return '[' + res + ']'; } string Parser::getFullParentheseArg() { Arg arg = getFullArg('(', ')'); if (arg.first) return '(' + arg.second + ')'; return string(); } string const Parser::verbatimEnvironment(string const & name) { if (!good()) return string(); ostringstream os; for (Token t = get_token(); good(); t = get_token()) { if (t.cat() == catBegin) { putback(); os << '{' << verbatim_item() << '}'; } else if (t.asInput() == "\\begin") { string const env = getArg('{', '}'); os << "\\begin{" << env << '}' << verbatimEnvironment(env) << "\\end{" << env << '}'; } else if (t.asInput() == "\\end") { string const end = getArg('{', '}'); if (end != name) cerr << "\\end{" << end << "} does not match \\begin{" << name << "}." << endl; return os.str(); } else os << t.asInput(); } cerr << "unexpected end of input" << endl; return os.str(); } string const Parser::plainEnvironment(string const & name) { if (!good()) return string(); ostringstream os; for (Token t = get_token(); good(); t = get_token()) { if (t.asInput() == "\\end") { string const end = getArg('{', '}'); if (end == name) return os.str(); else os << "\\end{" << end << '}'; } else os << t.asInput(); } cerr << "unexpected end of input" << endl; return os.str(); } string const Parser::plainCommand(char left, char right, string const & name) { if (!good()) return string(); // check if first token is really the start character Token tok = get_token(); if (tok.character() != left) { cerr << "first character does not match start character of command \\" << name << endl; return string(); } ostringstream os; for (Token t = get_token(); good(); t = get_token()) { if (t.character() == right) { return os.str(); } else os << t.asInput(); } cerr << "unexpected end of input" << endl; return os.str(); } void Parser::tokenize_one() { catInit(); char_type c; if (!is_.get(c)) return; switch (catcode(c)) { case catSpace: { docstring s(1, c); while (is_.get(c) && catcode(c) == catSpace) s += c; if (catcode(c) != catSpace) is_.putback(c); push_back(Token(s, catSpace)); break; } case catNewline: { ++lineno_; docstring s(1, getNewline(is_, c)); while (is_.get(c) && catcode(c) == catNewline) { ++lineno_; s += getNewline(is_, c); } if (catcode(c) != catNewline) is_.putback(c); push_back(Token(s, catNewline)); break; } case catComment: { // We don't treat "%\n" combinations here specially because // we want to preserve them in the preamble docstring s; while (is_.get(c) && catcode(c) != catNewline) s += c; // handle possible DOS line ending if (catcode(c) == catNewline) c = getNewline(is_, c); // Note: The '%' at the beginning and the '\n' at the end // of the comment are not stored. ++lineno_; push_back(Token(s, catComment)); break; } case catEscape: { is_.get(c); if (!is_) { error("unexpected end of input"); } else { docstring s(1, c); if (catcode(c) == catLetter) { // collect letters while (is_.get(c) && catcode(c) == catLetter) s += c; if (catcode(c) != catLetter) is_.putback(c); } push_back(Token(s, catEscape)); } break; } case catIgnore: { cerr << "ignoring a char: " << c << "\n"; break; } default: push_back(Token(docstring(1, c), catcode(c))); } //cerr << tokens_.back(); } void Parser::dump() const { cerr << "\nTokens: "; for (unsigned i = 0; i < tokens_.size(); ++i) { if (i == pos_) cerr << " <#> "; cerr << tokens_[i]; } cerr << " pos: " << pos_ << "\n"; } void Parser::error(string const & msg) { cerr << "Line ~" << lineno_ << ": parse error: " << msg << endl; dump(); //exit(1); } string Parser::verbatimOption() { string res; if (next_token().character() == '[') { Token t = get_token(); for (t = get_token(); t.character() != ']' && good(); t = get_token()) { if (t.cat() == catBegin) { putback(); res += '{' + verbatim_item() + '}'; } else res += t.cs(); } } return res; } string Parser::verbatim_item() { if (!good()) error("stream bad"); skip_spaces(); if (next_token().cat() == catBegin) { Token t = get_token(); // skip brace string res; for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) { if (t.cat() == catBegin) { putback(); res += '{' + verbatim_item() + '}'; } else res += t.asInput(); } return res; } return get_token().asInput(); } void Parser::reset() { pos_ = 0; } void Parser::setCatCode(char c, CatCode cat) { theCatcode[(unsigned char)c] = cat; } CatCode Parser::getCatCode(char c) const { return theCatcode[(unsigned char)c]; } } // namespace lyx