mirror of
https://git.lyx.org/repos/lyx.git
synced 2024-12-21 05:00:59 +00:00
2eea1590b1
Provide functions for translating to the LyX name of an encoding from either a LaTeX name or an Iconv name, with the possibility to specify the package. This is in anticipation of changing to use the LyX name of the encoding in the .lyx file format and allowing multiple lib/encodings entries to have the same LaTeX name (but different packages!). The tex2lyx parser needs to worry about the iconv name of the input encoding, so store that instead of the latex name.
286 lines
8.5 KiB
C++
286 lines
8.5 KiB
C++
// -*- C++ -*-
|
|
/**
|
|
* \file Parser.h
|
|
* This file is part of LyX, the document processor.
|
|
* Licence details can be found in the file COPYING.
|
|
*
|
|
* \author André Pönitz
|
|
*
|
|
* Full author contact details are available in file CREDITS.
|
|
*/
|
|
|
|
#ifndef PARSER_H
|
|
#define PARSER_H
|
|
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "support/docstream.h"
|
|
|
|
namespace lyx {
|
|
|
|
|
|
enum mode_type {UNDECIDED_MODE, TEXT_MODE, MATH_MODE, MATHTEXT_MODE, TABLE_MODE};
|
|
|
|
mode_type asMode(mode_type oldmode, std::string const & str);
|
|
|
|
|
|
// These are TeX's catcodes
|
|
enum CatCode {
|
|
catEscape, // 0 backslash
|
|
catBegin, // 1 {
|
|
catEnd, // 2 }
|
|
catMath, // 3 $
|
|
catAlign, // 4 &
|
|
catNewline, // 5 ^^M
|
|
catParameter, // 6 #
|
|
catSuper, // 7 ^
|
|
catSub, // 8 _
|
|
catIgnore, // 9
|
|
catSpace, // 10 space
|
|
catLetter, // 11 a-zA-Z
|
|
catOther, // 12 none of the above
|
|
catActive, // 13 ~
|
|
catComment, // 14 %
|
|
catInvalid // 15 <delete>
|
|
};
|
|
|
|
|
|
enum {
|
|
FLAG_BRACE_LAST = 1 << 1, // last closing brace ends the parsing
|
|
FLAG_RIGHT = 1 << 2, // next \\right ends the parsing process
|
|
FLAG_END = 1 << 3, // next \\end ends the parsing process
|
|
FLAG_BRACK_LAST = 1 << 4, // next closing bracket ends the parsing
|
|
FLAG_TEXTMODE = 1 << 5, // we are in a box
|
|
FLAG_ITEM = 1 << 6, // read a (possibly braced token)
|
|
FLAG_LEAVE = 1 << 7, // leave the loop at the end
|
|
FLAG_SIMPLE = 1 << 8, // next $ leaves the loop
|
|
FLAG_EQUATION = 1 << 9, // next \] leaves the loop
|
|
FLAG_SIMPLE2 = 1 << 10, // next \) leaves the loop
|
|
FLAG_OPTION = 1 << 11, // read [...] style option
|
|
FLAG_BRACED = 1 << 12, // read {...} style argument
|
|
FLAG_CELL = 1 << 13, // read table cell
|
|
FLAG_TABBING = 1 << 14 // We are inside a tabbing environment
|
|
};
|
|
|
|
|
|
|
|
//
|
|
// Helper class for parsing
|
|
//
|
|
|
|
class Token {
|
|
public:
|
|
///
|
|
Token() : cs_(), cat_(catIgnore) {}
|
|
///
|
|
Token(docstring const & cs, CatCode cat) : cs_(to_utf8(cs)), cat_(cat) {}
|
|
|
|
/// Returns the token as string
|
|
std::string const & cs() const { return cs_; }
|
|
/// Returns the catcode of the token
|
|
CatCode cat() const { return cat_; }
|
|
/** Get the character of tokens that were constructed from a single
|
|
* character input or a two character input and cat_ == catEscape.
|
|
* FIXME: The intended usage is not clear. The Token class in
|
|
* ../mathed/MathParser.cpp (which is the anchestor of this
|
|
* class) uses a separate char member for this method. I
|
|
* believe that the intended usage is to not cover tokens with
|
|
* catEscape or catComment, e.g. \code
|
|
* return (cs_.empty() || cat_ == catEscape || cat_ == catComment) ? 0 : cs_[0];
|
|
* \endcode
|
|
* All usages of this method should be checked. gb 2011-01-05
|
|
*/
|
|
char character() const { return cs_.empty() ? 0 : cs_[0]; }
|
|
/// Returns the token verbatim
|
|
std::string asInput() const;
|
|
/// Is the token an alphanumerical character?
|
|
bool isAlnumASCII() const;
|
|
|
|
private:
|
|
///
|
|
std::string cs_;
|
|
///
|
|
CatCode cat_;
|
|
};
|
|
|
|
std::ostream & operator<<(std::ostream & os, Token const & t);
|
|
|
|
#ifdef FILEDEBUG
|
|
extern void debugToken(std::ostream & os, Token const & t, unsigned int flags);
|
|
#endif
|
|
|
|
|
|
/*!
|
|
* Actual parser class
|
|
*
|
|
* The parser parses every character of the inputstream into a token
|
|
* and classifies the token.
|
|
* The following transformations are done:
|
|
* - Consecutive spaces are combined into one single token with CatCode catSpace
|
|
* - Consecutive newlines are combined into one single token with CatCode catNewline
|
|
* - Comments and %\n combinations are parsed into one token with CatCode catComment
|
|
*/
|
|
|
|
class Parser {
|
|
/// noncopyable
|
|
Parser(Parser const & p);
|
|
Parser & operator=(Parser const & p);
|
|
public:
|
|
///
|
|
Parser(idocstream & is);
|
|
///
|
|
Parser(std::string const & s);
|
|
///
|
|
~Parser();
|
|
|
|
/// change the iconv encoding of the input stream
|
|
/// according to the latex encoding and package
|
|
void setEncoding(std::string const & encoding, int const & package);
|
|
/// change the iconv encoding of the input stream
|
|
void setEncoding(std::string const & encoding);
|
|
/// get the current iconv encoding of the input stream
|
|
std::string getEncoding() const { return encoding_iconv_; }
|
|
|
|
///
|
|
int lineno() const { return lineno_; }
|
|
///
|
|
void putback();
|
|
/// store current position
|
|
void pushPosition();
|
|
/// restore previous position
|
|
void popPosition();
|
|
/// dump contents to screen
|
|
void dump() const;
|
|
|
|
/// Does an optional argument follow after the current token?
|
|
bool hasOpt();
|
|
///
|
|
typedef std::pair<bool, std::string> Arg;
|
|
/*!
|
|
* Get an argument enclosed by \p left and \p right.
|
|
* If \p allow_escaping is true, a right delimiter escaped by a
|
|
* backslash does not count as delimiter, but is included in the
|
|
* argument.
|
|
* \returns wether an argument was found in \p Arg.first and the
|
|
* argument in \p Arg.second. \see getArg().
|
|
*/
|
|
Arg getFullArg(char left, char right, bool allow_escaping = true);
|
|
/*!
|
|
* Get an argument enclosed by \p left and \p right.
|
|
* If \p allow_escaping is true, a right delimiter escaped by a
|
|
* backslash does not count as delimiter, but is included in the
|
|
* argument.
|
|
* \returns the argument (without \p left and \p right) or the empty
|
|
* string if the next non-space token is not \p left. Use
|
|
* getFullArg() if you need to know wether there was an empty
|
|
* argument or no argument at all.
|
|
*/
|
|
std::string getArg(char left, char right, bool allow_escaping = true);
|
|
/*!
|
|
* Like getOpt(), but distinguishes between a missing argument ""
|
|
* and an empty argument "[]".
|
|
*/
|
|
std::string getFullOpt(bool keepws = false);
|
|
/*!
|
|
* \returns getArg('[', ']') including the brackets or the
|
|
* empty string if there is no such argument.
|
|
* No whitespace is eaten if \p keepws is true and no optional
|
|
* argument exists. This is important if an optional argument is
|
|
* parsed that would go after a command in ERT: In this case the
|
|
* whitespace is needed to separate the ERT from the subsequent
|
|
* word. Without it, the ERT and the next word would be concatenated
|
|
* during .tex export, thus creating an invalid command.
|
|
*/
|
|
std::string getOpt(bool keepws = false);
|
|
/*!
|
|
* \returns getFullArg('(', ')') including the parentheses or the
|
|
* empty string if there is no such argument.
|
|
*/
|
|
std::string getFullParentheseArg();
|
|
/*!
|
|
* \returns the contents of the environment \p name.
|
|
* <tt>\begin{name}</tt> must be parsed already, <tt>\end{name}</tt>
|
|
* is parsed but not returned.
|
|
*/
|
|
std::string const verbatimEnvironment(std::string const & name);
|
|
/*
|
|
* The same as verbatimEnvironment(std::string const & name) but
|
|
* \begin and \end commands inside the name environment are not parsed.
|
|
* This function is designed to parse verbatim environments.
|
|
*/
|
|
std::string const plainEnvironment(std::string const & name);
|
|
/*
|
|
* Basically the same as plainEnvironment(std::string const & name) but
|
|
* instead of \begin and \end commands the parsing is started/stopped
|
|
* at given characters.
|
|
* This function is designed to parse verbatim commands.
|
|
*/
|
|
std::string const plainCommand(char left, char right, std::string const & name);
|
|
/*!
|
|
* Returns the character of the current token and increments
|
|
* the token position.
|
|
*/
|
|
char getChar();
|
|
///
|
|
void error(std::string const & msg);
|
|
/// Parses one token from \p is
|
|
void tokenize_one();
|
|
///
|
|
void push_back(Token const & t);
|
|
/// The previous token.
|
|
Token const prev_token() const;
|
|
/// The current token.
|
|
Token const curr_token() const;
|
|
/// The next token.
|
|
Token const next_token();
|
|
/// The next but one token.
|
|
Token const next_next_token();
|
|
/// Make the next token current and return that.
|
|
Token const get_token();
|
|
/// \return whether the current token starts a new paragraph
|
|
bool isParagraph();
|
|
/// skips spaces (and comments if \p skip_comments is true)
|
|
/// \return whether whitespace was skipped (not comments)
|
|
bool skip_spaces(bool skip_comments = false);
|
|
/// puts back spaces (and comments if \p skip_comments is true)
|
|
void unskip_spaces(bool skip_comments = false);
|
|
///
|
|
void lex(std::string const & s);
|
|
///
|
|
bool good();
|
|
///
|
|
std::string verbatim_item();
|
|
///
|
|
std::string verbatimOption();
|
|
/// resets the parser to initial state
|
|
void reset();
|
|
///
|
|
void setCatCode(char c, CatCode cat);
|
|
///
|
|
CatCode getCatCode(char c) const;
|
|
|
|
private:
|
|
///
|
|
int lineno_;
|
|
///
|
|
std::vector<Token> tokens_;
|
|
///
|
|
unsigned pos_;
|
|
///
|
|
std::vector<unsigned> positions_;
|
|
///
|
|
idocstringstream * iss_;
|
|
///
|
|
idocstream & is_;
|
|
/// iconv name of the current encoding
|
|
std::string encoding_iconv_;
|
|
};
|
|
|
|
|
|
|
|
} // namespace lyx
|
|
|
|
#endif
|