lyx_mirror/src/tex2lyx/Parser.cpp
2022-12-29 09:37:00 +01:00

937 lines
20 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* \file Parser.cpp
* This file is part of LyX, the document processor.
* Licence details can be found in the file COPYING.
*
* \author André Pönitz
*
* Full author contact details are available in file CREDITS.
*/
#include <config.h>
#include "Parser.h"
#include "tex2lyx.h"
#include "Encoding.h"
#include "support/convert.h"
#include "support/lstrings.h"
#include "support/textutils.h"
#include <cstdint>
#include <iostream>
using namespace std;
using namespace lyx::support;
namespace lyx {
namespace {
/*!
* Translate a line ending to '\n'.
* \p c must have catcode catNewline, and it must be the last character read
* from \p is.
*/
char_type getNewline(iparserdocstream & is, char_type c)
{
// we have to handle 3 different line endings:
// - UNIX (\n)
// - MAC (\r)
// - DOS (\r\n)
if (c == '\r') {
// MAC or DOS
char_type wc;
if (is.get(wc) && wc != '\n') {
// MAC
is.putback(wc);
}
return '\n';
}
// UNIX
return c;
}
} // namespace
//
// Token
//
ostream & operator<<(ostream & os, Token const & t)
{
if (t.cat() == catComment)
os << '%' << t.cs() << '\n';
else if (t.cat() == catSpace)
os << t.cs();
else if (t.cat() == catEscape)
os << '\\' << t.cs() << ' ';
else if (t.cat() == catLetter)
os << t.cs();
else if (t.cat() == catNewline)
os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
else
os << '[' << t.cs() << ',' << t.cat() << ']';
return os;
}
string Token::asInput() const
{
if (cat_ == catComment)
return '%' + cs_ + '\n';
if (cat_ == catEscape)
return '\\' + cs_;
return cs_;
}
bool Token::isAlnumASCII() const
{
return cat_ == catLetter ||
(cat_ == catOther && cs_.length() == 1 && isDigitASCII(cs_[0]));
}
#ifdef FILEDEBUG
void debugToken(std::ostream & os, Token const & t, unsigned int flags)
{
char sep = ' ';
os << "t: " << t << " flags: " << flags;
if (flags & FLAG_BRACE_LAST) { os << sep << "BRACE_LAST"; sep = '|'; }
if (flags & FLAG_RIGHT ) { os << sep << "RIGHT" ; sep = '|'; }
if (flags & FLAG_END ) { os << sep << "END" ; sep = '|'; }
if (flags & FLAG_BRACK_LAST) { os << sep << "BRACK_LAST"; sep = '|'; }
if (flags & FLAG_TEXTMODE ) { os << sep << "TEXTMODE" ; sep = '|'; }
if (flags & FLAG_ITEM ) { os << sep << "ITEM" ; sep = '|'; }
if (flags & FLAG_LEAVE ) { os << sep << "LEAVE" ; sep = '|'; }
if (flags & FLAG_SIMPLE ) { os << sep << "SIMPLE" ; sep = '|'; }
if (flags & FLAG_EQUATION ) { os << sep << "EQUATION" ; sep = '|'; }
if (flags & FLAG_SIMPLE2 ) { os << sep << "SIMPLE2" ; sep = '|'; }
if (flags & FLAG_OPTION ) { os << sep << "OPTION" ; sep = '|'; }
if (flags & FLAG_BRACED ) { os << sep << "BRACED" ; sep = '|'; }
if (flags & FLAG_CELL ) { os << sep << "CELL" ; sep = '|'; }
if (flags & FLAG_TABBING ) { os << sep << "TABBING" ; sep = '|'; }
os << "\n";
}
#endif
//
// Wrapper
//
void iparserdocstream::setEncoding(std::string const & e)
{
is_ << lyx::setEncoding(e);
}
void iparserdocstream::putback(char_type c)
{
s_ = c + s_;
}
void iparserdocstream::putback(docstring const & s)
{
s_ = s + s_;
}
iparserdocstream & iparserdocstream::get(char_type &c)
{
if (s_.empty())
is_.get(c);
else {
//warning_message("unparsed: " + to_utf8(s_));
c = s_[0];
s_.erase(0,1);
}
return *this;
}
//
// Parser
//
Parser::Parser(idocstream & is, std::string const & fixedenc)
: lineno_(0), pos_(0), iss_(nullptr), is_(is),
encoding_iconv_(fixedenc.empty() ? "UTF-8" : fixedenc),
theCatcodesType_(NORMAL_CATCODES), curr_cat_(UNDECIDED_CATCODES),
fixed_enc_(!fixedenc.empty())
{
if (fixed_enc_)
is_.setEncoding(fixedenc);
catInit();
}
Parser::Parser(string const & s)
: lineno_(0), pos_(0),
iss_(new idocstringstream(from_utf8(s))), is_(*iss_),
encoding_iconv_("UTF-8"),
theCatcodesType_(NORMAL_CATCODES), curr_cat_(UNDECIDED_CATCODES),
// An idocstringstream can not change the encoding
fixed_enc_(true)
{
catInit();
}
Parser::~Parser()
{
delete iss_;
}
void Parser::deparse()
{
string s;
for(size_type i = pos_ ; i < tokens_.size() ; ++i) {
s += tokens_[i].asInput();
}
is_.putback(from_utf8(s));
tokens_.erase(tokens_.begin() + pos_, tokens_.end());
// make sure that next token is read
tokenize_one();
}
bool Parser::setEncoding(std::string const & e, int p)
{
// We may (and need to) use unsafe encodings here: Since the text is
// converted to unicode while reading from is_, we never see text in
// the original encoding of the parser, but operate on utf8 strings
// instead. Therefore, we cannot misparse high bytes as {, } or \\.
Encoding const * const enc = encodings.fromLaTeXName(e, p, true);
if (!enc) {
warning_message("Unknown encoding " + e + ". Ignoring.");
return false;
}
return setEncoding(enc->iconvName());
}
void Parser::catInit()
{
if (curr_cat_ == theCatcodesType_)
return;
curr_cat_ = theCatcodesType_;
fill(theCatcode_, theCatcode_ + 256, catOther);
fill(theCatcode_ + 'a', theCatcode_ + 'z' + 1, catLetter);
fill(theCatcode_ + 'A', theCatcode_ + 'Z' + 1, catLetter);
// This is wrong!
theCatcode_[int('@')] = catLetter;
if (theCatcodesType_ == NORMAL_CATCODES) {
theCatcode_[int('\\')] = catEscape;
theCatcode_[int('{')] = catBegin;
theCatcode_[int('}')] = catEnd;
theCatcode_[int('$')] = catMath;
theCatcode_[int('&')] = catAlign;
theCatcode_[int('\n')] = catNewline;
theCatcode_[int('#')] = catParameter;
theCatcode_[int('^')] = catSuper;
theCatcode_[int('_')] = catSub;
theCatcode_[0x7f] = catIgnore;
theCatcode_[int(' ')] = catSpace;
theCatcode_[int('\t')] = catSpace;
theCatcode_[int('\r')] = catNewline;
theCatcode_[int('~')] = catActive;
theCatcode_[int('%')] = catComment;
}
}
CatCode Parser::catcode(char_type c) const
{
if (c < 256)
return theCatcode_[(unsigned char)c];
return catOther;
}
void Parser::setCatcode(char c, CatCode cat)
{
theCatcode_[(unsigned char)c] = cat;
deparse();
}
void Parser::setCatcodes(cat_type t)
{
theCatcodesType_ = t;
deparse();
}
bool Parser::setEncoding(std::string const & e)
{
//warning_message("setting encoding to " + e);
encoding_iconv_ = e;
// If the encoding is fixed, we must not change the stream encoding
// (because the whole input uses that encoding, e.g. if it comes from
// the clipboard). We still need to track the original encoding in
// encoding_iconv_, so that the generated output is correct.
if (!fixed_enc_)
is_.setEncoding(e);
return true;
}
void Parser::push_back(Token const & t)
{
tokens_.push_back(t);
}
// We return a copy here because the tokens_ vector may get reallocated
Token const Parser::prev_token() const
{
static const Token dummy;
return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
}
// We return a copy here because the tokens_ vector may get reallocated
Token const Parser::curr_token() const
{
static const Token dummy;
return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
}
// We return a copy here because the tokens_ vector may get reallocated
Token const Parser::next_token()
{
static const Token dummy;
if (!good())
return dummy;
if (pos_ >= tokens_.size())
tokenize_one();
return pos_ < tokens_.size() ? tokens_[pos_] : dummy;
}
// We return a copy here because the tokens_ vector may get reallocated
Token const Parser::next_next_token()
{
static const Token dummy;
if (!good())
return dummy;
// If tokenize_one() has not been called after the last get_token() we
// need to tokenize two more tokens.
if (pos_ >= tokens_.size())
tokenize_one();
if (pos_ + 1 >= tokens_.size())
tokenize_one();
return pos_ + 1 < tokens_.size() ? tokens_[pos_ + 1] : dummy;
}
// We return a copy here because the tokens_ vector may get reallocated
Token const Parser::get_token()
{
static const Token dummy;
if (!good())
return dummy;
if (pos_ >= tokens_.size()) {
tokenize_one();
if (pos_ >= tokens_.size())
return dummy;
}
// warning_message("looking at token " + tokens_[pos_]
// + " pos: " + pos_ <<);
return tokens_[pos_++];
}
bool Parser::isParagraph()
{
// A new paragraph in TeX is started
// - either by a newline, following any amount of whitespace
// characters (including zero), and another newline
// - or the token \par
if (curr_token().cat() == catNewline &&
(curr_token().cs().size() > 1 ||
(next_token().cat() == catSpace &&
next_next_token().cat() == catNewline)))
return true;
if (curr_token().cat() == catEscape && curr_token().cs() == "par")
return true;
return false;
}
bool Parser::skip_spaces(bool skip_comments)
{
// We just silently return if we have no more tokens.
// skip_spaces() should be callable at any time,
// the caller must check p::good() anyway.
bool skipped = false;
while (good()) {
get_token();
if (isParagraph()) {
putback();
break;
}
if (curr_token().cat() == catSpace ||
curr_token().cat() == catNewline) {
skipped = true;
continue;
}
if ((curr_token().cat() == catComment && curr_token().cs().empty()))
continue;
if (skip_comments && curr_token().cat() == catComment) {
// If positions_ is not empty we are doing some kind
// of look ahead
if (!positions_.empty())
warning_message("Ignoring comment: " + curr_token().asInput());
} else {
putback();
break;
}
}
return skipped;
}
void Parser::unskip_spaces(bool skip_comments)
{
while (pos_ > 0) {
if ( curr_token().cat() == catSpace ||
(curr_token().cat() == catNewline && curr_token().cs().size() == 1))
putback();
else if (skip_comments && curr_token().cat() == catComment) {
// TODO: Get rid of this
// If positions_ is not empty we are doing some kind
// of look ahead
if (!positions_.empty())
warning_message("Unignoring comment: " + curr_token().asInput());
putback();
}
else
break;
}
}
void Parser::putback()
{
--pos_;
}
void Parser::pushPosition()
{
positions_.push_back(pos_);
}
void Parser::popPosition()
{
pos_ = positions_.back();
positions_.pop_back();
deparse();
}
void Parser::dropPosition()
{
positions_.pop_back();
}
bool Parser::good() const
{
if (pos_ < tokens_.size())
return true;
if (!is_.good())
return false;
return is_.peek() != idocstream::traits_type::eof();
}
bool Parser::hasOpt(string const & l)
{
// An optional argument can occur in any of the following forms:
// - \foo[bar]
// - \foo [bar]
// - \foo
// [bar]
// - \foo %comment
// [bar]
// remember current position
unsigned int oldpos = pos_;
// skip spaces and comments
while (good()) {
get_token();
if (isParagraph()) {
putback();
break;
}
if (curr_token().cat() == catSpace ||
curr_token().cat() == catNewline ||
curr_token().cat() == catComment)
continue;
putback();
break;
}
bool const retval = (next_token().asInput() == l);
pos_ = oldpos;
return retval;
}
bool Parser::hasIdxMacros(string const & c, string const & e)
{
// Check for index entry separator (! or @),
// consider escaping via "
// \p e marks a terminating delimiter¸
// remember current position
unsigned int oldpos = pos_;
// skip spaces and comments
bool retval = false;
while (good()) {
get_token();
if (isParagraph()) {
putback();
break;
}
if (curr_token().cat() == catEnd)
break;
if (!e.empty() && curr_token().asInput() == e
&& prev_token().asInput() != "\"")
break;
if (curr_token().asInput() == c
&& prev_token().asInput() != "\"") {
retval = true;
break;
}
continue;
}
pos_ = oldpos;
return retval;
}
Parser::Arg Parser::getFullArg(char left, char right, bool allow_escaping, char e)
{
skip_spaces(true);
// This is needed if a partial file ends with a command without arguments,
// e. g. \medskip
if (! good())
return make_pair(false, string());
int group_level = (left == '{') ? 1 : 0;
string result;
Token t = get_token();
if (left != char()
&& (t.cat() == catComment || t.cat() == catEscape
|| t.character() != left)) {
putback();
return make_pair(false, string());
} else {
while (good()) {
t = get_token();
// honor grouping
if (t.cat() == catBegin) {
++group_level;
if (left != '{')
continue;
}
if (group_level > 0 && t.cat() == catEnd) {
--group_level;
if (left != '{')
continue;
}
// Ignore comments
if (t.cat() == catComment) {
if (!t.cs().empty())
warning_message("Ignoring comment: " + t.asInput());
continue;
}
if (allow_escaping) {
if (t.cat() != catEscape && t.character() == right
&& group_level == 0)
break;
} else if (e != char()) {
if (prev_token().character() != e && t.character() == right
&& group_level == 0)
break;
} else {
if (t.character() == right) {
if (t.cat() == catEscape)
result += '\\';
if (group_level == 0)
break;
}
}
result += t.asInput();
}
}
return make_pair(true, result);
}
string Parser::getArg(char left, char right, bool allow_escaping, char e)
{
return getFullArg(left, right, allow_escaping, e).second;
}
string Parser::getFullOpt(bool keepws, char left, char right)
{
Arg arg = getFullArg(left, right);
if (arg.first)
return left + arg.second + right;
if (keepws)
unskip_spaces(true);
return string();
}
string Parser::getOpt(bool keepws)
{
string const res = getArg('[', ']');
if (res.empty()) {
if (keepws)
unskip_spaces(true);
return string();
}
return '[' + res + ']';
}
string Parser::getFullParentheseArg()
{
Arg arg = getFullArg('(', ')');
if (arg.first)
return '(' + arg.second + ')';
return string();
}
bool Parser::hasListPreamble(string const & itemcmd)
{
// remember current position
unsigned int oldpos = pos_;
// jump over arguments
if (hasOpt())
getOpt();
if (hasOpt("{"))
getArg('{', '}');
// and swallow spaces and comments
skip_spaces(true);
// we have a list preamble if the next thing
// that follows is not the \item command
bool res = next_token().cs() != itemcmd;
// back to orig position
pos_ = oldpos;
return res;
}
string const Parser::ertEnvironment(string const & name)
{
if (!good())
return string();
ostringstream os;
for (Token t = get_token(); good(); t = get_token()) {
if (t.cat() == catBegin) {
putback();
os << '{' << verbatim_item() << '}';
} else if (t.asInput() == "\\begin") {
string const env = getArg('{', '}');
os << "\\begin{" << env << '}'
<< ertEnvironment(env)
<< "\\end{" << env << '}';
} else if (t.asInput() == "\\end") {
string const end = getArg('{', '}');
if (end != name)
warning_message("\\end{" + end
+ "} does not match \\begin{"
+ name + "}.");
return os.str();
} else
os << t.asInput();
}
warning_message("unexpected end of input");
return os.str();
}
string const Parser::plainEnvironment(string const & name)
{
if (!good())
return string();
ostringstream os;
for (Token t = get_token(); good(); t = get_token()) {
if (t.asInput() == "\\end") {
string const end = getArg('{', '}');
if (end == name)
return os.str();
else
os << "\\end{" << end << '}';
} else
os << t.asInput();
}
warning_message("unexpected end of input");
return os.str();
}
string const Parser::plainCommand(char left, char right, string const & name)
{
if (!good())
return string();
// check if first token is really the start character
Token tok = get_token();
if (tok.character() != left) {
warning_message("first character does not match start character of command \\" + name);
return string();
}
ostringstream os;
for (Token t = get_token(); good(); t = get_token()) {
if (t.character() == right) {
return os.str();
} else
os << t.asInput();
}
warning_message("unexpected end of input");
return os.str();
}
string const Parser::getCommandLatexParam()
{
if (!good())
return string();
string res;
size_t offset = 0;
while (true) {
if (pos_ + offset >= tokens_.size())
tokenize_one();
if (pos_ + offset >= tokens_.size())
break;
Token t = tokens_[pos_ + offset];
if (t.cat() == catBegin)
break;
res += t.asInput();
++offset;
}
return res;
}
Parser::Arg Parser::verbatimStuff(string const & end_string, bool const allow_linebreak)
{
if (!good())
return Arg(false, string());
pushPosition();
ostringstream oss;
size_t match_index = 0;
setCatcodes(VERBATIM_CATCODES);
for (Token t = get_token(); good(); t = get_token()) {
// FIXME t.asInput() might be longer than we need ?
if (t.asInput() == end_string.substr(match_index,
t.asInput().length())) {
match_index += t.asInput().length();
if (match_index >= end_string.length())
break;
} else {
if (!allow_linebreak && t.asInput() == "\n") {
warning_message("unexpected end of input");
popPosition();
setCatcodes(NORMAL_CATCODES);
return Arg(false, string());
}
if (match_index) {
oss << end_string.substr(0, match_index)
<< t.asInput();
match_index = 0;
} else
oss << t.asInput();
}
}
if (!good()) {
warning_message("unexpected end of input");
popPosition();
setCatcodes(NORMAL_CATCODES);
return Arg(false, string());
}
setCatcodes(NORMAL_CATCODES);
dropPosition();
return Arg(true, oss.str());
}
string const Parser::verbatimEnvironment(string const & name)
{
//FIXME: do something if endstring is not found
string s = verbatimStuff("\\end{" + name + "}").second;
// ignore one newline at beginning or end of string
if (prefixIs(s, "\n"))
s.erase(0,1);
if (suffixIs(s, "\n"))
s.erase(s.length() - 1,1);
return s;
}
string Parser::verbatimOption()
{
string res;
if (next_token().character() == '[') {
Token t = get_token();
for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
if (t.cat() == catBegin) {
putback();
res += '{' + verbatim_item() + '}';
} else
res += t.asInput();
}
}
return res;
}
string Parser::verbatim_item()
{
if (!good())
error("stream bad");
skip_spaces();
if (next_token().cat() == catBegin) {
Token t = get_token(); // skip brace
string res;
for (t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
if (t.cat() == catBegin) {
putback();
res += '{' + verbatim_item() + '}';
}
else
res += t.asInput();
}
return res;
}
return get_token().asInput();
}
void Parser::tokenize_one()
{
catInit();
char_type c;
if (!is_.get(c))
return;
switch (catcode(c)) {
case catSpace: {
docstring s(1, c);
while (is_.get(c) && catcode(c) == catSpace)
s += c;
if (catcode(c) != catSpace)
is_.putback(c);
push_back(Token(s, catSpace));
break;
}
case catNewline: {
++lineno_;
docstring s(1, getNewline(is_, c));
while (is_.get(c) && catcode(c) == catNewline) {
++lineno_;
s += getNewline(is_, c);
}
if (catcode(c) != catNewline)
is_.putback(c);
push_back(Token(s, catNewline));
break;
}
case catComment: {
// We don't treat "%\n" combinations here specially because
// we want to preserve them in the preamble
docstring s;
while (is_.get(c) && catcode(c) != catNewline)
s += c;
// handle possible DOS line ending
if (catcode(c) == catNewline)
c = getNewline(is_, c);
// Note: The '%' at the beginning and the '\n' at the end
// of the comment are not stored.
++lineno_;
push_back(Token(s, catComment));
break;
}
case catEscape: {
is_.get(c);
if (!is_) {
error("unexpected end of input");
} else {
docstring s(1, c);
if (catcode(c) == catLetter) {
// collect letters
while (is_.get(c) && catcode(c) == catLetter)
s += c;
if (catcode(c) != catLetter)
is_.putback(c);
}
push_back(Token(s, catEscape));
}
break;
}
case catIgnore: {
warning_message("ignoring a char: " + std::to_string(static_cast<uint32_t>(c)));
break;
}
default:
push_back(Token(docstring(1, c), catcode(c)));
}
//warning_message(tokens_.back());
}
void Parser::dump() const
{
cerr << "\nTokens: ";
for (unsigned i = 0; i < tokens_.size(); ++i) {
if (i == pos_)
cerr << " <#> ";
cerr << tokens_[i];
}
cerr << " pos: " << pos_ << "\n";
}
void Parser::error(string const & msg) const
{
error_message("Line ~" + convert<string>(lineno_) + ": parse error: " + msg);
dump();
//exit(1);
}
void Parser::reset()
{
pos_ = 0;
}
} // namespace lyx