mirror of
https://git.lyx.org/repos/lyx.git
synced 2024-11-14 23:12:31 +00:00
28dc8a77ee
Commit 7cfac95
got rid of empty lines that were created by removing \usepackage
statements. However, it added an additional newline in case the \usepackage
was not at the end of the line. This is now fixed.
693 lines
14 KiB
C++
693 lines
14 KiB
C++
/**
|
|
* \file Parser.cpp
|
|
* This file is part of LyX, the document processor.
|
|
* Licence details can be found in the file COPYING.
|
|
*
|
|
* \author André Pönitz
|
|
*
|
|
* Full author contact details are available in file CREDITS.
|
|
*/
|
|
|
|
#include <config.h>
|
|
|
|
#include "Encoding.h"
|
|
#include "Parser.h"
|
|
#include "support/textutils.h"
|
|
|
|
#include <iostream>
|
|
|
|
using namespace std;
|
|
|
|
namespace lyx {
|
|
|
|
namespace {
|
|
|
|
CatCode theCatcode[256];
|
|
|
|
void catInit()
|
|
{
|
|
static bool init_done = false;
|
|
if (init_done)
|
|
return;
|
|
init_done = true;
|
|
|
|
fill(theCatcode, theCatcode + 256, catOther);
|
|
fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
|
|
fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
|
|
|
|
theCatcode[int('\\')] = catEscape;
|
|
theCatcode[int('{')] = catBegin;
|
|
theCatcode[int('}')] = catEnd;
|
|
theCatcode[int('$')] = catMath;
|
|
theCatcode[int('&')] = catAlign;
|
|
theCatcode[int('\n')] = catNewline;
|
|
theCatcode[int('#')] = catParameter;
|
|
theCatcode[int('^')] = catSuper;
|
|
theCatcode[int('_')] = catSub;
|
|
theCatcode[0x7f] = catIgnore;
|
|
theCatcode[int(' ')] = catSpace;
|
|
theCatcode[int('\t')] = catSpace;
|
|
theCatcode[int('\r')] = catNewline;
|
|
theCatcode[int('~')] = catActive;
|
|
theCatcode[int('%')] = catComment;
|
|
|
|
// This is wrong!
|
|
theCatcode[int('@')] = catLetter;
|
|
}
|
|
|
|
/*!
|
|
* Translate a line ending to '\n'.
|
|
* \p c must have catcode catNewline, and it must be the last character read
|
|
* from \p is.
|
|
*/
|
|
char_type getNewline(idocstream & is, char_type c)
|
|
{
|
|
// we have to handle 3 different line endings:
|
|
// - UNIX (\n)
|
|
// - MAC (\r)
|
|
// - DOS (\r\n)
|
|
if (c == '\r') {
|
|
// MAC or DOS
|
|
char_type wc;
|
|
if (is.get(wc) && wc != '\n') {
|
|
// MAC
|
|
is.putback(wc);
|
|
}
|
|
return '\n';
|
|
}
|
|
// UNIX
|
|
return c;
|
|
}
|
|
|
|
CatCode catcode(char_type c)
|
|
{
|
|
if (c < 256)
|
|
return theCatcode[(unsigned char)c];
|
|
return catOther;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
//
|
|
// Token
|
|
//
|
|
|
|
ostream & operator<<(ostream & os, Token const & t)
|
|
{
|
|
if (t.cat() == catComment)
|
|
os << '%' << t.cs() << '\n';
|
|
else if (t.cat() == catSpace)
|
|
os << t.cs();
|
|
else if (t.cat() == catEscape)
|
|
os << '\\' << t.cs() << ' ';
|
|
else if (t.cat() == catLetter)
|
|
os << t.cs();
|
|
else if (t.cat() == catNewline)
|
|
os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
|
|
else
|
|
os << '[' << t.cs() << ',' << t.cat() << ']';
|
|
return os;
|
|
}
|
|
|
|
|
|
string Token::asInput() const
|
|
{
|
|
if (cat_ == catComment)
|
|
return '%' + cs_ + '\n';
|
|
if (cat_ == catEscape)
|
|
return '\\' + cs_;
|
|
return cs_;
|
|
}
|
|
|
|
|
|
bool Token::isAlnumASCII() const
|
|
{
|
|
return cat_ == catLetter ||
|
|
(cat_ == catOther && cs_.length() == 1 && isDigitASCII(cs_[0]));
|
|
}
|
|
|
|
|
|
#ifdef FILEDEBUG
|
|
void debugToken(std::ostream & os, Token const & t, unsigned int flags)
|
|
{
|
|
char sep = ' ';
|
|
os << "t: " << t << " flags: " << flags;
|
|
if (flags & FLAG_BRACE_LAST) { os << sep << "BRACE_LAST"; sep = '|'; }
|
|
if (flags & FLAG_RIGHT ) { os << sep << "RIGHT" ; sep = '|'; }
|
|
if (flags & FLAG_END ) { os << sep << "END" ; sep = '|'; }
|
|
if (flags & FLAG_BRACK_LAST) { os << sep << "BRACK_LAST"; sep = '|'; }
|
|
if (flags & FLAG_TEXTMODE ) { os << sep << "TEXTMODE" ; sep = '|'; }
|
|
if (flags & FLAG_ITEM ) { os << sep << "ITEM" ; sep = '|'; }
|
|
if (flags & FLAG_LEAVE ) { os << sep << "LEAVE" ; sep = '|'; }
|
|
if (flags & FLAG_SIMPLE ) { os << sep << "SIMPLE" ; sep = '|'; }
|
|
if (flags & FLAG_EQUATION ) { os << sep << "EQUATION" ; sep = '|'; }
|
|
if (flags & FLAG_SIMPLE2 ) { os << sep << "SIMPLE2" ; sep = '|'; }
|
|
if (flags & FLAG_OPTION ) { os << sep << "OPTION" ; sep = '|'; }
|
|
if (flags & FLAG_BRACED ) { os << sep << "BRACED" ; sep = '|'; }
|
|
if (flags & FLAG_CELL ) { os << sep << "CELL" ; sep = '|'; }
|
|
if (flags & FLAG_TABBING ) { os << sep << "TABBING" ; sep = '|'; }
|
|
os << "\n";
|
|
}
|
|
#endif
|
|
|
|
|
|
//
|
|
// Parser
|
|
//
|
|
|
|
|
|
Parser::Parser(idocstream & is)
|
|
: lineno_(0), pos_(0), iss_(0), is_(is), encoding_latex_("utf8")
|
|
{
|
|
}
|
|
|
|
|
|
Parser::Parser(string const & s)
|
|
: lineno_(0), pos_(0),
|
|
iss_(new idocstringstream(from_utf8(s))), is_(*iss_),
|
|
encoding_latex_("utf8")
|
|
{
|
|
}
|
|
|
|
|
|
Parser::~Parser()
|
|
{
|
|
delete iss_;
|
|
}
|
|
|
|
|
|
void Parser::setEncoding(std::string const & e)
|
|
{
|
|
Encoding const * enc = encodings.fromLaTeXName(e);
|
|
if (!enc) {
|
|
cerr << "Unknown encoding " << e << ". Ignoring." << std::endl;
|
|
return;
|
|
}
|
|
//cerr << "setting encoding to " << enc->iconvName() << std::endl;
|
|
is_ << lyx::setEncoding(enc->iconvName());
|
|
encoding_latex_ = e;
|
|
}
|
|
|
|
|
|
void Parser::push_back(Token const & t)
|
|
{
|
|
tokens_.push_back(t);
|
|
}
|
|
|
|
|
|
// We return a copy here because the tokens_ vector may get reallocated
|
|
Token const Parser::prev_token() const
|
|
{
|
|
static const Token dummy;
|
|
return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
|
|
}
|
|
|
|
|
|
// We return a copy here because the tokens_ vector may get reallocated
|
|
Token const Parser::curr_token() const
|
|
{
|
|
static const Token dummy;
|
|
return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
|
|
}
|
|
|
|
|
|
// We return a copy here because the tokens_ vector may get reallocated
|
|
Token const Parser::next_token()
|
|
{
|
|
static const Token dummy;
|
|
return good() ? tokens_[pos_] : dummy;
|
|
}
|
|
|
|
|
|
// We return a copy here because the tokens_ vector may get reallocated
|
|
Token const Parser::next_next_token()
|
|
{
|
|
static const Token dummy;
|
|
// If good() has not been called after the last get_token() we need
|
|
// to tokenize two more tokens.
|
|
if (pos_ + 1 >= tokens_.size()) {
|
|
tokenize_one();
|
|
tokenize_one();
|
|
}
|
|
return pos_ + 1 < tokens_.size() ? tokens_[pos_ + 1] : dummy;
|
|
}
|
|
|
|
|
|
// We return a copy here because the tokens_ vector may get reallocated
|
|
Token const Parser::get_token()
|
|
{
|
|
static const Token dummy;
|
|
//cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
|
|
return good() ? tokens_[pos_++] : dummy;
|
|
}
|
|
|
|
|
|
bool Parser::isParagraph()
|
|
{
|
|
// A new paragraph in TeX ist started
|
|
// - either by a newline, following any amount of whitespace
|
|
// characters (including zero), and another newline
|
|
// - or the token \par
|
|
if (curr_token().cat() == catNewline &&
|
|
(curr_token().cs().size() > 1 ||
|
|
(next_token().cat() == catSpace &&
|
|
next_next_token().cat() == catNewline)))
|
|
return true;
|
|
if (curr_token().cat() == catEscape && curr_token().cs() == "par")
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
|
|
bool Parser::skip_spaces(bool skip_comments)
|
|
{
|
|
// We just silently return if we have no more tokens.
|
|
// skip_spaces() should be callable at any time,
|
|
// the caller must check p::good() anyway.
|
|
bool skipped = false;
|
|
while (good()) {
|
|
get_token();
|
|
if (isParagraph()) {
|
|
putback();
|
|
break;
|
|
}
|
|
if (curr_token().cat() == catSpace ||
|
|
curr_token().cat() == catNewline) {
|
|
skipped = true;
|
|
continue;
|
|
}
|
|
if ((curr_token().cat() == catComment && curr_token().cs().empty()))
|
|
continue;
|
|
if (skip_comments && curr_token().cat() == catComment) {
|
|
// If positions_ is not empty we are doing some kind
|
|
// of look ahead
|
|
if (!positions_.empty())
|
|
cerr << " Ignoring comment: "
|
|
<< curr_token().asInput();
|
|
} else {
|
|
putback();
|
|
break;
|
|
}
|
|
}
|
|
return skipped;
|
|
}
|
|
|
|
|
|
void Parser::unskip_spaces(bool skip_comments)
|
|
{
|
|
while (pos_ > 0) {
|
|
if ( curr_token().cat() == catSpace ||
|
|
(curr_token().cat() == catNewline && curr_token().cs().size() == 1))
|
|
putback();
|
|
else if (skip_comments && curr_token().cat() == catComment) {
|
|
// TODO: Get rid of this
|
|
// If positions_ is not empty we are doing some kind
|
|
// of look ahead
|
|
if (!positions_.empty())
|
|
cerr << "Unignoring comment: "
|
|
<< curr_token().asInput();
|
|
putback();
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
void Parser::putback()
|
|
{
|
|
--pos_;
|
|
}
|
|
|
|
|
|
void Parser::pushPosition()
|
|
{
|
|
positions_.push_back(pos_);
|
|
}
|
|
|
|
|
|
void Parser::popPosition()
|
|
{
|
|
pos_ = positions_.back();
|
|
positions_.pop_back();
|
|
}
|
|
|
|
|
|
bool Parser::good()
|
|
{
|
|
if (pos_ < tokens_.size())
|
|
return true;
|
|
tokenize_one();
|
|
return pos_ < tokens_.size();
|
|
}
|
|
|
|
|
|
char Parser::getChar()
|
|
{
|
|
if (!good())
|
|
error("The input stream is not well...");
|
|
return get_token().character();
|
|
}
|
|
|
|
|
|
bool Parser::hasOpt()
|
|
{
|
|
// An optional argument can occur in any of the following forms:
|
|
// - \foo[bar]
|
|
// - \foo [bar]
|
|
// - \foo
|
|
// [bar]
|
|
// - \foo %comment
|
|
// [bar]
|
|
|
|
// remember current position
|
|
unsigned int oldpos = pos_;
|
|
// skip spaces and comments
|
|
while (good()) {
|
|
get_token();
|
|
if (isParagraph()) {
|
|
putback();
|
|
break;
|
|
}
|
|
if (curr_token().cat() == catSpace ||
|
|
curr_token().cat() == catNewline ||
|
|
curr_token().cat() == catComment)
|
|
continue;
|
|
putback();
|
|
break;
|
|
}
|
|
bool const retval = (next_token().asInput() == "[");
|
|
pos_ = oldpos;
|
|
return retval;
|
|
}
|
|
|
|
|
|
Parser::Arg Parser::getFullArg(char left, char right, bool allow_escaping)
|
|
{
|
|
skip_spaces(true);
|
|
|
|
// This is needed if a partial file ends with a command without arguments,
|
|
// e. g. \medskip
|
|
if (! good())
|
|
return make_pair(false, string());
|
|
|
|
string result;
|
|
Token t = get_token();
|
|
|
|
if (t.cat() == catComment || t.cat() == catEscape ||
|
|
t.character() != left) {
|
|
putback();
|
|
return make_pair(false, string());
|
|
} else {
|
|
for (t = get_token(); good(); t = get_token()) {
|
|
// Ignore comments
|
|
if (t.cat() == catComment) {
|
|
if (!t.cs().empty())
|
|
cerr << "Ignoring comment: " << t.asInput();
|
|
continue;
|
|
}
|
|
if (allow_escaping) {
|
|
if (t.cat() != catEscape && t.character() == right)
|
|
break;
|
|
} else {
|
|
if (t.character() == right) {
|
|
if (t.cat() == catEscape)
|
|
result += '\\';
|
|
break;
|
|
}
|
|
}
|
|
result += t.asInput();
|
|
}
|
|
}
|
|
return make_pair(true, result);
|
|
}
|
|
|
|
|
|
string Parser::getArg(char left, char right, bool allow_escaping)
|
|
{
|
|
return getFullArg(left, right, allow_escaping).second;
|
|
}
|
|
|
|
|
|
string Parser::getFullOpt(bool keepws)
|
|
{
|
|
Arg arg = getFullArg('[', ']');
|
|
if (arg.first)
|
|
return '[' + arg.second + ']';
|
|
if (keepws)
|
|
unskip_spaces(true);
|
|
return string();
|
|
}
|
|
|
|
|
|
string Parser::getOpt(bool keepws)
|
|
{
|
|
string const res = getArg('[', ']');
|
|
if (res.empty()) {
|
|
if (keepws)
|
|
unskip_spaces(true);
|
|
return string();
|
|
}
|
|
return '[' + res + ']';
|
|
}
|
|
|
|
|
|
string Parser::getFullParentheseArg()
|
|
{
|
|
Arg arg = getFullArg('(', ')');
|
|
if (arg.first)
|
|
return '(' + arg.second + ')';
|
|
return string();
|
|
}
|
|
|
|
|
|
string const Parser::verbatimEnvironment(string const & name)
|
|
{
|
|
if (!good())
|
|
return string();
|
|
|
|
ostringstream os;
|
|
for (Token t = get_token(); good(); t = get_token()) {
|
|
if (t.cat() == catBegin) {
|
|
putback();
|
|
os << '{' << verbatim_item() << '}';
|
|
} else if (t.asInput() == "\\begin") {
|
|
string const env = getArg('{', '}');
|
|
os << "\\begin{" << env << '}'
|
|
<< verbatimEnvironment(env)
|
|
<< "\\end{" << env << '}';
|
|
} else if (t.asInput() == "\\end") {
|
|
string const end = getArg('{', '}');
|
|
if (end != name)
|
|
cerr << "\\end{" << end
|
|
<< "} does not match \\begin{" << name
|
|
<< "}." << endl;
|
|
return os.str();
|
|
} else
|
|
os << t.asInput();
|
|
}
|
|
cerr << "unexpected end of input" << endl;
|
|
return os.str();
|
|
}
|
|
|
|
|
|
string const Parser::plainEnvironment(string const & name)
|
|
{
|
|
if (!good())
|
|
return string();
|
|
|
|
ostringstream os;
|
|
for (Token t = get_token(); good(); t = get_token()) {
|
|
if (t.asInput() == "\\end") {
|
|
string const end = getArg('{', '}');
|
|
if (end == name)
|
|
return os.str();
|
|
else
|
|
os << "\\end{" << end << '}';
|
|
} else
|
|
os << t.asInput();
|
|
}
|
|
cerr << "unexpected end of input" << endl;
|
|
return os.str();
|
|
}
|
|
|
|
|
|
string const Parser::plainCommand(char left, char right, string const & name)
|
|
{
|
|
if (!good())
|
|
return string();
|
|
// check if first token is really the start character
|
|
Token tok = get_token();
|
|
if (tok.character() != left) {
|
|
cerr << "first character does not match start character of command \\" << name << endl;
|
|
return string();
|
|
}
|
|
ostringstream os;
|
|
for (Token t = get_token(); good(); t = get_token()) {
|
|
if (t.character() == right) {
|
|
return os.str();
|
|
} else
|
|
os << t.asInput();
|
|
}
|
|
cerr << "unexpected end of input" << endl;
|
|
return os.str();
|
|
}
|
|
|
|
|
|
void Parser::tokenize_one()
|
|
{
|
|
catInit();
|
|
char_type c;
|
|
if (!is_.get(c))
|
|
return;
|
|
|
|
switch (catcode(c)) {
|
|
case catSpace: {
|
|
docstring s(1, c);
|
|
while (is_.get(c) && catcode(c) == catSpace)
|
|
s += c;
|
|
if (catcode(c) != catSpace)
|
|
is_.putback(c);
|
|
push_back(Token(s, catSpace));
|
|
break;
|
|
}
|
|
|
|
case catNewline: {
|
|
++lineno_;
|
|
docstring s(1, getNewline(is_, c));
|
|
while (is_.get(c) && catcode(c) == catNewline) {
|
|
++lineno_;
|
|
s += getNewline(is_, c);
|
|
}
|
|
if (catcode(c) != catNewline)
|
|
is_.putback(c);
|
|
push_back(Token(s, catNewline));
|
|
break;
|
|
}
|
|
|
|
case catComment: {
|
|
// We don't treat "%\n" combinations here specially because
|
|
// we want to preserve them in the preamble
|
|
docstring s;
|
|
while (is_.get(c) && catcode(c) != catNewline)
|
|
s += c;
|
|
// handle possible DOS line ending
|
|
if (catcode(c) == catNewline)
|
|
c = getNewline(is_, c);
|
|
// Note: The '%' at the beginning and the '\n' at the end
|
|
// of the comment are not stored.
|
|
++lineno_;
|
|
push_back(Token(s, catComment));
|
|
break;
|
|
}
|
|
|
|
case catEscape: {
|
|
is_.get(c);
|
|
if (!is_) {
|
|
error("unexpected end of input");
|
|
} else {
|
|
docstring s(1, c);
|
|
if (catcode(c) == catLetter) {
|
|
// collect letters
|
|
while (is_.get(c) && catcode(c) == catLetter)
|
|
s += c;
|
|
if (catcode(c) != catLetter)
|
|
is_.putback(c);
|
|
}
|
|
push_back(Token(s, catEscape));
|
|
}
|
|
break;
|
|
}
|
|
|
|
case catIgnore: {
|
|
cerr << "ignoring a char: " << c << "\n";
|
|
break;
|
|
}
|
|
|
|
default:
|
|
push_back(Token(docstring(1, c), catcode(c)));
|
|
}
|
|
//cerr << tokens_.back();
|
|
}
|
|
|
|
|
|
void Parser::dump() const
|
|
{
|
|
cerr << "\nTokens: ";
|
|
for (unsigned i = 0; i < tokens_.size(); ++i) {
|
|
if (i == pos_)
|
|
cerr << " <#> ";
|
|
cerr << tokens_[i];
|
|
}
|
|
cerr << " pos: " << pos_ << "\n";
|
|
}
|
|
|
|
|
|
void Parser::error(string const & msg)
|
|
{
|
|
cerr << "Line ~" << lineno_ << ": parse error: " << msg << endl;
|
|
dump();
|
|
//exit(1);
|
|
}
|
|
|
|
|
|
string Parser::verbatimOption()
|
|
{
|
|
string res;
|
|
if (next_token().character() == '[') {
|
|
Token t = get_token();
|
|
for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
|
|
if (t.cat() == catBegin) {
|
|
putback();
|
|
res += '{' + verbatim_item() + '}';
|
|
} else
|
|
res += t.cs();
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
|
|
string Parser::verbatim_item()
|
|
{
|
|
if (!good())
|
|
error("stream bad");
|
|
skip_spaces();
|
|
if (next_token().cat() == catBegin) {
|
|
Token t = get_token(); // skip brace
|
|
string res;
|
|
for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
|
|
if (t.cat() == catBegin) {
|
|
putback();
|
|
res += '{' + verbatim_item() + '}';
|
|
}
|
|
else
|
|
res += t.asInput();
|
|
}
|
|
return res;
|
|
}
|
|
return get_token().asInput();
|
|
}
|
|
|
|
|
|
void Parser::reset()
|
|
{
|
|
pos_ = 0;
|
|
}
|
|
|
|
|
|
void Parser::setCatCode(char c, CatCode cat)
|
|
{
|
|
theCatcode[(unsigned char)c] = cat;
|
|
}
|
|
|
|
|
|
CatCode Parser::getCatCode(char c) const
|
|
{
|
|
return theCatcode[(unsigned char)c];
|
|
}
|
|
|
|
|
|
} // namespace lyx
|