From 337c6d157754e2f613ddbc3bca072ca658282edf Mon Sep 17 00:00:00 2001 From: Jean-Marc Lasgouttes Date: Sun, 16 Nov 2008 17:02:00 +0000 Subject: [PATCH] First serious step for utf8 file format format. tex2lyx is now able to read utf8 tex documents and translate them to lyxformat 249. There is still no code to discover the encoding and use it, but it is the easiest part (I hope). git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@27563 a592a061-630c-0410-9148-cb99ea01b6c8 --- src/tex2lyx/Parser.cpp | 60 +++++++++++++++++++--------------------- src/tex2lyx/Parser.h | 22 ++++++--------- src/tex2lyx/math.cpp | 2 +- src/tex2lyx/preamble.cpp | 2 +- src/tex2lyx/table.cpp | 19 ++++++------- src/tex2lyx/tex2lyx.cpp | 11 ++++---- src/tex2lyx/text.cpp | 6 ++-- 7 files changed, 55 insertions(+), 67 deletions(-) diff --git a/src/tex2lyx/Parser.cpp b/src/tex2lyx/Parser.cpp index b386d3c79e..83f4c47980 100644 --- a/src/tex2lyx/Parser.cpp +++ b/src/tex2lyx/Parser.cpp @@ -13,7 +13,6 @@ #include "Parser.h" #include -#include using namespace std; @@ -54,13 +53,12 @@ void catInit() theCatcode[int('@')] = catLetter; } - /*! * Translate a line ending to '\n'. * \p c must have catcode catNewline, and it must be the last character read * from \p is. */ -char getNewline(istream & is, char c) +char getNewline(idocstream & is, char c) { // we have to handle 3 different line endings: // - UNIX (\n) @@ -68,9 +66,10 @@ char getNewline(istream & is, char c) // - DOS (\r\n) if (c == '\r') { // MAC or DOS - if (is.get(c) && c != '\n') { + char_type wc; + if (is.get(wc) && wc != '\n') { // MAC - is.putback(c); + is.putback(wc); } return '\n'; } @@ -78,18 +77,14 @@ char getNewline(istream & is, char c) return c; } -} - - -// -// catcodes -// - -CatCode catcode(unsigned char c) +CatCode catcode(char_type c) { - return theCatcode[c]; + if (c < 256) + return theCatcode[(unsigned char)c]; + return catOther; } +} // @@ -105,18 +100,18 @@ ostream & operator<<(ostream & os, Token const & t) else if (t.cat() == catEscape) os << '\\' << t.cs() << ' '; else if (t.cat() == catLetter) - os << t.character(); + os << t.cs(); else if (t.cat() == catNewline) os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n"; else - os << '[' << t.character() << ',' << t.cat() << ']'; + os << '[' << t.cs() << ',' << t.cat() << ']'; return os; } string Token::asString() const { - return cs_.size() ? cs_ : string(1, char_); + return cs_; } @@ -124,9 +119,9 @@ string Token::asInput() const { if (cat_ == catComment) return '%' + cs_ + '\n'; - if (cat_ == catSpace || cat_ == catNewline) - return cs_; - return char_ ? string(1, char_) : '\\' + cs_; + if (cat_ == catEscape) + return '\\' + cs_; + return cs_; } @@ -135,14 +130,15 @@ string Token::asInput() const // -Parser::Parser(istream & is) +Parser::Parser(idocstream & is) : lineno_(0), pos_(0), iss_(0), is_(is) { } Parser::Parser(string const & s) - : lineno_(0), pos_(0), iss_(new istringstream(s)), is_(*iss_) + : lineno_(0), pos_(0), + iss_(new idocstringstream(from_utf8(s))), is_(*iss_) { } @@ -267,7 +263,7 @@ char Parser::getChar() { if (!good()) error("The input stream is not well..."); - return tokens_[pos_++].character(); + return get_token().character(); } @@ -365,14 +361,13 @@ string const Parser::verbatimEnvironment(string const & name) void Parser::tokenize_one() { catInit(); - char c; + char_type c; if (!is_.get(c)) return; - //cerr << "reading c: " << c << "\n"; switch (catcode(c)) { case catSpace: { - string s(1, c); + docstring s(1, c); while (is_.get(c) && catcode(c) == catSpace) s += c; if (catcode(c) != catSpace) @@ -383,7 +378,7 @@ void Parser::tokenize_one() case catNewline: { ++lineno_; - string s(1, getNewline(is_, c)); + docstring s(1, getNewline(is_, c)); while (is_.get(c) && catcode(c) == catNewline) { ++lineno_; s += getNewline(is_, c); @@ -397,7 +392,7 @@ void Parser::tokenize_one() case catComment: { // We don't treat "%\n" combinations here specially because // we want to preserve them in the preamble - string s; + docstring s; while (is_.get(c) && catcode(c) != catNewline) s += c; // handle possible DOS line ending @@ -415,7 +410,7 @@ void Parser::tokenize_one() if (!is_) { error("unexpected end of input"); } else { - string s(1, c); + docstring s(1, c); if (catcode(c) == catLetter) { // collect letters while (is_.get(c) && catcode(c) == catLetter) @@ -429,13 +424,14 @@ void Parser::tokenize_one() } case catIgnore: { - cerr << "ignoring a char: " << int(c) << "\n"; + cerr << "ignoring a char: " << c << "\n"; break; } default: - push_back(Token(c, catcode(c))); + push_back(Token(docstring(1, c), catcode(c))); } + //cerr << tokens_.back(); } @@ -464,7 +460,7 @@ string Parser::verbatimOption() string res; if (next_token().character() == '[') { Token t = get_token(); - for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) { + for (t = get_token(); t.character() != ']' && good(); t = get_token()) { if (t.cat() == catBegin) { putback(); res += '{' + verbatim_item() + '}'; diff --git a/src/tex2lyx/Parser.h b/src/tex2lyx/Parser.h index 94c1100aae..7c70ecca97 100644 --- a/src/tex2lyx/Parser.h +++ b/src/tex2lyx/Parser.h @@ -12,10 +12,11 @@ #ifndef PARSER_H #define PARSER_H -#include #include #include +#include +#include "support/docstream.h" namespace lyx { @@ -46,9 +47,6 @@ enum CatCode { }; -CatCode catcode(unsigned char c); - - enum { FLAG_BRACE_LAST = 1 << 1, // last closing brace ends the parsing FLAG_RIGHT = 1 << 2, // next \\right ends the parsing process @@ -75,18 +73,16 @@ enum { class Token { public: /// - Token() : cs_(), char_(0), cat_(catIgnore) {} + Token() : cs_(), cat_(catIgnore) {} /// - Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {} - /// - Token(std::string const & cs, CatCode cat) : cs_(cs), char_(0), cat_(cat) {} + Token(docstring const & cs, CatCode cat) : cs_(to_utf8(cs)), cat_(cat) {} /// std::string const & cs() const { return cs_; } /// Returns the catcode of the token CatCode cat() const { return cat_; } /// - char character() const { return char_; } + char character() const { return cs_.empty() ? 0 : cs_[0]; } /// Returns the token as string std::string asString() const; /// Returns the token verbatim @@ -96,8 +92,6 @@ private: /// std::string cs_; /// - char char_; - /// CatCode cat_; }; @@ -119,7 +113,7 @@ class Parser { public: /// - Parser(std::istream & is); + Parser(idocstream & is); /// Parser(std::string const & s); /// @@ -217,9 +211,9 @@ private: /// unsigned pos_; /// - std::istringstream * iss_; + idocstringstream * iss_; /// - std::istream & is_; + idocstream & is_; }; diff --git a/src/tex2lyx/math.cpp b/src/tex2lyx/math.cpp index be2800643e..44b0f08389 100644 --- a/src/tex2lyx/math.cpp +++ b/src/tex2lyx/math.cpp @@ -94,7 +94,7 @@ void parse_math(Parser & p, ostream & os, unsigned flags, const mode_type mode) t.cat() == catAlign || t.cat() == catActive || t.cat() == catParameter) - os << t.character(); + os << t.cs(); else if (t.cat() == catBegin) { os << '{'; diff --git a/src/tex2lyx/preamble.cpp b/src/tex2lyx/preamble.cpp index 28f44ee844..b911b373ae 100644 --- a/src/tex2lyx/preamble.cpp +++ b/src/tex2lyx/preamble.cpp @@ -413,7 +413,7 @@ void handle_package(string const & name, string const & opts, void end_preamble(ostream & os, TextClass const & /*textclass*/) { os << "#LyX file created by tex2lyx " << PACKAGE_VERSION << "\n" - << "\\lyxformat 247\n" + << "\\lyxformat 249\n" << "\\begin_document\n" << "\\begin_header\n" << "\\textclass " << h_textclass << "\n"; diff --git a/src/tex2lyx/table.cpp b/src/tex2lyx/table.cpp index e04be70bb2..d23db7088a 100644 --- a/src/tex2lyx/table.cpp +++ b/src/tex2lyx/table.cpp @@ -661,16 +661,15 @@ void parse_table(Parser & p, ostream & os, bool is_long_tabular, } } - else if (t.cat() == catSpace || t.cat() == catNewline) - os << t.cs(); - - else if (t.cat() == catLetter || - t.cat() == catSuper || - t.cat() == catSub || - t.cat() == catOther || - t.cat() == catActive || - t.cat() == catParameter) - os << t.character(); + else if (t.cat() == catSpace + || t.cat() == catNewline + || t.cat() == catLetter + || t.cat() == catSuper + || t.cat() == catSub + || t.cat() == catOther + || t.cat() == catActive + || t.cat() == catParameter) + os << t.cs(); else if (t.cat() == catBegin) { os << '{'; diff --git a/src/tex2lyx/tex2lyx.cpp b/src/tex2lyx/tex2lyx.cpp index 4c1366d85b..eacea79ac8 100644 --- a/src/tex2lyx/tex2lyx.cpp +++ b/src/tex2lyx/tex2lyx.cpp @@ -18,17 +18,16 @@ #include "TextClass.h" #include "Layout.h" -#include "support/lassert.h" #include "support/convert.h" #include "support/debug.h" #include "support/ExceptionMessage.h" #include "support/filetools.h" +#include "support/lassert.h" #include "support/lstrings.h" #include "support/os.h" #include "support/Package.h" #include -#include #include #include #include @@ -202,7 +201,7 @@ void read_environment(Parser & p, string const & begin, */ void read_syntaxfile(FileName const & file_name) { - ifstream is(file_name.toFilesystemEncoding().c_str()); + ifdocstream is(file_name.toFilesystemEncoding().c_str()); if (!is.good()) { cerr << "Could not open syntax file \"" << file_name << "\" for reading." << endl; @@ -389,7 +388,7 @@ namespace { * You must ensure that \p parentFilePath is properly set before calling * this function! */ -void tex2lyx(istream & is, ostream & os) +void tex2lyx(idocstream & is, ostream & os) { Parser p(is); //p.dump(); @@ -411,7 +410,7 @@ void tex2lyx(istream & is, ostream & os) os << ss.str(); #ifdef TEST_PARSER p.reset(); - ofstream parsertest("parsertest.tex"); + ofdocstream parsertest("parsertest.tex"); while (p.good()) parsertest << p.get_token().asInput(); // and parsertest.tex should now have identical content @@ -422,7 +421,7 @@ void tex2lyx(istream & is, ostream & os) /// convert TeX from \p infilename to LyX and write it to \p os bool tex2lyx(FileName const & infilename, ostream & os) { - ifstream is(infilename.toFilesystemEncoding().c_str()); + ifdocstream is(infilename.toFilesystemEncoding().c_str()); if (!is.good()) { cerr << "Could not open input file \"" << infilename << "\" for reading." << endl; diff --git a/src/tex2lyx/text.cpp b/src/tex2lyx/text.cpp index df5f51175c..dfb9c1fe51 100644 --- a/src/tex2lyx/text.cpp +++ b/src/tex2lyx/text.cpp @@ -1262,7 +1262,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer, t.cat() == catParameter) { // This translates "&" to "\\&" which may be wrong... context.check_layout(os); - os << t.character(); + os << t.cs(); } else if (p.isParagraph()) { @@ -1281,7 +1281,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer, else os << "\\InsetSpace ~\n"; } else - os << t.character(); + os << t.cs(); } else if (t.cat() == catBegin && @@ -1309,7 +1309,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer, next.character() == '*') { p.get_token(); if (p.next_token().cat() == catEnd) { - os << next.character(); + os << next.cs(); p.get_token(); } else { p.putback();