First serious step for utf8 file format format. tex2lyx is now able to

read utf8 tex documents and translate them to lyxformat 249.

There is still no code to discover the encoding and use it, but it is the
easiest part (I hope).


git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@27563 a592a061-630c-0410-9148-cb99ea01b6c8
This commit is contained in:
Jean-Marc Lasgouttes 2008-11-16 17:02:00 +00:00
parent f8f5a7b28d
commit 337c6d1577
7 changed files with 55 additions and 67 deletions

View File

@ -13,7 +13,6 @@
#include "Parser.h"
#include <iostream>
#include <sstream>
using namespace std;
@ -54,13 +53,12 @@ void catInit()
theCatcode[int('@')] = catLetter;
}
/*!
* Translate a line ending to '\n'.
* \p c must have catcode catNewline, and it must be the last character read
* from \p is.
*/
char getNewline(istream & is, char c)
char getNewline(idocstream & is, char c)
{
// we have to handle 3 different line endings:
// - UNIX (\n)
@ -68,9 +66,10 @@ char getNewline(istream & is, char c)
// - DOS (\r\n)
if (c == '\r') {
// MAC or DOS
if (is.get(c) && c != '\n') {
char_type wc;
if (is.get(wc) && wc != '\n') {
// MAC
is.putback(c);
is.putback(wc);
}
return '\n';
}
@ -78,18 +77,14 @@ char getNewline(istream & is, char c)
return c;
}
}
//
// catcodes
//
CatCode catcode(unsigned char c)
CatCode catcode(char_type c)
{
return theCatcode[c];
if (c < 256)
return theCatcode[(unsigned char)c];
return catOther;
}
}
//
@ -105,18 +100,18 @@ ostream & operator<<(ostream & os, Token const & t)
else if (t.cat() == catEscape)
os << '\\' << t.cs() << ' ';
else if (t.cat() == catLetter)
os << t.character();
os << t.cs();
else if (t.cat() == catNewline)
os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
else
os << '[' << t.character() << ',' << t.cat() << ']';
os << '[' << t.cs() << ',' << t.cat() << ']';
return os;
}
string Token::asString() const
{
return cs_.size() ? cs_ : string(1, char_);
return cs_;
}
@ -124,9 +119,9 @@ string Token::asInput() const
{
if (cat_ == catComment)
return '%' + cs_ + '\n';
if (cat_ == catSpace || cat_ == catNewline)
return cs_;
return char_ ? string(1, char_) : '\\' + cs_;
if (cat_ == catEscape)
return '\\' + cs_;
return cs_;
}
@ -135,14 +130,15 @@ string Token::asInput() const
//
Parser::Parser(istream & is)
Parser::Parser(idocstream & is)
: lineno_(0), pos_(0), iss_(0), is_(is)
{
}
Parser::Parser(string const & s)
: lineno_(0), pos_(0), iss_(new istringstream(s)), is_(*iss_)
: lineno_(0), pos_(0),
iss_(new idocstringstream(from_utf8(s))), is_(*iss_)
{
}
@ -267,7 +263,7 @@ char Parser::getChar()
{
if (!good())
error("The input stream is not well...");
return tokens_[pos_++].character();
return get_token().character();
}
@ -365,14 +361,13 @@ string const Parser::verbatimEnvironment(string const & name)
void Parser::tokenize_one()
{
catInit();
char c;
char_type c;
if (!is_.get(c))
return;
//cerr << "reading c: " << c << "\n";
switch (catcode(c)) {
case catSpace: {
string s(1, c);
docstring s(1, c);
while (is_.get(c) && catcode(c) == catSpace)
s += c;
if (catcode(c) != catSpace)
@ -383,7 +378,7 @@ void Parser::tokenize_one()
case catNewline: {
++lineno_;
string s(1, getNewline(is_, c));
docstring s(1, getNewline(is_, c));
while (is_.get(c) && catcode(c) == catNewline) {
++lineno_;
s += getNewline(is_, c);
@ -397,7 +392,7 @@ void Parser::tokenize_one()
case catComment: {
// We don't treat "%\n" combinations here specially because
// we want to preserve them in the preamble
string s;
docstring s;
while (is_.get(c) && catcode(c) != catNewline)
s += c;
// handle possible DOS line ending
@ -415,7 +410,7 @@ void Parser::tokenize_one()
if (!is_) {
error("unexpected end of input");
} else {
string s(1, c);
docstring s(1, c);
if (catcode(c) == catLetter) {
// collect letters
while (is_.get(c) && catcode(c) == catLetter)
@ -429,13 +424,14 @@ void Parser::tokenize_one()
}
case catIgnore: {
cerr << "ignoring a char: " << int(c) << "\n";
cerr << "ignoring a char: " << c << "\n";
break;
}
default:
push_back(Token(c, catcode(c)));
push_back(Token(docstring(1, c), catcode(c)));
}
//cerr << tokens_.back();
}
@ -464,7 +460,7 @@ string Parser::verbatimOption()
string res;
if (next_token().character() == '[') {
Token t = get_token();
for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
if (t.cat() == catBegin) {
putback();
res += '{' + verbatim_item() + '}';

View File

@ -12,10 +12,11 @@
#ifndef PARSER_H
#define PARSER_H
#include <vector>
#include <string>
#include <utility>
#include <vector>
#include "support/docstream.h"
namespace lyx {
@ -46,9 +47,6 @@ enum CatCode {
};
CatCode catcode(unsigned char c);
enum {
FLAG_BRACE_LAST = 1 << 1, // last closing brace ends the parsing
FLAG_RIGHT = 1 << 2, // next \\right ends the parsing process
@ -75,18 +73,16 @@ enum {
class Token {
public:
///
Token() : cs_(), char_(0), cat_(catIgnore) {}
Token() : cs_(), cat_(catIgnore) {}
///
Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {}
///
Token(std::string const & cs, CatCode cat) : cs_(cs), char_(0), cat_(cat) {}
Token(docstring const & cs, CatCode cat) : cs_(to_utf8(cs)), cat_(cat) {}
///
std::string const & cs() const { return cs_; }
/// Returns the catcode of the token
CatCode cat() const { return cat_; }
///
char character() const { return char_; }
char character() const { return cs_.empty() ? 0 : cs_[0]; }
/// Returns the token as string
std::string asString() const;
/// Returns the token verbatim
@ -96,8 +92,6 @@ private:
///
std::string cs_;
///
char char_;
///
CatCode cat_;
};
@ -119,7 +113,7 @@ class Parser {
public:
///
Parser(std::istream & is);
Parser(idocstream & is);
///
Parser(std::string const & s);
///
@ -217,9 +211,9 @@ private:
///
unsigned pos_;
///
std::istringstream * iss_;
idocstringstream * iss_;
///
std::istream & is_;
idocstream & is_;
};

View File

@ -94,7 +94,7 @@ void parse_math(Parser & p, ostream & os, unsigned flags, const mode_type mode)
t.cat() == catAlign ||
t.cat() == catActive ||
t.cat() == catParameter)
os << t.character();
os << t.cs();
else if (t.cat() == catBegin) {
os << '{';

View File

@ -413,7 +413,7 @@ void handle_package(string const & name, string const & opts,
void end_preamble(ostream & os, TextClass const & /*textclass*/)
{
os << "#LyX file created by tex2lyx " << PACKAGE_VERSION << "\n"
<< "\\lyxformat 247\n"
<< "\\lyxformat 249\n"
<< "\\begin_document\n"
<< "\\begin_header\n"
<< "\\textclass " << h_textclass << "\n";

View File

@ -661,16 +661,15 @@ void parse_table(Parser & p, ostream & os, bool is_long_tabular,
}
}
else if (t.cat() == catSpace || t.cat() == catNewline)
os << t.cs();
else if (t.cat() == catLetter ||
t.cat() == catSuper ||
t.cat() == catSub ||
t.cat() == catOther ||
t.cat() == catActive ||
t.cat() == catParameter)
os << t.character();
else if (t.cat() == catSpace
|| t.cat() == catNewline
|| t.cat() == catLetter
|| t.cat() == catSuper
|| t.cat() == catSub
|| t.cat() == catOther
|| t.cat() == catActive
|| t.cat() == catParameter)
os << t.cs();
else if (t.cat() == catBegin) {
os << '{';

View File

@ -18,17 +18,16 @@
#include "TextClass.h"
#include "Layout.h"
#include "support/lassert.h"
#include "support/convert.h"
#include "support/debug.h"
#include "support/ExceptionMessage.h"
#include "support/filetools.h"
#include "support/lassert.h"
#include "support/lstrings.h"
#include "support/os.h"
#include "support/Package.h"
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <sstream>
@ -202,7 +201,7 @@ void read_environment(Parser & p, string const & begin,
*/
void read_syntaxfile(FileName const & file_name)
{
ifstream is(file_name.toFilesystemEncoding().c_str());
ifdocstream is(file_name.toFilesystemEncoding().c_str());
if (!is.good()) {
cerr << "Could not open syntax file \"" << file_name
<< "\" for reading." << endl;
@ -389,7 +388,7 @@ namespace {
* You must ensure that \p parentFilePath is properly set before calling
* this function!
*/
void tex2lyx(istream & is, ostream & os)
void tex2lyx(idocstream & is, ostream & os)
{
Parser p(is);
//p.dump();
@ -411,7 +410,7 @@ void tex2lyx(istream & is, ostream & os)
os << ss.str();
#ifdef TEST_PARSER
p.reset();
ofstream parsertest("parsertest.tex");
ofdocstream parsertest("parsertest.tex");
while (p.good())
parsertest << p.get_token().asInput();
// <origfile> and parsertest.tex should now have identical content
@ -422,7 +421,7 @@ void tex2lyx(istream & is, ostream & os)
/// convert TeX from \p infilename to LyX and write it to \p os
bool tex2lyx(FileName const & infilename, ostream & os)
{
ifstream is(infilename.toFilesystemEncoding().c_str());
ifdocstream is(infilename.toFilesystemEncoding().c_str());
if (!is.good()) {
cerr << "Could not open input file \"" << infilename
<< "\" for reading." << endl;

View File

@ -1262,7 +1262,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
t.cat() == catParameter) {
// This translates "&" to "\\&" which may be wrong...
context.check_layout(os);
os << t.character();
os << t.cs();
}
else if (p.isParagraph()) {
@ -1281,7 +1281,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
else
os << "\\InsetSpace ~\n";
} else
os << t.character();
os << t.cs();
}
else if (t.cat() == catBegin &&
@ -1309,7 +1309,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
next.character() == '*') {
p.get_token();
if (p.next_token().cat() == catEnd) {
os << next.character();
os << next.cs();
p.get_token();
} else {
p.putback();