backport the tex2lyx unicode stuff from trunk

git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/branches/BRANCH_1_6_X@28858 a592a061-630c-0410-9148-cb99ea01b6c8
2025-01-18 21:45:24 +00:00 · 2009-03-18 22:40:08 +00:00 · 2009-03-18 22:40:08 +00:00 · f9a4e1cc95
commit f9a4e1cc95
parent 6dec0f3e0d
13 changed files with 316 additions and 239 deletions
--- a/src/Encoding.cpp
+++ b/src/Encoding.cpp
@ -530,6 +530,11 @@ docstring Encodings::fromLaTeXCommand(docstring const & cmd, docstring & rem)

 void Encodings::initUnicodeMath(Buffer const & buffer, bool clear_sets)
 {
+#ifdef TEX2LYX
+	// The code below is not needed in tex2lyx and requires additional stuff
+	(void)buffer;
+	(void)clear_sets;
+#else
 	if (clear_sets) {
 		mathcmd.clear();
 		textcmd.clear();
@ -549,11 +554,18 @@ void Encodings::initUnicodeMath(Buffer const & buffer, bool clear_sets)
 	for (; bit != bend; ++bit)
 		if (buffer.isChild(*bit))
 			initUnicodeMath(**bit, false);
+#endif
 }


 void Encodings::validate(char_type c, LaTeXFeatures & features, bool for_mathed)
 {
+#ifdef TEX2LYX
+	// The code below is not needed in tex2lyx and requires additional stuff
+	(void)c;
+	(void)features;
+	(void)for_mathed;
+#else
 	CharInfoMap::const_iterator const it = unicodesymbols.find(c);
 	if (it != unicodesymbols.end()) {
 		// In mathed, c could be used both in textmode and mathmode
@ -592,6 +604,7 @@ void Encodings::validate(char_type c, LaTeXFeatures & features, bool for_mathed)
 		features.require("relsize");
 		features.require("lyxmathsym");
 	}
+#endif
 }


--- a/src/tex2lyx/Makefile.am
+++ b/src/tex2lyx/Makefile.am
@ -41,7 +41,8 @@ LINKED_FILES = \
 	../Lexer.cpp \
 	../lengthcommon.cpp \
 	../Color.cpp \
-	../Color.h
+	../Color.h \
+	../Encoding.cpp

 BUILT_SOURCES = $(PCH_FILE)

--- a/src/tex2lyx/Parser.cpp
+++ b/src/tex2lyx/Parser.cpp
@ -10,10 +10,10 @@

 #include <config.h>

+#include "Encoding.h"
 #include "Parser.h"

 #include <iostream>
-#include <sstream>

 using namespace std;

@ -25,6 +25,11 @@ CatCode theCatcode[256];

 void catInit()
 {
+	static bool init_done = false;
+	if (init_done) 
+		return;
+	init_done = true;
+
 	fill(theCatcode, theCatcode + 256, catOther);
 	fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
 	fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
@ -49,13 +54,12 @@ void catInit()
 	theCatcode[int('@')]  = catLetter;
 }

-
 /*!
 * Translate a line ending to '\n'.
 * \p c must have catcode catNewline, and it must be the last character read
 * from \p is.
 */
-char getNewline(istream & is, char c)
+char getNewline(idocstream & is, char c)
 {
 	// we have to handle 3 different line endings:
 	// - UNIX (\n)
@ -63,9 +67,10 @@ char getNewline(istream & is, char c)
 	// - DOS  (\r\n)
 	if (c == '\r') {
 		// MAC or DOS
-		if (is.get(c) && c != '\n') {
+		char_type wc;
+		if (is.get(wc) && wc != '\n') {
 			// MAC
-			is.putback(c);
+			is.putback(wc);
 		}
 		return '\n';
 	}
@ -73,18 +78,14 @@ char getNewline(istream & is, char c)
 	return c;
 }

-}
-
-
-//
-// catcodes
-//
-
-CatCode catcode(unsigned char c)
+CatCode catcode(char_type c)
 {
-	return theCatcode[c];
+	if (c < 256)
+		return theCatcode[(unsigned char)c];
+	return catOther;
 }

+}


 //
@ -100,18 +101,18 @@ ostream & operator<<(ostream & os, Token const & t)
 	else if (t.cat() == catEscape)
 		os << '\\' << t.cs() << ' ';
 	else if (t.cat() == catLetter)
-		os << t.character();
+		os << t.cs();
 	else if (t.cat() == catNewline)
 		os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
 	else
-		os << '[' << t.character() << ',' << t.cat() << ']';
+		os << '[' << t.cs() << ',' << t.cat() << ']';
 	return os;
 }


 string Token::asString() const
 {
-	return cs_.size() ? cs_ : string(1, char_);
+	return cs_;
 }


@ -119,9 +120,9 @@ string Token::asInput() const
 {
 	if (cat_ == catComment)
 		return '%' + cs_ + '\n';
-	if (cat_ == catSpace || cat_ == catNewline)
-		return cs_;
-	return char_ ? string(1, char_) : '\\' + cs_;
+	if (cat_ == catEscape)
+		return '\\' + cs_;
+	return cs_;
 }


@ -130,18 +131,32 @@ string Token::asInput() const
 //


-Parser::Parser(istream & is)
-	: lineno_(0), pos_(0)
+Parser::Parser(idocstream & is)
+	: lineno_(0), pos_(0), iss_(0), is_(is), encoding_latex_("utf8")
 {
-	tokenize(is);
 }


 Parser::Parser(string const & s)
-	: lineno_(0), pos_(0)
+	: lineno_(0), pos_(0), 
+	  iss_(new idocstringstream(from_utf8(s))), is_(*iss_), 
+	  encoding_latex_("utf8")
 {
-	istringstream is(s);
-	tokenize(is);
+}
+
+
+Parser::~Parser()
+{
+	delete iss_;
+}
+
+
+void Parser::setEncoding(std::string const & e)
+{
+	Encoding const * enc = encodings.fromLaTeXName(e);
+	//cerr << "setting encoding to " << enc->iconvName()<<std::endl;
+	is_ << lyx::setEncoding(enc->iconvName());
+	encoding_latex_ = e;
 }


@ -165,7 +180,7 @@ Token const & Parser::curr_token() const
 }


-Token const & Parser::next_token() const
+Token const & Parser::next_token()
 {
 	static const Token dummy;
 	return good() ? tokens_[pos_] : dummy;
@ -180,7 +195,7 @@ Token const & Parser::get_token()
 }


-bool Parser::isParagraph() const
+bool Parser::isParagraph()
 {
 	// A new paragraph in TeX ist started
 	// - either by a newline, following any amount of whitespace
@ -246,8 +261,11 @@ void Parser::putback()
 }


-bool Parser::good() const
+bool Parser::good()
 {
+	if (pos_ < tokens_.size())
+		return true;
+	tokenize_one();
 	return pos_ < tokens_.size();
 }

@ -256,7 +274,7 @@ char Parser::getChar()
 {
 	if (!good())
 		error("The input stream is not well...");
-	return tokens_[pos_++].character();
+	return get_token().character();
 }


@ -351,86 +369,80 @@ string const Parser::verbatimEnvironment(string const & name)
 }


-void Parser::tokenize(istream & is)
+void Parser::tokenize_one()
 {
-	static bool init_done = false;
+	catInit();
+	char_type c;
+	if (!is_.get(c)) 
+		return;

-	if (!init_done) {
-		catInit();
-		init_done = true;
+	switch (catcode(c)) {
+	case catSpace: {
+		docstring s(1, c);
+		while (is_.get(c) && catcode(c) == catSpace)
+			s += c;
+		if (catcode(c) != catSpace)
+			is_.putback(c);
+		push_back(Token(s, catSpace));
+		break;
 	}
-
-	char c;
-	while (is.get(c)) {
-		//cerr << "reading c: " << c << "\n";
-
-		switch (catcode(c)) {
-			case catSpace: {
-				string s(1, c);
-				while (is.get(c) && catcode(c) == catSpace)
-					s += c;
-				if (catcode(c) != catSpace)
-					is.putback(c);
-				push_back(Token(s, catSpace));
-				break;
-			}
-
-			case catNewline: {
-				++lineno_;
-				string s(1, getNewline(is, c));
-				while (is.get(c) && catcode(c) == catNewline) {
-					++lineno_;
-					s += getNewline(is, c);
-				}
-				if (catcode(c) != catNewline)
-					is.putback(c);
-				push_back(Token(s, catNewline));
-				break;
-			}
-
-			case catComment: {
-				// We don't treat "%\n" combinations here specially because
-				// we want to preserve them in the preamble
-				string s;
-				while (is.get(c) && catcode(c) != catNewline)
-					s += c;
-				// handle possible DOS line ending
-				if (catcode(c) == catNewline)
-					c = getNewline(is, c);
-				// Note: The '%' at the beginning and the '\n' at the end
-				// of the comment are not stored.
-				++lineno_;
-				push_back(Token(s, catComment));
-				break;
-			}
-
-			case catEscape: {
-				is.get(c);
-				if (!is) {
-					error("unexpected end of input");
-				} else {
-					string s(1, c);
-					if (catcode(c) == catLetter) {
-						// collect letters
-						while (is.get(c) && catcode(c) == catLetter)
-							s += c;
-						if (catcode(c) != catLetter)
-							is.putback(c);
-					}
-					push_back(Token(s, catEscape));
-				}
-				break;
-			}
-
-			case catIgnore: {
-				cerr << "ignoring a char: " << int(c) << "\n";
-				break;
-			}
-
-			default:
-				push_back(Token(c, catcode(c)));
+		
+	case catNewline: {
+		++lineno_;
+		docstring s(1, getNewline(is_, c));
+		while (is_.get(c) && catcode(c) == catNewline) {
+			++lineno_;
+			s += getNewline(is_, c);
 		}
+		if (catcode(c) != catNewline)
+			is_.putback(c);
+		push_back(Token(s, catNewline));
+		break;
 	}
+		
+	case catComment: {
+		// We don't treat "%\n" combinations here specially because
+		// we want to preserve them in the preamble
+		docstring s;
+		while (is_.get(c) && catcode(c) != catNewline)
+			s += c;
+		// handle possible DOS line ending
+		if (catcode(c) == catNewline)
+			c = getNewline(is_, c);
+		// Note: The '%' at the beginning and the '\n' at the end
+		// of the comment are not stored.
+		++lineno_;
+		push_back(Token(s, catComment));
+		break;
+	}
+		
+	case catEscape: {
+		is_.get(c);
+		if (!is_) {
+			error("unexpected end of input");
+		} else {
+			docstring s(1, c);
+			if (catcode(c) == catLetter) {
+				// collect letters
+				while (is_.get(c) && catcode(c) == catLetter)
+					s += c;
+				if (catcode(c) != catLetter)
+					is_.putback(c);
+			}
+			push_back(Token(s, catEscape));
+		}
+		break;
+	}
+		
+	case catIgnore: {
+		cerr << "ignoring a char: " << c << "\n";
+		break;
+	}
+		
+	default:
+		push_back(Token(docstring(1, c), catcode(c)));
+	}
+	//cerr << tokens_.back();
 }


@ -459,7 +471,7 @@ string Parser::verbatimOption()
 	string res;
 	if (next_token().character() == '[') {
 		Token t = get_token();
-		for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
+		for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
 			if (t.cat() == catBegin) {
 				putback();
 				res += '{' + verbatim_item() + '}';
--- a/src/tex2lyx/Parser.h
+++ b/src/tex2lyx/Parser.h
@ -12,10 +12,11 @@
 #ifndef PARSER_H
 #define PARSER_H

-#include <vector>
 #include <string>
 #include <utility>
+#include <vector>

+#include "support/docstream.h"

 namespace lyx {

@ -46,9 +47,6 @@ enum CatCode {
 };


-CatCode catcode(unsigned char c);
-
-
 enum {
 	FLAG_BRACE_LAST = 1 << 1,  //  last closing brace ends the parsing
 	FLAG_RIGHT      = 1 << 2,  //  next \\right ends the parsing process
@ -75,18 +73,16 @@ enum {
 class Token {
 public:
 	///
-	Token() : cs_(), char_(0), cat_(catIgnore) {}
+	Token() : cs_(), cat_(catIgnore) {}
 	///
-	Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {}
-	///
-	Token(std::string const & cs, CatCode cat) : cs_(cs), char_(0), cat_(cat) {}
+	Token(docstring const & cs, CatCode cat) : cs_(to_utf8(cs)), cat_(cat) {}

 	///
 	std::string const & cs() const { return cs_; }
 	/// Returns the catcode of the token
 	CatCode cat() const { return cat_; }
 	///
-	char character() const { return char_; }
+	char character() const { return cs_.empty() ? 0 : cs_[0]; }
 	/// Returns the token as string
 	std::string asString() const;
 	/// Returns the token verbatim
@ -96,8 +92,6 @@ private:
 	///
 	std::string cs_;
 	///
-	char char_;
-	///
 	CatCode cat_;
 };

@ -119,9 +113,16 @@ class Parser {

 public:
 	///
-	Parser(std::istream & is);
+	Parser(idocstream & is);
 	///
 	Parser(std::string const & s);
+	///
+	~Parser();
+
+	/// change the latex encoding of the input stream
+	void setEncoding(std::string const & encoding);
+	/// get the current latex encoding of the input stream
+	std::string getEncoding() const { return encoding_latex_; }

 	///
 	int lineno() const { return lineno_; }
@ -174,8 +175,8 @@ public:
 	char getChar();
 	///
 	void error(std::string const & msg);
-	/// Parses \p is into tokens
-	void tokenize(std::istream & is);
+	/// Parses one token from \p is 
+	void tokenize_one();
 	///
 	void push_back(Token const & t);
 	/// The previous token.
@ -183,11 +184,11 @@ public:
 	/// The current token.
 	Token const & curr_token() const;
 	/// The next token.
-	Token const & next_token() const;
+	Token const & next_token();
 	/// Make the next token current and return that.
 	Token const & get_token();
 	/// \return whether the current token starts a new paragraph
-	bool isParagraph() const;
+	bool isParagraph();
 	/// skips spaces (and comments if \p skip_comments is true)
 	void skip_spaces(bool skip_comments = false);
 	/// puts back spaces (and comments if \p skip_comments is true)
@ -195,7 +196,7 @@ public:
 	///
 	void lex(std::string const & s);
 	///
-	bool good() const;
+	bool good();
 	///
 	std::string verbatim_item();
 	///
@ -214,6 +215,12 @@ private:
 	std::vector<Token> tokens_;
 	///
 	unsigned pos_;
+	///
+	idocstringstream * iss_;
+	///
+	idocstream & is_;
+	/// latex name of the current encoding
+	std::string encoding_latex_;
 };


--- a/src/tex2lyx/math.cpp
+++ b/src/tex2lyx/math.cpp
@ -94,7 +94,7 @@ void parse_math(Parser & p, ostream & os, unsigned flags, const mode_type mode)
 			       t.cat() == catAlign ||
 			       t.cat() == catActive ||
 			       t.cat() == catParameter)
-			os << t.character();
+			os << t.cs();

 		else if (t.cat() == catBegin) {
 			os << '{';
--- a/src/tex2lyx/preamble.cpp
+++ b/src/tex2lyx/preamble.cpp
@ -248,7 +248,7 @@ string const scale_as_percentage(string const & scale)
 }


-void handle_package(string const & name, string const & opts,
+void handle_package(Parser &p, string const & name, string const & opts,
 		    bool in_lyx_preamble)
 {
 	vector<string> options = split_options(opts);
@ -326,9 +326,10 @@ void handle_package(string const & name, string const & opts,
 		; // ignore this

 	else if (name == "inputenc") {
-		// only set when there is not more than one inputenc option
-		// therefore check for the "," character
-		// also only set when there is not more then one babel language option
+		// only set when there is not more than one inputenc
+		// option therefore check for the "," character also
+		// only set when there is not more then one babel
+		// language option
 		if (opts.find(",") == string::npos && one_language == true) {
 			if (opts == "ascii")
 				//change ascii to auto to be in the unicode range, see
@ -337,6 +338,8 @@ void handle_package(string const & name, string const & opts,
 			else if (!opts.empty())
 				h_inputencoding = opts;
 		}
+		if (!options.empty())
+			p.setEncoding(options.back());
 		options.clear();
 	}

@ -413,7 +416,7 @@ void handle_package(string const & name, string const & opts,
 void end_preamble(ostream & os, TextClass const & /*textclass*/)
 {
 	os << "#LyX file created by tex2lyx " << PACKAGE_VERSION << "\n"
-	   << "\\lyxformat 247\n"
+	   << "\\lyxformat 249\n"
 	   << "\\begin_document\n"
 	   << "\\begin_header\n"
 	   << "\\textclass " << h_textclass << "\n";
@ -664,21 +667,19 @@ void parse_preamble(Parser & p, ostream & os,
 		else if (t.cs() == "usepackage") {
 			string const options = p.getArg('[', ']');
 			string const name = p.getArg('{', '}');
-			if (options.empty() && name.find(',')) {
-				vector<string> vecnames;
-				split(name, vecnames, ',');
-				vector<string>::const_iterator it  = vecnames.begin();
-				vector<string>::const_iterator end = vecnames.end();
-				for (; it != end; ++it)
-					handle_package(trim(*it), string(), 
-						       in_lyx_preamble);
-			} else {
-				handle_package(name, options, in_lyx_preamble);
-			}
+			vector<string> vecnames;
+			split(name, vecnames, ',');
+			vector<string>::const_iterator it  = vecnames.begin();
+			vector<string>::const_iterator end = vecnames.end();
+			for (; it != end; ++it)
+				handle_package(p, trim(*it), options, 
+					       in_lyx_preamble);
 		}

 		else if (t.cs() == "inputencoding") {
-			h_inputencoding = p.getArg('{','}');
+			string const encoding = p.getArg('{','}');
+			h_inputencoding = encoding;
+			p.setEncoding(encoding);
 		}

 		else if (t.cs() == "newenvironment") {
--- a/src/tex2lyx/table.cpp
+++ b/src/tex2lyx/table.cpp
@ -661,16 +661,15 @@ void parse_table(Parser & p, ostream & os, bool is_long_tabular,
 			}
 		}

-		else if (t.cat() == catSpace || t.cat() == catNewline)
-				os << t.cs();
-
-		else if (t.cat() == catLetter ||
-			       t.cat() == catSuper ||
-			       t.cat() == catSub ||
-			       t.cat() == catOther ||
-			       t.cat() == catActive ||
-			       t.cat() == catParameter)
-			os << t.character();
+		else if (t.cat() == catSpace 
+			 || t.cat() == catNewline
+			 || t.cat() == catLetter 
+			 || t.cat() == catSuper 
+			 || t.cat() == catSub 
+			 || t.cat() == catOther 
+			 || t.cat() == catActive 
+			 || t.cat() == catParameter)
+			os << t.cs();

 		else if (t.cat() == catBegin) {
 			os << '{';
--- a/src/tex2lyx/test/DummyDocument.tex
+++ b/src/tex2lyx/test/DummyDocument.tex
@ -2,4 +2,5 @@
 \subsection{\label{sub:External-Subsection}External Subsection}

 This is a small dummy child document to show how files can be inserted
-to another document.
+to another document. Here are some accented characters to make sure
+the encoding is passed to included files: éè
--- a/src/tex2lyx/test/test.ltx
+++ b/src/tex2lyx/test/test.ltx
@ -75,6 +75,12 @@ foo & bar \\
 bar & foo
 \end{tabular}

+Let's try a few unicode characters: the (R) symbol \textregistered
+(and the same one with braces \textregistered{} and a space after) or
+maybe an accented a \'{a} or this one \'a or this \^\i.
+
+Watch out: \textregistered should be glued to its successor here.
+
 Final Text.
 \end{document}

--- a/src/tex2lyx/tex2lyx.cpp
+++ b/src/tex2lyx/tex2lyx.cpp
@ -15,20 +15,20 @@
 #include "tex2lyx.h"

 #include "Context.h"
-#include "TextClass.h"
+#include "Encoding.h"
 #include "Layout.h"
+#include "TextClass.h"

-#include "support/lassert.h"
 #include "support/convert.h"
 #include "support/debug.h"
 #include "support/ExceptionMessage.h"
 #include "support/filetools.h"
+#include "support/lassert.h"
 #include "support/lstrings.h"
 #include "support/os.h"
 #include "support/Package.h"

 #include <cstdlib>
-#include <fstream>
 #include <iostream>
 #include <string>
 #include <sstream>
@ -202,7 +202,7 @@ void read_environment(Parser & p, string const & begin,
 */
 void read_syntaxfile(FileName const & file_name)
 {
-	ifstream is(file_name.toFilesystemEncoding().c_str());
+	ifdocstream is(file_name.toFilesystemEncoding().c_str());
 	if (!is.good()) {
 		cerr << "Could not open syntax file \"" << file_name
 		     << "\" for reading." << endl;
@ -236,6 +236,7 @@ void read_syntaxfile(FileName const & file_name)


 string documentclass;
+string default_encoding;
 string syntaxfile;
 bool overwrite_files = false;

@ -253,6 +254,7 @@ int parse_help(string const &, string const &)
 		"\t-userdir dir       try to set user directory to dir\n"
 		"\t-sysdir dir        try to set system directory to dir\n"
 		"\t-c textclass       declare the textclass\n"
+		"\t-e encoding        set the default encoding (latex name)\n"
 		"\t-n                 translate a noweb (aka literate programming) file.\n"
 		"\t-s syntaxfile      read additional syntax file" << endl;
 	exit(0);
@ -270,6 +272,17 @@ int parse_class(string const & arg, string const &)
 }


+int parse_encoding(string const & arg, string const &)
+{
+	if (arg.empty()) {
+		cerr << "Missing encoding string after -e switch" << endl;
+		exit(1);
+	}
+	default_encoding = arg;
+	return 1;
+}
+
+
 int parse_syntaxfile(string const & arg, string const &)
 {
 	if (arg.empty()) {
@ -328,6 +341,7 @@ void easyParse(int & argc, char * argv[])
 	map<string, cmd_helper> cmdmap;

 	cmdmap["-c"] = parse_class;
+	cmdmap["-e"] = parse_encoding;
 	cmdmap["-f"] = parse_force;
 	cmdmap["-s"] = parse_syntaxfile;
 	cmdmap["-help"] = parse_help;
@ -389,9 +403,11 @@ namespace {
 *  You must ensure that \p parentFilePath is properly set before calling
 *  this function!
 */
-void tex2lyx(istream & is, ostream & os)
+void tex2lyx(idocstream & is, ostream & os, string const & encoding)
 {
 	Parser p(is);
+	if (!encoding.empty())
+		p.setEncoding(encoding);
 	//p.dump();

 	stringstream ss;
@ -411,7 +427,7 @@ void tex2lyx(istream & is, ostream & os)
 	os << ss.str();
 #ifdef TEST_PARSER
 	p.reset();
-	ofstream parsertest("parsertest.tex");
+	ofdocstream parsertest("parsertest.tex");
 	while (p.good())
 		parsertest << p.get_token().asInput();
 	// <origfile> and parsertest.tex should now have identical content
@ -420,9 +436,12 @@ void tex2lyx(istream & is, ostream & os)


 /// convert TeX from \p infilename to LyX and write it to \p os
-bool tex2lyx(FileName const & infilename, ostream & os)
+bool tex2lyx(FileName const & infilename, ostream & os, string const & encoding)
 {
-	ifstream is(infilename.toFilesystemEncoding().c_str());
+	ifdocstream is;
+	// forbid buffering on this stream
+	is.rdbuf()->pubsetbuf(0,0);
+	is.open(infilename.toFilesystemEncoding().c_str());
 	if (!is.good()) {
 		cerr << "Could not open input file \"" << infilename
 		     << "\" for reading." << endl;
@ -430,7 +449,7 @@ bool tex2lyx(FileName const & infilename, ostream & os)
 	}
 	string const oldParentFilePath = parentFilePath;
 	parentFilePath = onlyPath(infilename.absFilename());
-	tex2lyx(is, os);
+	tex2lyx(is, os, encoding);
 	parentFilePath = oldParentFilePath;
 	return true;
 }
@ -438,7 +457,8 @@ bool tex2lyx(FileName const & infilename, ostream & os)
 } // anonymous namespace


-bool tex2lyx(string const & infilename, FileName const & outfilename)
+bool tex2lyx(string const & infilename, FileName const & outfilename, 
+	     string const & encoding)
 {
 	if (outfilename.isReadableFile()) {
 		if (overwrite_files) {
@ -462,7 +482,7 @@ bool tex2lyx(string const & infilename, FileName const & outfilename)
 	cerr << "Input file: " << infilename << "\n";
 	cerr << "Output file: " << outfilename << "\n";
 #endif
-	return tex2lyx(FileName(infilename), os);
+	return tex2lyx(FileName(infilename), os, encoding);
 }

 } // namespace lyx
@ -485,11 +505,11 @@ int main(int argc, char * argv[])
 	os::init(argc, argv);

 	try { init_package(internal_path(to_utf8(from_local8bit(argv[0]))),
-		cl_system_support, cl_user_support,
-		top_build_dir_is_two_levels_up);
+			     cl_system_support, cl_user_support,
+			     top_build_dir_is_two_levels_up);
 	} catch (ExceptionMessage const & message) {
 		cerr << to_utf8(message.title_) << ":\n"
-			<< to_utf8(message.details_) << endl;
+		     << to_utf8(message.details_) << endl;
 		if (message.type_ == ErrorException)
 			exit(1);
 	}
@ -507,6 +527,7 @@ int main(int argc, char * argv[])
 	} else
 		outfilename = changeExtension(infilename, ".lyx");

+	// Read the syntax tables
 	FileName const system_syntaxfile = libFileSearch("", "syntax.default");
 	if (system_syntaxfile.empty()) {
 		cerr << "Error: Could not find syntax file \"syntax.default\"." << endl;
@ -516,16 +537,31 @@ int main(int argc, char * argv[])
 	if (!syntaxfile.empty())
 		read_syntaxfile(makeAbsPath(syntaxfile));

+	// Read the encodings table.
+	FileName const symbols_path = libFileSearch(string(), "unicodesymbols");
+	if (symbols_path.empty()) {
+		cerr << "Error: Could not find file \"unicodesymbols\"." 
+		     << endl;
+		exit(1);
+	}
+	FileName const enc_path = libFileSearch(string(), "encodings");
+	if (enc_path.empty()) {
+		cerr << "Error: Could not find file \"encodings\"." 
+		     << endl;
+		exit(1);
+	}
+	encodings.read(enc_path, symbols_path);
+
+	// The real work now.
 	masterFilePath = onlyPath(infilename);
 	parentFilePath = masterFilePath;
-
 	if (outfilename == "-") {
-		if (tex2lyx(FileName(infilename), cout))
+		if (tex2lyx(FileName(infilename), cout, default_encoding))
 			return EXIT_SUCCESS;
 		else
 			return EXIT_FAILURE;
 	} else {
-		if (tex2lyx(infilename, FileName(outfilename)))
+		if (tex2lyx(infilename, FileName(outfilename), default_encoding))
 			return EXIT_SUCCESS;
 		else
 			return EXIT_FAILURE;
--- a/src/tex2lyx/tex2lyx.h
+++ b/src/tex2lyx/tex2lyx.h
@ -114,13 +114,16 @@ extern std::string getParentFilePath();

 /*!
 *  Reads tex input from \a infilename and writes lyx output to \a outfilename.
+ *  The (latex) encoding can be provided as \a encoding.
 *  Uses some common settings for the preamble, so this should only
 *  be used more than once for included documents.
 *  Caution: Overwrites the existing preamble settings if the new document
 *  contains a preamble.
 *  \return true if the conversion was successful, else false.
 */
-bool tex2lyx(std::string const & infilename, support::FileName const & outfilename);
+bool tex2lyx(std::string const & infilename, 
+	     support::FileName const & outfilename, 
+	     std::string const & encoding);


 } // namespace lyx
--- a/src/tex2lyx/text.cpp
+++ b/src/tex2lyx/text.cpp
@ -17,6 +17,7 @@
 #include "tex2lyx.h"

 #include "Context.h"
+#include "Encoding.h"
 #include "FloatList.h"
 #include "Layout.h"
 #include "Length.h"
@ -344,8 +345,6 @@ void translate_box_len(string const & length, string & value, string & unit, str
 string find_file(string const & name, string const & path,
 		 char const * const * extensions)
 {
-	// FIXME UNICODE encoding of name and path may be wrong (makeAbsPath
-	// expects utf8)
 	for (char const * const * what = extensions; *what; ++what) {
 		string const trial = addExtension(name, *what);
 		if (makeAbsPath(trial, path).exists())
@ -509,7 +508,7 @@ void output_command_layout(ostream & os, Parser & p, bool outer,
 * The drawback is that the logic inside the function becomes
 * complicated, and that is the reason why it is not implemented.
 */
-void check_space(Parser const & p, ostream & os, Context & context)
+void check_space(Parser & p, ostream & os, Context & context)
 {
 	Token const next = p.next_token();
 	Token const curr = p.curr_token();
@ -1054,8 +1053,6 @@ void fix_relative_filename(string & name)
 	if (fname.isAbsolute())
 		return;

-	// FIXME UNICODE encoding of name may be wrong (makeAbsPath expects
-	// utf8)
 	name = to_utf8(makeRelPath(from_utf8(makeAbsPath(name, getMasterFilePath()).absFilename()),
 				   from_utf8(getParentFilePath())));
 }
@ -1262,7 +1259,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			       t.cat() == catParameter) {
 			// This translates "&" to "\\&" which may be wrong...
 			context.check_layout(os);
-			os << t.character();
+			os << t.cs();
 		}

 		else if (p.isParagraph()) {
@ -1281,7 +1278,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 				else
 					os << "\\InsetSpace ~\n";
 			} else
-				os << t.character();
+				os << t.cs();
 		}

 		else if (t.cat() == catBegin &&
@ -1309,7 +1306,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			    next.character() == '*') {
 				p.get_token();
 				if (p.next_token().cat() == catEnd) {
-					os << next.character();
+					os << next.cs();
 					p.get_token();
 				} else {
 					p.putback();
@ -1552,8 +1549,9 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			TeXFont const oldFont = context.font;
 			// save the current font size
 			string const size = oldFont.size;
-			// reset the font size to default, because the font size switches don't
-			// affect section headings and the like
+			// reset the font size to default, because the
+			// font size switches don't affect section
+			// headings and the like
 			context.font.size = known_coded_sizes[0];
 			output_font_change(os, oldFont, context.font);
 			// write the layout
@ -1605,8 +1603,6 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			string const path = getMasterFilePath();
 			// We want to preserve relative / absolute filenames,
 			// therefore path is only used for testing
-			// FIXME UNICODE encoding of name and path may be
-			// wrong (makeAbsPath expects utf8)
 			if (!makeAbsPath(name, path).exists()) {
 				// The file extension is probably missing.
 				// Now try to find it out.
@ -1637,8 +1633,6 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 					name = pdftex_name;
 			}

-			// FIXME UNICODE encoding of name and path may be
-			// wrong (makeAbsPath expects utf8)
 			if (makeAbsPath(name, path).exists())
 				fix_relative_filename(name);
 			else
@ -1763,6 +1757,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			p.skip_spaces();
 			context.check_layout(os);
 			string const s = p.verbatim_item();
+			//FIXME: this never triggers in UTF8
 			if (s == "\xb1" || s == "\xb3" || s == "\xb2" || s == "\xb5")
 				os << s;
 			else
@ -2127,25 +2122,31 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,

 		else if (t.cs() == "selectlanguage") {
 			context.check_layout(os);
-			// save the language for the case that a \foreignlanguage is used 
+			// save the language for the case that a
+			// \foreignlanguage is used 
+
+			//FIXME: this is wrong, the language should
+			// be saved in the context. (JMarc)
 			selectlang = subst(p.verbatim_item(), "\n", " ");
 			os << "\\lang " << selectlang << "\n";
-			
 		}

 		else if (t.cs() == "foreignlanguage") {
 			context.check_layout(os);
 			os << "\n\\lang " << subst(p.verbatim_item(), "\n", " ") << "\n";
 			os << subst(p.verbatim_item(), "\n", " ");
+			// FIXME: the second argument of selectlanguage
+			// has to be parsed (like for \textsf, for
+			// example). 
 			// set back to last selectlanguage
 			os << "\n\\lang " << selectlang << "\n";
 		}

-		else if (t.cs() == "inputencoding")
-			// write nothing because this is done by LyX using the "\lang"
-			// information given by selectlanguage and foreignlanguage
-			subst(p.verbatim_item(), "\n", " ");
-		
+		else if (t.cs() == "inputencoding") {
+			// nothing to write here
+			string const enc = subst(p.verbatim_item(), "\n", " ");
+			p.setEncoding(enc);
+		}
 		else if (t.cs() == "LyX" || t.cs() == "TeX"
 			 || t.cs() == "LaTeX") {
 			context.check_layout(os);
@ -2238,18 +2239,6 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			handle_ert(os, oss.str(), context);
 		}

-		else if (t.cs() == "\"") {
-			context.check_layout(os);
-			string const name = p.verbatim_item();
-			     if (name == "a") os << '\xe4';
-			else if (name == "o") os << '\xf6';
-			else if (name == "u") os << '\xfc';
-			else if (name == "A") os << '\xc4';
-			else if (name == "O") os << '\xd6';
-			else if (name == "U") os << '\xdc';
-			else handle_ert(os, "\"{" + name + "}", context);
-		}
-
 		// Problem: \= creates a tabstop inside the tabbing environment
 		// and else an accent. In the latter case we really would want
 		// \={o} instead of \= o.
@ -2260,30 +2249,22 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			 || t.cs() == "'" || t.cs() == "`"
 			 || t.cs() == "~" || t.cs() == "." || t.cs() == "=") {
 			// we need the trim as the LyX parser chokes on such spaces
-			// The argument of InsetLatexAccent is parsed as a
-			// subset of LaTeX, so don't parse anything here,
-			// but use the raw argument.
-			// Otherwise we would convert \~{\i} wrongly.
-			// This will of course not translate \~{\ss} to \~{ß},
-			// but that does at least compile and does only look
-			// strange on screen.
 			context.check_layout(os);
-			os << "\\i \\" << t.cs() << "{"
-			   << trim(p.verbatim_item(), " ")
-			   << "}\n";
-		}
-
-		else if (t.cs() == "ss") {
-			context.check_layout(os);
-			os << "\xdf";
-			skip_braces(p); // eat {}
-		}
-
-		else if (t.cs() == "i" || t.cs() == "j" || t.cs() == "l" ||
-			 t.cs() == "L") {
-			context.check_layout(os);
-			os << "\\i \\" << t.cs() << "{}\n";
-			skip_braces(p); // eat {}
+			// try to see whether the string is in unicodesymbols
+			docstring rem;
+			string command = t.asInput() + "{" 
+				+ trim(p.verbatim_item())
+				+ "}";
+			docstring s = encodings.fromLaTeXCommand(from_utf8(command), rem);
+			if (!s.empty()) {
+				if (!rem.empty())
+					cerr << "When parsing " << command 
+					     << ", result is " << to_utf8(s)
+					     << "+" << to_utf8(rem) << endl;
+				os << to_utf8(s);
+			} else
+				// we did not find a non-ert version
+				handle_ert(os, command, context);
 		}

 		else if (t.cs() == "\\") {
@ -2319,8 +2300,6 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			string const path = getMasterFilePath();
 			// We want to preserve relative / absolute filenames,
 			// therefore path is only used for testing
-			// FIXME UNICODE encoding of filename and path may be
-			// wrong (makeAbsPath expects utf8)
 			if ((t.cs() == "include" || t.cs() == "input") &&
 			    !makeAbsPath(filename, path).exists()) {
 				// The file extension is probably missing.
@ -2331,8 +2310,6 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 				if (!tex_name.empty())
 					filename = tex_name;
 			}
-			// FIXME UNICODE encoding of filename and path may be
-			// wrong (makeAbsPath expects utf8)
 			if (makeAbsPath(filename, path).exists()) {
 				string const abstexname =
 					makeAbsPath(filename, path).absFilename();
@ -2342,7 +2319,8 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 				string const lyxname =
 					changeExtension(filename, ".lyx");
 				if (t.cs() != "verbatiminput" &&
-				    tex2lyx(abstexname, FileName(abslyxname))) {
+				    tex2lyx(abstexname, FileName(abslyxname),
+					    p.getEncoding())) {
 					os << name << '{' << lyxname << "}\n";
 				} else {
 					os << name << '{' << filename << "}\n";
@ -2537,6 +2515,19 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 		}

 		else {
+			// try to see whether the string is in unicodesymbols
+			docstring rem;
+			docstring s = encodings.fromLaTeXCommand(from_utf8(t.asInput()), rem);
+			if (!s.empty()) {
+				if (!rem.empty())
+					cerr << "When parsing " << t.cs() 
+					     << ", result is " << to_utf8(s)
+					     << "+" << to_utf8(rem) << endl;
+				context.check_layout(os);
+				os << to_utf8(s);
+				p.skip_spaces();
+				skip_braces(p); // eat {}
+			}
 			//cerr << "#: " << t << " mode: " << mode << endl;
 			// heuristic: read up to next non-nested space
 			/*
@ -2550,14 +2541,16 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			cerr << "found ERT: " << s << endl;
 			handle_ert(os, s + ' ', context);
 			*/
-			string name = t.asInput();
-			if (p.next_token().asInput() == "*") {
-				// Starred commands like \vspace*{}
-				p.get_token();				// Eat '*'
-				name += '*';
+			else {
+				string name = t.asInput();
+				if (p.next_token().asInput() == "*") {
+					// Starred commands like \vspace*{}
+					p.get_token();	// Eat '*'
+					name += '*';
+				}
+				if (!parse_command(name, p, os, outer, context))
+					handle_ert(os, name, context);
 			}
-			if (! parse_command(name, p, os, outer, context))
-				handle_ert(os, name, context);
 		}

 		if (flags & FLAG_LEAVE) {
--- a/status.16x
+++ b/status.16x
@ -33,6 +33,11 @@ What's new

 * DOCUMENT INPUT/OUTPUT

+- Tex2lyx is now able to read files in all latex supported encodings and
+  transform them into the proper unicode-based format introduced in 1.5.0.
+  This paves the way for many other improvements in LaTeX->LyX translation
+  (bugs 3035, 4379, 4917).
+
 - Quotes in InsetListings are now normal quotes, rather than InsetQuote
  entries (bug 5782).