Open tex2lyx input files with correct encoding

We open the input file now twice: The first time in latin1 encoding to read
the document encoding from the preamble. This does always work, since
traditional TeX does not allow non-ASCII contents without an encoding changing
command (except for comments, but we do not need them, and using latin1 rather
than utf8 ensures that they do not produce an iconv exception, but are simply
recored with wrong characters), and we do detect the utf8 based TeX engines
XeTeX and LuaTeX as well. The second time we open the file directly with the
document encoding.

This fixes a few tex2lyx tests on OS X, since changing the encoding of an
open file steam does not work with clang on OS X. Files using more than one
encoding are still broken, but all single-encoding files are fixed now.
This commit is contained in:
Georg Baum 2016-01-31 12:54:59 +01:00
parent 92d0835e14
commit de3e5280f6
5 changed files with 106 additions and 31 deletions

View File

@ -334,6 +334,12 @@ ifdocstream::ifdocstream() : base()
}
ifdocstream::ifdocstream(SetEnc const & enc) : base()
{
setEncoding(*this, enc.encoding, in);
}
ifdocstream::ifdocstream(const char* s, ios_base::openmode mode,
string const & encoding)
: base()
@ -349,6 +355,12 @@ ofdocstream::ofdocstream(): base()
}
ofdocstream::ofdocstream(SetEnc const & enc) : base()
{
setEncoding(*this, enc.encoding, out);
}
ofdocstream::ofdocstream(const char* s, ios_base::openmode mode,
string const & encoding)
: base()

View File

@ -40,6 +40,8 @@ typedef std::basic_istream<char_type> idocstream;
*/
typedef std::basic_ostream<char_type> odocstream;
struct SetEnc;
/// File stream for reading UTF8-encoded files with automatic conversion to
/// UCS4.
/// Buffering must be switched off if the encoding is changed after
@ -48,6 +50,9 @@ class ifdocstream : public std::basic_ifstream<char_type> {
typedef std::basic_ifstream<char_type> base;
public:
ifdocstream();
/// Create a stream with a specific encoding \p enc.
/// We must not pass \p enc as string, to avoid confusing it with a file name.
explicit ifdocstream(SetEnc const & enc);
explicit ifdocstream(const char* s,
std::ios_base::openmode mode = std::ios_base::in,
std::string const & encoding = "UTF-8");
@ -61,6 +66,9 @@ class ofdocstream : public std::basic_ofstream<char_type> {
typedef std::basic_ofstream<char_type> base;
public:
ofdocstream();
/// Create a stream with a specific encoding \p enc.
/// We must not pass \p enc as string, to avoid confusing it with a file name.
explicit ofdocstream(SetEnc const & enc);
explicit ofdocstream(const char* s,
std::ios_base::openmode mode = std::ios_base::out|std::ios_base::trunc,
std::string const & encoding = "UTF-8");

View File

@ -672,7 +672,8 @@ void Preamble::handle_geometry(vector<string> & options)
void Preamble::handle_package(Parser &p, string const & name,
string const & opts, bool in_lyx_preamble)
string const & opts, bool in_lyx_preamble,
bool detectEncoding)
{
vector<string> options = split_options(opts);
add_package(name, options);
@ -909,9 +910,11 @@ void Preamble::handle_package(Parser &p, string const & name,
string const encoding = options.back();
Encoding const * const enc = encodings.fromLaTeXName(
encoding, Encoding::inputenc, true);
if (!enc)
cerr << "Unknown encoding " << encoding << ". Ignoring." << std::endl;
else {
if (!enc) {
if (!detectEncoding)
cerr << "Unknown encoding " << encoding
<< ". Ignoring." << std::endl;
} else {
if (!enc->unsafe() && options.size() == 1 && one_language == true)
h_inputencoding = enc->name();
p.setEncoding(enc->iconvName());
@ -1026,7 +1029,7 @@ void Preamble::handle_package(Parser &p, string const & name,
}
// We need to do something with the options...
if (!options.empty())
if (!options.empty() && !detectEncoding)
cerr << "Ignoring options '" << join(options, ",")
<< "' of package " << name << '.' << endl;
@ -1260,6 +1263,13 @@ void Preamble::parse(Parser & p, string const & forceclass,
{
// initialize fixed types
special_columns_['D'] = 3;
parse(p, forceclass, false, tc);
}
void Preamble::parse(Parser & p, string const & forceclass,
bool detectEncoding, TeX2LyXDocClass & tc)
{
bool is_full_document = false;
bool is_lyx_file = false;
bool in_lyx_preamble = false;
@ -1275,11 +1285,19 @@ void Preamble::parse(Parser & p, string const & forceclass,
}
p.reset();
if (detectEncoding && !is_full_document)
return;
while (is_full_document && p.good()) {
if (detectEncoding && h_inputencoding != "auto" &&
h_inputencoding != "default")
return;
Token const & t = p.get_token();
#ifdef FILEDEBUG
cerr << "t: " << t << "\n";
if (!detectEncoding)
cerr << "t: " << t << '\n';
#endif
//
@ -1314,7 +1332,8 @@ void Preamble::parse(Parser & p, string const & forceclass,
if (comment.size() > magicXeLaTeX.size()
&& comment.substr(0, magicXeLaTeX.size()) == magicXeLaTeX
&& h_inputencoding == "auto") {
cerr << "XeLaTeX comment found, switching to UTF8\n";
if (!detectEncoding)
cerr << "XeLaTeX comment found, switching to UTF8\n";
h_inputencoding = "utf8";
}
smatch sub;
@ -1657,16 +1676,18 @@ void Preamble::parse(Parser & p, string const & forceclass,
vector<string>::const_iterator end = vecnames.end();
for (; it != end; ++it)
handle_package(p, trimSpaceAndEol(*it), options,
in_lyx_preamble);
in_lyx_preamble, detectEncoding);
}
else if (t.cs() == "inputencoding") {
string const encoding = p.getArg('{','}');
Encoding const * const enc = encodings.fromLaTeXName(
encoding, Encoding::inputenc, true);
if (!enc)
cerr << "Unknown encoding " << encoding << ". Ignoring." << std::endl;
else {
if (!enc) {
if (!detectEncoding)
cerr << "Unknown encoding " << encoding
<< ". Ignoring." << std::endl;
} else {
if (!enc->unsafe())
h_inputencoding = enc->name();
p.setEncoding(enc->iconvName());
@ -1949,6 +1970,16 @@ void Preamble::parse(Parser & p, string const & forceclass,
}
string Preamble::parseEncoding(Parser & p, string const & forceclass)
{
TeX2LyXDocClass dummy;
parse(p, forceclass, true, dummy);
if (h_inputencoding != "auto" && h_inputencoding != "default")
return h_inputencoding;
return "";
}
string babel2lyx(string const & language)
{
char const * const * where = is_known(language, known_languages);

View File

@ -92,6 +92,8 @@ public:
/// Parses the LaTeX preamble into internal data
void parse(Parser & p, std::string const & forceclass,
TeX2LyXDocClass & tc);
/// Parse the encoding from a preamble. *this is unusable afterwards.
std::string parseEncoding(Parser & p, std::string const & forceclass);
/// Writes the LyX file header from internal data
bool writeLyXHeader(std::ostream & os, bool subdoc,
std::string const & outfiledir);
@ -103,6 +105,9 @@ public:
private:
///
void parse(Parser & p, std::string const & forceclass,
bool detectEncoding, TeX2LyXDocClass & tc);
///
std::map<std::string, std::vector<std::string> > used_packages;
/// Packages that will be loaded automatically by LyX
std::set<std::string> auto_packages;
@ -218,7 +223,8 @@ private:
void handle_geometry(std::vector<std::string> & options);
///
void handle_package(Parser &p, std::string const & name,
std::string const & opts, bool in_lyx_preamble);
std::string const & opts, bool in_lyx_preamble,
bool detectEncoding);
///
void handle_if(Parser & p, bool in_lyx_preamble);

View File

@ -839,24 +839,9 @@ namespace {
* You must ensure that \p parentFilePathTeX is properly set before calling
* this function!
*/
bool tex2lyx(idocstream & is, ostream & os, string encoding,
bool tex2lyx(idocstream & is, ostream & os, string const & encoding,
string const & outfiledir)
{
// Set a sensible default encoding.
// This is used until an encoding command is found.
// For child documents use the encoding of the master, else ISO-8859-1,
// (formerly known by its latex name latin1), since ISO-8859-1 does not
// cause an iconv error if the actual encoding is different (bug 7509).
if (encoding.empty()) {
if (preamble.inputencoding() == "auto")
encoding = "ISO-8859-1";
else {
Encoding const * const enc = encodings.fromLyXName(
preamble.inputencoding(), true);
encoding = enc->iconvName();
}
}
Parser p(is, fixed_encoding ? default_encoding : string());
p.setEncoding(encoding);
//p.dump();
@ -925,12 +910,45 @@ bool tex2lyx(idocstream & is, ostream & os, string encoding,
/// convert TeX from \p infilename to LyX and write it to \p os
bool tex2lyx(FileName const & infilename, ostream & os, string const & encoding,
bool tex2lyx(FileName const & infilename, ostream & os, string encoding,
string const & outfiledir)
{
ifdocstream is;
// Set a sensible default encoding.
// This is used until an encoding command is found.
// For child documents use the encoding of the master, else try to
// detect it from the preamble, since setting an encoding of an open
// fstream does currently not work on OS X.
// Always start with ISO-8859-1, (formerly known by its latex name
// latin1), since ISO-8859-1 does not cause an iconv error if the
// actual encoding is different (bug 7509).
if (encoding.empty()) {
Encoding const * enc = 0;
if (preamble.inputencoding() == "auto") {
ifdocstream is(setEncoding("ISO-8859-1"));
// forbid buffering on this stream
is.rdbuf()->pubsetbuf(0, 0);
is.open(infilename.toFilesystemEncoding().c_str());
if (is.good()) {
Parser ep(is, string());
ep.setEncoding("ISO-8859-1");
Preamble encodingpreamble;
string const e = encodingpreamble
.parseEncoding(ep, documentclass);
if (!e.empty())
enc = encodings.fromLyXName(e, true);
}
} else
enc = encodings.fromLyXName(
preamble.inputencoding(), true);
if (enc)
encoding = enc->iconvName();
else
encoding = "ISO-8859-1";
}
ifdocstream is(setEncoding(encoding));
// forbid buffering on this stream
is.rdbuf()->pubsetbuf(0,0);
is.rdbuf()->pubsetbuf(0, 0);
is.open(infilename.toFilesystemEncoding().c_str());
if (!is.good()) {
cerr << "Could not open input file \"" << infilename