From ee3ce572a0b4fc2b71fcf7da0f1136174c2f4928 Mon Sep 17 00:00:00 2001
From: Georg Baum <baum@lyx.org>
Date: Sun, 30 Dec 2012 20:39:33 +0100
Subject: [PATCH] Fix import of shift-jis and big5 CJK texts.

These encodings were not defined, since they must not be used as document
encodings (the characters {, } and \ may appear in high bytes, and latex
would be confused). However, they are supported by CJK.sty (which uses a
preprocessor to circumvent the limitations of the latex executable). These
encodings are now defined, but used for import in tex2lyx only.
The test case CJK.tex contained fake tests for shift-jis and big5 (the
japanese and chinese characters were entered using the utf8 encoding), and
therefore the wrong interpretation of these encoding looked as if it worked.
The comments about missing iconv support of shift-jis and big5 were wrong as
well (otherwise shift-jis-plain would not work either).
---
 lib/encodings                     | 24 +++++++----
 src/Encoding.cpp                  | 26 +++++++++---
 src/Encoding.h                    | 15 +++++--
 src/frontends/qt4/GuiDocument.cpp |  9 ++--
 src/tex2lyx/Parser.cpp            |  6 ++-
 src/tex2lyx/test/CJK.lyx.lyx      | 70 +++++++++++++++++++++++++------
 src/tex2lyx/test/CJK.tex          | 12 +++++-
 src/tex2lyx/text.cpp              | 26 ++++++++----
 8 files changed, 142 insertions(+), 46 deletions(-)
diff --git a/lib/encodings b/lib/encodings
index 899a1a813a..f6b9b2b091 100644
--- a/lib/encodings
+++ b/lib/encodings
@@ -6,8 +6,12 @@
 # It does support singlebyte encodings with variable with (e.g. utf8).
 # These are marked with the "variable" keyword.
 # Fixed width encodings are marked with the "fixed" keyword.
+# The code points of TeX control characters like {, } and \ can occur in the
+# second byte of some variable width encodings. These encodings must not be
+# set as document encodings and are marked with the "variableunsafe" keyword.
+# They are only needed for proper tex2lyx import.
 
-# Syntax: Encoding <LyX name> <LaTeX name> <GUI name> <iconv name> fixed|variable <package> End
+# Syntax: Encoding <LyX name> <LaTeX name> <GUI name> <iconv name> fixed|variable|variableunsafe <package> End
 
 # encodings used by inputenc.sty
 
@@ -131,16 +135,19 @@ End
 # Bg5+, GBt
 # See the NOTES file of libiconv for details.
 
-# The following encodings can't be handled directly, because the code points
-# of TeX control characters like {, } and \ can occur in the second byte:
-
 # For traditional chinese
-#Encoding big5 Bg5 "Chinese (traditional) (Big5)" BIG5 variable CJK
-#End
+Encoding big5 Bg5 "Chinese (traditional) (Big5)" BIG5 variableunsafe CJK
+End
 
 # For japanese
-#Encoding shift-jis SJIS "Japanese (CJK) (SJIS)" SJIS variable CJK
-#End
+# We use CP932 and not SJIS, since it is a super set of SJIS except that it
+# translates SJIS 0x5c to U+005C (REVERSE SOLIDUS aka backslash) instead of
+# U+00A5 (YEN SIGN). Strictly speaking this is wrong, but there is a long
+# standing confusion which translation is correct, and using CP932 is also
+# consistent with the SJIS example file <texmf>/doc/latex/cjk/examples/SJIS.tex
+# of the CJK package. The CP932 extensions over SJIS do not matter here.
+Encoding shift-jis SJIS "Japanese (CJK) (SJIS)" CP932 variableunsafe CJK
+End
 
 # The following encodings need hardcoded support of the encodable unicode
 # range, but are known by iconv:
@@ -181,6 +188,7 @@ Encoding euc-jp-plain euc "Japanese (non-CJK) (EUC-JP)" EUC-JP variable japanese
 End
 Encoding jis-plain jis "Japanese (non-CJK) (JIS)" ISO-2022-JP variable japanese
 End
+# FIXME: Should use variableunsafe (would be a file format change)
 Encoding shift-jis-plain sjis "Japanese (non-CJK) (SJIS)" CP932 variable japanese
 End
 Encoding utf8 utf8 "Japanese (non-CJK) (utf8)" UTF-8 variable japanese
diff --git a/src/Encoding.cpp b/src/Encoding.cpp
index 298fc6a0ae..48e9271395 100644
--- a/src/Encoding.cpp
+++ b/src/Encoding.cpp
@@ -305,8 +305,9 @@ const char * EncodingException::what() const throw()
 
 
 Encoding::Encoding(string const & n, string const & l, string const & g,
-		   string const & i, bool f, Encoding::Package p)
-	: name_(n), latexName_(l), guiName_(g), iconvName_(i), fixedwidth_(f), package_(p)
+		   string const & i, bool f, bool u, Encoding::Package p)
+	: name_(n), latexName_(l), guiName_(g), iconvName_(i), fixedwidth_(f),
+	  unsafe_(u), package_(p)
 {
 	if (n == "ascii") {
 		// ASCII can encode 128 code points and nothing else
@@ -823,14 +824,18 @@ bool Encodings::isMathAlpha(char_type c)
 }
 
 
-Encoding const * Encodings::fromLyXName(string const & name) const
+Encoding const *
+Encodings::fromLyXName(string const & name, bool allowUnsafe) const
 {
 	EncodingList::const_iterator const it = encodinglist.find(name);
+	if (!allowUnsafe && it->second.unsafe())
+		return 0;
 	return it != encodinglist.end() ? &it->second : 0;
 }
 
 
-Encoding const * Encodings::fromLaTeXName(string const & n) const
+Encoding const *
+Encodings::fromLaTeXName(string const & n, bool allowUnsafe) const
 {
 	string name = n;
 	// FIXME: if we have to test for too many of these synonyms,
@@ -845,8 +850,11 @@ Encoding const * Encodings::fromLaTeXName(string const & n) const
 	// most at the top of lib/encodings.
 	EncodingList::const_iterator const end = encodinglist.end();
 	for (EncodingList::const_iterator it = encodinglist.begin(); it != end; ++it)
-		if (it->second.latexName() == name)
+		if (it->second.latexName() == name) {
+			if (!allowUnsafe && it->second.unsafe())
+				return 0;
 			return &it->second;
+		}
 	return 0;
 }
 
@@ -1005,10 +1013,15 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 			lex.next();
 			string const width = lex.getString();
 			bool fixedwidth = false;
+			bool unsafe = false;
 			if (width == "fixed")
 				fixedwidth = true;
 			else if (width == "variable")
 				fixedwidth = false;
+			else if (width == "variableunsafe") {
+				fixedwidth = false;
+				unsafe = true;
+			}
 			else
 				lex.printError("Unknown width");
 
@@ -1028,7 +1041,8 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 
 			LYXERR(Debug::INFO, "Reading encoding " << name);
 			encodinglist[name] = Encoding(name, latexname,
-				guiname, iconvname, fixedwidth, package);
+				guiname, iconvname, fixedwidth, unsafe,
+				package);
 
 			if (lex.lex() != et_end)
 				lex.printError("Missing end");
diff --git a/src/Encoding.h b/src/Encoding.h
index b57ad7aa08..445c80be3b 100644
--- a/src/Encoding.h
+++ b/src/Encoding.h
@@ -54,7 +54,7 @@ public:
 	///
 	Encoding(std::string const & n, std::string const & l,
 		 std::string const & g, std::string const & i,
-		 bool f, Package p);
+		 bool f, bool u, Package p);
 	///
 	void init() const;
 	///
@@ -66,7 +66,9 @@ public:
 	///
 	std::string const & iconvName() const { return iconvName_; }
 	///
-	bool const & hasFixedWidth() const { return fixedwidth_; }
+	bool hasFixedWidth() const { return fixedwidth_; }
+	///
+	bool unsafe() const { return unsafe_; }
 	/// \p c is representable in this encoding without a LaTeX macro
 	bool encodable(char_type c) const;
 	/**
@@ -108,6 +110,9 @@ private:
 	std::string iconvName_;
 	/// Is this a fixed width encoding?
 	bool fixedwidth_;
+	/// Is this encoding TeX unsafe, e.g. control characters like {, }
+	/// and \\ may appear in high bytes?
+	bool unsafe_;
 	///
 	typedef std::set<char_type> CharSet;
 	/// Set of UCS4 characters that we can encode (for singlebyte
@@ -157,9 +162,11 @@ public:
 	void read(support::FileName const & encfile,
 		  support::FileName const & symbolsfile);
 	/// Get encoding from LyX name \p name
-	Encoding const * fromLyXName(std::string const & name) const;
+	Encoding const *
+	fromLyXName(std::string const & name, bool allowUnsafe = false) const;
 	/// Get encoding from LaTeX name \p name
-	Encoding const * fromLaTeXName(std::string const & name) const;
+	Encoding const *
+	fromLaTeXName(std::string const & name, bool allowUnsafe = false) const;
 
 	///
 	const_iterator begin() const { return encodinglist.begin(); }
diff --git a/src/frontends/qt4/GuiDocument.cpp b/src/frontends/qt4/GuiDocument.cpp
index 2ce8393da6..49a166c09a 100644
--- a/src/frontends/qt4/GuiDocument.cpp
+++ b/src/frontends/qt4/GuiDocument.cpp
@@ -1068,7 +1068,8 @@ GuiDocument::GuiDocument(GuiView & lv)
 	Encodings::const_iterator it = encodings.begin();
 	Encodings::const_iterator const end = encodings.end();
 	for (; it != end; ++it)
-		encodinglist.append(qt_(it->guiName()));
+		if (!it->unsafe())
+			encodinglist.append(qt_(it->guiName()));
 	encodinglist.sort();
 	langModule->encodingCO->addItems(encodinglist);
 
@@ -2558,7 +2559,8 @@ void GuiDocument::applyView()
 			Encodings::const_iterator const end = encodings.end();
 			bool found = false;
 			for (; it != end; ++it) {
-				if (qt_(it->guiName()) == enc_gui) {
+				if (qt_(it->guiName()) == enc_gui &&
+				    !it->unsafe()) {
 					bp_.inputenc = it->latexName();
 					found = true;
 					break;
@@ -2966,7 +2968,8 @@ void GuiDocument::paramsToDialog()
 			Encodings::const_iterator it = encodings.begin();
 			Encodings::const_iterator const end = encodings.end();
 			for (; it != end; ++it) {
-				if (it->latexName() == bp_.inputenc) {
+				if (it->latexName() == bp_.inputenc &&
+				    !it->unsafe()) {
 					enc_gui = it->guiName();
 					break;
 				}
diff --git a/src/tex2lyx/Parser.cpp b/src/tex2lyx/Parser.cpp
index 010e9dd7c9..6309ab4379 100644
--- a/src/tex2lyx/Parser.cpp
+++ b/src/tex2lyx/Parser.cpp
@@ -179,7 +179,11 @@ Parser::~Parser()
 
 void Parser::setEncoding(std::string const & e)
 {
-	Encoding const * enc = encodings.fromLaTeXName(e);
+	// We may (and need to) use unsafe encodings here: Since the text is
+	// converted to unicode while reading from is_, we never see text in
+	// the original encoding of the parser, but operate on utf8 strings
+	// instead. Therefore, we cannot misparse high bytes as {, } or \\.
+	Encoding const * enc = encodings.fromLaTeXName(e, true);
 	if (!enc) {
 		cerr << "Unknown encoding " << e << ". Ignoring." << std::endl;
 		return;
diff --git a/src/tex2lyx/test/CJK.lyx.lyx b/src/tex2lyx/test/CJK.lyx.lyx
index c8c1a7e7b6..3af96805b1 100644
--- a/src/tex2lyx/test/CJK.lyx.lyx
+++ b/src/tex2lyx/test/CJK.lyx.lyx
@@ -16,7 +16,7 @@
 \end_preamble
 \use_default_options false
 \maintain_unincluded_children false
-\language english
+\language japanese-cjk
 \language_package default
 \inputencoding utf8
 \fontencoding T1
@@ -146,29 +146,51 @@ Chinese simplified
 \begin_inset ERT
 status collapsed
 
+\begin_layout Plain Layout
+% The following Bg5 encoded text cannot be processed as is by latex.
+\end_layout
+
 \begin_layout Plain Layout
 
-\backslash
-begin{CJK}{Bg5}{}
 \end_layout
 
 \end_inset
 
 
-Big5 鏂囬紟妤锋浉
-
 \begin_inset ERT
 status collapsed
 
+\begin_layout Plain Layout
+% It needs to be preprocessed by bg5conv or bg5latex needs to be used.
+\end_layout
+
 \begin_layout Plain Layout
 
-\backslash
-end{CJK}
 \end_layout
 
 \end_inset
 
 
+\begin_inset ERT
+status collapsed
+
+\begin_layout Plain Layout
+% See the documentation of CJK.sty for details.
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+
+\lang chinese-traditional
+ Big5 鏂囬紟妤锋浉 
 \end_layout
 
 \begin_layout Standard
@@ -182,29 +204,51 @@ English
 \begin_inset ERT
 status collapsed
 
+\begin_layout Plain Layout
+% The following SJIS encoded text cannot be processed as is by latex.
+\end_layout
+
 \begin_layout Plain Layout
 
-\backslash
-begin{CJK}{SJIS}{}
 \end_layout
 
 \end_inset
 
 
-Shift_JIS 鏃ユ湰瑾炪伄鏂囩珷
-
 \begin_inset ERT
 status collapsed
 
+\begin_layout Plain Layout
+% It needs to be preprocessed by sjisconv or sjislatex needs to be used.
+\end_layout
+
 \begin_layout Plain Layout
 
-\backslash
-end{CJK}
 \end_layout
 
 \end_inset
 
 
+\begin_inset ERT
+status collapsed
+
+\begin_layout Plain Layout
+% See the documentation of CJK.sty for details.
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+
+\lang japanese-cjk
+ Shift_JIS 鏃ユ湰瑾炪伄鏂囩珷 
 \end_layout
 
 \begin_layout Standard
diff --git a/src/tex2lyx/test/CJK.tex b/src/tex2lyx/test/CJK.tex
index b469b895d7..434477b85f 100644
--- a/src/tex2lyx/test/CJK.tex
+++ b/src/tex2lyx/test/CJK.tex
@@ -18,16 +18,24 @@ hello
 Chinese simplified \end{CJK}\begin{CJK}{EUC-JP}{hei} Japanese \end{CJK} \begin{CJK}{GB}{}Chinese simplified
 \end{CJK}
 
+% The following Bg5 encoded text cannot be processed as is by latex.
+% It needs to be preprocessed by bg5conv or bg5latex needs to be used.
+% See the documentation of CJK.sty for details.
+
 \begin{CJK}{Bg5}{}
-Big5 鏂囬紟妤锋浉
+Big5 ゅ供发
 \end{CJK}
 
 \inputencoding{latin9}%
 \selectlanguage{english}%
 English
 
+% The following SJIS encoded text cannot be processed as is by latex.
+% It needs to be preprocessed by sjisconv or sjislatex needs to be used.
+% See the documentation of CJK.sty for details.
+
 \begin{CJK}{SJIS}{}
-Shift_JIS 鏃ユ湰瑾炪伄鏂囩珷
+Shift\_JIS 擔杮岅偺暥復
 \end{CJK}
 
 hello
diff --git a/src/tex2lyx/text.cpp b/src/tex2lyx/text.cpp
index 90728a960e..cdf274094b 100644
--- a/src/tex2lyx/text.cpp
+++ b/src/tex2lyx/text.cpp
@@ -119,19 +119,23 @@ char const * const known_coded_ref_commands[] = { "ref", "pageref", "vref",
 
 /**
  * supported CJK encodings
- * SJIS and Bg5 cannot be supported as they are not
- * supported by iconv
  * JIS does not work with LyX's encoding conversion
  */
 const char * const supported_CJK_encodings[] = {
-"EUC-JP", "KS", "GB", "UTF8", 0};
+"EUC-JP", "KS", "GB", "UTF8",
+"Bg5", /*"JIS",*/ "SJIS", 0};
 
 /**
  * the same as supported_CJK_encodings with their corresponding LyX language name
+ * FIXME: The mapping "UTF8" => "chinese-traditional" is only correct for files
+ *        created by LyX.
+ * NOTE: "Bg5", "JIS" and "SJIS" are not supported by LyX, on re-export the
+ *       encodings "UTF8", "EUC-JP" and "EUC-JP" will be used.
  * please keep this in sync with supported_CJK_encodings line by line!
  */
 const char * const supported_CJK_languages[] = {
-"japanese-cjk", "korean", "chinese-simplified", "chinese-traditional", 0};
+"japanese-cjk", "korean", "chinese-simplified", "chinese-traditional",
+"chinese-traditional", /*"japanese-cjk",*/ "japanese-cjk", 0};
 
 /*!
  * natbib commands.
@@ -1454,9 +1458,13 @@ void parse_environment(Parser & p, ostream & os, bool outer,
 		// store the encoding to be able to reset it
 		string const encoding_old = p.getEncoding();
 		string const encoding = p.getArg('{', '}');
-		// SJIS and Bg5 cannot be handled by iconv
-		// JIS does not work with LyX's encoding conversion
-		if (encoding != "Bg5" && encoding != "JIS" && encoding != "SJIS")
+		// FIXME: For some reason JIS does not work. Although the text
+		// in tests/CJK.tex is identical with the SJIS version if you
+		// convert both snippets using the recode command line utility,
+		// the resulting .lyx file contains some extra characters if
+		// you set buggy_encoding to false for JIS.
+		bool const buggy_encoding = encoding == "JIS";
+		if (!buggy_encoding)
 			p.setEncoding(encoding);
 		else {
 			// FIXME: This will read garbage, since the data is not encoded in utf8.
@@ -1467,10 +1475,10 @@ void parse_environment(Parser & p, ostream & os, bool outer,
 		string const mapping = trim(p.getArg('{', '}'));
 		char const * const * const where =
 			is_known(encoding, supported_CJK_encodings);
-		if (!preamble.fontCJKSet())
+		if (!buggy_encoding && !preamble.fontCJKSet())
 			preamble.fontCJK(mapping);
 		bool knownMapping = mapping == preamble.fontCJK();
-		if (!knownMapping || !where) {
+		if (buggy_encoding || !knownMapping || !where) {
 			parent_context.check_layout(os);
 			handle_ert(os, "\\begin{" + name + "}{" + encoding + "}{" + mapping + "}",
 				       parent_context);