Fix import of shift-jis and big5 CJK texts.

These encodings were not defined, since they must not be used as document encodings (the characters {, } and \ may appear in high bytes, and latex would be confused). However, they are supported by CJK.sty (which uses a preprocessor to circumvent the limitations of the latex executable). These encodings are now defined, but used for import in tex2lyx only. The test case CJK.tex contained fake tests for shift-jis and big5 (the japanese and chinese characters were entered using the utf8 encoding), and therefore the wrong interpretation of these encoding looked as if it worked. The comments about missing iconv support of shift-jis and big5 were wrong as well (otherwise shift-jis-plain would not work either).
2024-11-25 10:58:52 +00:00 · 2012-12-30 20:39:33 +01:00 · 2012-12-30 20:39:33 +01:00 · ee3ce572a0
commit ee3ce572a0
parent 81f1f33a79
8 changed files with 142 additions and 46 deletions
--- a/lib/encodings
+++ b/lib/encodings
@ -6,8 +6,12 @@
 # It does support singlebyte encodings with variable with (e.g. utf8).
 # These are marked with the "variable" keyword.
 # Fixed width encodings are marked with the "fixed" keyword.
+# The code points of TeX control characters like {, } and \ can occur in the
+# second byte of some variable width encodings. These encodings must not be
+# set as document encodings and are marked with the "variableunsafe" keyword.
+# They are only needed for proper tex2lyx import.

-# Syntax: Encoding <LyX name> <LaTeX name> <GUI name> <iconv name> fixed|variable <package> End
+# Syntax: Encoding <LyX name> <LaTeX name> <GUI name> <iconv name> fixed|variable|variableunsafe <package> End

 # encodings used by inputenc.sty

@ -131,16 +135,19 @@ End
 # Bg5+, GBt
 # See the NOTES file of libiconv for details.

-# The following encodings can't be handled directly, because the code points
-# of TeX control characters like {, } and \ can occur in the second byte:
-
 # For traditional chinese
-#Encoding big5 Bg5 "Chinese (traditional) (Big5)" BIG5 variable CJK
-#End
+Encoding big5 Bg5 "Chinese (traditional) (Big5)" BIG5 variableunsafe CJK
+End

 # For japanese
-#Encoding shift-jis SJIS "Japanese (CJK) (SJIS)" SJIS variable CJK
-#End
+# We use CP932 and not SJIS, since it is a super set of SJIS except that it
+# translates SJIS 0x5c to U+005C (REVERSE SOLIDUS aka backslash) instead of
+# U+00A5 (YEN SIGN). Strictly speaking this is wrong, but there is a long
+# standing confusion which translation is correct, and using CP932 is also
+# consistent with the SJIS example file <texmf>/doc/latex/cjk/examples/SJIS.tex
+# of the CJK package. The CP932 extensions over SJIS do not matter here.
+Encoding shift-jis SJIS "Japanese (CJK) (SJIS)" CP932 variableunsafe CJK
+End

 # The following encodings need hardcoded support of the encodable unicode
 # range, but are known by iconv:
@ -181,6 +188,7 @@ Encoding euc-jp-plain euc "Japanese (non-CJK) (EUC-JP)" EUC-JP variable japanese
 End
 Encoding jis-plain jis "Japanese (non-CJK) (JIS)" ISO-2022-JP variable japanese
 End
+# FIXME: Should use variableunsafe (would be a file format change)
 Encoding shift-jis-plain sjis "Japanese (non-CJK) (SJIS)" CP932 variable japanese
 End
 Encoding utf8 utf8 "Japanese (non-CJK) (utf8)" UTF-8 variable japanese
--- a/src/Encoding.cpp
+++ b/src/Encoding.cpp
@ -305,8 +305,9 @@ const char * EncodingException::what() const throw()


 Encoding::Encoding(string const & n, string const & l, string const & g,
-		   string const & i, bool f, Encoding::Package p)
-	: name_(n), latexName_(l), guiName_(g), iconvName_(i), fixedwidth_(f), package_(p)
+		   string const & i, bool f, bool u, Encoding::Package p)
+	: name_(n), latexName_(l), guiName_(g), iconvName_(i), fixedwidth_(f),
+	  unsafe_(u), package_(p)
 {
 	if (n == "ascii") {
 		// ASCII can encode 128 code points and nothing else
@ -823,14 +824,18 @@ bool Encodings::isMathAlpha(char_type c)
 }


-Encoding const * Encodings::fromLyXName(string const & name) const
+Encoding const *
+Encodings::fromLyXName(string const & name, bool allowUnsafe) const
 {
 	EncodingList::const_iterator const it = encodinglist.find(name);
+	if (!allowUnsafe && it->second.unsafe())
+		return 0;
 	return it != encodinglist.end() ? &it->second : 0;
 }


-Encoding const * Encodings::fromLaTeXName(string const & n) const
+Encoding const *
+Encodings::fromLaTeXName(string const & n, bool allowUnsafe) const
 {
 	string name = n;
 	// FIXME: if we have to test for too many of these synonyms,
@ -845,8 +850,11 @@ Encoding const * Encodings::fromLaTeXName(string const & n) const
 	// most at the top of lib/encodings.
 	EncodingList::const_iterator const end = encodinglist.end();
 	for (EncodingList::const_iterator it = encodinglist.begin(); it != end; ++it)
-		if (it->second.latexName() == name)
+		if (it->second.latexName() == name) {
+			if (!allowUnsafe && it->second.unsafe())
+				return 0;
 			return &it->second;
+		}
 	return 0;
 }

@ -1005,10 +1013,15 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 			lex.next();
 			string const width = lex.getString();
 			bool fixedwidth = false;
+			bool unsafe = false;
 			if (width == "fixed")
 				fixedwidth = true;
 			else if (width == "variable")
 				fixedwidth = false;
+			else if (width == "variableunsafe") {
+				fixedwidth = false;
+				unsafe = true;
+			}
 			else
 				lex.printError("Unknown width");

@ -1028,7 +1041,8 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)

 			LYXERR(Debug::INFO, "Reading encoding " << name);
 			encodinglist[name] = Encoding(name, latexname,
-				guiname, iconvname, fixedwidth, package);
+				guiname, iconvname, fixedwidth, unsafe,
+				package);

 			if (lex.lex() != et_end)
 				lex.printError("Missing end");
--- a/src/Encoding.h
+++ b/src/Encoding.h
@ -54,7 +54,7 @@ public:
 	///
 	Encoding(std::string const & n, std::string const & l,
 		 std::string const & g, std::string const & i,
-		 bool f, Package p);
+		 bool f, bool u, Package p);
 	///
 	void init() const;
 	///
@ -66,7 +66,9 @@ public:
 	///
 	std::string const & iconvName() const { return iconvName_; }
 	///
-	bool const & hasFixedWidth() const { return fixedwidth_; }
+	bool hasFixedWidth() const { return fixedwidth_; }
+	///
+	bool unsafe() const { return unsafe_; }
 	/// \p c is representable in this encoding without a LaTeX macro
 	bool encodable(char_type c) const;
 	/**
@ -108,6 +110,9 @@ private:
 	std::string iconvName_;
 	/// Is this a fixed width encoding?
 	bool fixedwidth_;
+	/// Is this encoding TeX unsafe, e.g. control characters like {, }
+	/// and \\ may appear in high bytes?
+	bool unsafe_;
 	///
 	typedef std::set<char_type> CharSet;
 	/// Set of UCS4 characters that we can encode (for singlebyte
@ -157,9 +162,11 @@ public:
 	void read(support::FileName const & encfile,
 		  support::FileName const & symbolsfile);
 	/// Get encoding from LyX name \p name
-	Encoding const * fromLyXName(std::string const & name) const;
+	Encoding const *
+	fromLyXName(std::string const & name, bool allowUnsafe = false) const;
 	/// Get encoding from LaTeX name \p name
-	Encoding const * fromLaTeXName(std::string const & name) const;
+	Encoding const *
+	fromLaTeXName(std::string const & name, bool allowUnsafe = false) const;

 	///
 	const_iterator begin() const { return encodinglist.begin(); }
--- a/src/frontends/qt4/GuiDocument.cpp
+++ b/src/frontends/qt4/GuiDocument.cpp
@ -1068,7 +1068,8 @@ GuiDocument::GuiDocument(GuiView & lv)
 	Encodings::const_iterator it = encodings.begin();
 	Encodings::const_iterator const end = encodings.end();
 	for (; it != end; ++it)
-		encodinglist.append(qt_(it->guiName()));
+		if (!it->unsafe())
+			encodinglist.append(qt_(it->guiName()));
 	encodinglist.sort();
 	langModule->encodingCO->addItems(encodinglist);

@ -2558,7 +2559,8 @@ void GuiDocument::applyView()
 			Encodings::const_iterator const end = encodings.end();
 			bool found = false;
 			for (; it != end; ++it) {
-				if (qt_(it->guiName()) == enc_gui) {
+				if (qt_(it->guiName()) == enc_gui &&
+				    !it->unsafe()) {
 					bp_.inputenc = it->latexName();
 					found = true;
 					break;
@ -2966,7 +2968,8 @@ void GuiDocument::paramsToDialog()
 			Encodings::const_iterator it = encodings.begin();
 			Encodings::const_iterator const end = encodings.end();
 			for (; it != end; ++it) {
-				if (it->latexName() == bp_.inputenc) {
+				if (it->latexName() == bp_.inputenc &&
+				    !it->unsafe()) {
 					enc_gui = it->guiName();
 					break;
 				}
--- a/src/tex2lyx/Parser.cpp
+++ b/src/tex2lyx/Parser.cpp
@ -179,7 +179,11 @@ Parser::~Parser()

 void Parser::setEncoding(std::string const & e)
 {
-	Encoding const * enc = encodings.fromLaTeXName(e);
+	// We may (and need to) use unsafe encodings here: Since the text is
+	// converted to unicode while reading from is_, we never see text in
+	// the original encoding of the parser, but operate on utf8 strings
+	// instead. Therefore, we cannot misparse high bytes as {, } or \\.
+	Encoding const * enc = encodings.fromLaTeXName(e, true);
 	if (!enc) {
 		cerr << "Unknown encoding " << e << ". Ignoring." << std::endl;
 		return;
--- a/src/tex2lyx/test/CJK.lyx.lyx
+++ b/src/tex2lyx/test/CJK.lyx.lyx
@ -16,7 +16,7 @@
 \end_preamble
 \use_default_options false
 \maintain_unincluded_children false
-\language english
+\language japanese-cjk
 \language_package default
 \inputencoding utf8
 \fontencoding T1
@ -146,29 +146,51 @@ Chinese simplified
 \begin_inset ERT
 status collapsed

+\begin_layout Plain Layout
+% The following Bg5 encoded text cannot be processed as is by latex.
+\end_layout
+
 \begin_layout Plain Layout

-\backslash
-begin{CJK}{Bg5}{}
 \end_layout

 \end_inset


-Big5 文鼎楷書
-
 \begin_inset ERT
 status collapsed

+\begin_layout Plain Layout
+% It needs to be preprocessed by bg5conv or bg5latex needs to be used.
+\end_layout
+
 \begin_layout Plain Layout

-\backslash
-end{CJK}
 \end_layout

 \end_inset


+\begin_inset ERT
+status collapsed
+
+\begin_layout Plain Layout
+% See the documentation of CJK.sty for details.
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+
+\lang chinese-traditional
+ Big5 文鼎楷書 
 \end_layout

 \begin_layout Standard
@ -182,29 +204,51 @@ English
 \begin_inset ERT
 status collapsed

+\begin_layout Plain Layout
+% The following SJIS encoded text cannot be processed as is by latex.
+\end_layout
+
 \begin_layout Plain Layout

-\backslash
-begin{CJK}{SJIS}{}
 \end_layout

 \end_inset


-Shift_JIS 日本語の文章
-
 \begin_inset ERT
 status collapsed

+\begin_layout Plain Layout
+% It needs to be preprocessed by sjisconv or sjislatex needs to be used.
+\end_layout
+
 \begin_layout Plain Layout

-\backslash
-end{CJK}
 \end_layout

 \end_inset


+\begin_inset ERT
+status collapsed
+
+\begin_layout Plain Layout
+% See the documentation of CJK.sty for details.
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+
+\lang japanese-cjk
+ Shift_JIS 日本語の文章 
 \end_layout

 \begin_layout Standard
--- a/src/tex2lyx/test/CJK.tex
+++ b/src/tex2lyx/test/CJK.tex
@ -18,16 +18,24 @@ hello
 Chinese simplified \end{CJK}\begin{CJK}{EUC-JP}{hei} Japanese \end{CJK} \begin{CJK}{GB}{}Chinese simplified
 \end{CJK}

+% The following Bg5 encoded text cannot be processed as is by latex.
+% It needs to be preprocessed by bg5conv or bg5latex needs to be used.
+% See the documentation of CJK.sty for details.
+
 \begin{CJK}{Bg5}{}
-Big5 文鼎楷書
+Big5 ､蟷ｩｷ｢ｮﾑ
 \end{CJK}

 \inputencoding{latin9}%
 \selectlanguage{english}%
 English

+% The following SJIS encoded text cannot be processed as is by latex.
+% It needs to be preprocessed by sjisconv or sjislatex needs to be used.
+% See the documentation of CJK.sty for details.
+
 \begin{CJK}{SJIS}{}
-Shift_JIS 日本語の文章
+Shift\_JIS 日本語の文章
 \end{CJK}

 hello
--- a/src/tex2lyx/text.cpp
+++ b/src/tex2lyx/text.cpp
@ -119,19 +119,23 @@ char const * const known_coded_ref_commands[] = { "ref", "pageref", "vref",

 /**
 * supported CJK encodings
- * SJIS and Bg5 cannot be supported as they are not
- * supported by iconv
 * JIS does not work with LyX's encoding conversion
 */
 const char * const supported_CJK_encodings[] = {
-"EUC-JP", "KS", "GB", "UTF8", 0};
+"EUC-JP", "KS", "GB", "UTF8",
+"Bg5", /*"JIS",*/ "SJIS", 0};

 /**
 * the same as supported_CJK_encodings with their corresponding LyX language name
+ * FIXME: The mapping "UTF8" => "chinese-traditional" is only correct for files
+ *        created by LyX.
+ * NOTE: "Bg5", "JIS" and "SJIS" are not supported by LyX, on re-export the
+ *       encodings "UTF8", "EUC-JP" and "EUC-JP" will be used.
 * please keep this in sync with supported_CJK_encodings line by line!
 */
 const char * const supported_CJK_languages[] = {
-"japanese-cjk", "korean", "chinese-simplified", "chinese-traditional", 0};
+"japanese-cjk", "korean", "chinese-simplified", "chinese-traditional",
+"chinese-traditional", /*"japanese-cjk",*/ "japanese-cjk", 0};

 /*!
 * natbib commands.
@ -1454,9 +1458,13 @@ void parse_environment(Parser & p, ostream & os, bool outer,
 		// store the encoding to be able to reset it
 		string const encoding_old = p.getEncoding();
 		string const encoding = p.getArg('{', '}');
-		// SJIS and Bg5 cannot be handled by iconv
-		// JIS does not work with LyX's encoding conversion
-		if (encoding != "Bg5" && encoding != "JIS" && encoding != "SJIS")
+		// FIXME: For some reason JIS does not work. Although the text
+		// in tests/CJK.tex is identical with the SJIS version if you
+		// convert both snippets using the recode command line utility,
+		// the resulting .lyx file contains some extra characters if
+		// you set buggy_encoding to false for JIS.
+		bool const buggy_encoding = encoding == "JIS";
+		if (!buggy_encoding)
 			p.setEncoding(encoding);
 		else {
 			// FIXME: This will read garbage, since the data is not encoded in utf8.
@ -1467,10 +1475,10 @@ void parse_environment(Parser & p, ostream & os, bool outer,
 		string const mapping = trim(p.getArg('{', '}'));
 		char const * const * const where =
 			is_known(encoding, supported_CJK_encodings);
-		if (!preamble.fontCJKSet())
+		if (!buggy_encoding && !preamble.fontCJKSet())
 			preamble.fontCJK(mapping);
 		bool knownMapping = mapping == preamble.fontCJK();
-		if (!knownMapping || !where) {
+		if (buggy_encoding || !knownMapping || !where) {
 			parent_context.check_layout(os);
 			handle_ert(os, "\\begin{" + name + "}{" + encoding + "}{" + mapping + "}",
 				       parent_context);