Set CJK document language instead of adding a note.

Instead of annoying the user with an automatically created note in the output document which she needs to delete manually, determine the document language automatically for documents that use CJK. This is done using a heuristic which roughly counts the number of characters in each language and sets the one that is used most often. This is not perfect, but it works for the two major use cases: A document with only some CJK parts (in this case the babel language is used), and a document which is mainly written in one CJK language. It is only a minor problem if the heuristic is wrong, since the TeX export is still correct, and there is no spell checking support for CJK anyway. Now all regression tests do pass except for some relative path issues depending on the location of the build directory.
2024-12-22 05:16:21 +00:00 · 2012-10-06 09:38:14 +02:00 · 2012-10-06 09:38:14 +02:00 · 23d7d6449f
commit 23d7d6449f
parent 8863ac62aa
5 changed files with 109 additions and 45 deletions
--- a/src/tex2lyx/Preamble.cpp
+++ b/src/tex2lyx/Preamble.cpp
@ -44,9 +44,8 @@ Preamble preamble;

 namespace {

-// "chinese-simplified", "chinese-traditional", "japanese-cjk", "korean"
-// cannot be supported because it is impossible to determine the correct document
-// language if CJK is used.
+// CJK languages are handled in text.cpp, polyglossia languages are listed
+// further down.
 /**
 * known babel language names (including synonyms)
 * not in standard babel: arabic, arabtex, armenian, belarusian, serbian-latin, thai
@ -88,6 +87,9 @@ const char * const known_coded_languages[] = {"french", "afrikaans", "albanian",
 "uppersorbian", "uppersorbian", "english", "english", "vietnamese", "welsh",
 0};

+/// languages with danish quotes (.lyx names)
+const char * const known_danish_quotes_languages[] = {"danish", 0};
+
 /// languages with english quotes (.lyx names)
 const char * const known_english_quotes_languages[] = {"american", "australian",
 "bahasa", "bahasam", "brazilian", "canadian", "chinese-simplified", "english",
@ -639,6 +641,7 @@ void Preamble::handle_package(Parser &p, string const & name,
 	vector<string> options = split_options(opts);
 	add_package(name, options);
 	string scale;
+	char const * const * where = 0;

 	if (is_known(name, known_xetex_packages)) {
 		xetex = true;
@ -753,9 +756,6 @@ void Preamble::handle_package(Parser &p, string const & name,
 	}

 	else if (name == "CJK") {
-		// It is impossible to determine the document language if CJK is used.
-		// All we can do is to notify the user that he has to set this by himself.
-		have_CJK = true;
 		// set the encoding to "auto" because it might be set to "default" by the babel handling
 		// and this would not be correct for CJK
 		if (h_inputencoding == "default")
@ -833,8 +833,8 @@ void Preamble::handle_package(Parser &p, string const & name,
 	else if (name == "subfig")
 		; // ignore this FIXME: Use the package separator mechanism instead

-	else if (is_known(name, known_languages))
-		h_language = name;
+	else if ((where = is_known(name, known_languages)))
+		h_language = known_coded_languages[where - known_languages];

 	else if (name == "natbib") {
 		h_biblio_style = "plainnat";
@ -914,7 +914,7 @@ bool Preamble::writeLyXHeader(ostream & os, bool subdoc)
 	// http://en.wikipedia.org/wiki/Quotation_mark,_non-English_usage
 	// (quotes for kazakh and interlingua are unknown)
 	// danish
-	if (h_language == "danish")
+	if (is_known(h_language, known_danish_quotes_languages))
 		h_quotes_language = "danish";
 	// french
 	else if (is_known(h_language, known_french_quotes_languages))
@ -1644,6 +1644,16 @@ void Preamble::parse(Parser & p, string const & forceclass,
 		ss << tc.sides();
 		h_papersides = ss.str();
 	}
+
+	// If the CJK package is used we cannot set the document language from
+	// the babel options. Instead, we guess which language is used most
+	// and set this one.
+	default_language = h_language;
+	if (is_full_document && auto_packages.find("CJK") != auto_packages.end()) {
+		p.pushPosition();
+		h_language = guessLanguage(p, default_language);
+		p.popPosition();
+	}
 }


--- a/src/tex2lyx/Preamble.h
+++ b/src/tex2lyx/Preamble.h
@ -38,8 +38,10 @@ public:
 	std::string inputencoding() const { return h_inputencoding; }
 	///
 	std::string notefontcolor() const { return h_notefontcolor; }
-	///
-	std::string language() const { return h_language; }
+	/// The document language
+	std::string docLanguage() const { return h_language; }
+	/// The language of text which is not explicitly marked
+	std::string defaultLanguage() const  { return default_language; }
 	///
 	std::string use_indices() const { return h_use_indices; }
 	///
@ -89,6 +91,8 @@ private:

 	/// needed to handle encodings with babel
 	bool one_language;
+	/// the main non-CJK language
+	std::string default_language;

 	/// was at least one title layout found?
 	bool title_layout_found;
--- a/src/tex2lyx/tex2lyx.cpp
+++ b/src/tex2lyx/tex2lyx.cpp
@ -332,7 +332,6 @@ bool checkModule(string const & name, bool command)
 bool noweb_mode = false;
 bool pdflatex = false;
 bool xetex = false;
-bool have_CJK = false;
 bool is_nonCJKJapanese = false;
 bool roundtrip = false;

@ -704,7 +703,7 @@ bool tex2lyx(idocstream & is, ostream & os, string encoding)
 	stringstream ss;
 	// store the document language in the context to be able to handle the
 	// commands like \foreignlanguage and \textenglish etc.
-	context.font.language = preamble.language();
+	context.font.language = preamble.defaultLanguage();
 	// parse the main text
 	parse_text(p, ss, FLAG_END, true, context);
 	if (Context::empty)
--- a/src/tex2lyx/tex2lyx.h
+++ b/src/tex2lyx/tex2lyx.h
@ -67,6 +67,10 @@ void parse_text_in_inset(Parser & p, std::ostream & os, unsigned flags,
                         bool outer, Context const & context,
                         InsetLayout const * layout = 0);

+/// Guess document language from \p p if CJK is used.
+/// \p lang is used for all non-CJK contents.
+std::string guessLanguage(Parser & p, std::string const & lang);
+

 /// in math.cpp
 void parse_math(Parser & p, std::ostream & os, unsigned flags, mode_type mode);
@ -166,8 +170,6 @@ extern bool noweb_mode;
 extern bool pdflatex;
 /// Did we recognize any xetex-only construct?
 extern bool xetex;
-/// Do we have CJK?
-extern bool have_CJK;
 /// Do we have non-CJK Japanese?
 extern bool is_nonCJKJapanese;
 /// LyX format that is created by tex2lyx
--- a/src/tex2lyx/text.cpp
+++ b/src/tex2lyx/text.cpp
@ -130,17 +130,9 @@ const char * const supported_CJK_encodings[] = {
 * the same as supported_CJK_encodings with their corresponding LyX language name
 * please keep this in sync with supported_CJK_encodings line by line!
 */
-const char * const coded_supported_CJK_encodings[] = {
+const char * const supported_CJK_languages[] = {
 "japanese-cjk", "korean", "chinese-simplified", "chinese-traditional", 0};

-string CJK2lyx(string const & encoding)
-{
-	char const * const * where = is_known(encoding, supported_CJK_encodings);
-	if (where)
-		return coded_supported_CJK_encodings[where - supported_CJK_encodings];
-	return encoding;
-}
-
 /*!
 * natbib commands.
 * The starred forms are also known except for "citefullauthor",
@ -1440,8 +1432,9 @@ void parse_environment(Parser & p, ostream & os, bool outer,
 		// LyX doesn't support the second argument so if
 		// this is used we need to output everything as ERT
 		string const mapping = p.getArg('{', '}');
-		if ((!mapping.empty() && mapping != " ")
-			|| (!is_known(encoding, supported_CJK_encodings))) {
+		char const * const * const where =
+			is_known(encoding, supported_CJK_encodings);
+		if ((!mapping.empty() && mapping != " ") || !where) {
 			parent_context.check_layout(os);
 			handle_ert(os, "\\begin{" + name + "}{" + encoding + "}{" + mapping + "}",
 				       parent_context);
@ -1459,7 +1452,8 @@ void parse_environment(Parser & p, ostream & os, bool outer,
 			handle_ert(os, "\\end{" + name + "}",
 				       parent_context);
 		} else {
-			string const lang = CJK2lyx(encoding);
+			string const lang =
+				supported_CJK_languages[where - supported_CJK_encodings];
 			// store the language because we must reset it at the end
 			string const lang_old = parent_context.font.language;
 			parent_context.font.language = lang;
@ -2142,24 +2136,6 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 	while (p.good()) {
 		Token const & t = p.get_token();

-	// it is impossible to determine the correct document language if CJK is used.
-	// Therefore write a note at the beginning of the document
-	if (have_CJK) {
-		context.check_layout(os);
-		begin_inset(os, "Note Note\n");
-		os << "status open\n\\begin_layout Plain Layout\n"
-		   << "\\series bold\n"
-		   << "Important information:\n"
-		   << "\\end_layout\n\n"
-		   << "\\begin_layout Plain Layout\n"
-		   << "This document contains text in Chinese, Japanese or Korean.\n"
-		   << " It was therefore impossible for tex2lyx to set the correct document language for your document."
-		   << " Please set the language manually in the document settings.\n"
-		   << "\\end_layout\n";
-		end_inset(os);
-		have_CJK = false;
-	}
-
 	// it is impossible to determine the correct encoding for non-CJK Japanese.
 	// Therefore write a note at the beginning of the document
 	if (is_nonCJKJapanese) {
@ -4426,6 +4402,79 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 	}
 }

+
+string guessLanguage(Parser & p, string const & lang)
+{
+	typedef std::map<std::string, size_t> LangMap;
+	// map from language names to number of characters
+	LangMap used;
+	used[lang] = 0;
+	for (char const * const * i = supported_CJK_languages; *i; i++)
+		used[string(*i)] = 0;
+
+	while (p.good()) {
+		Token const t = p.get_token();
+		// comments are not counted for any language
+		if (t.cat() == catComment)
+			continue;
+		// commands are not counted as well, but we need to detect
+		// \begin{CJK} and switch encoding if needed
+		if (t.cat() == catEscape) {
+			if (t.cs() == "inputencoding") {
+				string const enc = subst(p.verbatim_item(), "\n", " ");
+				p.setEncoding(enc);
+				continue;
+			}
+			if (t.cs() != "begin")
+				continue;
+		} else {
+			// Non-CJK content is counted for lang.
+			// We do not care about the real language here:
+			// If we have more non-CJK contents than CJK contents,
+			// we simply use the language that was specified as
+			// babel main language.
+			used[lang] += t.asInput().length();
+			continue;
+		}
+		// Now we are starting an environment
+		p.pushPosition();
+		string const name = p.getArg('{', '}');
+		if (name != "CJK") {
+			p.popPosition();
+			continue;
+		}
+		// It is a CJK environment
+		p.popPosition();
+		/* name = */ p.getArg('{', '}');
+		string const encoding = p.getArg('{', '}');
+		/* mapping = */ p.getArg('{', '}');
+		string const encoding_old = p.getEncoding();
+		char const * const * const where =
+			is_known(encoding, supported_CJK_encodings);
+		if (where)
+			p.setEncoding(encoding);
+		else
+			p.setEncoding("utf8");
+		string const text = p.verbatimEnvironment("CJK");
+		p.setEncoding(encoding_old);
+		p.skip_spaces();
+		if (!where) {
+			// ignore contents in unknown CJK encoding
+			continue;
+		}
+		// the language of the text
+		string const cjk =
+			supported_CJK_languages[where - supported_CJK_encodings];
+		used[cjk] += text.length();
+	}
+	LangMap::const_iterator use = used.begin();
+	for (LangMap::const_iterator it = used.begin(); it != used.end(); ++it) {
+		if (it->second > use->second)
+			use = it;
+	}
+	return use->first;
+}
+
 // }])