tex2lyx: update quote handling

* Consider new quote styles * Consider changed quote styles * Try to be a bit smarter with ambiguous quotation marks
2024-11-22 10:00:33 +00:00 · 2018-03-11 18:04:23 +01:00 · 2018-03-11 18:04:23 +01:00 · 8184f08f4a
commit 8184f08f4a
parent e5e6e16274
7 changed files with 172 additions and 85 deletions
--- a/src/tex2lyx/Preamble.cpp
+++ b/src/tex2lyx/Preamble.cpp
@ -88,35 +88,51 @@ const char * const known_coded_languages[] = {"french", "afrikaans", "albanian",
 "vietnamese", "welsh",
 0};
 /// languages with british quotes (.lyx names)
 const char * const known_british_quotes_languages[] = {"british", "welsh", 0};
 /// languages with cjk quotes (.lyx names)
 const char * const known_cjk_quotes_languages[] = {"chinese-traditional",
 "japanese", "japanese-cjk", 0};
 /// languages with cjk-angle quotes (.lyx names)
 const char * const known_cjkangle_quotes_languages[] = {"korean", 0};
 /// languages with danish quotes (.lyx names)
 const char * const known_danish_quotes_languages[] = {"danish", 0};
 /// languages with english quotes (.lyx names)
 const char * const known_english_quotes_languages[] = {"american", "australian",
 "bahasa", "bahasam", "brazilian", "canadian", "chinese-simplified", "english",
-"esperanto", "hebrew", "irish", "korean", "newzealand", "portuguese", "scottish",
+"esperanto", "farsi", "interlingua", "irish", "newzealand", "scottish",
-"thai", 0};
+"thai", "turkish", "vietnamese", 0};
 /// languages with french quotes (.lyx names)
-const char * const known_french_quotes_languages[] = {"albanian",
+const char * const known_french_quotes_languages[] = {"ancientgreek",
-"arabic_arabi", "arabic_arabtex", "asturian", "basque", "canadien", "catalan",
+"arabic_arabi", "arabic_arabtex", "asturian", "belarusian", "breton",
-"french", "friulan", "galician", "greek", "italian", "norsk", "nynorsk",
+"canadien", "catalan", "french", "friulan", "galician", "italian", "occitan",
-"piedmontese", "polutonikogreek", "russian", "spanish", "spanish-mexico",
+"piedmontese", "portuguese", "spanish", "spanish-mexico", 0};
 "turkish", "turkmen", "ukrainian", "vietnamese", 0};
 /// languages with german quotes (.lyx names)
 const char * const known_german_quotes_languages[] = {"austrian", "bulgarian",
-"czech", "german", "georgian", "icelandic", "lithuanian", "lowersorbian", "macedonian",
+"czech", "estonian", "georgian", "german", "icelandic", "latvian", "lithuanian",
-"naustrian", "ngerman", "romansh", "serbian", "serbian-latin", "slovak", "slovene",
+"lowersorbian", "macedonian", "naustrian", "ngerman", "romansh", "slovak", "slovene",
 "uppersorbian", 0};
 /// languages with polish quotes (.lyx names)
 const char * const known_polish_quotes_languages[] = {"afrikaans", "bosnian", "croatian",
-"dutch", "estonian", "magyar", "polish", "romanian", 0};
+"dutch", "magyar", "polish", "romanian", "serbian", "serbian-latin", 0};
 /// languages with russian quotes (.lyx names)
 const char * const known_russian_quotes_languages[] = {"russian", "ukrainian", 0};
 /// languages with swedish quotes (.lyx names)
-const char * const known_swedish_quotes_languages[] = {"finnish",
+const char * const known_swedish_quotes_languages[] = {"finnish", "swedish", 0};
-"swedish", 0};
+
 /// languages with swiss quotes (.lyx names)
 const char * const known_swiss_quotes_languages[] = {"albanian",
 "armenian", "basque", "german-ch", "german-ch-old",
 "norsk", "nynorsk", "turkmen", "ukrainian", "vietnamese", 0};
 /// known language packages from the times before babel
 const char * const known_old_language_packages[] = {"french", "frenchle",
@ -1214,33 +1230,6 @@ void Preamble::handle_if(Parser & p, bool in_lyx_preamble)
 bool Preamble::writeLyXHeader(ostream & os, bool subdoc, string const & outfiledir)
 {
 	// set the quote language
 	// LyX only knows the following quotes languages:
 	// english, swedish, german, polish, french and danish
 	// (quotes for "japanese" and "chinese-traditional" are missing because
 	//  they wouldn't be useful: https://www.lyx.org/trac/ticket/6383)
 	// conversion list taken from
 	// https://en.wikipedia.org/wiki/Quotation_mark,_non-English_usage
 	// (quotes for kazakh and interlingua are unknown)
 	// danish
 	if (is_known(h_language, known_danish_quotes_languages))
 		h_quotes_style = "danish";
 	// french
 	else if (is_known(h_language, known_french_quotes_languages))
 		h_quotes_style = "french";
 	// german
 	else if (is_known(h_language, known_german_quotes_languages))
 		h_quotes_style = "german";
 	// polish
 	else if (is_known(h_language, known_polish_quotes_languages))
 		h_quotes_style = "polish";
 	// swedish
 	else if (is_known(h_language, known_swedish_quotes_languages))
 		h_quotes_style = "swedish";
 	//english
 	else if (is_known(h_language, known_english_quotes_languages))
 		h_quotes_style = "english";
 	if (contains(h_float_placement, "H"))
 		registerAutomaticallyLoadedPackage("float");
 	if (h_spacing != "single" && h_spacing != "default")
@ -2264,6 +2253,47 @@ void Preamble::parse(Parser & p, string const & forceclass,
 				h_options += ',' + lyx2babel(default_language);
 		}
 	}
 	// Finally, set the quote style.
 	// LyX knows the following quotes styles:
 	// british, cjk, cjkangle, danish, english, french, german,
 	// polish, russian, swedish and swiss
 	// conversion list taken from
 	// https://en.wikipedia.org/wiki/Quotation_mark,_non-English_usage
 	// (quotes for kazakh are unknown)
 	// british
 	if (is_known(h_language, known_british_quotes_languages))
 		h_quotes_style = "british";
 	// cjk
 	else if (is_known(h_language, known_cjk_quotes_languages))
 		h_quotes_style = "cjk";
 	// cjkangle
 	else if (is_known(h_language, known_cjkangle_quotes_languages))
 		h_quotes_style = "cjkangle";
 	// danish
 	else if (is_known(h_language, known_danish_quotes_languages))
 		h_quotes_style = "danish";
 	// french
 	else if (is_known(h_language, known_french_quotes_languages))
 		h_quotes_style = "french";
 	// german
 	else if (is_known(h_language, known_german_quotes_languages))
 		h_quotes_style = "german";
 	// polish
 	else if (is_known(h_language, known_polish_quotes_languages))
 		h_quotes_style = "polish";
 	// russian
 	else if (is_known(h_language, known_russian_quotes_languages))
 		h_quotes_style = "russian";
 	// swedish
 	else if (is_known(h_language, known_swedish_quotes_languages))
 		h_quotes_style = "swedish";
 	// swiss
 	else if (is_known(h_language, known_swiss_quotes_languages))
 		h_quotes_style = "swiss";
 	// english
 	else if (is_known(h_language, known_english_quotes_languages))
 		h_quotes_style = "english";
 }
--- a/src/tex2lyx/Preamble.h
+++ b/src/tex2lyx/Preamble.h
@ -54,6 +54,8 @@ public:
 	std::string docLanguage() const { return h_language; }
 	/// The language of text which is not explicitly marked
 	std::string defaultLanguage() const  { return default_language; }
 	/// The quotation marks style
 	std::string quotesStyle() const { return h_quotes_style; }
 	///
 	bool usePolyglossia() const;
 	///
--- a/src/tex2lyx/TODO.txt
+++ b/src/tex2lyx/TODO.txt
@ -75,23 +75,6 @@ Format LaTeX feature                        LyX feature
       \twocolumn[]{}{}                      Layout Twocolumn, InsetArgument
       \item[]<>                             InsetArgument
       \begin{enumerate|itemize|...}[]       InsetArgument
 520    Plain InsetQuote Style:
       \textquotesingle                      \begin_inset Quotes qls, \begin_inset Quotes qrs
       \textquotedbl                         \begin_inset Quotes qld, \begin_inset Quotes qrd
 521    New Quote Styles                      InsetQuote
       - british                             \begin_inset Quotes b..
       - swiss                               \begin_inset Quotes c..
       - swedishg                            \begin_inset Quotes w..
       - frenchin                            \begin_inset Quotes i..
       - russian                             \begin_inset Quotes r..
       Change default behavior               \begin_inset Quotes f..
       of French quote style:
       - Inner quotes are now ``...''.
       - Former french style is now
         called "swiss"
 523    CJK Quote Styles                      InsetQuote
       - cjk (corner brackets)               \begin_inset Quotes j..
       - cjkangle (angle brackets)           \begin_inset Quotes k..
 526   Plural and capitalized refstyles      InsetRef
 533   Multibib support
      \begin{btUnit}...\end{btUnit}        \multibib {none|part|chapter|section|subsection}
--- a/src/tex2lyx/test/CJK.lyx.lyx
+++ b/src/tex2lyx/test/CJK.lyx.lyx
@ -74,7 +74,7 @@
 \paragraph_indentation default
 \is_math_indent 0
 \math_numbering_side default
-\quotes_style english
+\quotes_style cjk
 \dynamic_quotes 0
 \papercolumns 1
 \papersides 1
--- a/src/tex2lyx/test/CJKutf8.lyx.lyx
+++ b/src/tex2lyx/test/CJKutf8.lyx.lyx
@ -74,7 +74,7 @@
 \paragraph_indentation default
 \is_math_indent 0
 \math_numbering_side default
-\quotes_style english
+\quotes_style cjk
 \dynamic_quotes 0
 \papercolumns 1
 \papersides 1
--- a/src/tex2lyx/test/XeTeX-polyglossia.lyx.lyx
+++ b/src/tex2lyx/test/XeTeX-polyglossia.lyx.lyx
@ -73,7 +73,7 @@
 \paragraph_indentation default
 \is_math_indent 0
 \math_numbering_side default
-\quotes_style english
+\quotes_style british
 \dynamic_quotes 0
 \papercolumns 1
 \papersides 1
--- a/src/tex2lyx/text.cpp
+++ b/src/tex2lyx/text.cpp
@ -201,13 +201,14 @@ bool need_commentbib = false;
 char const * const known_quotes[] = { "dq", "guillemotleft", "flqq", "og",
 "guillemotright", "frqq", "fg", "glq", "glqq", "textquoteleft", "grq", "grqq",
 "quotedblbase", "textquotedblleft", "quotesinglbase", "textquoteright", "flq",
-"guilsinglleft", "frq", "guilsinglright", 0};
+"guilsinglleft", "frq", "guilsinglright", "textquotedblright", "textquotesingle",
 "textquotedbl", 0};
 /// the same as known_quotes with .lyx names
-char const * const known_coded_quotes[] = { "prd", "ard", "ard", "ard",
+char const * const known_coded_quotes[] = { "qrd", "ard", "ard", "ard",
-"ald", "ald", "ald", "gls", "gld", "els", "els", "grd",
+"ald", "ald", "ald", "gls", "gld", "els", "els", "eld",
-"gld", "grd", "gls", "ers", "fls",
+"gld", "eld", "gls", "ers", "ars",
-"fls", "frs", "frs", 0};
+"ars", "als", "als", "erd", "qrs", "qrd", 0};
 /// LaTeX names for font sizes
 char const * const known_sizes[] = { "tiny", "scriptsize", "footnotesize",
@ -446,6 +447,78 @@ bool translate_len(string const & length, string & valstring, string & unit)
 	return true;
 }
 /// If we have ambiguous quotation marks, make a smart guess
 /// based on main quote style
 string guessQuoteStyle(string in, bool const opening)
 {
 	string res = in;
 	if (prefixIs(in, "qr")) {// straight quote
 		if (!opening)
 			res = subst(res, "r", "l");
 	} else if (in == "eld") {// ``
 		if (preamble.quotesStyle() == "german")
 			res = "grd";
 		else if (preamble.quotesStyle() == "british")
 			res = "bls";
 		else if (preamble.quotesStyle() == "french")
 			res = "fls";
 		else if (preamble.quotesStyle() == "russian")
 			res = "rrs";
 	} else if (in == "erd") {// ''
 		if (preamble.quotesStyle() == "polish")
 			res = "prd";
 		else if (preamble.quotesStyle() == "british")
 			res = "brs";
 		else if (preamble.quotesStyle() == "french")
 			res = "frs";
 		else if (preamble.quotesStyle() == "swedish")
 			res = opening ? "sld" : "srd";
 	} else if (in == "els") {// `
 		if (preamble.quotesStyle() == "german")
 			res = "grs";
 		else if (preamble.quotesStyle() == "british")
 			res = "bld";
 	} else if (in == "ers") {// '
 		if (preamble.quotesStyle() == "polish")
 			res = "prs";
 		else if (preamble.quotesStyle() == "british")
 			res = "brd";
 		else if (preamble.quotesStyle() == "swedish")
 			res = opening ? "sls" : "srs";
 	} else if (in == "ard") {// >>
 		if (preamble.quotesStyle() == "swiss")
 			res = "cld";
 		else if (preamble.quotesStyle() == "french")
 			res = "fld";
 		else if (preamble.quotesStyle() == "russian")
 			res = "rld";
 	} else if (in == "ald") {// <<
 		if (preamble.quotesStyle() == "swiss")
 			res = "crd";
 		else if (preamble.quotesStyle() == "french")
 			res = "frd";
 		else if (preamble.quotesStyle() == "russian")
 			res = "rrd";
 	} else if (in == "ars") {// >
 		if (preamble.quotesStyle() == "swiss")
 			res = "cls";
 	} else if (in == "als") {// <
 		if (preamble.quotesStyle() == "swiss")
 			res = "crs";
 	} else if (in == "gld") {// ,,
 		if (preamble.quotesStyle() == "polish")
 			res = "pld";
 		else if (preamble.quotesStyle() == "russian")
 			res = "rls";
 	} else if (in == "gls") {// ,
 		if (preamble.quotesStyle() == "polish")
 			res = "pls";
 	}
 	return res;
 }
 } // namespace
@ -2634,14 +2707,17 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			continue;
 		}
-		// Basic support for english quotes. This should be
+		// Basic support for quotes. We try to disambiguate
-		// extended to other quotes, but is not so easy (a
+		// quotes from the context (e.g., a left english quote is
-		// left english quote is the same as a right german
+		// the same as a right german quote...).
-		// quote...)
+		// Try to make a smart guess about the side
 		Token const prev = p.prev_token();
 		bool const opening = (prev.cat() != catSpace && prev.character() != 0
 				&& prev.character() != '\n' && prev.character() != '~');
 		if (t.asInput() == "`" && p.next_token().asInput() == "`") {
 			context.check_layout(os);
 			begin_inset(os, "Quotes ");
-			os << "eld";
+			os << guessQuoteStyle("eld", opening);
 			end_inset(os);
 			p.get_token();
 			skip_braces(p);
@ -2650,7 +2726,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 		if (t.asInput() == "'" && p.next_token().asInput() == "'") {
 			context.check_layout(os);
 			begin_inset(os, "Quotes ");
-			os << "erd";
+			os << guessQuoteStyle("erd", opening);
 			end_inset(os);
 			p.get_token();
 			skip_braces(p);
@ -2660,7 +2736,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 		if (t.asInput() == ">" && p.next_token().asInput() == ">") {
 			context.check_layout(os);
 			begin_inset(os, "Quotes ");
-			os << "ald";
+			os << guessQuoteStyle("ald", opening);
 			end_inset(os);
 			p.get_token();
 			skip_braces(p);
@ -2681,9 +2757,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			if (!has_chunk) {
 				context.check_layout(os);
 				begin_inset(os, "Quotes ");
-				//FIXME: this is a right danish quote;
+				os << guessQuoteStyle("ard", opening);
 				// why not a left french quote?
 				os << "ard";
 				end_inset(os);
 				p.get_token();
 				skip_braces(p);
@ -2809,8 +2883,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			           is_known(next.cs(), known_quotes) &&
 			           end.cat() == catEnd) {
 				// Something like {\textquoteright} (e.g.
-				// from writer2latex). LyX writes
+				// from writer2latex). We may skip the
 				// \textquoteright{}, so we may skip the
 				// braces here for better readability.
 				parse_text_snippet(p, os, FLAG_BRACE_LAST,
 				                   outer, context);
@ -4375,7 +4448,13 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 		if ((where = is_known(t.cs(), known_quotes))) {
 			context.check_layout(os);
 			begin_inset(os, "Quotes ");
-			os << known_coded_quotes[where - known_quotes];
+			string quotetype = known_coded_quotes[where - known_quotes];
 			// try to make a smart guess about the side
 			Token const prev = p.prev_token();
 			bool const opening = (prev.cat() != catSpace && prev.character() != 0
 					&& prev.character() != '\n' && prev.character() != '~');
 			quotetype = guessQuoteStyle(quotetype, opening);
 			os << quotetype;
 			end_inset(os);
 			// LyX adds {} after the quote, so we have to eat
 			// spaces here if there are any before a possible
@ -4386,7 +4465,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 		}
 		if ((where = is_known(t.cs(), known_sizes)) &&
-			 context.new_layout_allowed) {
+			context.new_layout_allowed) {
 			context.check_layout(os);
 			TeXFont const oldFont = context.font;
 			context.font.size = known_coded_sizes[where - known_sizes];
@ -4551,13 +4630,6 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 			continue;
 		}
 		if (t.cs() == "textquotedbl") {
 			context.check_layout(os);
 			os << "\"";
 			skip_braces(p);
 			continue;
 		}
 		if (t.cs() == "_" || t.cs() == "&" || t.cs() == "#"
 			    || t.cs() == "$" || t.cs() == "{" || t.cs() == "}"
 			    || t.cs() == "%" || t.cs() == "-") {