mode convertLaTeXCommands from BiblioInfo to Encodings

So it can also be used in other contexts. This includes also an improvement of math parsing
2024-12-23 05:25:26 +00:00 · 2024-07-20 17:37:45 +02:00 · 2024-07-20 17:37:45 +02:00 · 9291fc465b
commit 9291fc465b
parent 485f5afc21
3 changed files with 159 additions and 140 deletions
--- a/src/BiblioInfo.cpp
+++ b/src/BiblioInfo.cpp
@ -329,141 +329,6 @@ bool multipleAuthors(docstring const & author)
 }
 // converts a string containing LaTeX commands into unicode
 // for display.
 docstring convertLaTeXCommands(docstring const & str)
 {
 	docstring val = str;
 	docstring ret;
 	bool scanning_cmd = false;
 	bool scanning_math = false;
 	bool is_section = false;
 	bool escaped = false; // used to catch \$, etc.
 	while (!val.empty()) {
 		char_type const ch = val[0];
 		// if we're scanning math, we output everything until we
 		// find an unescaped $, at which point we break out.
 		if (scanning_math) {
 			if (escaped)
 				escaped = false;
 			else if (ch == '\\')
 				escaped = true;
 			else if (ch == '$')
 				scanning_math = false;
 			ret += ch;
 			val = val.substr(1);
 			continue;
 		}
 		// if we're scanning a command name, then we just
 		// discard characters until we hit something that
 		// isn't alpha.
 		if (scanning_cmd) {
 			if (!is_section && ch == 'S') {
 				is_section = true;
 				val = val.substr(1);
 				continue;
 			}
 			if (isAlphaASCII(ch)) {
 				is_section = false;
 				val = val.substr(1);
 				escaped = false;
 				continue;
 			} else if (is_section) {
 				ret.push_back(0x00a7);
 				is_section = false;
 				continue;
 			}
 			// so we're done with this command.
 			// now we fall through and check this character.
 			is_section = false;
 			scanning_cmd = false;
 		}
 		// was the last character a \? If so, then this is something like:
 		// \\ or \$, so we'll just output it. That's probably not always right...
 		if (escaped) {
 			// exception: output \, as THIN SPACE
 			if (ch == ',')
 				ret.push_back(0x2009);
 			else
 				ret += ch;
 			val = val.substr(1);
 			escaped = false;
 			continue;
 		}
 		if (ch == '~') {
 			ret += char_type(0x00a0);
 			val = val.substr(1);
 			continue;
 		}
 		if (ch == '$') {
 			ret += ch;
 			val = val.substr(1);
 			scanning_math = true;
 			continue;
 		}
 		// Change text mode accents in the form
 		// {\v a} to \v{a} (see #9340).
 		// FIXME: This is a sort of mini-tex2lyx.
 		//        Use the real tex2lyx instead!
 		static regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}");
 		if (regex_search(to_utf8(val), tma_reg)) {
 			val = val.substr(1);
 			val.replace(2, 1, from_ascii("{"));
 			continue;
 		}
 		// Apart from the above, we just ignore braces
 		if (ch == '{' || ch == '}') {
 			val = val.substr(1);
 			continue;
 		}
 		// we're going to check things that look like commands, so if
 		// this doesn't, just output it.
 		if (ch != '\\') {
 			ret += ch;
 			val = val.substr(1);
 			continue;
 		}
 		// ok, could be a command of some sort
 		// let's see if it corresponds to some unicode
 		// unicodesymbols has things in the form: \"{u},
 		// whereas we may see things like: \"u. So we'll
 		// look for that and change it, if necessary.
 		// FIXME: This is a sort of mini-tex2lyx.
 		//        Use the real tex2lyx instead!
 		static regex const reg("^\\\\\\W\\w");
 		if (regex_search(to_utf8(val), reg)) {
 			val.insert(3, from_ascii("}"));
 			val.insert(2, from_ascii("{"));
 		}
 		bool termination;
 		docstring rem;
 		docstring const cnvtd = Encodings::fromLaTeXCommand(val,
 				Encodings::TEXT_CMD, termination, rem);
 		if (!cnvtd.empty()) {
 			// it did, so we'll take that bit and proceed with what's left
 			ret += cnvtd;
 			val = rem;
 			continue;
 		}
 		// it's a command of some sort
 		scanning_cmd = true;
 		escaped = true;
 		val = val.substr(1);
 	}
 	return ret;
 }
 // Escape '<' and '>' and remove richtext markers (e.g. {!this is richtext!}) from a string.
 docstring processRichtext(docstring const & str, bool richtext)
 {
@ -639,7 +504,7 @@ docstring const BibTeXInfo::getAuthorList(Buffer const * buf,
 			retval = constructName(authors[0], citenameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 	}
-	return convertLaTeXCommands(retval);
+	return Encodings::convertLaTeXCommands(retval);
 }
@ -1068,7 +933,7 @@ docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs,
 	}
 	if (!richtext && !info_.empty()) {
-		info_ = convertLaTeXCommands(processRichtext(info_, false));
+		info_ = Encodings::convertLaTeXCommands(processRichtext(info_, false));
 		return info_;
 	}
 	if (richtext && !info_richtext_.empty())
@ -1090,11 +955,11 @@ docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs,
 	}
 	if (richtext) {
-		info_richtext_ = convertLaTeXCommands(processRichtext(info_, true));
+		info_richtext_ = Encodings::convertLaTeXCommands(processRichtext(info_, true));
 		return info_richtext_;
 	}
-	info_ = convertLaTeXCommands(processRichtext(info_, false));
+	info_ = Encodings::convertLaTeXCommands(processRichtext(info_, false));
 	return info_;
 }
@ -1110,7 +975,7 @@ docstring const BibTeXInfo::getLabel(BibTeXInfoList const & xrefs,
 	if (!loclabel.empty() && !next) {
 		loclabel = processRichtext(loclabel, ci.richtext);
-		loclabel = convertLaTeXCommands(loclabel);
+		loclabel = Encodings::convertLaTeXCommands(loclabel);
 	}
 	return loclabel;
--- a/src/Encoding.cpp
+++ b/src/Encoding.cpp
@ -26,6 +26,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <iterator>
 #include <regex>
 #include <sstream>
 using namespace std;
@ -613,6 +614,156 @@ docstring Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype,
 }
 docstring Encodings::convertLaTeXCommands(docstring const & str)
 {
 	docstring val = str;
 	docstring ret;
 	docstring mret;
 	bool scanning_cmd = false;
 	bool scanning_math = false;
 	bool is_section = false;
 	bool escaped = false; // used to catch \$, etc.
 	while (!val.empty()) {
 		char_type const ch = val[0];
 		// if we're scanning math, we output everything until we
 		// find an unescaped $, at which point we break out.
 		if (scanning_math) {
 			if (escaped)
 				escaped = false;
 			else if (ch == '\\')
 				escaped = true;
 			else if (ch == '$') {
 				scanning_math = false;
 				bool termination;
 				docstring rem;
 				ret += fromLaTeXCommand(mret, MATH_CMD, termination, rem);
 				// parse remaining math
 				while (!rem.empty()) {
 					docstring rrem;
 					// split command from normal text
 					docstring cmd = split(rem, rrem, '\\');
 					ret += rrem;
 					// done of no command was found
 					if (cmd.empty())
 						break;
 					// go on ...
 					ret += fromLaTeXCommand(from_ascii("\\") + cmd, MATH_CMD, termination, rem);
 				}
 				mret = docstring();
 			}
 			mret += ch;
 			val = val.substr(1);
 			continue;
 		}
 		// if we're scanning a command name, then we just
 		// discard characters until we hit something that
 		// isn't alpha.
 		if (scanning_cmd) {
 			if (!is_section && ch == 'S') {
 				is_section = true;
 				val = val.substr(1);
 				continue;
 			}
 			if (isAlphaASCII(ch)) {
 				is_section = false;
 				val = val.substr(1);
 				escaped = false;
 				continue;
 			} else if (is_section) {
 				ret.push_back(0x00a7);
 				is_section = false;
 				continue;
 			}
 			// so we're done with this command.
 			// now we fall through and check this character.
 			is_section = false;
 			scanning_cmd = false;
 		}
 		// was the last character a \? If so, then this is something like:
 		// \\ or \$, so we'll just output it. That's probably not always right...
 		if (escaped) {
 			// exception: output \, as THIN SPACE
 			if (ch == ',')
 				ret.push_back(0x2009);
 			else
 				ret += ch;
 			val = val.substr(1);
 			escaped = false;
 			continue;
 		}
 		if (ch == '~') {
 			ret += char_type(0x00a0);
 			val = val.substr(1);
 			continue;
 		}
 		if (ch == '$') {
 			val = val.substr(1);
 			scanning_math = true;
 			continue;
 		}
 		// Change text mode accents in the form
 		// {\v a} to \v{a} (see #9340).
 		// FIXME: This is a sort of mini-tex2lyx.
 		//        Use the real tex2lyx instead!
 		static regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}");
 		if (regex_search(to_utf8(val), tma_reg)) {
 			val = val.substr(1);
 			val.replace(2, 1, from_ascii("{"));
 			continue;
 		}
 		// Apart from the above, we just ignore braces
 		if (ch == '{' || ch == '}') {
 			val = val.substr(1);
 			continue;
 		}
 		// we're going to check things that look like commands, so if
 		// this doesn't, just output it.
 		if (ch != '\\') {
 			ret += ch;
 			val = val.substr(1);
 			continue;
 		}
 		// ok, could be a command of some sort
 		// let's see if it corresponds to some unicode
 		// unicodesymbols has things in the form: \"{u},
 		// whereas we may see things like: \"u. So we'll
 		// look for that and change it, if necessary.
 		// FIXME: This is a sort of mini-tex2lyx.
 		//        Use the real tex2lyx instead!
 		static regex const reg("^\\\\\\W\\w");
 		if (regex_search(to_utf8(val), reg)) {
 			val.insert(3, from_ascii("}"));
 			val.insert(2, from_ascii("{"));
 		}
 		bool termination;
 		docstring rem;
 		docstring const cnvtd = fromLaTeXCommand(val,
 				TEXT_CMD, termination, rem);
 		if (!cnvtd.empty()) {
 			// it did, so we'll take that bit and proceed with what's left
 			ret += cnvtd;
 			val = rem;
 			continue;
 		}
 		// it's a command of some sort
 		scanning_cmd = true;
 		escaped = true;
 		val = val.substr(1);
 	}
 	return ret;
 }
 CharInfo const & Encodings::unicodeCharInfo(char_type c)
 {
 	static CharInfo empty;
--- a/src/Encoding.h
+++ b/src/Encoding.h
@ -349,6 +349,9 @@ public:
 	static char_type fromLaTeXCommand(docstring const & cmd, int cmdtype,
 			bool & combining, bool & needsTermination,
 			std::set<std::string> * req = nullptr);
 	/// converts a string containing LaTeX commands into unicode
 	/// for display.
 	static docstring convertLaTeXCommands(docstring const & str);
 	///
 	enum LatexCmd {
 		///