From 9291fc465b0d25a519fe52cdc7d37d3bc7fbf643 Mon Sep 17 00:00:00 2001 From: Juergen Spitzmueller Date: Sat, 20 Jul 2024 17:37:45 +0200 Subject: [PATCH] mode convertLaTeXCommands from BiblioInfo to Encodings So it can also be used in other contexts. This includes also an improvement of math parsing --- src/BiblioInfo.cpp | 145 ++----------------------------------------- src/Encoding.cpp | 151 +++++++++++++++++++++++++++++++++++++++++++++ src/Encoding.h | 3 + 3 files changed, 159 insertions(+), 140 deletions(-) diff --git a/src/BiblioInfo.cpp b/src/BiblioInfo.cpp index d70de5d169..d933bda4c9 100644 --- a/src/BiblioInfo.cpp +++ b/src/BiblioInfo.cpp @@ -329,141 +329,6 @@ bool multipleAuthors(docstring const & author) } -// converts a string containing LaTeX commands into unicode -// for display. -docstring convertLaTeXCommands(docstring const & str) -{ - docstring val = str; - docstring ret; - - bool scanning_cmd = false; - bool scanning_math = false; - bool is_section = false; - bool escaped = false; // used to catch \$, etc. - while (!val.empty()) { - char_type const ch = val[0]; - - // if we're scanning math, we output everything until we - // find an unescaped $, at which point we break out. - if (scanning_math) { - if (escaped) - escaped = false; - else if (ch == '\\') - escaped = true; - else if (ch == '$') - scanning_math = false; - ret += ch; - val = val.substr(1); - continue; - } - - // if we're scanning a command name, then we just - // discard characters until we hit something that - // isn't alpha. - if (scanning_cmd) { - if (!is_section && ch == 'S') { - is_section = true; - val = val.substr(1); - continue; - } - if (isAlphaASCII(ch)) { - is_section = false; - val = val.substr(1); - escaped = false; - continue; - } else if (is_section) { - ret.push_back(0x00a7); - is_section = false; - continue; - } - // so we're done with this command. - // now we fall through and check this character. - is_section = false; - scanning_cmd = false; - } - - // was the last character a \? If so, then this is something like: - // \\ or \$, so we'll just output it. That's probably not always right... - if (escaped) { - // exception: output \, as THIN SPACE - if (ch == ',') - ret.push_back(0x2009); - else - ret += ch; - val = val.substr(1); - escaped = false; - continue; - } - - if (ch == '~') { - ret += char_type(0x00a0); - val = val.substr(1); - continue; - } - - if (ch == '$') { - ret += ch; - val = val.substr(1); - scanning_math = true; - continue; - } - - // Change text mode accents in the form - // {\v a} to \v{a} (see #9340). - // FIXME: This is a sort of mini-tex2lyx. - // Use the real tex2lyx instead! - static regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}"); - if (regex_search(to_utf8(val), tma_reg)) { - val = val.substr(1); - val.replace(2, 1, from_ascii("{")); - continue; - } - - // Apart from the above, we just ignore braces - if (ch == '{' || ch == '}') { - val = val.substr(1); - continue; - } - - // we're going to check things that look like commands, so if - // this doesn't, just output it. - if (ch != '\\') { - ret += ch; - val = val.substr(1); - continue; - } - - // ok, could be a command of some sort - // let's see if it corresponds to some unicode - // unicodesymbols has things in the form: \"{u}, - // whereas we may see things like: \"u. So we'll - // look for that and change it, if necessary. - // FIXME: This is a sort of mini-tex2lyx. - // Use the real tex2lyx instead! - static regex const reg("^\\\\\\W\\w"); - if (regex_search(to_utf8(val), reg)) { - val.insert(3, from_ascii("}")); - val.insert(2, from_ascii("{")); - } - bool termination; - docstring rem; - docstring const cnvtd = Encodings::fromLaTeXCommand(val, - Encodings::TEXT_CMD, termination, rem); - if (!cnvtd.empty()) { - // it did, so we'll take that bit and proceed with what's left - ret += cnvtd; - val = rem; - continue; - } - // it's a command of some sort - scanning_cmd = true; - escaped = true; - val = val.substr(1); - } - return ret; -} - - // Escape '<' and '>' and remove richtext markers (e.g. {!this is richtext!}) from a string. docstring processRichtext(docstring const & str, bool richtext) { @@ -639,7 +504,7 @@ docstring const BibTeXInfo::getAuthorList(Buffer const * buf, retval = constructName(authors[0], citenameform) + (buf ? buf->B_(etal) : from_ascii(etal)); } - return convertLaTeXCommands(retval); + return Encodings::convertLaTeXCommands(retval); } @@ -1068,7 +933,7 @@ docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs, } if (!richtext && !info_.empty()) { - info_ = convertLaTeXCommands(processRichtext(info_, false)); + info_ = Encodings::convertLaTeXCommands(processRichtext(info_, false)); return info_; } if (richtext && !info_richtext_.empty()) @@ -1090,11 +955,11 @@ docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs, } if (richtext) { - info_richtext_ = convertLaTeXCommands(processRichtext(info_, true)); + info_richtext_ = Encodings::convertLaTeXCommands(processRichtext(info_, true)); return info_richtext_; } - info_ = convertLaTeXCommands(processRichtext(info_, false)); + info_ = Encodings::convertLaTeXCommands(processRichtext(info_, false)); return info_; } @@ -1110,7 +975,7 @@ docstring const BibTeXInfo::getLabel(BibTeXInfoList const & xrefs, if (!loclabel.empty() && !next) { loclabel = processRichtext(loclabel, ci.richtext); - loclabel = convertLaTeXCommands(loclabel); + loclabel = Encodings::convertLaTeXCommands(loclabel); } return loclabel; diff --git a/src/Encoding.cpp b/src/Encoding.cpp index 2dd3c9de8f..538eead253 100644 --- a/src/Encoding.cpp +++ b/src/Encoding.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include using namespace std; @@ -613,6 +614,156 @@ docstring Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype, } +docstring Encodings::convertLaTeXCommands(docstring const & str) +{ + docstring val = str; + docstring ret; + docstring mret; + + bool scanning_cmd = false; + bool scanning_math = false; + bool is_section = false; + bool escaped = false; // used to catch \$, etc. + while (!val.empty()) { + char_type const ch = val[0]; + + // if we're scanning math, we output everything until we + // find an unescaped $, at which point we break out. + if (scanning_math) { + if (escaped) + escaped = false; + else if (ch == '\\') + escaped = true; + else if (ch == '$') { + scanning_math = false; + bool termination; + docstring rem; + ret += fromLaTeXCommand(mret, MATH_CMD, termination, rem); + // parse remaining math + while (!rem.empty()) { + docstring rrem; + // split command from normal text + docstring cmd = split(rem, rrem, '\\'); + ret += rrem; + // done of no command was found + if (cmd.empty()) + break; + // go on ... + ret += fromLaTeXCommand(from_ascii("\\") + cmd, MATH_CMD, termination, rem); + } + mret = docstring(); + } + mret += ch; + val = val.substr(1); + continue; + } + + // if we're scanning a command name, then we just + // discard characters until we hit something that + // isn't alpha. + if (scanning_cmd) { + if (!is_section && ch == 'S') { + is_section = true; + val = val.substr(1); + continue; + } + if (isAlphaASCII(ch)) { + is_section = false; + val = val.substr(1); + escaped = false; + continue; + } else if (is_section) { + ret.push_back(0x00a7); + is_section = false; + continue; + } + // so we're done with this command. + // now we fall through and check this character. + is_section = false; + scanning_cmd = false; + } + + // was the last character a \? If so, then this is something like: + // \\ or \$, so we'll just output it. That's probably not always right... + if (escaped) { + // exception: output \, as THIN SPACE + if (ch == ',') + ret.push_back(0x2009); + else + ret += ch; + val = val.substr(1); + escaped = false; + continue; + } + + if (ch == '~') { + ret += char_type(0x00a0); + val = val.substr(1); + continue; + } + + if (ch == '$') { + val = val.substr(1); + scanning_math = true; + continue; + } + + // Change text mode accents in the form + // {\v a} to \v{a} (see #9340). + // FIXME: This is a sort of mini-tex2lyx. + // Use the real tex2lyx instead! + static regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}"); + if (regex_search(to_utf8(val), tma_reg)) { + val = val.substr(1); + val.replace(2, 1, from_ascii("{")); + continue; + } + + // Apart from the above, we just ignore braces + if (ch == '{' || ch == '}') { + val = val.substr(1); + continue; + } + + // we're going to check things that look like commands, so if + // this doesn't, just output it. + if (ch != '\\') { + ret += ch; + val = val.substr(1); + continue; + } + + // ok, could be a command of some sort + // let's see if it corresponds to some unicode + // unicodesymbols has things in the form: \"{u}, + // whereas we may see things like: \"u. So we'll + // look for that and change it, if necessary. + // FIXME: This is a sort of mini-tex2lyx. + // Use the real tex2lyx instead! + static regex const reg("^\\\\\\W\\w"); + if (regex_search(to_utf8(val), reg)) { + val.insert(3, from_ascii("}")); + val.insert(2, from_ascii("{")); + } + bool termination; + docstring rem; + docstring const cnvtd = fromLaTeXCommand(val, + TEXT_CMD, termination, rem); + if (!cnvtd.empty()) { + // it did, so we'll take that bit and proceed with what's left + ret += cnvtd; + val = rem; + continue; + } + // it's a command of some sort + scanning_cmd = true; + escaped = true; + val = val.substr(1); + } + return ret; +} + + CharInfo const & Encodings::unicodeCharInfo(char_type c) { static CharInfo empty; diff --git a/src/Encoding.h b/src/Encoding.h index 4913ddc238..3177132514 100644 --- a/src/Encoding.h +++ b/src/Encoding.h @@ -349,6 +349,9 @@ public: static char_type fromLaTeXCommand(docstring const & cmd, int cmdtype, bool & combining, bool & needsTermination, std::set * req = nullptr); + /// converts a string containing LaTeX commands into unicode + /// for display. + static docstring convertLaTeXCommands(docstring const & str); /// enum LatexCmd { ///