From 0dc7f34e00e06e8b378236e32f86023313067b7c Mon Sep 17 00:00:00 2001 From: Thibaut Cuvelier Date: Sat, 19 Feb 2022 02:10:45 +0100 Subject: [PATCH] unicodesymbols: parse supplementary lines to encode alternative ways to encode symbols in raw LaTeX. --- lib/unicodesymbols | 2 + src/Encoding.cpp | 132 +++++++++++++++++++++++----------------- src/Encoding.h | 5 ++ src/insets/InsetERT.cpp | 2 + 4 files changed, 84 insertions(+), 57 deletions(-) diff --git a/lib/unicodesymbols b/lib/unicodesymbols index 043f4dcc43..1de6910f36 100644 --- a/lib/unicodesymbols +++ b/lib/unicodesymbols @@ -60,6 +60,7 @@ # 0x00a0 "~" "" "force=cp862;cp1255;cp1256;koi8-u;iso8859-6;iso8859-7;utf8-platex,notermination=both" "~" "" # NO-BREAK SPACE 0x00a1 "\\textexclamdown" "" "force=cp862;cp1255;euc-jp;euc-jp-platex;euc-kr;utf8-platex" # INVERTED EXCLAMATION MARK +0x00a1 "!`" "" 0x00a2 "\\textcent" "textcomp" "force=cp862;cp1255;cp1256;euc-jp;euc-jp-platex;jis;shift-jis-platex" #"\\mathcent" "txfonts|pxfonts" # CENT SIGN 0x00a3 "\\pounds" "" "force=cp862;cp1255;cp1256;iso8859-7;euc-jp;euc-jp-platex;jis;shift-jis-platex" "\\pounds" "" # £ POUND SIGN 0x00a4 "\\textcurrency" "textcomp" "force=cp1256;euc-cn;euc-jp;euc-jp-platex;euc-kr;gbk;iso8859-6;utf8-platex" # CURRENCY SYMBOL @@ -90,6 +91,7 @@ 0x00bd "\\textonehalf" "textcomp" "force=cp862;cp1255;cp1256;iso8859-7;euc-kr" "\\sfrac{1}{2}" "xfrac" # 1/2 FRACTION 0x00be "\\textthreequarters" "textcomp" "force=cp1255;cp1256;euc-kr" "\\sfrac{3}{4}" "xfrac" # 3/4 FRACTION 0x00bf "\\textquestiondown" "" "force=cp862;cp1255;euc-jp;euc-kr" # INVERTED QUESTION MARK +0x00bf "?`" "" 0x00c0 "\\`{A}" "" "mathalpha,force=euc-jp" "\\grave{A}" # LATIN CAPITAL LETTER A WITH GRAVE 0x00c1 "\\'{A}" "" "mathalpha,force=euc-jp" "\\acute{A}" # LATIN CAPITAL LETTER A WITH ACUTE 0x00c2 "\\^{A}" "" "mathalpha,force=euc-jp" "\\hat{A}" # LATIN CAPITAL LETTER A WITH CIRCUMFLEX diff --git a/src/Encoding.cpp b/src/Encoding.cpp index b0d56495cd..4e47daef8b 100644 --- a/src/Encoding.cpp +++ b/src/Encoding.cpp @@ -50,7 +50,7 @@ CharInfoMap unicodesymbols; typedef set CharSet; typedef map CharSetMap; CharSet forced; -CharSetMap forcedselected; +CharSetMap forcedSelected; typedef set MathAlphaSet; MathAlphaSet mathalpha; @@ -99,7 +99,7 @@ CharInfo::CharInfo( Encoding::Encoding(string const & n, string const & l, string const & g, string const & i, bool f, bool u, Encoding::Package p) : name_(n), latexName_(l), guiName_(g), iconvName_(i), fixedwidth_(f), - unsafe_(u), forced_(&forcedselected[n]), package_(p) + unsafe_(u), forced_(&forcedSelected[n]), package_(p) { if (n == "ascii") { // ASCII can encode 128 code points and nothing else @@ -705,20 +705,20 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) { // We must read the symbolsfile first, because the Encoding // constructor depends on it. - CharSetMap forcednotselected; - Lexer symbolslex; - symbolslex.setFile(symbolsfile); + CharSetMap forcedNotSelected; + Lexer symbolsLex; + symbolsLex.setFile(symbolsfile); bool getNextToken = true; - while (symbolslex.isOK()) { + while (symbolsLex.isOK()) { char_type symbol; if (getNextToken) { - if (!symbolslex.next(true)) + if (!symbolsLex.next(true)) break; } else getNextToken = true; - istringstream is(symbolslex.getString()); + istringstream is(symbolsLex.getString()); // reading symbol directly does not work if // char_type == wchar_t. uint32_t tmp; @@ -726,20 +726,38 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) break; symbol = tmp; - if (!symbolslex.next(true)) - break; - docstring textcommand = symbolslex.getDocString(); - if (!symbolslex.next(true)) - break; - string textpreamble = symbolslex.getString(); - if (!symbolslex.next(true)) - break; - string sflags = symbolslex.getString(); + // Special case: more than one entry for one character (to add other LaTeX commands). + if (unicodesymbols.contains(symbol)) { + if (!symbolsLex.next(true)) + break; + docstring textCommand = symbolsLex.getDocString(); + if (!symbolsLex.next(true)) + break; + string mathCommand = symbolsLex.getString(); - string tipashortcut; + if (!textCommand.empty()) + unicodesymbols.at(symbol).addTextCommand(textCommand); + if (!mathCommand.empty()) + unicodesymbols.at(symbol).addMathCommand(textCommand); + + continue; + } + + // If the symbol is not the same as the previous entry, consider it is a totally new symbol. + if (!symbolsLex.next(true)) + break; + docstring textCommand = symbolsLex.getDocString(); + if (!symbolsLex.next(true)) + break; + string textPreamble = symbolsLex.getString(); + if (!symbolsLex.next(true)) + break; + string sflags = symbolsLex.getString(); + + string tipaShortcut; int flags = 0; - if (suffixIs(textcommand, '}')) + if (suffixIs(textCommand, '}')) flags |= CharInfoTextNoTermination; while (!sflags.empty()) { string flag; @@ -753,13 +771,13 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) vector encs = getVectorFromString(flag.substr(6), ";"); for (auto const & enc : encs) - forcedselected[enc].insert(symbol); + forcedSelected[enc].insert(symbol); flags |= CharInfoForceSelected; } else if (prefixIs(flag, "force!=")) { vector encs = getVectorFromString(flag.substr(7), ";"); for (auto const & enc : encs) - forcednotselected[enc].insert(symbol); + forcedNotSelected[enc].insert(symbol); flags |= CharInfoForceSelected; } else if (flag == "mathalpha") { mathalpha.insert(symbol); @@ -773,8 +791,8 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) } else if (flag == "notermination=none") { flags &= ~CharInfoTextNoTermination; flags &= ~CharInfoMathNoTermination; - } else if (contains(flag, "tipaShortcut=")) { - tipashortcut = split(flag, '='); + } else if (contains(flag, "tipashortcut=")) { + tipaShortcut = split(flag, '='); } else if (flag == "deprecated") { flags |= CharInfoDeprecated; } else { @@ -786,25 +804,25 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) } // mathCommand and mathPreamble have been added for 1.6.0. // make them optional so that old files still work. - int const lineno = symbolslex.lineNumber(); + int const lineNo = symbolsLex.lineNumber(); bool breakout = false; - docstring mathcommand; - string mathpreamble; - if (symbolslex.next(true)) { - if (symbolslex.lineNumber() != lineno) { + docstring mathCommand; + string mathPreamble; + if (symbolsLex.next(true)) { + if (symbolsLex.lineNumber() != lineNo) { // line in old format without mathCommand and mathPreamble getNextToken = false; } else { - mathcommand = symbolslex.getDocString(); - if (suffixIs(mathcommand, '}')) + mathCommand = symbolsLex.getDocString(); + if (suffixIs(mathCommand, '}')) flags |= CharInfoMathNoTermination; - if (symbolslex.next(true)) { - if (symbolslex.lineNumber() != lineno) { + if (symbolsLex.next(true)) { + if (symbolsLex.lineNumber() != lineNo) { // line in new format with mathCommand only getNextToken = false; } else { // line in new format with mathCommand and mathPreamble - mathpreamble = symbolslex.getString(); + mathPreamble = symbolsLex.getString(); } } else breakout = true; @@ -814,20 +832,20 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) } // backward compatibility - if (mathpreamble == "esintoramsmath") - mathpreamble = "esint|amsmath"; + if (mathPreamble == "esintoramsmath") + mathPreamble = "esint|amsmath"; - if (!textpreamble.empty()) - if (textpreamble[0] != '\\') + if (!textPreamble.empty()) + if (textPreamble[0] != '\\') flags |= CharInfoTextFeature; - if (!mathpreamble.empty()) - if (mathpreamble[0] != '\\') + if (!mathPreamble.empty()) + if (mathPreamble[0] != '\\') flags |= CharInfoMathFeature; CharInfo info = CharInfo( - textcommand, mathcommand, - textpreamble, mathpreamble, - tipashortcut, flags); + textCommand, mathCommand, + textPreamble, mathPreamble, + tipaShortcut, flags); LYXERR(Debug::INFO, "Read unicode symbol " << symbol << " '" << to_utf8(info.textCommand()) << "' '" << info.textPreamble() << " '" << info.textFeature() << ' ' << info.textNoTermination() @@ -851,12 +869,12 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) et_end }; - LexerKeyword encodingtags[] = { + LexerKeyword encodingTags[] = { { "encoding", et_encoding }, { "end", et_end } }; - Lexer lex(encodingtags); + Lexer lex(encodingTags); lex.setFile(encfile); lex.setContext("Encodings::read"); while (lex.isOK()) { @@ -866,21 +884,21 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) lex.next(); string const name = lex.getString(); lex.next(); - string const latexname = lex.getString(); + string const latexName = lex.getString(); lex.next(); - string const guiname = lex.getString(); + string const guiName = lex.getString(); lex.next(); - string const iconvname = lex.getString(); + string const iconvName = lex.getString(); lex.next(); string const width = lex.getString(); - bool fixedwidth = false; + bool fixedWidth = false; bool unsafe = false; if (width == "fixed") - fixedwidth = true; + fixedWidth = true; else if (width == "variable") - fixedwidth = false; + fixedWidth = false; else if (width == "variableunsafe") { - fixedwidth = false; + fixedWidth = false; unsafe = true; } else @@ -901,9 +919,9 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) lex.printError("Unknown package"); LYXERR(Debug::INFO, "Reading encoding " << name); - encodinglist[name] = Encoding(name, latexname, - guiname, iconvname, fixedwidth, unsafe, - package); + encodinglist[name] = Encoding(name, latexName, + guiName, iconvName, fixedWidth, unsafe, + package); if (lex.lex() != et_end) lex.printError("Missing end"); @@ -920,9 +938,9 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) } } - // Move all information from forcednotselected to forcedselected - for (CharSetMap::const_iterator it1 = forcednotselected.begin(); it1 != forcednotselected.end(); ++it1) { - for (CharSetMap::iterator it2 = forcedselected.begin(); it2 != forcedselected.end(); ++it2) { + // Move all information from forcedNotSelected to forcedSelected + for (CharSetMap::const_iterator it1 = forcedNotSelected.begin(); it1 != forcedNotSelected.end(); ++it1) { + for (CharSetMap::iterator it2 = forcedSelected.begin(); it2 != forcedSelected.end(); ++it2) { if (it2->first != it1->first) it2->second.insert(it1->second.begin(), it1->second.end()); } diff --git a/src/Encoding.h b/src/Encoding.h index 2bfbda051d..6016f05f02 100644 --- a/src/Encoding.h +++ b/src/Encoding.h @@ -69,6 +69,11 @@ public: std::vector const & text_commands, std::vector const & math_commands, std::string const & text_preamble, std::string const & math_preamble, std::string const & tipa_shortcut, unsigned int flags); + // Add a new text command for this symbol. + void addTextCommand(const docstring& newTextCommand) { text_commands_.emplace_back(newTextCommand); } + // Add a new math command for this symbol. + void addMathCommand(const docstring& newMathCommand) { math_commands_.emplace_back(newMathCommand); } + // we assume that at least one command is nonempty when using unicodesymbols bool isUnicodeSymbol() const { return !text_commands_.empty() || !math_commands_.empty(); } /// LaTeX command (text mode) for this character diff --git a/src/insets/InsetERT.cpp b/src/insets/InsetERT.cpp index 00f497f031..b3e73ffded 100644 --- a/src/insets/InsetERT.cpp +++ b/src/insets/InsetERT.cpp @@ -158,6 +158,8 @@ void InsetERT::docbook(XMLStream & xs, OutputParams const & runparams) const os_trimmed.insert(4, from_ascii("}")); } + std::cout << to_utf8(os_trimmed) << std::endl; + // Look into the global table of Unicode characters if there is a match. bool termination; docstring rem;