From feb789596568ab01d21943f1595ddb321142797d Mon Sep 17 00:00:00 2001 From: Georg Baum Date: Sun, 28 Jan 2007 21:27:45 +0000 Subject: [PATCH] Add machinery to output arbitrary unicode characters with LaTeX commands read from a text file. * src/encoding.[Ch] (Encoding::latexChar): New, output a character to LaTeX (Encoding::validate): New, add needed preamble stuff for a character (Encodings::read): Read new unicodesymbols file (Encodings::isCombiningChar): New, is a character a combining char? * src/paragraph_pimpl.C (isEncoding): Delete, no longer needed (getEncoding): New, get the real encoding of a font (Paragraph::Pimpl::latexSurrogatePair): New, output a surrogate pair to LaTeX (Paragraph::Pimpl::simpleTeXBlanks): Use latexSurrogatePair if needed (Paragraph::Pimpl::simpleTeXSpecialChars): Ditto, and replace several hardcoded characters with a call of encoding.latexChar() (Paragraph::Pimpl::validate): replace several hardcoded characters with a call of encoding.validate() * src/support/debugstream.h (basic_debugstream::disable): New, disable the stream completely (basic_debugstream::enable): New, reenable the stream * src/lyx_main.[Ch]: Adjust to changes above * src/paragraph.C: Ditto * lib/unicodesymbols: New file with UCS4 -> LaTeX command mapping. It is far from complete yet, but contains most accents on latin characters. * lib/Makefile.am: add lib/unicodesymbols * development/scons/scons_manifest.py: ditto * development/tools/unicodesymbols.py: Helper script to update lib/unicodesymbols with new symbols git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@16920 a592a061-630c-0410-9148-cb99ea01b6c8 --- development/scons/scons_manifest.py | 1 + development/tools/unicodesymbols.py | 119 +++++ lib/Makefile.am | 2 +- lib/unicodesymbols | 681 ++++++++++++++++++++++++++++ src/encoding.C | 166 ++++++- src/encoding.h | 43 +- src/lyx_main.C | 22 +- src/lyx_main.h | 7 +- src/paragraph.C | 20 +- src/paragraph_pimpl.C | 189 +++----- src/paragraph_pimpl.h | 18 +- src/support/debugstream.h | 27 +- 12 files changed, 1148 insertions(+), 147 deletions(-) create mode 100644 development/tools/unicodesymbols.py create mode 100644 lib/unicodesymbols diff --git a/development/scons/scons_manifest.py b/development/scons/scons_manifest.py index 1991432fb4..1a9b7b963f 100644 --- a/development/scons/scons_manifest.py +++ b/development/scons/scons_manifest.py @@ -1275,6 +1275,7 @@ lib_files = Split(''' languages symbols syntax.default + unicodesymbols configure.py ''') diff --git a/development/tools/unicodesymbols.py b/development/tools/unicodesymbols.py new file mode 100644 index 0000000000..d186275796 --- /dev/null +++ b/development/tools/unicodesymbols.py @@ -0,0 +1,119 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +# file unciodesymbols.py +# This file is part of LyX, the document processor. +# Licence details can be found in the file COPYING. + +# author Georg Baum + +# Full author contact details are available in file CREDITS + +# This script reads a unicode symbol file and completes it in the given range + + +import os, re, string, sys, unicodedata + +def usage(prog_name): + return ("Usage: %s start stop inputfile outputfile\n" % prog_name + + "or %s start stop outputfile" % prog_name) + + +def error(message): + sys.stderr.write(message + '\n') + sys.exit(1) + + +def trim_eol(line): + " Remove end of line char(s)." + if line[-2:-1] == '\r': + return line[:-2] + elif line[-1:] == '\r' or line[-1:] == '\n': + return line[:-1] + else: + # file with no EOL in last line + return line + + +def read(input): + " Read input file and strip lineendings." + lines = list() + while 1: + line = input.readline() + if not line: + break + line = trim_eol(line) + tokens = line.split() + char = -1 + if len(tokens) > 0: + if tokens[0][0:2] == "0x": + char = int(tokens[0][2:], 16) + elif tokens[0][0:3] == "#0x": + char = int(tokens[0][3:], 16) + lines.append([char, line]) + return lines + + +def write(output, lines): + " Write output file with native lineendings." + for line in lines: + output.write(line[1] + os.linesep) + + +def complete(lines, start, stop): + l = 0 + for i in range(start, stop): + # This catches both comments (lines[l][0] == -1) and code points less than i + while l < len(lines) and lines[l][0] < i: + print lines[l] + l = l + 1 + continue + if l >= len(lines) or lines[l][0] != i: + c = unichr(i) + name = unicodedata.name(c, "") + if name != "": + if unicodedata.combining(c): + combining = "combining" + else: + combining = "" + line = [i, '#0x%04x "" "" "%s" # %s' % (i, combining, name)] + lines.insert(l, line) + print lines[l] + l = l + 1 + + +def main(argv): + + # Open files + if len(argv) == 3: + input = sys.stdin + output = sys.stdout + elif len(argv) == 5: + input = open(argv[3], 'rb') + output = open(argv[4], 'wb') + else: + error(usage(argv[0])) + if argv[1][:2] == "0x": + start = int(argv[1][2:], 16) + else: + start = int(argv[1]) + if argv[2][:2] == "0x": + stop = int(argv[2][2:], 16) + else: + stop = int(argv[2]) + + # Do the real work + lines = read(input) + complete(lines, start, stop) + write(output, lines) + + # Close files + if len(argv) == 3: + input.close() + output.close() + + return 0 + + +if __name__ == "__main__": + main(sys.argv) diff --git a/lib/Makefile.am b/lib/Makefile.am index a5995dc0b5..9012fe0735 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -5,7 +5,7 @@ SUBDIRS = doc lyx2lyx CHMOD = chmod dist_pkgdata_DATA = CREDITS chkconfig.ltx \ - external_templates encodings languages symbols syntax.default + external_templates encodings languages symbols syntax.default unicodesymbols # Note that we "chmod 755" manually this file in install-data-hook. dist_pkgdata_PYTHON = configure.py diff --git a/lib/unicodesymbols b/lib/unicodesymbols new file mode 100644 index 0000000000..daccbc2c01 --- /dev/null +++ b/lib/unicodesymbols @@ -0,0 +1,681 @@ +# +# file unicodesymbols +# This file is part of LyX, the document processor. +# Licence details can be found in the file COPYING. +# +# author Georg Baum +# +# Full author contact details are available in file CREDITS. + +# This file is a database of LaTeX commands for unicode characters. +# These commands will be used by LyX for LaTeX export for all characters +# that are not representable in the chosen encoding. + +# syntax: +# ucs4 command preamble flags +# preamble can either be a known feature, or a LaTeX command. +# Known flags: +# - combining This is a combining char that will get combined with a base char +# - force Always output replacement command + +#0x00a0 "" "" "" # NO-BREAK SPACE +#0x00a1 "" "" "" # INVERTED EXCLAMATION MARK +0x00a2 "\\textcent" "textcomp" "" # CENT SIGN +0x00a3 "\\pounds" "" "" # £ POUND SIGN +0x00a4 "\\textcurrency" "textcomp" "" # CURRENCY SYMBOL +0x00a5 "\\textyen" "textcomp" "" # YEN SIGN +0x00a6 "\\textbrokenbar" "textcomp" "" # BROKEN BAR +0x00a7 "\\textsection" "textcomp" "" # SECTION SIGN +0x00a8 "\\textasciidieresis" "textcomp" "" # DIAERESIS +0x00a9 "\textcopyright" "textcomp" "" # COPYRIGHT SIGN +0x00aa "\\textordfeminine" "textcomp" "" # FEMININE ORDINAL INDICATOR +#0x00ab "" "" "" # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0x00ac "\\textlnot" "textcomp" "force" # ¬ NOT SIGN +#0x00ad "" "" "" # SOFT HYPHEN +0x00ae "\\textregistered" "textcomp" "" # REGISTERED SIGN +0x00af "\\textasciimacron" "textcomp" "" # MACRON +0x00b0 "\\textdegree" "textcomp" "" # DEGREE SIGN +0x00b1 "\\textpm" "textcomp" "force" # ± PLUS-MINUS SIGN +0x00b2 "\\texttwosuperior" "textcomp" "force" # ² SUPERSCRIPT TWO +0x00b3 "\\textthreesuperior" "textcomp" "force" # ³ SUPERSCRIPT THREE +0x00b4 "\\textasciiacute" "textcomp" "" # ACUTE ACCENT +0x00b5 "\\textmu" "textcomp" "force" # µ MICRO SIGN +0x00b6 "\\textpilcrow" "textcomp" "" # PILCROW SIGN +#0x00b7 "" "" "" # MIDDLE DOT +0x00b8 "\\c\\ " "" "" # CEDILLA (command from latin1.def) +0x00b9 "\\textonesuperior" "textcomp" "force" # ¹ SUPERSCRIPT ONE +0x00ba "\\textordmasculine" "textcomp" "" # MASCULINE ORDINAL INDICATOR +#0x00bb "" "" "" # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0x00bc "\\textonequarter" "textcomp" "" # 1/4 FRACTION +0x00bd "\\textonehalf" "textcomp" "" # 1/2 FRACTION +0x00be "\\textthreequarters" "textcomp" "" # 3/4 FRACTION +#0x00bf "" "" "" # INVERTED QUESTION MARK +0x00c0 "\\`{A}" "" "" # LATIN CAPITAL LETTER A WITH GRAVE +0x00c1 "\\'{A}" "" "" # LATIN CAPITAL LETTER A WITH ACUTE +0x00c2 "\\^{A}" "" "" # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0x00c3 "\\~{A}" "" "" # LATIN CAPITAL LETTER A WITH TILDE +0x00c4 "\\\"{A}" "" "" # LATIN CAPITAL LETTER A WITH DIAERESIS +0x00c5 "\\r{A}" "" "" # LATIN CAPITAL LETTER A WITH RING ABOVE +#0x00c6 "" "" "" # LATIN CAPITAL LETTER AE +0x00c7 "\\c{C}" "" "" # LATIN CAPITAL LETTER C WITH CEDILLA +0x00c8 "\\`{E}" "" "" # LATIN CAPITAL LETTER E WITH GRAVE +0x00c9 "\\'{E}" "" "" # LATIN CAPITAL LETTER E WITH ACUTE +0x00ca "\\^{E}" "" "" # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0x00cb "\\\"{E}" "" "" # LATIN CAPITAL LETTER E WITH DIAERESIS +0x00cc "\\`{I}" "" "" # LATIN CAPITAL LETTER I WITH GRAVE +0x00cd "\\'{I}" "" "" # LATIN CAPITAL LETTER I WITH ACUTE +0x00ce "\\^{I}" "" "" # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0x00cf "\\\"{I}" "" "" # LATIN CAPITAL LETTER I WITH DIAERESIS +#0x00d0 "" "" "" # LATIN CAPITAL LETTER ETH +0x00d1 "\\~{N}" "" "" # LATIN CAPITAL LETTER N WITH TILDE +0x00d2 "\\`{O}" "" "" # LATIN CAPITAL LETTER O WITH GRAVE +0x00d3 "\\'{O}" "" "" # LATIN CAPITAL LETTER O WITH ACUTE +0x00d4 "\\^{O}" "" "" # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0x00d5 "\\~{O}" "" "" # LATIN CAPITAL LETTER O WITH TILDE +0x00d6 "\\\"{O}" "" "" # LATIN CAPITAL LETTER O WITH DIAERESIS +0x00d7 "\\texttimes" "textcomp" "force" # × MULTIPLICATION SIGN +#0x00d8 "" "" "" # LATIN CAPITAL LETTER O WITH STROKE +0x00d9 "\\`{U}" "" "" # LATIN CAPITAL LETTER U WITH GRAVE +0x00da "\\'{U}" "" "" # LATIN CAPITAL LETTER U WITH ACUTE +0x00db "\\^{U}" "" "" # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0x00dc "\\\"{U}" "" "" # LATIN CAPITAL LETTER U WITH DIAERESIS +0x00dd "\\'{Y}" "" "" # LATIN CAPITAL LETTER Y WITH ACUTE +#0x00de "" "" "" # LATIN CAPITAL LETTER THORN +#0x00df "" "" "" # LATIN SMALL LETTER SHARP S +0x00e0 "\\`{a}" "" "" # LATIN SMALL LETTER A WITH GRAVE +0x00e1 "\\'{a}" "" "" # LATIN SMALL LETTER A WITH ACUTE +0x00e2 "\\^{a}" "" "" # LATIN SMALL LETTER A WITH CIRCUMFLEX +0x00e3 "\\~{a}" "" "" # LATIN SMALL LETTER A WITH TILDE +0x00e4 "\\\"{a}" "" "" # LATIN SMALL LETTER A WITH DIAERESIS +0x00e5 "\\r{a}" "" "" # LATIN SMALL LETTER A WITH RING ABOVE +#0x00e6 "" "" "" # LATIN SMALL LETTER AE +0x00e7 "\\c{c}" "" "" # LATIN SMALL LETTER C WITH CEDILLA +0x00e8 "\\`{e}" "" "" # LATIN SMALL LETTER E WITH GRAVE +0x00e9 "\\'{e}" "" "" # LATIN SMALL LETTER E WITH ACUTE +0x00ea "\\^{e}" "" "" # LATIN SMALL LETTER E WITH CIRCUMFLEX +0x00eb "\\\"{e}" "" "" # LATIN SMALL LETTER E WITH DIAERESIS +0x00ec "\\`{\\i}" "" "" # LATIN SMALL LETTER I WITH GRAVE +0x00ed "\\'{\\i}" "" "" # LATIN SMALL LETTER I WITH ACUTE +0x00ee "\\^{\\i}" "" "" # LATIN SMALL LETTER I WITH CIRCUMFLEX +0x00ef "\\\"{\\i}" "" "" # LATIN SMALL LETTER I WITH DIAERESIS +#0x00f0 "" "" "" # LATIN SMALL LETTER ETH +0x00f1 "\\~{n}" "" "" # LATIN SMALL LETTER N WITH TILDE +0x00f2 "\\`{o}" "" "" # LATIN SMALL LETTER O WITH GRAVE +0x00f3 "\\'{o}" "" "" # LATIN SMALL LETTER O WITH ACUTE +0x00f4 "\\^{o}" "" "" # LATIN SMALL LETTER O WITH CIRCUMFLEX +0x00f5 "\\~{o}" "" "" # LATIN SMALL LETTER O WITH TILDE +0x00f6 "\\\"{o}" "" "" # LATIN SMALL LETTER O WITH DIAERESIS +0x00f7 "\\textdiv" "textcomp" "force" # ÷ DIVISION SIGN +#0x00f8 "" "" "" # LATIN SMALL LETTER O WITH STROKE +0x00f9 "\\`{u}" "" "" # LATIN SMALL LETTER U WITH GRAVE +0x00fa "\\'{u}" "" "" # LATIN SMALL LETTER U WITH ACUTE +0x00fb "\\^{u}" "" "" # LATIN SMALL LETTER U WITH CIRCUMFLEX +0x00fc "\\\"{u}" "" "" # LATIN SMALL LETTER U WITH DIAERESIS +0x00fd "\\'{y}" "" "" # LATIN SMALL LETTER Y WITH ACUTE +#0x00fe "" "" "" # LATIN SMALL LETTER THORN +0x00ff "\\\"{y}" "" "" # LATIN SMALL LETTER Y WITH DIAERESIS +0x0100 "\\={A}" "" "" # LATIN CAPITAL LETTER A WITH MACRON +0x0101 "\\={a}" "" "" # LATIN SMALL LETTER A WITH MACRON +0x0102 "\\u{A}" "" "" # LATIN CAPITAL LETTER A WITH BREVE +0x0103 "\\u{a}" "" "" # LATIN SMALL LETTER A WITH BREVE +0x0104 "\\k{A}" "" "" # LATIN CAPITAL LETTER A WITH OGONEK +0x0105 "\\k{a}" "" "" # LATIN SMALL LETTER A WITH OGONEK +0x0106 "\\'{C}" "" "" # LATIN CAPITAL LETTER C WITH ACUTE +0x0107 "\\'{c}" "" "" # LATIN SMALL LETTER C WITH ACUTE +0x0108 "\\^{C}" "" "" # LATIN CAPITAL LETTER C WITH CIRCUMFLEX +0x0109 "\\^{c}" "" "" # LATIN SMALL LETTER C WITH CIRCUMFLEX +0x010a "\\.{C}" "" "" # LATIN CAPITAL LETTER C WITH DOT ABOVE +0x010b "\\.{c}" "" "" # LATIN SMALL LETTER C WITH DOT ABOVE +0x010c "\\v{C}" "" "" # LATIN CAPITAL LETTER C WITH CARON +0x010d "\\v{c}" "" "" # LATIN SMALL LETTER C WITH CARON +0x010e "\\v{D}" "" "" # LATIN CAPITAL LETTER D WITH CARON +0x010f "\\v{d}" "" "" # LATIN SMALL LETTER D WITH CARON +#0x0110 "" "" "" # LATIN CAPITAL LETTER D WITH STROKE +#0x0111 "" "" "" # LATIN SMALL LETTER D WITH STROKE +0x0112 "\\={E}" "" "" # LATIN CAPITAL LETTER E WITH MACRON +0x0113 "\\={e}" "" "" # LATIN SMALL LETTER E WITH MACRON +0x0114 "\\u{E}" "" "" # LATIN CAPITAL LETTER E WITH BREVE +0x0115 "\\u{e}" "" "" # LATIN SMALL LETTER E WITH BREVE +0x0116 "\\.{E}" "" "" # LATIN CAPITAL LETTER E WITH DOT ABOVE +0x0117 "\\.{e}" "" "" # LATIN SMALL LETTER E WITH DOT ABOVE +0x0118 "\\k{E}" "" "" # LATIN CAPITAL LETTER E WITH OGONEK +0x0119 "\\k{e}" "" "" # LATIN SMALL LETTER E WITH OGONEK +0x011a "\\v{E}" "" "" # LATIN CAPITAL LETTER E WITH CARON +0x011b "\\v{e}" "" "" # LATIN SMALL LETTER E WITH CARON +0x011c "\\^{G}" "" "" # LATIN CAPITAL LETTER G WITH CIRCUMFLEX +0x011d "\\^{g}" "" "" # LATIN SMALL LETTER G WITH CIRCUMFLEX +0x011e "\\u{G}" "" "" # LATIN CAPITAL LETTER G WITH BREVE +0x011f "\\u{g}" "" "" # LATIN SMALL LETTER G WITH BREVE +0x0120 "\\.{G}" "" "" # LATIN CAPITAL LETTER G WITH DOT ABOVE +0x0121 "\\.{g}" "" "" # LATIN SMALL LETTER G WITH DOT ABOVE +0x0122 "\\c{G}" "" "" # LATIN CAPITAL LETTER G WITH CEDILLA +0x0123 "\\c{g}" "" "" # LATIN SMALL LETTER G WITH CEDILLA +0x0124 "\\^{H}" "" "" # LATIN CAPITAL LETTER H WITH CIRCUMFLEX +0x0125 "\\^{h}" "" "" # LATIN SMALL LETTER H WITH CIRCUMFLEX +#0x0126 "" "" "" # LATIN CAPITAL LETTER H WITH STROKE +#0x0127 "" "" "" # LATIN SMALL LETTER H WITH STROKE +0x0128 "\\~{I}" "" "" # LATIN CAPITAL LETTER I WITH TILDE +0x0129 "\\~{\\i}" "" "" # LATIN SMALL LETTER I WITH TILDE +0x012a "\\={I}" "" "" # LATIN CAPITAL LETTER I WITH MACRON +0x012b "\\={\\i}" "" "" # LATIN SMALL LETTER I WITH MACRON +0x012c "\\u{I}" "" "" # LATIN CAPITAL LETTER I WITH BREVE +0x012d "\\u{\\i}" "" "" # LATIN SMALL LETTER I WITH BREVE +0x012e "\\k{I}" "" "" # LATIN CAPITAL LETTER I WITH OGONEK +0x012f "\\k{i}" "" "" # LATIN SMALL LETTER I WITH OGONEK +0x0130 "\\.{I}" "" "" # LATIN CAPITAL LETTER I WITH DOT ABOVE +0x0131 "\\i" "" "" # LATIN SMALL LETTER DOTLESS I +#0x0132 "" "" "" # LATIN CAPITAL LIGATURE IJ +#0x0133 "" "" "" # LATIN SMALL LIGATURE IJ +0x0134 "\\^{J}" "" "" # LATIN CAPITAL LETTER J WITH CIRCUMFLEX +0x0135 "\\^{\\j}" "" "" # LATIN SMALL LETTER J WITH CIRCUMFLEX +0x0136 "\\c{K}" "" "" # LATIN CAPITAL LETTER K WITH CEDILLA +0x0137 "\\c{k}" "" "" # LATIN SMALL LETTER K WITH CEDILLA +#0x0138 "" "" "" # LATIN SMALL LETTER KRA +0x0139 "\\'{L}" "" "" # LATIN CAPITAL LETTER L WITH ACUTE +0x013a "\\'{l}" "" "" # LATIN SMALL LETTER L WITH ACUTE +0x013b "\\c{L}" "" "" # LATIN CAPITAL LETTER L WITH CEDILLA +0x013c "\\c{l}" "" "" # LATIN SMALL LETTER L WITH CEDILLA +0x013d "\\v{L}" "" "" # LATIN CAPITAL LETTER L WITH CARON +0x013e "\\v{l}" "" "" # LATIN SMALL LETTER L WITH CARON +#0x013f "" "" "" # LATIN CAPITAL LETTER L WITH MIDDLE DOT +#0x0140 "" "" "" # LATIN SMALL LETTER L WITH MIDDLE DOT +0x0141 "\\L" "" "" # LATIN CAPITAL LETTER L WITH STROKE +0x0142 "\\l" "" "" # LATIN SMALL LETTER L WITH STROKE +0x0143 "\\'{N}" "" "" # LATIN CAPITAL LETTER N WITH ACUTE +0x0144 "\\'{n}" "" "" # LATIN SMALL LETTER N WITH ACUTE +0x0145 "\\c{N}" "" "" # LATIN CAPITAL LETTER N WITH CEDILLA +0x0146 "\\c{n}" "" "" # LATIN SMALL LETTER N WITH CEDILLA +0x0147 "\\v{N}" "" "" # LATIN CAPITAL LETTER N WITH CARON +0x0148 "\\v{n}" "" "" # LATIN SMALL LETTER N WITH CARON +#0x0149 "" "" "" # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE +#0x014a "" "" "" # LATIN CAPITAL LETTER ENG +#0x014b "" "" "" # LATIN SMALL LETTER ENG +0x014c "\\={O}" "" "" # LATIN CAPITAL LETTER O WITH MACRON +0x014d "\\={o}" "" "" # LATIN SMALL LETTER O WITH MACRON +0x014e "\\u{O}" "" "" # LATIN CAPITAL LETTER O WITH BREVE +0x014f "\\u{o}" "" "" # LATIN SMALL LETTER O WITH BREVE +0x0150 "\\H{O}" "" "" # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0x0151 "\\H{o}" "" "" # LATIN SMALL LETTER O WITH DOUBLE ACUTE +#0x0152 "" "" "" # LATIN CAPITAL LIGATURE OE +#0x0153 "" "" "" # LATIN SMALL LIGATURE OE +0x0154 "\\'{R}" "" "" # LATIN CAPITAL LETTER R WITH ACUTE +0x0155 "\\'{r}" "" "" # LATIN SMALL LETTER R WITH ACUTE +0x0156 "\\c{R}" "" "" # LATIN CAPITAL LETTER R WITH CEDILLA +0x0157 "\\c{r}" "" "" # LATIN SMALL LETTER R WITH CEDILLA +0x0158 "\\v{R}" "" "" # LATIN CAPITAL LETTER R WITH CARON +0x0159 "\\v{r}" "" "" # LATIN SMALL LETTER R WITH CARON +0x015a "\\'{S}" "" "" # LATIN CAPITAL LETTER S WITH ACUTE +0x015b "\\'{s}" "" "" # LATIN SMALL LETTER S WITH ACUTE +0x015c "\\^{S}" "" "" # LATIN CAPITAL LETTER S WITH CIRCUMFLEX +0x015d "\\^{s}" "" "" # LATIN SMALL LETTER S WITH CIRCUMFLEX +0x015e "\\c{S}" "" "" # LATIN CAPITAL LETTER S WITH CEDILLA +0x015f "\\c{s}" "" "" # LATIN SMALL LETTER S WITH CEDILLA +0x0160 "\\v{S}" "" "" # LATIN CAPITAL LETTER S WITH CARON +0x0161 "\\v{s}" "" "" # LATIN SMALL LETTER S WITH CARON +0x0162 "\\c{T}" "" "" # LATIN CAPITAL LETTER T WITH CEDILLA +0x0163 "\\c{t}" "" "" # LATIN SMALL LETTER T WITH CEDILLA +0x0164 "\\v{T}" "" "" # LATIN CAPITAL LETTER T WITH CARON +0x0165 "\\v{t}" "" "" # LATIN SMALL LETTER T WITH CARON +#0x0166 "" "" "" # LATIN CAPITAL LETTER T WITH STROKE +#0x0167 "" "" "" # LATIN SMALL LETTER T WITH STROKE +0x0168 "\\~{U}" "" "" # LATIN CAPITAL LETTER U WITH TILDE +0x0169 "\\~{u}" "" "" # LATIN SMALL LETTER U WITH TILDE +0x016a "\\={U}" "" "" # LATIN CAPITAL LETTER U WITH MACRON +0x016b "\\={u}" "" "" # LATIN SMALL LETTER U WITH MACRON +0x016c "\\u{U}" "" "" # LATIN CAPITAL LETTER U WITH BREVE +0x016d "\\u{u}" "" "" # LATIN SMALL LETTER U WITH BREVE +0x016e "\\r{U}" "" "" # LATIN CAPITAL LETTER U WITH RING ABOVE +0x016f "\\r{u}" "" "" # LATIN SMALL LETTER U WITH RING ABOVE +0x0170 "\\'{U}" "" "" # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0x0171 "\\'{u}" "" "" # LATIN SMALL LETTER U WITH DOUBLE ACUTE +0x0172 "\\k{U}" "" "" # LATIN CAPITAL LETTER U WITH OGONEK +0x0173 "\\k{u}" "" "" # LATIN SMALL LETTER U WITH OGONEK +0x0174 "\\^{W}" "" "" # LATIN CAPITAL LETTER W WITH CIRCUMFLEX +0x0175 "\\^{w}" "" "" # LATIN SMALL LETTER W WITH CIRCUMFLEX +0x0176 "\\^{Y}" "" "" # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX +0x0177 "\\^{y}" "" "" # LATIN SMALL LETTER Y WITH CIRCUMFLEX +0x0178 "\\\"{Y}" "" "" # LATIN CAPITAL LETTER Y WITH DIAERESIS +0x0179 "\\'{Z}" "" "" # LATIN CAPITAL LETTER Z WITH ACUTE +0x017a "\\'{z}" "" "" # LATIN SMALL LETTER Z WITH ACUTE +0x017b "\\.{Z}" "" "" # LATIN CAPITAL LETTER Z WITH DOT ABOVE +0x017c "\\.{z}" "" "" # LATIN SMALL LETTER Z WITH DOT ABOVE +0x017d "\\v{Z}" "" "" # LATIN CAPITAL LETTER Z WITH CARON +0x017e "\\v{z}" "" "" # LATIN SMALL LETTER Z WITH CARON +#0x017f "" "" "" # LATIN SMALL LETTER LONG S +#0x0180 "" "" "" # LATIN SMALL LETTER B WITH STROKE +#0x0181 "" "" "" # LATIN CAPITAL LETTER B WITH HOOK +#0x0182 "" "" "" # LATIN CAPITAL LETTER B WITH TOPBAR +#0x0183 "" "" "" # LATIN SMALL LETTER B WITH TOPBAR +#0x0184 "" "" "" # LATIN CAPITAL LETTER TONE SIX +#0x0185 "" "" "" # LATIN SMALL LETTER TONE SIX +#0x0186 "" "" "" # LATIN CAPITAL LETTER OPEN O +#0x0187 "" "" "" # LATIN CAPITAL LETTER C WITH HOOK +#0x0188 "" "" "" # LATIN SMALL LETTER C WITH HOOK +#0x0189 "" "" "" # LATIN CAPITAL LETTER AFRICAN D +#0x018a "" "" "" # LATIN CAPITAL LETTER D WITH HOOK +#0x018b "" "" "" # LATIN CAPITAL LETTER D WITH TOPBAR +#0x018c "" "" "" # LATIN SMALL LETTER D WITH TOPBAR +#0x018d "" "" "" # LATIN SMALL LETTER TURNED DELTA +#0x018e "" "" "" # LATIN CAPITAL LETTER REVERSED E +#0x018f "" "" "" # LATIN CAPITAL LETTER SCHWA +#0x0190 "" "" "" # LATIN CAPITAL LETTER OPEN E +#0x0191 "" "" "" # LATIN CAPITAL LETTER F WITH HOOK +#0x0192 "" "" "" # LATIN SMALL LETTER F WITH HOOK +#0x0193 "" "" "" # LATIN CAPITAL LETTER G WITH HOOK +#0x0194 "" "" "" # LATIN CAPITAL LETTER GAMMA +#0x0195 "" "" "" # LATIN SMALL LETTER HV +#0x0196 "" "" "" # LATIN CAPITAL LETTER IOTA +#0x0197 "" "" "" # LATIN CAPITAL LETTER I WITH STROKE +#0x0198 "" "" "" # LATIN CAPITAL LETTER K WITH HOOK +#0x0199 "" "" "" # LATIN SMALL LETTER K WITH HOOK +#0x019a "" "" "" # LATIN SMALL LETTER L WITH BAR +#0x019b "" "" "" # LATIN SMALL LETTER LAMBDA WITH STROKE +#0x019c "" "" "" # LATIN CAPITAL LETTER TURNED M +#0x019d "" "" "" # LATIN CAPITAL LETTER N WITH LEFT HOOK +#0x019e "" "" "" # LATIN SMALL LETTER N WITH LONG RIGHT LEG +#0x019f "" "" "" # LATIN CAPITAL LETTER O WITH MIDDLE TILDE +#0x01a0 "" "" "" # LATIN CAPITAL LETTER O WITH HORN +#0x01a1 "" "" "" # LATIN SMALL LETTER O WITH HORN +#0x01a2 "" "" "" # LATIN CAPITAL LETTER OI +#0x01a3 "" "" "" # LATIN SMALL LETTER OI +#0x01a4 "" "" "" # LATIN CAPITAL LETTER P WITH HOOK +#0x01a5 "" "" "" # LATIN SMALL LETTER P WITH HOOK +#0x01a6 "" "" "" # LATIN LETTER YR +#0x01a7 "" "" "" # LATIN CAPITAL LETTER TONE TWO +#0x01a8 "" "" "" # LATIN SMALL LETTER TONE TWO +#0x01a9 "" "" "" # LATIN CAPITAL LETTER ESH +#0x01aa "" "" "" # LATIN LETTER REVERSED ESH LOOP +#0x01ab "" "" "" # LATIN SMALL LETTER T WITH PALATAL HOOK +#0x01ac "" "" "" # LATIN CAPITAL LETTER T WITH HOOK +#0x01ad "" "" "" # LATIN SMALL LETTER T WITH HOOK +#0x01ae "" "" "" # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK +#0x01af "" "" "" # LATIN CAPITAL LETTER U WITH HORN +#0x01b0 "" "" "" # LATIN SMALL LETTER U WITH HORN +#0x01b1 "" "" "" # LATIN CAPITAL LETTER UPSILON +#0x01b2 "" "" "" # LATIN CAPITAL LETTER V WITH HOOK +#0x01b3 "" "" "" # LATIN CAPITAL LETTER Y WITH HOOK +#0x01b4 "" "" "" # LATIN SMALL LETTER Y WITH HOOK +#0x01b5 "" "" "" # LATIN CAPITAL LETTER Z WITH STROKE +#0x01b6 "" "" "" # LATIN SMALL LETTER Z WITH STROKE +#0x01b7 "" "" "" # LATIN CAPITAL LETTER EZH +#0x01b8 "" "" "" # LATIN CAPITAL LETTER EZH REVERSED +#0x01b9 "" "" "" # LATIN SMALL LETTER EZH REVERSED +#0x01ba "" "" "" # LATIN SMALL LETTER EZH WITH TAIL +#0x01bb "" "" "" # LATIN LETTER TWO WITH STROKE +#0x01bc "" "" "" # LATIN CAPITAL LETTER TONE FIVE +#0x01bd "" "" "" # LATIN SMALL LETTER TONE FIVE +#0x01be "" "" "" # LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE +#0x01bf "" "" "" # LATIN LETTER WYNN +#0x01c0 "" "" "" # LATIN LETTER DENTAL CLICK +#0x01c1 "" "" "" # LATIN LETTER LATERAL CLICK +#0x01c2 "" "" "" # LATIN LETTER ALVEOLAR CLICK +#0x01c3 "" "" "" # LATIN LETTER RETROFLEX CLICK +#0x01c4 "" "" "" # LATIN CAPITAL LETTER DZ WITH CARON +#0x01c5 "" "" "" # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON +#0x01c6 "" "" "" # LATIN SMALL LETTER DZ WITH CARON +#0x01c7 "" "" "" # LATIN CAPITAL LETTER LJ +#0x01c8 "" "" "" # LATIN CAPITAL LETTER L WITH SMALL LETTER J +#0x01c9 "" "" "" # LATIN SMALL LETTER LJ +#0x01ca "" "" "" # LATIN CAPITAL LETTER NJ +#0x01cb "" "" "" # LATIN CAPITAL LETTER N WITH SMALL LETTER J +#0x01cc "" "" "" # LATIN SMALL LETTER NJ +0x01cd "\\v{A}" "" "" # LATIN CAPITAL LETTER A WITH CARON +0x01ce "\\v{a}" "" "" # LATIN SMALL LETTER A WITH CARON +0x01cf "\\v{I}" "" "" # LATIN CAPITAL LETTER I WITH CARON +0x01d0 "\\v{\\i}" "" "" # LATIN SMALL LETTER I WITH CARON +0x01d1 "\\v{O}" "" "" # LATIN CAPITAL LETTER O WITH CARON +0x01d2 "\\v{o}" "" "" # LATIN SMALL LETTER O WITH CARON +0x01d3 "\\v{U}" "" "" # LATIN CAPITAL LETTER U WITH CARON +0x01d4 "\\v{u}" "" "" # LATIN SMALL LETTER U WITH CARON +#0x01d5 "" "" "" # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON +#0x01d6 "" "" "" # LATIN SMALL LETTER U WITH DIAERESIS AND MACRON +#0x01d7 "" "" "" # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE +#0x01d8 "" "" "" # LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE +#0x01d9 "" "" "" # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON +#0x01da "" "" "" # LATIN SMALL LETTER U WITH DIAERESIS AND CARON +#0x01db "" "" "" # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE +#0x01dc "" "" "" # LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE +#0x01dd "" "" "" # LATIN SMALL LETTER TURNED E +#0x01de "" "" "" # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON +#0x01df "" "" "" # LATIN SMALL LETTER A WITH DIAERESIS AND MACRON +#0x01e0 "" "" "" # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON +#0x01e1 "" "" "" # LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON +#0x01e2 "" "" "" # LATIN CAPITAL LETTER AE WITH MACRON +#0x01e3 "" "" "" # LATIN SMALL LETTER AE WITH MACRON +#0x01e4 "" "" "" # LATIN CAPITAL LETTER G WITH STROKE +#0x01e5 "" "" "" # LATIN SMALL LETTER G WITH STROKE +0x01e6 "\\v{G}" "" "" # LATIN CAPITAL LETTER G WITH CARON +0x01e7 "\\v{g}" "" "" # LATIN SMALL LETTER G WITH CARON +0x01e8 "\\v{K}" "" "" # LATIN CAPITAL LETTER K WITH CARON +0x01e9 "\\v{k}" "" "" # LATIN SMALL LETTER K WITH CARON +0x01ea "\\k{O}" "" "" # LATIN CAPITAL LETTER O WITH OGONEK +0x01eb "\\k{o}" "" "" # LATIN SMALL LETTER O WITH OGONEK +#0x01ec "" "" "" # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON +#0x01ed "" "" "" # LATIN SMALL LETTER O WITH OGONEK AND MACRON +#0x01ee "" "" "" # LATIN CAPITAL LETTER EZH WITH CARON +#0x01ef "" "" "" # LATIN SMALL LETTER EZH WITH CARON +0x01f0 "\\v{\\j}" "" "" # LATIN SMALL LETTER J WITH CARON +0x01ce "\\v{a}" "" "" # LATIN SMALL LETTER A WITH CARON +#0x01f1 "" "" "" # LATIN CAPITAL LETTER DZ +#0x01f2 "" "" "" # LATIN CAPITAL LETTER D WITH SMALL LETTER Z +#0x01f3 "" "" "" # LATIN SMALL LETTER DZ +0x01f4 "\\'{G}" "" "" # LATIN CAPITAL LETTER G WITH ACUTE +0x01f5 "\\'{g}" "" "" # LATIN SMALL LETTER G WITH ACUTE +#0x01f6 "" "" "" # LATIN CAPITAL LETTER HWAIR +#0x01f7 "" "" "" # LATIN CAPITAL LETTER WYNN +0x01f8 "\\`{N}" "" "" # LATIN CAPITAL LETTER N WITH GRAVE +0x01f9 "\\`{n}" "" "" # LATIN SMALL LETTER N WITH GRAVE +#0x01fa "" "" "" # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE +#0x01fb "" "" "" # LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE +#0x01fc "" "" "" # LATIN CAPITAL LETTER AE WITH ACUTE +#0x01fd "" "" "" # LATIN SMALL LETTER AE WITH ACUTE +#0x01fe "" "" "" # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE +#0x01ff "" "" "" # LATIN SMALL LETTER O WITH STROKE AND ACUTE +#0x0200 "" "" "" # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE +#0x0201 "" "" "" # LATIN SMALL LETTER A WITH DOUBLE GRAVE +#0x0202 "" "" "" # LATIN CAPITAL LETTER A WITH INVERTED BREVE +#0x0203 "" "" "" # LATIN SMALL LETTER A WITH INVERTED BREVE +#0x0204 "" "" "" # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE +#0x0205 "" "" "" # LATIN SMALL LETTER E WITH DOUBLE GRAVE +#0x0206 "" "" "" # LATIN CAPITAL LETTER E WITH INVERTED BREVE +#0x0207 "" "" "" # LATIN SMALL LETTER E WITH INVERTED BREVE +#0x0208 "" "" "" # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE +#0x0209 "" "" "" # LATIN SMALL LETTER I WITH DOUBLE GRAVE +#0x020a "" "" "" # LATIN CAPITAL LETTER I WITH INVERTED BREVE +#0x020b "" "" "" # LATIN SMALL LETTER I WITH INVERTED BREVE +#0x020c "" "" "" # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE +#0x020d "" "" "" # LATIN SMALL LETTER O WITH DOUBLE GRAVE +#0x020e "" "" "" # LATIN CAPITAL LETTER O WITH INVERTED BREVE +#0x020f "" "" "" # LATIN SMALL LETTER O WITH INVERTED BREVE +#0x0210 "" "" "" # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE +#0x0211 "" "" "" # LATIN SMALL LETTER R WITH DOUBLE GRAVE +#0x0212 "" "" "" # LATIN CAPITAL LETTER R WITH INVERTED BREVE +#0x0213 "" "" "" # LATIN SMALL LETTER R WITH INVERTED BREVE +#0x0214 "" "" "" # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE +#0x0215 "" "" "" # LATIN SMALL LETTER U WITH DOUBLE GRAVE +#0x0216 "" "" "" # LATIN CAPITAL LETTER U WITH INVERTED BREVE +#0x0217 "" "" "" # LATIN SMALL LETTER U WITH INVERTED BREVE +#0x0218 "" "" "" # LATIN CAPITAL LETTER S WITH COMMA BELOW +#0x0219 "" "" "" # LATIN SMALL LETTER S WITH COMMA BELOW +#0x021a "" "" "" # LATIN CAPITAL LETTER T WITH COMMA BELOW +#0x021b "" "" "" # LATIN SMALL LETTER T WITH COMMA BELOW +#0x021c "" "" "" # LATIN CAPITAL LETTER YOGH +#0x021d "" "" "" # LATIN SMALL LETTER YOGH +0x021e "\\v{H}" "" "" # LATIN CAPITAL LETTER H WITH CARON +0x021f "\\v{h}" "" "" # LATIN SMALL LETTER H WITH CARON +#0x0220 "" "" "" # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG +#0x0222 "" "" "" # LATIN CAPITAL LETTER OU +#0x0223 "" "" "" # LATIN SMALL LETTER OU +#0x0224 "" "" "" # LATIN CAPITAL LETTER Z WITH HOOK +#0x0225 "" "" "" # LATIN SMALL LETTER Z WITH HOOK +0x0226 "\\.{A}" "" "" # LATIN CAPITAL LETTER A WITH DOT ABOVE +0x0227 "\\.{a}" "" "" # LATIN SMALL LETTER A WITH DOT ABOVE +0x0228 "\\c{E}" "" "" # LATIN CAPITAL LETTER E WITH CEDILLA +0x0229 "\\c{e}" "" "" # LATIN SMALL LETTER E WITH CEDILLA +#0x022a "" "" "" # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON +#0x022b "" "" "" # LATIN SMALL LETTER O WITH DIAERESIS AND MACRON +#0x022c "" "" "" # LATIN CAPITAL LETTER O WITH TILDE AND MACRON +#0x022d "" "" "" # LATIN SMALL LETTER O WITH TILDE AND MACRON +0x022e "\\.{O}" "" "" # LATIN CAPITAL LETTER O WITH DOT ABOVE +0x022f "\\.{o}" "" "" # LATIN SMALL LETTER O WITH DOT ABOVE +#0x0230 "" "" "" # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON +#0x0231 "" "" "" # LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON +0x0232 "\\={Y}" "" "" # LATIN CAPITAL LETTER Y WITH MACRON +0x0233 "\\={y}" "" "" # LATIN SMALL LETTER Y WITH MACRON +0x0237 "\\j" "" "" # LATIN SMALL LETTER DOTLESS J +#0x0250 "" "" "" # LATIN SMALL LETTER TURNED A +#0x0251 "" "" "" # LATIN SMALL LETTER ALPHA +#0x0252 "" "" "" # LATIN SMALL LETTER TURNED ALPHA +#0x0253 "" "" "" # LATIN SMALL LETTER B WITH HOOK +#0x0254 "" "" "" # LATIN SMALL LETTER OPEN O +#0x0255 "" "" "" # LATIN SMALL LETTER C WITH CURL +#0x0256 "" "" "" # LATIN SMALL LETTER D WITH TAIL +#0x0257 "" "" "" # LATIN SMALL LETTER D WITH HOOK +#0x0258 "" "" "" # LATIN SMALL LETTER REVERSED E +#0x0259 "" "" "" # LATIN SMALL LETTER SCHWA +#0x025a "" "" "" # LATIN SMALL LETTER SCHWA WITH HOOK +#0x025b "" "" "" # LATIN SMALL LETTER OPEN E +#0x025c "" "" "" # LATIN SMALL LETTER REVERSED OPEN E +#0x025d "" "" "" # LATIN SMALL LETTER REVERSED OPEN E WITH HOOK +#0x025e "" "" "" # LATIN SMALL LETTER CLOSED REVERSED OPEN E +#0x025f "" "" "" # LATIN SMALL LETTER DOTLESS J WITH STROKE +#0x0260 "" "" "" # LATIN SMALL LETTER G WITH HOOK +#0x0261 "" "" "" # LATIN SMALL LETTER SCRIPT G +#0x0262 "" "" "" # LATIN LETTER SMALL CAPITAL G +#0x0263 "" "" "" # LATIN SMALL LETTER GAMMA +#0x0264 "" "" "" # LATIN SMALL LETTER RAMS HORN +#0x0265 "" "" "" # LATIN SMALL LETTER TURNED H +#0x0266 "" "" "" # LATIN SMALL LETTER H WITH HOOK +#0x0267 "" "" "" # LATIN SMALL LETTER HENG WITH HOOK +#0x0268 "" "" "" # LATIN SMALL LETTER I WITH STROKE +#0x0269 "" "" "" # LATIN SMALL LETTER IOTA +#0x026a "" "" "" # LATIN LETTER SMALL CAPITAL I +#0x026b "" "" "" # LATIN SMALL LETTER L WITH MIDDLE TILDE +#0x026c "" "" "" # LATIN SMALL LETTER L WITH BELT +#0x026d "" "" "" # LATIN SMALL LETTER L WITH RETROFLEX HOOK +#0x026e "" "" "" # LATIN SMALL LETTER LEZH +#0x026f "" "" "" # LATIN SMALL LETTER TURNED M +#0x0270 "" "" "" # LATIN SMALL LETTER TURNED M WITH LONG LEG +#0x0271 "" "" "" # LATIN SMALL LETTER M WITH HOOK +#0x0272 "" "" "" # LATIN SMALL LETTER N WITH LEFT HOOK +#0x0273 "" "" "" # LATIN SMALL LETTER N WITH RETROFLEX HOOK +#0x0274 "" "" "" # LATIN LETTER SMALL CAPITAL N +#0x0275 "" "" "" # LATIN SMALL LETTER BARRED O +#0x0276 "" "" "" # LATIN LETTER SMALL CAPITAL OE +#0x0277 "" "" "" # LATIN SMALL LETTER CLOSED OMEGA +#0x0278 "" "" "" # LATIN SMALL LETTER PHI +#0x0279 "" "" "" # LATIN SMALL LETTER TURNED R +#0x027a "" "" "" # LATIN SMALL LETTER TURNED R WITH LONG LEG +#0x027b "" "" "" # LATIN SMALL LETTER TURNED R WITH HOOK +#0x027c "" "" "" # LATIN SMALL LETTER R WITH LONG LEG +#0x027d "" "" "" # LATIN SMALL LETTER R WITH TAIL +#0x027e "" "" "" # LATIN SMALL LETTER R WITH FISHHOOK +#0x027f "" "" "" # LATIN SMALL LETTER REVERSED R WITH FISHHOOK +#0x0280 "" "" "" # LATIN LETTER SMALL CAPITAL R +#0x0281 "" "" "" # LATIN LETTER SMALL CAPITAL INVERTED R +#0x0282 "" "" "" # LATIN SMALL LETTER S WITH HOOK +#0x0283 "" "" "" # LATIN SMALL LETTER ESH +#0x0284 "" "" "" # LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK +#0x0285 "" "" "" # LATIN SMALL LETTER SQUAT REVERSED ESH +#0x0286 "" "" "" # LATIN SMALL LETTER ESH WITH CURL +#0x0287 "" "" "" # LATIN SMALL LETTER TURNED T +#0x0288 "" "" "" # LATIN SMALL LETTER T WITH RETROFLEX HOOK +#0x0289 "" "" "" # LATIN SMALL LETTER U BAR +#0x028a "" "" "" # LATIN SMALL LETTER UPSILON +#0x028b "" "" "" # LATIN SMALL LETTER V WITH HOOK +#0x028c "" "" "" # LATIN SMALL LETTER TURNED V +#0x028d "" "" "" # LATIN SMALL LETTER TURNED W +#0x028e "" "" "" # LATIN SMALL LETTER TURNED Y +#0x028f "" "" "" # LATIN LETTER SMALL CAPITAL Y +#0x0290 "" "" "" # LATIN SMALL LETTER Z WITH RETROFLEX HOOK +#0x0291 "" "" "" # LATIN SMALL LETTER Z WITH CURL +#0x0292 "" "" "" # LATIN SMALL LETTER EZH +#0x0293 "" "" "" # LATIN SMALL LETTER EZH WITH CURL +#0x0294 "" "" "" # LATIN LETTER GLOTTAL STOP +#0x0295 "" "" "" # LATIN LETTER PHARYNGEAL VOICED FRICATIVE +#0x0296 "" "" "" # LATIN LETTER INVERTED GLOTTAL STOP +#0x0297 "" "" "" # LATIN LETTER STRETCHED C +#0x0298 "" "" "" # LATIN LETTER BILABIAL CLICK +#0x0299 "" "" "" # LATIN LETTER SMALL CAPITAL B +#0x029a "" "" "" # LATIN SMALL LETTER CLOSED OPEN E +#0x029b "" "" "" # LATIN LETTER SMALL CAPITAL G WITH HOOK +#0x029c "" "" "" # LATIN LETTER SMALL CAPITAL H +#0x029d "" "" "" # LATIN SMALL LETTER J WITH CROSSED-TAIL +#0x029e "" "" "" # LATIN SMALL LETTER TURNED K +#0x029f "" "" "" # LATIN LETTER SMALL CAPITAL L +#0x02a0 "" "" "" # LATIN SMALL LETTER Q WITH HOOK +#0x02a1 "" "" "" # LATIN LETTER GLOTTAL STOP WITH STROKE +#0x02a2 "" "" "" # LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE +#0x02a3 "" "" "" # LATIN SMALL LETTER DZ DIGRAPH +#0x02a4 "" "" "" # LATIN SMALL LETTER DEZH DIGRAPH +#0x02a5 "" "" "" # LATIN SMALL LETTER DZ DIGRAPH WITH CURL +#0x02a6 "" "" "" # LATIN SMALL LETTER TS DIGRAPH +#0x02a7 "" "" "" # LATIN SMALL LETTER TESH DIGRAPH +#0x02a8 "" "" "" # LATIN SMALL LETTER TC DIGRAPH WITH CURL +#0x02a9 "" "" "" # LATIN SMALL LETTER FENG DIGRAPH +#0x02aa "" "" "" # LATIN SMALL LETTER LS DIGRAPH +#0x02ab "" "" "" # LATIN SMALL LETTER LZ DIGRAPH +#0x02ac "" "" "" # LATIN LETTER BILABIAL PERCUSSIVE +#0x02ad "" "" "" # LATIN LETTER BIDENTAL PERCUSSIVE +#0x02b0 "" "" "" # MODIFIER LETTER SMALL H +#0x02b1 "" "" "" # MODIFIER LETTER SMALL H WITH HOOK +#0x02b2 "" "" "" # MODIFIER LETTER SMALL J +#0x02b3 "" "" "" # MODIFIER LETTER SMALL R +#0x02b4 "" "" "" # MODIFIER LETTER SMALL TURNED R +#0x02b5 "" "" "" # MODIFIER LETTER SMALL TURNED R WITH HOOK +#0x02b6 "" "" "" # MODIFIER LETTER SMALL CAPITAL INVERTED R +#0x02b7 "" "" "" # MODIFIER LETTER SMALL W +#0x02b8 "" "" "" # MODIFIER LETTER SMALL Y +#0x02b9 "" "" "" # MODIFIER LETTER PRIME +#0x02ba "" "" "" # MODIFIER LETTER DOUBLE PRIME +#0x02bb "" "" "" # MODIFIER LETTER TURNED COMMA +#0x02bc "" "" "" # MODIFIER LETTER APOSTROPHE +#0x02bd "" "" "" # MODIFIER LETTER REVERSED COMMA +#0x02be "" "" "" # MODIFIER LETTER RIGHT HALF RING +#0x02bf "" "" "" # MODIFIER LETTER LEFT HALF RING +#0x02c0 "" "" "" # MODIFIER LETTER GLOTTAL STOP +#0x02c1 "" "" "" # MODIFIER LETTER REVERSED GLOTTAL STOP +#0x02c2 "" "" "" # MODIFIER LETTER LEFT ARROWHEAD +#0x02c3 "" "" "" # MODIFIER LETTER RIGHT ARROWHEAD +#0x02c4 "" "" "" # MODIFIER LETTER UP ARROWHEAD +#0x02c5 "" "" "" # MODIFIER LETTER DOWN ARROWHEAD +#0x02c6 "" "" "" # MODIFIER LETTER CIRCUMFLEX ACCENT +0x02c7 "\\v{ }" "" "" # CARON +#0x02c8 "" "" "" # MODIFIER LETTER VERTICAL LINE +#0x02c9 "" "" "" # MODIFIER LETTER MACRON +#0x02ca "" "" "" # MODIFIER LETTER ACUTE ACCENT +#0x02cb "" "" "" # MODIFIER LETTER GRAVE ACCENT +#0x02cc "" "" "" # MODIFIER LETTER LOW VERTICAL LINE +#0x02cd "" "" "" # MODIFIER LETTER LOW MACRON +#0x02ce "" "" "" # MODIFIER LETTER LOW GRAVE ACCENT +#0x02cf "" "" "" # MODIFIER LETTER LOW ACUTE ACCENT +#0x02d0 "" "" "" # MODIFIER LETTER TRIANGULAR COLON +#0x02d1 "" "" "" # MODIFIER LETTER HALF TRIANGULAR COLON +#0x02d2 "" "" "" # MODIFIER LETTER CENTRED RIGHT HALF RING +#0x02d3 "" "" "" # MODIFIER LETTER CENTRED LEFT HALF RING +#0x02d4 "" "" "" # MODIFIER LETTER UP TACK +#0x02d5 "" "" "" # MODIFIER LETTER DOWN TACK +#0x02d6 "" "" "" # MODIFIER LETTER PLUS SIGN +#0x02d7 "" "" "" # MODIFIER LETTER MINUS SIGN +0x02d8 "\\u{ }" "" "" # BREVE +0x02d9 "\\.{ }" "" "" # DOT ABOVE +0x02da "\\r{ }" "" "" # RING ABOVE +0x02db "\\k{ }" "" "" # OGONEK +0x02dc "\\~{ }" "" "" # SMALL TILDE +0x02dd "\\H{ }" "" "" # DOUBLE ACUTE ACCENT +#0x02de "" "" "" # MODIFIER LETTER RHOTIC HOOK +#0x02df "" "" "" # MODIFIER LETTER CROSS ACCENT +#0x02e0 "" "" "" # MODIFIER LETTER SMALL GAMMA +#0x02e1 "" "" "" # MODIFIER LETTER SMALL L +#0x02e2 "" "" "" # MODIFIER LETTER SMALL S +#0x02e3 "" "" "" # MODIFIER LETTER SMALL X +#0x02e4 "" "" "" # MODIFIER LETTER SMALL REVERSED GLOTTAL STOP +#0x02e5 "" "" "" # MODIFIER LETTER EXTRA-HIGH TONE BAR +#0x02e6 "" "" "" # MODIFIER LETTER HIGH TONE BAR +#0x02e7 "" "" "" # MODIFIER LETTER MID TONE BAR +#0x02e8 "" "" "" # MODIFIER LETTER LOW TONE BAR +#0x02e9 "" "" "" # MODIFIER LETTER EXTRA-LOW TONE BAR +#0x02ea "" "" "" # MODIFIER LETTER YIN DEPARTING TONE MARK +#0x02eb "" "" "" # MODIFIER LETTER YANG DEPARTING TONE MARK +#0x02ec "" "" "" # MODIFIER LETTER VOICING +#0x02ed "" "" "" # MODIFIER LETTER UNASPIRATED +#0x02ee "" "" "" # MODIFIER LETTER DOUBLE APOSTROPHE +0x0300 "\\`" "" "combining" # COMBINING GRAVE ACCENT +0x0301 "\\'" "" "combining" # COMBINING ACUTE ACCENT +0x0302 "\\^" "" "combining" # COMBINING CIRCUMFLEX ACCENT +0x0303 "\\~" "" "combining" # COMBINING TILDE +0x0304 "\\=" "" "combining" # COMBINING MACRON +#0x0305 "" "" "combining" # COMBINING OVERLINE +0x0306 "\\u" "" "combining" # COMBINING BREVE +0x0307 "\\." "" "combining" # COMBINING DOT ABOVE +0x0308 "\\\"" "" "combining" # COMBINING DIAERESIS +#0x0309 "" "" "combining" # COMBINING HOOK ABOVE +0x030a "\\r" "" "combining" # COMBINING RING ABOVE +0x030b "\\H" "" "combining" # COMBINING DOUBLE ACUTE ACCENT +0x030c "\\v" "" "combining" # COMBINING CARON +#0x030d "" "" "combining" # COMBINING VERTICAL LINE ABOVE +#0x030e "" "" "combining" # COMBINING DOUBLE VERTICAL LINE ABOVE +#0x030f "" "" "combining" # COMBINING DOUBLE GRAVE ACCENT +#0x0310 "" "" "combining" # COMBINING CANDRABINDU +#0x0311 "" "" "combining" # COMBINING INVERTED BREVE +#0x0312 "" "" "combining" # COMBINING TURNED COMMA ABOVE +#0x0313 "" "" "combining" # COMBINING COMMA ABOVE +#0x0314 "" "" "combining" # COMBINING REVERSED COMMA ABOVE +#0x0315 "" "" "combining" # COMBINING COMMA ABOVE RIGHT +#0x0316 "" "" "combining" # COMBINING GRAVE ACCENT BELOW +#0x0317 "" "" "combining" # COMBINING ACUTE ACCENT BELOW +#0x0318 "" "" "combining" # COMBINING LEFT TACK BELOW +#0x0319 "" "" "combining" # COMBINING RIGHT TACK BELOW +#0x031a "" "" "combining" # COMBINING LEFT ANGLE ABOVE +#0x031b "" "" "combining" # COMBINING HORN +#0x031c "" "" "combining" # COMBINING LEFT HALF RING BELOW +#0x031d "" "" "combining" # COMBINING UP TACK BELOW +#0x031e "" "" "combining" # COMBINING DOWN TACK BELOW +#0x031f "" "" "combining" # COMBINING PLUS SIGN BELOW +0x0320 "\\b" "" "combining" # COMBINING MINUS SIGN BELOW +#0x0321 "" "" "combining" # COMBINING PALATALIZED HOOK BELOW +#0x0322 "" "" "combining" # COMBINING RETROFLEX HOOK BELOW +0x0323 "\\d" "" "combining" # COMBINING DOT BELOW +#0x0324 "" "" "combining" # COMBINING DIAERESIS BELOW +#0x0325 "" "" "combining" # COMBINING RING BELOW +#0x0326 "" "" "combining" # COMBINING COMMA BELOW +0x0327 "\\c" "" "combining" # COMBINING CEDILLA +0x0328 "\\k" "" "combining" # COMBINING OGONEK +#0x0329 "" "" "combining" # COMBINING VERTICAL LINE BELOW +#0x032a "" "" "combining" # COMBINING BRIDGE BELOW +#0x032b "" "" "combining" # COMBINING INVERTED DOUBLE ARCH BELOW +#0x032c "" "" "combining" # COMBINING CARON BELOW +#0x032d "" "" "combining" # COMBINING CIRCUMFLEX ACCENT BELOW +#0x032e "" "" "combining" # COMBINING BREVE BELOW +#0x032f "" "" "combining" # COMBINING INVERTED BREVE BELOW +#0x0330 "" "" "combining" # COMBINING TILDE BELOW +#0x0331 "" "" "combining" # COMBINING MACRON BELOW +#0x0332 "" "" "combining" # COMBINING LOW LINE +#0x0333 "" "" "combining" # COMBINING DOUBLE LOW LINE +#0x0334 "" "" "combining" # COMBINING TILDE OVERLAY +#0x0335 "" "" "combining" # COMBINING SHORT STROKE OVERLAY +#0x0336 "" "" "combining" # COMBINING LONG STROKE OVERLAY +#0x0337 "" "" "combining" # COMBINING SHORT SOLIDUS OVERLAY +#0x0338 "" "" "combining" # COMBINING LONG SOLIDUS OVERLAY +#0x0339 "" "" "combining" # COMBINING RIGHT HALF RING BELOW +#0x033a "" "" "combining" # COMBINING INVERTED BRIDGE BELOW +#0x033b "" "" "combining" # COMBINING SQUARE BELOW +#0x033c "" "" "combining" # COMBINING SEAGULL BELOW +#0x033d "" "" "combining" # COMBINING X ABOVE +#0x033e "" "" "combining" # COMBINING VERTICAL TILDE +#0x033f "" "" "combining" # COMBINING DOUBLE OVERLINE +#0x0340 "" "" "combining" # COMBINING GRAVE TONE MARK +#0x0341 "" "" "combining" # COMBINING ACUTE TONE MARK +#0x0342 "" "" "combining" # COMBINING GREEK PERISPOMENI +#0x0343 "" "" "combining" # COMBINING GREEK KORONIS +#0x0344 "" "" "combining" # COMBINING GREEK DIALYTIKA TONOS +#0x0345 "" "" "combining" # COMBINING GREEK YPOGEGRAMMENI +#0x0346 "" "" "combining" # COMBINING BRIDGE ABOVE +#0x0347 "" "" "combining" # COMBINING EQUALS SIGN BELOW +#0x0348 "" "" "combining" # COMBINING DOUBLE VERTICAL LINE BELOW +#0x0349 "" "" "combining" # COMBINING LEFT ANGLE BELOW +#0x034a "" "" "combining" # COMBINING NOT TILDE ABOVE +#0x034b "" "" "combining" # COMBINING HOMOTHETIC ABOVE +#0x034c "" "" "combining" # COMBINING ALMOST EQUAL TO ABOVE +#0x034d "" "" "combining" # COMBINING LEFT RIGHT ARROW BELOW +#0x034e "" "" "combining" # COMBINING UPWARDS ARROW BELOW +#0x034f "" "" "" # COMBINING GRAPHEME JOINER +#0x0360 "" "" "combining" # COMBINING DOUBLE TILDE +0x0361 "\\t" "" "combining" # COMBINING DOUBLE INVERTED BREVE (ligature tie) +#0x0362 "" "" "combining" # COMBINING DOUBLE RIGHTWARDS ARROW BELOW +#0x0363 "" "" "combining" # COMBINING LATIN SMALL LETTER A +#0x0364 "" "" "combining" # COMBINING LATIN SMALL LETTER E +#0x0365 "" "" "combining" # COMBINING LATIN SMALL LETTER I +#0x0366 "" "" "combining" # COMBINING LATIN SMALL LETTER O +#0x0367 "" "" "combining" # COMBINING LATIN SMALL LETTER U +#0x0368 "" "" "combining" # COMBINING LATIN SMALL LETTER C +#0x0369 "" "" "combining" # COMBINING LATIN SMALL LETTER D +#0x036a "" "" "combining" # COMBINING LATIN SMALL LETTER H +#0x036b "" "" "combining" # COMBINING LATIN SMALL LETTER M +#0x036c "" "" "combining" # COMBINING LATIN SMALL LETTER R +#0x036d "" "" "combining" # COMBINING LATIN SMALL LETTER T +#0x036e "" "" "combining" # COMBINING LATIN SMALL LETTER V +#0x036f "" "" "combining" # COMBINING LATIN SMALL LETTER X +0x1ea1 "\\d{a}" "" "" # LATIN SMALL LETTER A WITH DOT BELOW +0x20ac "\\texteuro" "textcomp" "" # EURO SIGN + diff --git a/src/encoding.C b/src/encoding.C index d8e70377b0..aa5ed9304a 100644 --- a/src/encoding.C +++ b/src/encoding.C @@ -15,14 +15,19 @@ #include "encoding.h" #include "debug.h" +#include "LaTeXFeatures.h" #include "lyxlex.h" #include "lyxrc.h" #include "support/filename.h" +#include "support/lstrings.h" +#include "support/unicode.h" namespace lyx { +using support::FileName; + #ifndef CXX_GLOBAL_CSTD using std::strtol; #endif @@ -177,9 +182,97 @@ char_type arabic_table[63][2] = { char_type const arabic_start = 0xc1; + +/// Information about a single UCS4 character +struct CharInfo { + /// LaTeX command for this character + docstring command; + /// Needed LaTeX preamble (or feature) + string preamble; + /// Is this a combining character? + bool combining; + /// Is \c preamble a feature known by LaTeXFeatures, or a raw LaTeX + /// command? + bool feature; + /// Always force the LaTeX command, even if the encoding contains + /// this character? + bool force; +}; + + +typedef std::map CharInfoMap; +CharInfoMap unicodesymbols; + } // namespace anon +Encoding::Encoding(string const & n, string const & l, string const & i) + : Name_(n), LatexName_(l), iconvName_(i) +{ + if (n == "utf8") + // UTF8 can encode all 1<<20 + 1<<16 UCS4 code points + start_encodable_ = 0x110000; + else { + start_encodable_ = 0; + // temporarily switch off lyxerr, since we will generate iconv errors + lyxerr.disable(); + for (unsigned short j = 0; j < 256; ++j) { + char const c = j; + std::vector const ucs4 = eightbit_to_ucs4(&c, 1, i); + if (ucs4.size() == 1) { + char_type const c = ucs4[0]; + CharInfoMap::const_iterator const it = unicodesymbols.find(c); + if (it == unicodesymbols.end() || !it->second.force) + encodable_.insert(c); + } + } + lyxerr.enable(); + CharSet::iterator it = encodable_.find(start_encodable_); + while (it != encodable_.end()) { + encodable_.erase(it); + ++start_encodable_; + it = encodable_.find(start_encodable_); + } + } +} + + +docstring const Encoding::latexChar(char_type c) const +{ + if (c < start_encodable_) + return docstring(1, c); + if (encodable_.find(c) == encodable_.end()) { + // c cannot be encoded in this encoding + CharInfoMap::const_iterator const it = unicodesymbols.find(c); + if (it == unicodesymbols.end()) + lyxerr << "Could not find LaTeX command for character 0x" + << std::hex << c << ".\nLaTeX export will fail." + << endl; + else + return it->second.command; + } + return docstring(1, c); +} + + +void Encoding::validate(char_type c, LaTeXFeatures & features) const +{ + if (c < start_encodable_) + return; + + if (encodable_.find(c) != encodable_.end()) + return; + + // c cannot be encoded in this encoding + CharInfoMap::const_iterator const it = unicodesymbols.find(c); + if (it != unicodesymbols.end() && !it->second.preamble.empty()) { + if (it->second.feature) + features.require(it->second.preamble); + else + features.addPreambleSnippet(it->second.preamble); + } +} + bool Encodings::isComposeChar_hebrew(char_type c) { @@ -226,6 +319,15 @@ char_type Encodings::transformChar(char_type c, } +bool Encodings::isCombiningChar(char_type c) +{ + CharInfoMap::const_iterator const it = unicodesymbols.find(c); + if (it != unicodesymbols.end()) + return it->second.combining; + return false; +} + + Encoding const * Encodings::getFromLyXName(string const & name) const { EncodingList::const_iterator it = encodinglist.find(name); @@ -255,8 +357,68 @@ Encodings::Encodings() { } -void Encodings::read(support::FileName const & filename) + +void Encodings::read(FileName const & encfile, FileName const & symbolsfile) { + // We must read the symbolsfile first, because the Encoding + // constructor depends on it. + LyXLex symbolslex(0, 0); + symbolslex.setFile(symbolsfile); + while (symbolslex.isOK()) { + char_type symbol; + CharInfo info; + string flags; + + if (symbolslex.next(true)) { + std::istringstream is(symbolslex.getString()); + // reading symbol directly does not work if + // char_type == std::wchar_t. + boost::uint32_t tmp; + if(!(is >> std::hex >> tmp)) + break; + symbol = tmp; + } else + break; + if (symbolslex.next(true)) + info.command = symbolslex.getDocString(); + else + break; + if (symbolslex.next(true)) + info.preamble = symbolslex.getString(); + else + break; + if (symbolslex.next(true)) + flags = symbolslex.getString(); + else + break; + + info.combining = false; + info.force = false; + while (!flags.empty()) { + string flag; + flags = support::split(flags, flag, ','); + if (flag == "combining") + info.combining = true; + else if (flag == "force") + info.force = true; + else + lyxerr << "Ignoring unknown flag `" << flag + << "' for symbol `0x" << std::hex + << symbol << "'." << endl; + } + + if (!info.preamble.empty()) + info.feature = info.preamble[0] != '\\'; + + lyxerr[Debug::INFO] + << "Read unicode symbol " << symbol << " '" + << to_utf8(info.command) << "' '" << info.preamble + << "' " << info.combining << ' ' << info.feature + << endl; + unicodesymbols[symbol] = info; + } + + // Now read the encodings enum Encodingtags { et_encoding = 1, et_end, @@ -269,7 +431,7 @@ void Encodings::read(support::FileName const & filename) }; LyXLex lex(encodingtags, et_last - 1); - lex.setFile(filename); + lex.setFile(encfile); while (lex.isOK()) { switch (lex.lex()) { case et_encoding: diff --git a/src/encoding.h b/src/encoding.h index a817cdff6c..b738893132 100644 --- a/src/encoding.h +++ b/src/encoding.h @@ -13,15 +13,17 @@ #ifndef ENCODING_H #define ENCODING_H -#include -#include +#include "support/docstring.h" -#include "support/types.h" +#include namespace lyx { namespace support { class FileName; } +class LaTeXFeatures; + + /// class Encoding { public: @@ -29,16 +31,25 @@ public: Encoding() {} /// Encoding(std::string const & n, std::string const & l, - std::string const & i) - : Name_(n), LatexName_(l), iconvName_(i) - { - } + std::string const & i); /// std::string const & name() const { return Name_; } /// std::string const & latexName() const { return LatexName_; } /// std::string const & iconvName() const { return iconvName_; } + /** + * Convert \p c to something that LaTeX can understand. + * This is either the character itself (if it is representable + * in this encoding), or a LaTeX macro. + * If the character is not representable in this encoding, but no + * LaTeX macro is known, a warning is given of lyxerr, and the + * character is returned. + */ + docstring const latexChar(char_type c) const; + /// Add the preamble snippet needed for the output of latexChar(c) + /// to \p features. + void validate(char_type c, LaTeXFeatures & features) const; private: /// std::string Name_; @@ -46,6 +57,15 @@ private: std::string LatexName_; /// std::string iconvName_; + /// + typedef std::set CharSet; + /// Set of UCS4 characters that we can encode (for singlebyte + /// encodings only) + CharSet encodable_; + /// All code points below this are encodable. This helps us to avoid + /// lokup of ASCII characters in encodable_ and gives about 1 sec + /// speedup on export of the Userguide. + char_type start_encodable_; }; class Encodings { @@ -64,8 +84,11 @@ public: }; /// Encodings(); - /// - void read(support::FileName const & filename); + /// Read the encodings. + /// \param encfile encodings definition file + /// \param symbolsfile unicode->LaTeX mapping file + void read(support::FileName const & encfile, + support::FileName const & symbolsfile); /// Get encoding from LyX name \p name Encoding const * getFromLyXName(std::string const & name) const; /// Get encoding from LaTeX name \p name @@ -97,6 +120,8 @@ public: static bool is_arabic(char_type c); /// static char_type transformChar(char_type c, Letter_Form form); + /// Is this a combining char? + static bool isCombiningChar(char_type c); private: /// diff --git a/src/lyx_main.C b/src/lyx_main.C index 97f71100ff..d9291ae68c 100644 --- a/src/lyx_main.C +++ b/src/lyx_main.C @@ -880,7 +880,7 @@ bool LyX::init() if (!readRcFile("preferences")) return false; - if (!readEncodingsFile("encodings")) + if (!readEncodingsFile("encodings", "unicodesymbols")) return false; if (!readLanguagesFile("languages")) return false; @@ -1247,16 +1247,24 @@ bool LyX::readLanguagesFile(string const & name) // Read the encodings file `name' -bool LyX::readEncodingsFile(string const & name) +bool LyX::readEncodingsFile(string const & enc_name, + string const & symbols_name) { - lyxerr[Debug::INIT] << "About to read " << name << "..." << endl; + lyxerr[Debug::INIT] << "About to read " << enc_name << " and " + << symbols_name << "..." << endl; - FileName const enc_path = libFileSearch(string(), name); - if (enc_path.empty()) { - showFileError(name); + FileName const symbols_path = libFileSearch(string(), symbols_name); + if (symbols_path.empty()) { + showFileError(symbols_name); return false; } - encodings.read(enc_path); + + FileName const enc_path = libFileSearch(string(), enc_name); + if (enc_path.empty()) { + showFileError(enc_name); + return false; + } + encodings.read(enc_path, symbols_path); return true; } diff --git a/src/lyx_main.h b/src/lyx_main.h index 4145c78b84..6c53bc6a01 100644 --- a/src/lyx_main.h +++ b/src/lyx_main.h @@ -149,8 +149,11 @@ private: bool readUIFile(std::string const & name, bool include = false); /// read the given languages file bool readLanguagesFile(std::string const & name); - /// read the given encodings file - bool readEncodingsFile(std::string const & name); + /// read the encodings. + /// \param enc_name encodings definition file + /// \param symbols_name unicode->LaTeX mapping file + bool readEncodingsFile(std::string const & enc_name, + std::string const & symbols_name); /// parsing of non-gui LyX options. void easyParse(int & argc, char * argv[]); /// shows up a parsing error on screen diff --git a/src/paragraph.C b/src/paragraph.C index 439e42187e..95287686f5 100644 --- a/src/paragraph.C +++ b/src/paragraph.C @@ -1007,6 +1007,8 @@ bool Paragraph::simpleTeXOnePar(Buffer const & buf, runparams.moving_arg); } + // Computed only once per paragraph since bparams.encoding() is expensive + Encoding const & doc_encoding = bparams.encoding(); for (pos_type i = 0; i < size(); ++i) { ++column; // First char in paragraph or after label? @@ -1066,10 +1068,18 @@ bool Paragraph::simpleTeXOnePar(Buffer const & buf, if (c == ' ') { // Do not print the separation of the optional argument if (i != body_pos - 1) { - // FIXME: change tracking - // Is this correct WRT change tracking? - pimpl_->simpleTeXBlanks(os, texrow, i, - column, font, *style); + if (pimpl_->simpleTeXBlanks(bparams, + doc_encoding, os, texrow, + i, column, font, *style)) + // A surrogate pair was output. We + // must not call simpleTeXSpecialChars + // in this iteration, since + // simpleTeXBlanks incremented i, and + // simpleTeXSpecialChars would output + // the combining character again. + // FIXME: change tracking + // Is this correct WRT change tracking? + continue; } } @@ -1101,7 +1111,7 @@ bool Paragraph::simpleTeXOnePar(Buffer const & buf, rp.local_font = &font; rp.intitle = style->intitle; pimpl_->simpleTeXSpecialChars(buf, bparams, - os, texrow, rp, + doc_encoding, os, texrow, rp, font, running_font, basefont, outerfont, open_font, runningChangeType, diff --git a/src/paragraph_pimpl.C b/src/paragraph_pimpl.C index 1abb68321f..900837a49a 100644 --- a/src/paragraph_pimpl.C +++ b/src/paragraph_pimpl.C @@ -59,16 +59,15 @@ special_phrase const special_phrases[] = { size_t const phrases_nr = sizeof(special_phrases)/sizeof(special_phrase); -bool isEncoding(BufferParams const & bparams, LyXFont const & font, - string const & encoding) +/// Get the real encoding of a character with font \p font. +/// doc_encoding == bparams.encoding(), but we use a precomputed variable +/// since bparams.encoding() is expensive +inline Encoding const & getEncoding(BufferParams const & bparams, + Encoding const & doc_encoding, LyXFont const & font) { - // We do ignore bparams.inputenc == "default" here because characters - // in this encoding could be treated by TeX as something different, - // e.g. if they are inside a CJK environment. See also - // http://bugzilla.lyx.org/show_bug.cgi?id=3043. - return (bparams.inputenc == encoding - || (bparams.inputenc == "auto" - && font.language()->encoding()->latexName() == encoding)); + if (bparams.inputenc == "auto" || bparams.inputenc == "default") + return *(font.language()->encoding()); + return doc_encoding; } } // namespace anon @@ -381,14 +380,44 @@ int Paragraph::Pimpl::eraseChars(pos_type start, pos_type end, bool trackChanges } -void Paragraph::Pimpl::simpleTeXBlanks(odocstream & os, TexRow & texrow, - pos_type const i, +int Paragraph::Pimpl::latexSurrogatePair(odocstream & os, value_type c, + value_type next, Encoding const & encoding) +{ + // Writing next here may circumvent a possible font change between + // c and next. Since next is only output if it forms a surrogate pair + // with c we can ignore this: + // A font change inside a surrogate pair does not make sense and is + // hopefully impossible to input. + // FIXME: change tracking + // Is this correct WRT change tracking? + docstring const latex1 = encoding.latexChar(next); + docstring const latex2 = encoding.latexChar(c); + os << latex1 << '{' << latex2 << '}'; + return latex1.length() + latex2.length() + 2; +} + + +bool Paragraph::Pimpl::simpleTeXBlanks(BufferParams const & bparams, + Encoding const & doc_encoding, + odocstream & os, TexRow & texrow, + pos_type & i, unsigned int & column, LyXFont const & font, LyXLayout const & style) { if (style.pass_thru) - return; + return false; + + if (i < size() - 1) { + char_type next = getChar(i + 1); + if (Encodings::isCombiningChar(next)) { + // This space has an accent, so we must always output it. + Encoding const & encoding = getEncoding(bparams, doc_encoding, font); + column += latexSurrogatePair(os, ' ', next, encoding) - 1; + ++i; + return true; + } + } if (lyxrc.plaintext_linelen > 0 && column > lyxrc.plaintext_linelen @@ -413,6 +442,7 @@ void Paragraph::Pimpl::simpleTeXBlanks(odocstream & os, TexRow & texrow, } else { os << ' '; } + return false; } @@ -448,6 +478,7 @@ bool Paragraph::Pimpl::isTextAt(string const & str, pos_type pos) const void Paragraph::Pimpl::simpleTeXSpecialChars(Buffer const & buf, BufferParams const & bparams, + Encoding const & doc_encoding, odocstream & os, TexRow & texrow, OutputParams const & runparams, @@ -465,6 +496,8 @@ void Paragraph::Pimpl::simpleTeXSpecialChars(Buffer const & buf, if (style.pass_thru) { if (c != Paragraph::META_INSET) { if (c != '\0') + // FIXME UNICODE: This can fail if c cannot + // be encoded in the current encoding. os.put(c); } else owner_->getInset(i)->plaintext(buf, os, runparams); @@ -581,25 +614,6 @@ void Paragraph::Pimpl::simpleTeXSpecialChars(Buffer const & buf, // would be wrongly converted on systems where char is signed, so we give // the code points. // This also makes us independant from the encoding of this source file. - case 0xb1: // ± PLUS-MINUS SIGN - case 0xb2: // ² SUPERSCRIPT TWO - case 0xb3: // ³ SUPERSCRIPT THREE - case 0xd7: // × MULTIPLICATION SIGN - case 0xf7: // ÷ DIVISION SIGN - case 0xb9: // ¹ SUPERSCRIPT ONE - case 0xac: // ¬ NOT SIGN - case 0xb5: // µ MICRO SIGN - if (isEncoding(bparams, font, "latin1") - || isEncoding(bparams, font, "latin9")) { - os << "\\ensuremath{"; - os.put(c); - os << '}'; - column += 13; - } else { - os.put(c); - } - break; - case '|': case '<': case '>': // In T1 encoding, these characters exist if (lyxrc.fontenc == "T1") { @@ -658,82 +672,6 @@ void Paragraph::Pimpl::simpleTeXSpecialChars(Buffer const & buf, column += 9; break; - case 0xa3: // £ POUND SIGN - if (bparams.inputenc == "default") { - os << "\\pounds{}"; - column += 8; - } else { - os.put(c); - } - break; - - case 0x20ac: // EURO SIGN - if (isEncoding(bparams, font, "latin9") - || isEncoding(bparams, font, "cp1251") - || isEncoding(bparams, font, "utf8") - || isEncoding(bparams, font, "latin10") - || isEncoding(bparams, font, "cp858")) { - os.put(c); - } else { - os << "\\texteuro{}"; - column += 10; - } - break; - - // These characters are covered by latin1, but not - // by latin9 (a.o.). We have to support them because - // we switched the default of latin1-languages to latin9 - case 0xa4: // CURRENCY SYMBOL - case 0xa6: // BROKEN BAR - case 0xa8: // DIAERESIS - case 0xb4: // ACUTE ACCENT - case 0xb8: // CEDILLA - case 0xbd: // 1/2 FRACTION - case 0xbc: // 1/4 FRACTION - case 0xbe: // 3/4 FRACTION - if (isEncoding(bparams, font, "latin1") - || isEncoding(bparams, font, "latin5") - || isEncoding(bparams, font, "utf8")) { - os.put(c); - break; - } else { - switch (c) { - case 0xa4: - os << "\\textcurrency{}"; - column += 15; - break; - case 0xa6: - os << "\\textbrokenbar{}"; - column += 16; - break; - case 0xa8: - os << "\\textasciidieresis{}"; - column += 20; - break; - case 0xb4: - os << "\\textasciiacute{}"; - column += 17; - break; - case 0xb8: // from latin1.def: - os << "\\c\\ "; - column += 3; - break; - case 0xbd: - os << "\\textonehalf{}"; - column += 14; - break; - case 0xbc: - os << "\\textonequarter{}"; - column += 17; - break; - case 0xbe: - os << "\\textthreequarters{}"; - column += 20; - break; - } - break; - } - case '$': case '&': case '%': case '#': case '{': case '}': case '_': @@ -771,6 +709,8 @@ void Paragraph::Pimpl::simpleTeXSpecialChars(Buffer const & buf, default: // I assume this is hack treating typewriter as verbatim + // FIXME UNICODE: This can fail if c cannot be encoded + // in the current encoding. if (font.family() == LyXFont::TYPEWRITER_FAMILY) { if (c != '\0') { os.put(c); @@ -798,7 +738,27 @@ void Paragraph::Pimpl::simpleTeXSpecialChars(Buffer const & buf, } if (pnr == phrases_nr && c != '\0') { - os.put(c); + Encoding const & encoding = getEncoding(bparams, doc_encoding, font); + if (i < size() - 1) { + char_type next = getChar(i + 1); + if (Encodings::isCombiningChar(next)) { + column += latexSurrogatePair(os, c, next, encoding) - 1; + ++i; + break; + } + } + docstring const latex = encoding.latexChar(c); + if (latex.length() > 1 && + latex[latex.length() - 1] != '}') { + // Prevent eating of a following + // space or command corruption by + // following characters + column += latex.length() + 1; + os << latex << "{}"; + } else { + column += latex.length() - 1; + os << latex; + } } break; } @@ -876,6 +836,7 @@ void Paragraph::Pimpl::validate(LaTeXFeatures & features, } // then the contents + Encoding const & doc_encoding = bparams.encoding(); for (pos_type i = 0; i < size() ; ++i) { for (size_t pnr = 0; pnr < phrases_nr; ++pnr) { if (!special_phrases[pnr].builtin @@ -884,12 +845,12 @@ void Paragraph::Pimpl::validate(LaTeXFeatures & features, break; } } - // these glyphs require the textcomp package - if (getChar(i) == 0x20ac || getChar(i) == 0xa4 - || getChar(i) == 0xa6 || getChar(i) == 0xa8 - || getChar(i) == 0xb4 || getChar(i) == 0xbd - || getChar(i) == 0xbc || getChar(i) == 0xbe) - features.require("textcomp"); + // We do not need the completely realized font, since we are + // only interested in the language, and that is never inherited. + // Therefore we can use getFontSettings instead of getFont. + LyXFont const & font = owner_->getFontSettings(bparams, i); + Encoding const & encoding = getEncoding(bparams, doc_encoding, font); + encoding.validate(getChar(i), features); } } diff --git a/src/paragraph_pimpl.h b/src/paragraph_pimpl.h index 04281d7a5b..3b8e0b0fab 100644 --- a/src/paragraph_pimpl.h +++ b/src/paragraph_pimpl.h @@ -26,6 +26,7 @@ namespace lyx { +class Encoding; class LyXLayout; @@ -123,16 +124,23 @@ public: /// FontList fontlist; - /// - void simpleTeXBlanks(odocstream &, TexRow & texrow, - pos_type const i, + /// Output the surrogate pair formed by \p c and \p next to \p os. + /// \return the number of characters written. + int latexSurrogatePair(odocstream & os, value_type c, value_type next, + Encoding const &); + /// Output a space in appropriate formatting (or a surrogate pair + /// if the next character is a combining character). + /// \return whether a surrogate pair was output. + bool simpleTeXBlanks(BufferParams const &, Encoding const &, + odocstream &, TexRow & texrow, + pos_type & i, unsigned int & column, LyXFont const & font, LyXLayout const & style); /// void simpleTeXSpecialChars(Buffer const &, BufferParams const &, - odocstream &, TexRow & texrow, - OutputParams const &, + Encoding const &, odocstream &, + TexRow & texrow, OutputParams const &, LyXFont & font, LyXFont & running_font, LyXFont & basefont, LyXFont const & outerfont, diff --git a/src/support/debugstream.h b/src/support/debugstream.h index b5e9283f7e..63744ef4c5 100644 --- a/src/support/debugstream.h +++ b/src/support/debugstream.h @@ -64,12 +64,14 @@ public: typedef typename debug::type Type; basic_debugstream() - : std::basic_ostream(0), dt(debug::NONE) + : std::basic_ostream(0), dt(debug::NONE), + realbuf_(0), enabled_(true) {} /// Constructor, sets the debug level to t. explicit basic_debugstream(std::basic_streambuf * buf) - : std::basic_ostream(buf), dt(debug::NONE) + : std::basic_ostream(buf), dt(debug::NONE), + realbuf_(0), enabled_(true) {} /// Sets the debug level to t. @@ -99,11 +101,32 @@ public: return *this; return nullstream; } + /// Disable the stream completely + void disable() + { + if (enabled_) { + realbuf_ = this->rdbuf(); + rdbuf(nullstream.rdbuf()); + enabled_ = false; + } + } + /// Enable the stream after a possible call of disable() + void enable() + { + if (!enabled_) { + this->rdbuf(realbuf_); + enabled_ = true; + } + } private: /// The current debug level Type dt; /// The no-op stream. boost::basic_onullstream nullstream; + /// The buffer of the real stream + std::streambuf * realbuf_; + /// Is the stream enabled? + bool enabled_; }; typedef basic_debugstream debugstream;