lyx2lyx refactoring.

* use unicode.transform() instead of loop over replacements
* telling variable names
* remove trailing whitespace
* documentation update
* don't set use_ligature_dashes if both dash types are found
* remove spurious warning, normalize indentation, and use
  Python idioms in revert_baselineskip()
This commit is contained in:
Günter Milde 2018-01-31 15:09:32 +01:00
parent e4c0c6afdf
commit 2fce4d49ee
8 changed files with 251 additions and 200 deletions

View File

@ -94,7 +94,7 @@ Test reversion/conversion between 2.3 and 2.2 formats with lyx2lyx.
\end_layout \end_layout
\begin_layout Description \begin_layout Description
Allowbreak: allowbreak:
\bar under \bar under
\begin_inset Box Boxed \begin_inset Box Boxed
@ -169,5 +169,69 @@ without spaces.
\end_layout \end_layout
\begin_layout Description
baselineskip%:
\begin_inset Box Boxed
position "t"
hor_pos "c"
has_inner_box 1
inner_pos "t"
use_parbox 1
use_makebox 0
width "250baselineskip%"
special "none"
height "50baselineskip%"
height_special "none"
thickness "4baselineskip%"
separation "9baselineskip%"
shadowsize "4pt"
framecolor "black"
backgroundcolor "none"
status open
\begin_layout Plain Layout
test
\end_layout
\end_inset
\begin_inset CommandInset line
LatexCommand rule
offset "40baselineskip%"
width "800baselineskip%"
height "5.3baselineskip%"
\end_inset
\end_layout
\begin_deeper
\begin_layout Standard
\begin_inset VSpace 200baselineskip%
\end_inset
\end_layout
\begin_layout Standard
Vertical space above this paragraph is 2·baselineskip.
\end_layout
\begin_layout Standard
\begin_inset space \hspace*{}
\length 75.2baselineskip%
\end_inset
Paragraph with
\begin_inset space \hspace{}
\length 135baselineskip%
\end_inset
horizontal space insets using baselineskip.
\end_layout
\end_deeper
\end_body \end_body
\end_document \end_document

View File

@ -17,8 +17,8 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
''' '''
This module offers several free functions to help with lyx2lyx'ing. This module offers several free functions to help with lyx2lyx'ing.
More documentaton is below, but here is a quick guide to what More documentaton is below, but here is a quick guide to what
they do. Optional arguments are marked by brackets. they do. Optional arguments are marked by brackets.
add_to_preamble(document, text): add_to_preamble(document, text):
@ -37,8 +37,8 @@ insert_to_preamble(document, text[, index]):
default index is 0, so the material is inserted at the beginning. default index is 0, so the material is inserted at the beginning.
Prepends a comment "% Added by lyx2lyx" to text. Prepends a comment "% Added by lyx2lyx" to text.
put_cmd_in_ert(arg): put_cmd_in_ert(cmd):
Here arg should be a list of strings (lines), which we want to Here cmd should be a list of strings (lines), which we want to
wrap in ERT. Returns a list of strings so wrapped. wrap in ERT. Returns a list of strings so wrapped.
A call to this routine will often go something like this: A call to this routine will often go something like this:
i = find_token('\\begin_inset FunkyInset', ...) i = find_token('\\begin_inset FunkyInset', ...)
@ -81,7 +81,6 @@ import string
from parser_tools import find_token, find_end_of_inset from parser_tools import find_token, find_end_of_inset
from unicode_symbols import unicode_reps from unicode_symbols import unicode_reps
# This will accept either a list of lines or a single line. # This will accept either a list of lines or a single line.
# It is bad practice to pass something with embedded newlines, # It is bad practice to pass something with embedded newlines,
# though we will handle that. # though we will handle that.
@ -118,34 +117,37 @@ def add_to_preamble(document, text):
# It should really be a list. # It should really be a list.
def insert_to_preamble(document, text, index = 0): def insert_to_preamble(document, text, index = 0):
""" Insert text to the preamble at a given line""" """ Insert text to the preamble at a given line"""
if not type(text) is list: if not type(text) is list:
# split on \n just in case # split on \n just in case
# it'll give us the one element list we want # it'll give us the one element list we want
# if there's no \n, too # if there's no \n, too
text = text.split('\n') text = text.split('\n')
text.insert(0, "% Added by lyx2lyx") text.insert(0, "% Added by lyx2lyx")
document.preamble[index:index] = text document.preamble[index:index] = text
def put_cmd_in_ert(arg): # A dictionary of Unicode->LICR mappings for use in a Unicode string's translate() method
''' # Created from the reversed list to keep the first of alternative definitions.
arg should be a list of lines we want to wrap in ERT. licr_table = dict((ord(ch), cmd) for cmd, ch in unicode_reps[::-1])
Returns a list of strings, with the lines so wrapped.
''' def put_cmd_in_ert(cmd):
"""
Return ERT inset wrapping `cmd` as a list of strings.
`cmd` can be a string or list of lines. Non-ASCII characters are converted
to the respective LICR macros if defined in unicodesymbols.
"""
ret = ["\\begin_inset ERT", "status collapsed", "", "\\begin_layout Plain Layout", ""] ret = ["\\begin_inset ERT", "status collapsed", "", "\\begin_layout Plain Layout", ""]
# It will be faster for us to work with a single string internally. # It will be faster to work with a single string internally.
# That way, we only go through the unicode_reps loop once. if isinstance(cmd, list):
if type(arg) is list: cmd = u"\n".join(cmd)
s = "\n".join(arg)
else: else:
s = arg cmd = u"%s" % cmd # ensure it is an unicode instance
for rep in unicode_reps: cmd = cmd.translate(licr_table)
s = s.replace(rep[1], rep[0]) cmd = cmd.replace("\\", "\\backslash\n")
s = s.replace('\\', "\\backslash\n") ret += cmd.splitlines()
ret += s.splitlines()
ret += ["\\end_layout", "", "\\end_inset"] ret += ["\\end_layout", "", "\\end_inset"]
return ret return ret
@ -300,7 +302,7 @@ def lyx2verbatim(document, lines):
def latex_length(slen): def latex_length(slen):
''' '''
Convert lengths to their LaTeX representation. Returns (bool, length), Convert lengths to their LaTeX representation. Returns (bool, length),
where the bool tells us if it was a percentage, and the length is the where the bool tells us if it was a percentage, and the length is the
LaTeX representation. LaTeX representation.
@ -314,9 +316,14 @@ def latex_length(slen):
# the + always precedes the - # the + always precedes the -
# Convert relative lengths to LaTeX units # Convert relative lengths to LaTeX units
units = {"text%":"\\textwidth", "col%":"\\columnwidth", units = {"col%": "\\columnwidth",
"page%":"\\paperwidth", "line%":"\\linewidth", "text%": "\\textwidth",
"theight%":"\\textheight", "pheight%":"\\paperheight"} "page%": "\\paperwidth",
"line%": "\\linewidth",
"theight%": "\\textheight",
"pheight%": "\\paperheight",
"baselineskip%": "\\baselineskip"
}
for unit in list(units.keys()): for unit in list(units.keys()):
i = slen.find(unit) i = slen.find(unit)
if i == -1: if i == -1:

View File

@ -23,7 +23,7 @@ import unicodedata
import sys, os import sys, os
from parser_tools import find_token, find_end_of, find_tokens, get_value from parser_tools import find_token, find_end_of, find_tokens, get_value
from unicode_symbols import read_unicodesymbols from unicode_symbols import unicode_reps
#################################################################### ####################################################################
# Private helper functions # Private helper functions
@ -146,54 +146,6 @@ def set_option(document, m, option, value):
return l return l
# FIXME: Remove this function if the version imported from unicode_symbols works.
# This function was the predecessor from that function, that in the meanwhile got
# new fixes.
def read_unicodesymbols2():
" Read the unicodesymbols list of unicode characters and corresponding commands."
# Provide support for both python 2 and 3
PY2 = sys.version_info[0] == 2
if not PY2:
unichr = chr
# End of code to support for both python 2 and 3
pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols'))
spec_chars = []
# Two backslashes, followed by some non-word character, and then a character
# in brackets. The idea is to check for constructs like: \"{u}, which is how
# they are written in the unicodesymbols file; but they can also be written
# as: \"u or even \" u.
r = re.compile(r'\\\\(\W)\{(\w)\}')
for line in fp.readlines():
if line[0] != '#' and line.strip() != "":
line=line.replace(' "',' ') # remove all quotation marks with spaces before
line=line.replace('" ',' ') # remove all quotation marks with spaces after
line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis)
try:
[ucs4,command,dead] = line.split(None,2)
if command[0:1] != "\\":
continue
spec_chars.append([command, unichr(eval(ucs4))])
except:
continue
m = r.match(command)
if m != None:
command = "\\\\"
# If the character is a double-quote, then we need to escape it, too,
# since it is done that way in the LyX file.
if m.group(1) == "\"":
command += "\\"
commandbl = command
command += m.group(1) + m.group(2)
commandbl += m.group(1) + ' ' + m.group(2)
spec_chars.append([command, unichr(eval(ucs4))])
spec_chars.append([commandbl, unichr(eval(ucs4))])
fp.close()
return spec_chars
def extract_argument(line): def extract_argument(line):
'Extracts a LaTeX argument from the start of line. Returns (arg, rest).' 'Extracts a LaTeX argument from the start of line. Returns (arg, rest).'
@ -280,8 +232,6 @@ def latex2ert(line, isindex):
return retval return retval
unicode_reps = read_unicodesymbols()
#Bug 5022.... #Bug 5022....
#Might should do latex2ert first, then deal with stuff that DOESN'T #Might should do latex2ert first, then deal with stuff that DOESN'T
#end up inside ERT. That routine could be modified so that it returned #end up inside ERT. That routine could be modified so that it returned

View File

@ -746,10 +746,10 @@ def convert_phrases(document):
if len(words) > 1 and words[0] == "\\begin_inset" and \ if len(words) > 1 and words[0] == "\\begin_inset" and \
words[1] in ["CommandInset", "External", "Formula", "Graphics", "listings"]: words[1] in ["CommandInset", "External", "Formula", "Graphics", "listings"]:
# must not replace anything in insets that store LaTeX contents in .lyx files # must not replace anything in insets that store LaTeX contents in .lyx files
# (math and command insets withut overridden read() and write() methods # (math and command insets without overridden read() and write() methods)
j = find_end_of_inset(document.body, i) j = find_end_of_inset(document.body, i)
if j == -1: if j == -1:
document.warning("Malformed LyX document: Can't find end of Formula inset at line " + str(i)) document.warning("Malformed LyX document: Can't find end of inset at line " + str(i))
i += 1 i += 1
else: else:
i = j i = j

View File

@ -27,15 +27,14 @@ import sys, os
from parser_tools import (del_token, del_value, del_complete_lines, from parser_tools import (del_token, del_value, del_complete_lines,
find_complete_lines, find_end_of, find_end_of_layout, find_end_of_inset, find_complete_lines, find_end_of, find_end_of_layout, find_end_of_inset,
find_re, find_token, find_token_backwards, get_containing_inset, find_re, find_token, find_token_backwards, get_containing_inset,
get_containing_layout, get_bool_value, get_value, get_quoted_value) get_containing_layout, get_bool_value, get_value, get_quoted_value,
# find_tokens, find_token_exact, is_in_inset, is_in_inset)
# check_token, get_option_value # find_tokens, find_token_exact, check_token, get_option_value
from lyx2lyx_tools import add_to_preamble, put_cmd_in_ert, revert_font_attrs, \ from lyx2lyx_tools import (add_to_preamble, put_cmd_in_ert, revert_font_attrs,
insert_to_preamble insert_to_preamble, latex_length)
# get_ert, lyx2latex, \ # get_ert, lyx2latex, lyx2verbatim, length_in_bp, convert_info_insets
# lyx2verbatim, length_in_bp, convert_info_insets # revert_flex_inset, hex2ratio, str2bool
# latex_length, revert_flex_inset, hex2ratio, str2bool
#################################################################### ####################################################################
# Private helper functions # Private helper functions
@ -88,13 +87,12 @@ def convert_dateinset(document):
continue continue
if get_value(document.body, 'template', i, j) == "Date": if get_value(document.body, 'template', i, j) == "Date":
document.body[i : j + 1] = put_cmd_in_ert("\\today ") document.body[i : j + 1] = put_cmd_in_ert("\\today ")
i += 1 i = j+1 # skip inset
continue
def convert_inputenc(document): def convert_inputenc(document):
" Replace no longer supported input encoding settings. " " Replace no longer supported input encoding settings. "
i = find_token(document.header, "\\inputenc", 0) i = find_token(document.header, "\\inputenc")
if i == -1: if i == -1:
return return
if get_value(document.header, "\\inputencoding", i) == "pt254": if get_value(document.header, "\\inputencoding", i) == "pt254":
@ -1859,7 +1857,7 @@ def convert_dashligatures(document):
while i+1 < len(lines): while i+1 < len(lines):
i += 1 i += 1
line = lines[i] line = lines[i]
# skip lines without any dashes: # skip lines without dashes:
if not re.search(u"[\u2013\u2014]|\\twohyphens|\\threehyphens", line): if not re.search(u"[\u2013\u2014]|\\twohyphens|\\threehyphens", line):
continue continue
# skip label width string (see bug 10243): # skip label width string (see bug 10243):
@ -1867,28 +1865,28 @@ def convert_dashligatures(document):
continue continue
# do not touch hyphens in some insets (cf. lyx_2_2.convert_dashes): # do not touch hyphens in some insets (cf. lyx_2_2.convert_dashes):
try: try:
value, start, end = get_containing_inset(lines, i) inset_type, start, end = get_containing_inset(lines, i)
except TypeError: # no containing inset except TypeError: # no containing inset
value, start, end = "no inset", -1, -1 inset_type, start, end = "no inset", -1, -1
if (value.split()[0] in if (inset_type.split()[0] in
["CommandInset", "ERT", "External", "Formula", ["CommandInset", "ERT", "External", "Formula",
"FormulaMacro", "Graphics", "IPA", "listings"] "FormulaMacro", "Graphics", "IPA", "listings"]
or value == "Flex Code"): or inset_type == "Flex Code"):
i = end i = end
continue continue
try: try:
layout, start, end, j = get_containing_layout(lines, i) layoutname, start, end, j = get_containing_layout(lines, i)
except TypeError: # no (or malformed) containing layout except TypeError: # no (or malformed) containing layout
document.warning("Malformed LyX document: " document.warning("Malformed LyX document: "
"Can't find layout at line %d" % i) "Can't find layout at line %d" % i)
continue continue
if layout == "LyX-Code": if layoutname == "LyX-Code":
i = end i = end
continue continue
# literal dash followed by a word or no-break space: # literal dash followed by a word or no-break space:
if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line, if re.search(u"[\u2013\u2014]([\w\u00A0]|$)",
flags=re.UNICODE): line, flags=re.UNICODE):
has_literal_dashes = True has_literal_dashes = True
# ligature dash followed by word or no-break space on next line: # ligature dash followed by word or no-break space on next line:
if (re.search(r"(\\twohyphens|\\threehyphens)", line) and if (re.search(r"(\\twohyphens|\\threehyphens)", line) and
@ -1900,14 +1898,15 @@ def convert_dashligatures(document):
'"ligature" dashes.\n Line breaks may have changed. ' '"ligature" dashes.\n Line breaks may have changed. '
'See UserGuide chapter 3.9.1 for details.') 'See UserGuide chapter 3.9.1 for details.')
break break
if has_literal_dashes:
if has_literal_dashes and not has_ligature_dashes:
use_dash_ligatures = False use_dash_ligatures = False
elif has_ligature_dashes: elif has_ligature_dashes and not has_literal_dashes:
use_dash_ligatures = True use_dash_ligatures = True
# insert the setting if there is a preferred value # insert the setting if there is a preferred value
if use_dash_ligatures is not None: if use_dash_ligatures is not None:
i = find_token(document.header, "\\graphics") document.header.insert(-1, "\\use_dash_ligatures %s"
document.header.insert(i, "\\use_dash_ligatures %s"
% str(use_dash_ligatures).lower()) % str(use_dash_ligatures).lower())
@ -2020,64 +2019,37 @@ def revert_mathindent(document):
def revert_baselineskip(document): def revert_baselineskip(document):
" Revert baselineskips to TeX code " " Revert baselineskips to TeX code "
i = 0 i = 0
vspaceLine = 0 regexp = re.compile(r'.*baselineskip%.*')
hspaceLine = 0 while True:
while True: i = i + 1
regexp = re.compile(r'^.*baselineskip%.*$') i = find_re(document.body, regexp, i)
i = find_re(document.body, regexp, i) if i == -1:
if i == -1: return
return if document.body[i].startswith("\\begin_inset VSpace"):
vspaceLine = find_token(document.body, "\\begin_inset VSpace", i) # output VSpace inset as TeX code
if vspaceLine == i: end = find_end_of_inset(document.body, i)
# output VSpace inset as TeX code if end == -1:
# first read out the values document.warning("Malformed LyX document: "
beg = document.body[i].rfind("VSpace "); "Can't find end of VSpace inset at line %d." % i)
end = document.body[i].rfind("baselineskip%"); continue
baselineskip = float(document.body[i][beg + 7:end]); # read out the value
# we store the value in percent, thus divide by 100 baselineskip = document.body[i].split()[-1]
baselineskip = baselineskip/100; # check if it is the starred version
baselineskip = str(baselineskip); star = '*' if '*' in document.body[i] else ''
# check if it is the starred version # now output TeX code
if document.body[i].find('*') != -1: cmd = "\\vspace%s{%s}" %(star, latex_length(baselineskip)[1])
star = '*' document.body[i:end+1] = put_cmd_in_ert(cmd)
else: i += 8
star = '' continue
# now output TeX code begin, end = is_in_inset(document.body, i, "\\begin_inset space \\hspace")
endInset = find_end_of_inset(document.body, i) if begin != - 1:
if endInset == -1: # output space inset as TeX code
document.warning("Malformed LyX document: Missing '\\end_inset' of VSpace inset.") baselineskip = document.body[i].split()[-1]
return star = '*' if '*' in document.body[i-1] else ''
else: cmd = "\\hspace%s{%s}" %(star, latex_length(baselineskip)[1])
document.body[vspaceLine: endInset + 1] = put_cmd_in_ert("\\vspace" + star + '{' + baselineskip + "\\baselineskip}") document.body[begin:end+1] = put_cmd_in_ert(cmd)
hspaceLine = find_token(document.body, "\\begin_inset space \\hspace", i - 1)
document.warning("hspaceLine: " + str(hspaceLine))
document.warning("i: " + str(i))
if hspaceLine == i - 1:
# output space inset as TeX code
# first read out the values
beg = document.body[i].rfind("\\length ");
end = document.body[i].rfind("baselineskip%");
baselineskip = float(document.body[i][beg + 7:end]);
document.warning("baselineskip: " + str(baselineskip))
# we store the value in percent, thus divide by 100
baselineskip = baselineskip/100;
baselineskip = str(baselineskip);
# check if it is the starred version
if document.body[i-1].find('*') != -1:
star = '*'
else:
star = ''
# now output TeX code
endInset = find_end_of_inset(document.body, i)
if endInset == -1:
document.warning("Malformed LyX document: Missing '\\end_inset' of space inset.")
return
else:
document.body[hspaceLine: endInset + 1] = put_cmd_in_ert("\\hspace" + star + '{' + baselineskip + "\\baselineskip}")
i = i + 1
def revert_rotfloat(document): def revert_rotfloat(document):

View File

@ -23,7 +23,7 @@ This module offers several free functions to help parse lines.
More documentaton is below, but here is a quick guide to what More documentaton is below, but here is a quick guide to what
they do. Optional arguments are marked by brackets. they do. Optional arguments are marked by brackets.
find_token(lines, token, start[, end[, ignorews]]): find_token(lines, token[, start[, end[, ignorews]]]):
Returns the first line i, start <= i < end, on which Returns the first line i, start <= i < end, on which
token is found at the beginning. Returns -1 if not token is found at the beginning. Returns -1 if not
found. found.
@ -31,10 +31,10 @@ find_token(lines, token, start[, end[, ignorews]]):
in whitespace do not count, except that there must be no in whitespace do not count, except that there must be no
extra whitespace following token itself. extra whitespace following token itself.
find_token_exact(lines, token, start[, end]): find_token_exact(lines, token[, start[, end]]]):
As find_token, but with ignorews set to True. As find_token, but with ignorews set to True.
find_tokens(lines, tokens, start[, end[, ignorews]]): find_tokens(lines, tokens[, start[, end[, ignorews]]]):
Returns the first line i, start <= i < end, on which Returns the first line i, start <= i < end, on which
one of the tokens in tokens is found at the beginning. one of the tokens in tokens is found at the beginning.
Returns -1 if not found. Returns -1 if not found.
@ -42,7 +42,7 @@ find_tokens(lines, tokens, start[, end[, ignorews]]):
in whitespace do not count, except that there must be no in whitespace do not count, except that there must be no
extra whitespace following token itself. extra whitespace following token itself.
find_tokens_exact(lines, token, start[, end]): find_tokens_exact(lines, token[, start[, end]]):
As find_tokens, but with ignorews True. As find_tokens, but with ignorews True.
find_token_backwards(lines, token, start): find_token_backwards(lines, token, start):
@ -543,8 +543,9 @@ def is_in_inset(lines, i, inset, default=(-1,-1)):
is_in_inset(document.body, i, "\\begin_inset Tabular") is_in_inset(document.body, i, "\\begin_inset Tabular")
returns (-1,-1) if `i` is not within a "Tabular" inset (i.e. a table). returns (-1,-1) if `i` is not within a "Tabular" inset (i.e. a table).
If it is, then it returns the line on which the table begins and the one If it is, then it returns the line on which the table begins and the one
on which it ends. Note that this pair will evaulate to on which it ends.
boolean True, so Note that this pair will evaulate to boolean True, so (with the optional
default value set to False)
if is_in_inset(..., default=False): if is_in_inset(..., default=False):
will do what you expect. will do what you expect.
""" """

View File

@ -0,0 +1,52 @@
# This file is part of lyx2lyx
# -*- coding: utf-8 -*-
# Copyright (C) 2018 The LyX team
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
" This modules tests the auxiliary functions for lyx2lyx."
from lyx2lyx_tools import *
import unittest
class TestParserTools(unittest.TestCase):
def test_put_cmd_in_ert(self):
ert = ['\\begin_inset ERT',
'status collapsed',
'',
'\\begin_layout Plain Layout',
'',
u'\\backslash',
u'texttt{Gr\\backslash',
u'"{u}\\backslash',
u'ss{}e}',
'\\end_layout',
'',
'\\end_inset']
self.assertEqual(put_cmd_in_ert(u"\\texttt{Grüße}"), ert)
self.assertEqual(put_cmd_in_ert([u"\\texttt{Grüße}"]), ert)
def test_latex_length(self):
self.assertEqual(latex_length("-30.5col%"), (True, "-0.305\\columnwidth"))
self.assertEqual(latex_length("35baselineskip%"), (True, "0.35\\baselineskip"))
self.assertEqual(latex_length("11em"), (False, "11em"))
self.assertEqual(latex_length("-0.4pt"), (False, "-0.4pt"))
if __name__ == '__main__':
unittest.main()

View File

@ -18,7 +18,7 @@
" Import unicode_reps from this module for access to the unicode<->LaTeX mapping. " " Import unicode_reps from this module for access to the unicode<->LaTeX mapping. "
import sys, os, re import sys, os, re, codecs
# Provide support for both python 2 and 3 # Provide support for both python 2 and 3
PY2 = sys.version_info[0] == 2 PY2 = sys.version_info[0] == 2
@ -28,14 +28,13 @@ if not PY2:
def read_unicodesymbols(): def read_unicodesymbols():
" Read the unicodesymbols list of unicode characters and corresponding commands." " Read the unicodesymbols list of unicode characters and corresponding commands."
pathname = os.path.abspath(os.path.dirname(sys.argv[0])) pathname = os.path.abspath(os.path.dirname(__file__))
filename = os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols') filename = os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols')
# For python 3+ we have to specify the encoding for those systems # Read as Unicode strings in both, Python 2 and 3
# where the default is not UTF-8 # Specify the encoding for those systems where the default is not UTF-8
fp = open(filename, encoding="utf8") if (not PY2) else open(filename) fp = codecs.open(filename, encoding="utf8")
spec_chars = []
# A backslash, followed by some non-word character, and then a character # A backslash, followed by some non-word character, and then a character
# in brackets. The idea is to check for constructs like: \"{u}, which is how # in brackets. The idea is to check for constructs like: \"{u}, which is how
# they are written in the unicodesymbols file; but they can also be written # they are written in the unicodesymbols file; but they can also be written
@ -43,36 +42,42 @@ def read_unicodesymbols():
# The two backslashes in the string literal are needed to specify a literal # The two backslashes in the string literal are needed to specify a literal
# backslash in the regex. Without r prefix, these would be four backslashes. # backslash in the regex. Without r prefix, these would be four backslashes.
r = re.compile(r'\\(\W)\{(\w)\}') r = re.compile(r'\\(\W)\{(\w)\}')
spec_chars = []
for line in fp.readlines(): for line in fp.readlines():
if line[0] != '#' and line.strip() != "": if not line.strip() or line.startswith('#'):
# Note: backslashes in the string literals with r prefix are not escaped, # skip empty lines and comments
# so one backslash in the source file equals one backslash in memory. continue
# Without r prefix backslahses are escaped, so two backslashes in the # Note: backslashes in the string literals with r prefix are not escaped,
# source file equal one backslash in memory. # so one backslash in the source file equals one backslash in memory.
line=line.replace(' "',' ') # remove all quotation marks with spaces before # Without r prefix backslahses are escaped, so two backslashes in the
line=line.replace('" ',' ') # remove all quotation marks with spaces after # source file equal one backslash in memory.
line=line.replace(r'\"','"') # unescape " line=line.replace(' "',' ') # remove all quotation marks with spaces before
line=line.replace(r'\\','\\') # unescape \ line=line.replace('" ',' ') # remove all quotation marks with spaces after
try: line=line.replace(r'\"','"') # unescape "
[ucs4,command,dead] = line.split(None,2) line=line.replace(r'\\','\\') # unescape \
if command[0:1] != "\\": try:
continue [ucs4,command,dead] = line.split(None,2)
if (line.find("notermination=text") < 0 and if command[0:1] != "\\":
line.find("notermination=both") < 0 and command[-1] != "}"):
command = command + "{}"
spec_chars.append([command, unichr(eval(ucs4))])
except:
continue continue
m = r.match(command) literal_char = unichr(int(ucs4, 16))
if m != None: if (line.find("notermination=text") < 0 and
command = "\\" line.find("notermination=both") < 0 and command[-1] != "}"):
commandbl = command command = command + "{}"
command += m.group(1) + m.group(2) spec_chars.append([command, literal_char])
commandbl += m.group(1) + ' ' + m.group(2) except:
spec_chars.append([command, unichr(eval(ucs4))]) continue
spec_chars.append([commandbl, unichr(eval(ucs4))]) m = r.match(command)
if m != None:
command = "\\"
commandbl = command
command += m.group(1) + m.group(2)
commandbl += m.group(1) + ' ' + m.group(2)
spec_chars.append([command, literal_char])
spec_chars.append([commandbl, literal_char])
fp.close() fp.close()
return spec_chars return spec_chars
unicode_reps = read_unicodesymbols() unicode_reps = read_unicodesymbols()