lyx2lyx refactoring.

* use unicode.transform() instead of loop over replacements
* telling variable names
* remove trailing whitespace
* documentation update
* don't set use_ligature_dashes if both dash types are found
* remove spurious warning, normalize indentation, and use
  Python idioms in revert_baselineskip()
This commit is contained in:
Günter Milde 2018-01-31 15:09:32 +01:00
parent e4c0c6afdf
commit 2fce4d49ee
8 changed files with 251 additions and 200 deletions

View File

@ -94,7 +94,7 @@ Test reversion/conversion between 2.3 and 2.2 formats with lyx2lyx.
\end_layout
\begin_layout Description
Allowbreak:
allowbreak:
\bar under
\begin_inset Box Boxed
@ -169,5 +169,69 @@ without spaces.
\end_layout
\begin_layout Description
baselineskip%:
\begin_inset Box Boxed
position "t"
hor_pos "c"
has_inner_box 1
inner_pos "t"
use_parbox 1
use_makebox 0
width "250baselineskip%"
special "none"
height "50baselineskip%"
height_special "none"
thickness "4baselineskip%"
separation "9baselineskip%"
shadowsize "4pt"
framecolor "black"
backgroundcolor "none"
status open
\begin_layout Plain Layout
test
\end_layout
\end_inset
\begin_inset CommandInset line
LatexCommand rule
offset "40baselineskip%"
width "800baselineskip%"
height "5.3baselineskip%"
\end_inset
\end_layout
\begin_deeper
\begin_layout Standard
\begin_inset VSpace 200baselineskip%
\end_inset
\end_layout
\begin_layout Standard
Vertical space above this paragraph is 2·baselineskip.
\end_layout
\begin_layout Standard
\begin_inset space \hspace*{}
\length 75.2baselineskip%
\end_inset
Paragraph with
\begin_inset space \hspace{}
\length 135baselineskip%
\end_inset
horizontal space insets using baselineskip.
\end_layout
\end_deeper
\end_body
\end_document

View File

@ -17,8 +17,8 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
'''
This module offers several free functions to help with lyx2lyx'ing.
More documentaton is below, but here is a quick guide to what
This module offers several free functions to help with lyx2lyx'ing.
More documentaton is below, but here is a quick guide to what
they do. Optional arguments are marked by brackets.
add_to_preamble(document, text):
@ -37,8 +37,8 @@ insert_to_preamble(document, text[, index]):
default index is 0, so the material is inserted at the beginning.
Prepends a comment "% Added by lyx2lyx" to text.
put_cmd_in_ert(arg):
Here arg should be a list of strings (lines), which we want to
put_cmd_in_ert(cmd):
Here cmd should be a list of strings (lines), which we want to
wrap in ERT. Returns a list of strings so wrapped.
A call to this routine will often go something like this:
i = find_token('\\begin_inset FunkyInset', ...)
@ -81,7 +81,6 @@ import string
from parser_tools import find_token, find_end_of_inset
from unicode_symbols import unicode_reps
# This will accept either a list of lines or a single line.
# It is bad practice to pass something with embedded newlines,
# though we will handle that.
@ -118,34 +117,37 @@ def add_to_preamble(document, text):
# It should really be a list.
def insert_to_preamble(document, text, index = 0):
""" Insert text to the preamble at a given line"""
if not type(text) is list:
# split on \n just in case
# it'll give us the one element list we want
# if there's no \n, too
text = text.split('\n')
text.insert(0, "% Added by lyx2lyx")
document.preamble[index:index] = text
def put_cmd_in_ert(arg):
'''
arg should be a list of lines we want to wrap in ERT.
Returns a list of strings, with the lines so wrapped.
'''
# A dictionary of Unicode->LICR mappings for use in a Unicode string's translate() method
# Created from the reversed list to keep the first of alternative definitions.
licr_table = dict((ord(ch), cmd) for cmd, ch in unicode_reps[::-1])
def put_cmd_in_ert(cmd):
"""
Return ERT inset wrapping `cmd` as a list of strings.
`cmd` can be a string or list of lines. Non-ASCII characters are converted
to the respective LICR macros if defined in unicodesymbols.
"""
ret = ["\\begin_inset ERT", "status collapsed", "", "\\begin_layout Plain Layout", ""]
# It will be faster for us to work with a single string internally.
# That way, we only go through the unicode_reps loop once.
if type(arg) is list:
s = "\n".join(arg)
# It will be faster to work with a single string internally.
if isinstance(cmd, list):
cmd = u"\n".join(cmd)
else:
s = arg
for rep in unicode_reps:
s = s.replace(rep[1], rep[0])
s = s.replace('\\', "\\backslash\n")
ret += s.splitlines()
cmd = u"%s" % cmd # ensure it is an unicode instance
cmd = cmd.translate(licr_table)
cmd = cmd.replace("\\", "\\backslash\n")
ret += cmd.splitlines()
ret += ["\\end_layout", "", "\\end_inset"]
return ret
@ -300,7 +302,7 @@ def lyx2verbatim(document, lines):
def latex_length(slen):
'''
'''
Convert lengths to their LaTeX representation. Returns (bool, length),
where the bool tells us if it was a percentage, and the length is the
LaTeX representation.
@ -314,9 +316,14 @@ def latex_length(slen):
# the + always precedes the -
# Convert relative lengths to LaTeX units
units = {"text%":"\\textwidth", "col%":"\\columnwidth",
"page%":"\\paperwidth", "line%":"\\linewidth",
"theight%":"\\textheight", "pheight%":"\\paperheight"}
units = {"col%": "\\columnwidth",
"text%": "\\textwidth",
"page%": "\\paperwidth",
"line%": "\\linewidth",
"theight%": "\\textheight",
"pheight%": "\\paperheight",
"baselineskip%": "\\baselineskip"
}
for unit in list(units.keys()):
i = slen.find(unit)
if i == -1:

View File

@ -23,7 +23,7 @@ import unicodedata
import sys, os
from parser_tools import find_token, find_end_of, find_tokens, get_value
from unicode_symbols import read_unicodesymbols
from unicode_symbols import unicode_reps
####################################################################
# Private helper functions
@ -146,54 +146,6 @@ def set_option(document, m, option, value):
return l
# FIXME: Remove this function if the version imported from unicode_symbols works.
# This function was the predecessor from that function, that in the meanwhile got
# new fixes.
def read_unicodesymbols2():
" Read the unicodesymbols list of unicode characters and corresponding commands."
# Provide support for both python 2 and 3
PY2 = sys.version_info[0] == 2
if not PY2:
unichr = chr
# End of code to support for both python 2 and 3
pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols'))
spec_chars = []
# Two backslashes, followed by some non-word character, and then a character
# in brackets. The idea is to check for constructs like: \"{u}, which is how
# they are written in the unicodesymbols file; but they can also be written
# as: \"u or even \" u.
r = re.compile(r'\\\\(\W)\{(\w)\}')
for line in fp.readlines():
if line[0] != '#' and line.strip() != "":
line=line.replace(' "',' ') # remove all quotation marks with spaces before
line=line.replace('" ',' ') # remove all quotation marks with spaces after
line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis)
try:
[ucs4,command,dead] = line.split(None,2)
if command[0:1] != "\\":
continue
spec_chars.append([command, unichr(eval(ucs4))])
except:
continue
m = r.match(command)
if m != None:
command = "\\\\"
# If the character is a double-quote, then we need to escape it, too,
# since it is done that way in the LyX file.
if m.group(1) == "\"":
command += "\\"
commandbl = command
command += m.group(1) + m.group(2)
commandbl += m.group(1) + ' ' + m.group(2)
spec_chars.append([command, unichr(eval(ucs4))])
spec_chars.append([commandbl, unichr(eval(ucs4))])
fp.close()
return spec_chars
def extract_argument(line):
'Extracts a LaTeX argument from the start of line. Returns (arg, rest).'
@ -280,8 +232,6 @@ def latex2ert(line, isindex):
return retval
unicode_reps = read_unicodesymbols()
#Bug 5022....
#Might should do latex2ert first, then deal with stuff that DOESN'T
#end up inside ERT. That routine could be modified so that it returned

View File

@ -746,10 +746,10 @@ def convert_phrases(document):
if len(words) > 1 and words[0] == "\\begin_inset" and \
words[1] in ["CommandInset", "External", "Formula", "Graphics", "listings"]:
# must not replace anything in insets that store LaTeX contents in .lyx files
# (math and command insets withut overridden read() and write() methods
# (math and command insets without overridden read() and write() methods)
j = find_end_of_inset(document.body, i)
if j == -1:
document.warning("Malformed LyX document: Can't find end of Formula inset at line " + str(i))
document.warning("Malformed LyX document: Can't find end of inset at line " + str(i))
i += 1
else:
i = j

View File

@ -27,15 +27,14 @@ import sys, os
from parser_tools import (del_token, del_value, del_complete_lines,
find_complete_lines, find_end_of, find_end_of_layout, find_end_of_inset,
find_re, find_token, find_token_backwards, get_containing_inset,
get_containing_layout, get_bool_value, get_value, get_quoted_value)
# find_tokens, find_token_exact, is_in_inset,
# check_token, get_option_value
get_containing_layout, get_bool_value, get_value, get_quoted_value,
is_in_inset)
# find_tokens, find_token_exact, check_token, get_option_value
from lyx2lyx_tools import add_to_preamble, put_cmd_in_ert, revert_font_attrs, \
insert_to_preamble
# get_ert, lyx2latex, \
# lyx2verbatim, length_in_bp, convert_info_insets
# latex_length, revert_flex_inset, hex2ratio, str2bool
from lyx2lyx_tools import (add_to_preamble, put_cmd_in_ert, revert_font_attrs,
insert_to_preamble, latex_length)
# get_ert, lyx2latex, lyx2verbatim, length_in_bp, convert_info_insets
# revert_flex_inset, hex2ratio, str2bool
####################################################################
# Private helper functions
@ -88,13 +87,12 @@ def convert_dateinset(document):
continue
if get_value(document.body, 'template', i, j) == "Date":
document.body[i : j + 1] = put_cmd_in_ert("\\today ")
i += 1
continue
i = j+1 # skip inset
def convert_inputenc(document):
" Replace no longer supported input encoding settings. "
i = find_token(document.header, "\\inputenc", 0)
i = find_token(document.header, "\\inputenc")
if i == -1:
return
if get_value(document.header, "\\inputencoding", i) == "pt254":
@ -1859,7 +1857,7 @@ def convert_dashligatures(document):
while i+1 < len(lines):
i += 1
line = lines[i]
# skip lines without any dashes:
# skip lines without dashes:
if not re.search(u"[\u2013\u2014]|\\twohyphens|\\threehyphens", line):
continue
# skip label width string (see bug 10243):
@ -1867,28 +1865,28 @@ def convert_dashligatures(document):
continue
# do not touch hyphens in some insets (cf. lyx_2_2.convert_dashes):
try:
value, start, end = get_containing_inset(lines, i)
inset_type, start, end = get_containing_inset(lines, i)
except TypeError: # no containing inset
value, start, end = "no inset", -1, -1
if (value.split()[0] in
inset_type, start, end = "no inset", -1, -1
if (inset_type.split()[0] in
["CommandInset", "ERT", "External", "Formula",
"FormulaMacro", "Graphics", "IPA", "listings"]
or value == "Flex Code"):
or inset_type == "Flex Code"):
i = end
continue
try:
layout, start, end, j = get_containing_layout(lines, i)
layoutname, start, end, j = get_containing_layout(lines, i)
except TypeError: # no (or malformed) containing layout
document.warning("Malformed LyX document: "
"Can't find layout at line %d" % i)
continue
if layout == "LyX-Code":
if layoutname == "LyX-Code":
i = end
continue
# literal dash followed by a word or no-break space:
if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line,
flags=re.UNICODE):
if re.search(u"[\u2013\u2014]([\w\u00A0]|$)",
line, flags=re.UNICODE):
has_literal_dashes = True
# ligature dash followed by word or no-break space on next line:
if (re.search(r"(\\twohyphens|\\threehyphens)", line) and
@ -1900,14 +1898,15 @@ def convert_dashligatures(document):
'"ligature" dashes.\n Line breaks may have changed. '
'See UserGuide chapter 3.9.1 for details.')
break
if has_literal_dashes:
if has_literal_dashes and not has_ligature_dashes:
use_dash_ligatures = False
elif has_ligature_dashes:
elif has_ligature_dashes and not has_literal_dashes:
use_dash_ligatures = True
# insert the setting if there is a preferred value
if use_dash_ligatures is not None:
i = find_token(document.header, "\\graphics")
document.header.insert(i, "\\use_dash_ligatures %s"
document.header.insert(-1, "\\use_dash_ligatures %s"
% str(use_dash_ligatures).lower())
@ -2020,64 +2019,37 @@ def revert_mathindent(document):
def revert_baselineskip(document):
" Revert baselineskips to TeX code "
i = 0
vspaceLine = 0
hspaceLine = 0
while True:
regexp = re.compile(r'^.*baselineskip%.*$')
i = find_re(document.body, regexp, i)
if i == -1:
return
vspaceLine = find_token(document.body, "\\begin_inset VSpace", i)
if vspaceLine == i:
# output VSpace inset as TeX code
# first read out the values
beg = document.body[i].rfind("VSpace ");
end = document.body[i].rfind("baselineskip%");
baselineskip = float(document.body[i][beg + 7:end]);
# we store the value in percent, thus divide by 100
baselineskip = baselineskip/100;
baselineskip = str(baselineskip);
# check if it is the starred version
if document.body[i].find('*') != -1:
star = '*'
else:
star = ''
# now output TeX code
endInset = find_end_of_inset(document.body, i)
if endInset == -1:
document.warning("Malformed LyX document: Missing '\\end_inset' of VSpace inset.")
return
else:
document.body[vspaceLine: endInset + 1] = put_cmd_in_ert("\\vspace" + star + '{' + baselineskip + "\\baselineskip}")
hspaceLine = find_token(document.body, "\\begin_inset space \\hspace", i - 1)
document.warning("hspaceLine: " + str(hspaceLine))
document.warning("i: " + str(i))
if hspaceLine == i - 1:
# output space inset as TeX code
# first read out the values
beg = document.body[i].rfind("\\length ");
end = document.body[i].rfind("baselineskip%");
baselineskip = float(document.body[i][beg + 7:end]);
document.warning("baselineskip: " + str(baselineskip))
# we store the value in percent, thus divide by 100
baselineskip = baselineskip/100;
baselineskip = str(baselineskip);
# check if it is the starred version
if document.body[i-1].find('*') != -1:
star = '*'
else:
star = ''
# now output TeX code
endInset = find_end_of_inset(document.body, i)
if endInset == -1:
document.warning("Malformed LyX document: Missing '\\end_inset' of space inset.")
return
else:
document.body[hspaceLine: endInset + 1] = put_cmd_in_ert("\\hspace" + star + '{' + baselineskip + "\\baselineskip}")
i = i + 1
" Revert baselineskips to TeX code "
i = 0
regexp = re.compile(r'.*baselineskip%.*')
while True:
i = i + 1
i = find_re(document.body, regexp, i)
if i == -1:
return
if document.body[i].startswith("\\begin_inset VSpace"):
# output VSpace inset as TeX code
end = find_end_of_inset(document.body, i)
if end == -1:
document.warning("Malformed LyX document: "
"Can't find end of VSpace inset at line %d." % i)
continue
# read out the value
baselineskip = document.body[i].split()[-1]
# check if it is the starred version
star = '*' if '*' in document.body[i] else ''
# now output TeX code
cmd = "\\vspace%s{%s}" %(star, latex_length(baselineskip)[1])
document.body[i:end+1] = put_cmd_in_ert(cmd)
i += 8
continue
begin, end = is_in_inset(document.body, i, "\\begin_inset space \\hspace")
if begin != - 1:
# output space inset as TeX code
baselineskip = document.body[i].split()[-1]
star = '*' if '*' in document.body[i-1] else ''
cmd = "\\hspace%s{%s}" %(star, latex_length(baselineskip)[1])
document.body[begin:end+1] = put_cmd_in_ert(cmd)
def revert_rotfloat(document):

View File

@ -23,7 +23,7 @@ This module offers several free functions to help parse lines.
More documentaton is below, but here is a quick guide to what
they do. Optional arguments are marked by brackets.
find_token(lines, token, start[, end[, ignorews]]):
find_token(lines, token[, start[, end[, ignorews]]]):
Returns the first line i, start <= i < end, on which
token is found at the beginning. Returns -1 if not
found.
@ -31,10 +31,10 @@ find_token(lines, token, start[, end[, ignorews]]):
in whitespace do not count, except that there must be no
extra whitespace following token itself.
find_token_exact(lines, token, start[, end]):
find_token_exact(lines, token[, start[, end]]]):
As find_token, but with ignorews set to True.
find_tokens(lines, tokens, start[, end[, ignorews]]):
find_tokens(lines, tokens[, start[, end[, ignorews]]]):
Returns the first line i, start <= i < end, on which
one of the tokens in tokens is found at the beginning.
Returns -1 if not found.
@ -42,7 +42,7 @@ find_tokens(lines, tokens, start[, end[, ignorews]]):
in whitespace do not count, except that there must be no
extra whitespace following token itself.
find_tokens_exact(lines, token, start[, end]):
find_tokens_exact(lines, token[, start[, end]]):
As find_tokens, but with ignorews True.
find_token_backwards(lines, token, start):
@ -543,8 +543,9 @@ def is_in_inset(lines, i, inset, default=(-1,-1)):
is_in_inset(document.body, i, "\\begin_inset Tabular")
returns (-1,-1) if `i` is not within a "Tabular" inset (i.e. a table).
If it is, then it returns the line on which the table begins and the one
on which it ends. Note that this pair will evaulate to
boolean True, so
on which it ends.
Note that this pair will evaulate to boolean True, so (with the optional
default value set to False)
if is_in_inset(..., default=False):
will do what you expect.
"""

View File

@ -0,0 +1,52 @@
# This file is part of lyx2lyx
# -*- coding: utf-8 -*-
# Copyright (C) 2018 The LyX team
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
" This modules tests the auxiliary functions for lyx2lyx."
from lyx2lyx_tools import *
import unittest
class TestParserTools(unittest.TestCase):
def test_put_cmd_in_ert(self):
ert = ['\\begin_inset ERT',
'status collapsed',
'',
'\\begin_layout Plain Layout',
'',
u'\\backslash',
u'texttt{Gr\\backslash',
u'"{u}\\backslash',
u'ss{}e}',
'\\end_layout',
'',
'\\end_inset']
self.assertEqual(put_cmd_in_ert(u"\\texttt{Grüße}"), ert)
self.assertEqual(put_cmd_in_ert([u"\\texttt{Grüße}"]), ert)
def test_latex_length(self):
self.assertEqual(latex_length("-30.5col%"), (True, "-0.305\\columnwidth"))
self.assertEqual(latex_length("35baselineskip%"), (True, "0.35\\baselineskip"))
self.assertEqual(latex_length("11em"), (False, "11em"))
self.assertEqual(latex_length("-0.4pt"), (False, "-0.4pt"))
if __name__ == '__main__':
unittest.main()

View File

@ -18,7 +18,7 @@
" Import unicode_reps from this module for access to the unicode<->LaTeX mapping. "
import sys, os, re
import sys, os, re, codecs
# Provide support for both python 2 and 3
PY2 = sys.version_info[0] == 2
@ -28,14 +28,13 @@ if not PY2:
def read_unicodesymbols():
" Read the unicodesymbols list of unicode characters and corresponding commands."
pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
pathname = os.path.abspath(os.path.dirname(__file__))
filename = os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols')
# For python 3+ we have to specify the encoding for those systems
# where the default is not UTF-8
fp = open(filename, encoding="utf8") if (not PY2) else open(filename)
# Read as Unicode strings in both, Python 2 and 3
# Specify the encoding for those systems where the default is not UTF-8
fp = codecs.open(filename, encoding="utf8")
spec_chars = []
# A backslash, followed by some non-word character, and then a character
# in brackets. The idea is to check for constructs like: \"{u}, which is how
# they are written in the unicodesymbols file; but they can also be written
@ -43,36 +42,42 @@ def read_unicodesymbols():
# The two backslashes in the string literal are needed to specify a literal
# backslash in the regex. Without r prefix, these would be four backslashes.
r = re.compile(r'\\(\W)\{(\w)\}')
spec_chars = []
for line in fp.readlines():
if line[0] != '#' and line.strip() != "":
# Note: backslashes in the string literals with r prefix are not escaped,
# so one backslash in the source file equals one backslash in memory.
# Without r prefix backslahses are escaped, so two backslashes in the
# source file equal one backslash in memory.
line=line.replace(' "',' ') # remove all quotation marks with spaces before
line=line.replace('" ',' ') # remove all quotation marks with spaces after
line=line.replace(r'\"','"') # unescape "
line=line.replace(r'\\','\\') # unescape \
try:
[ucs4,command,dead] = line.split(None,2)
if command[0:1] != "\\":
continue
if (line.find("notermination=text") < 0 and
line.find("notermination=both") < 0 and command[-1] != "}"):
command = command + "{}"
spec_chars.append([command, unichr(eval(ucs4))])
except:
if not line.strip() or line.startswith('#'):
# skip empty lines and comments
continue
# Note: backslashes in the string literals with r prefix are not escaped,
# so one backslash in the source file equals one backslash in memory.
# Without r prefix backslahses are escaped, so two backslashes in the
# source file equal one backslash in memory.
line=line.replace(' "',' ') # remove all quotation marks with spaces before
line=line.replace('" ',' ') # remove all quotation marks with spaces after
line=line.replace(r'\"','"') # unescape "
line=line.replace(r'\\','\\') # unescape \
try:
[ucs4,command,dead] = line.split(None,2)
if command[0:1] != "\\":
continue
m = r.match(command)
if m != None:
command = "\\"
commandbl = command
command += m.group(1) + m.group(2)
commandbl += m.group(1) + ' ' + m.group(2)
spec_chars.append([command, unichr(eval(ucs4))])
spec_chars.append([commandbl, unichr(eval(ucs4))])
literal_char = unichr(int(ucs4, 16))
if (line.find("notermination=text") < 0 and
line.find("notermination=both") < 0 and command[-1] != "}"):
command = command + "{}"
spec_chars.append([command, literal_char])
except:
continue
m = r.match(command)
if m != None:
command = "\\"
commandbl = command
command += m.group(1) + m.group(2)
commandbl += m.group(1) + ' ' + m.group(2)
spec_chars.append([command, literal_char])
spec_chars.append([commandbl, literal_char])
fp.close()
return spec_chars
unicode_reps = read_unicodesymbols()