lyx2lyx refactoring.

* use unicode.transform() instead of loop over replacements * telling variable names * remove trailing whitespace * documentation update * don't set use_ligature_dashes if both dash types are found * remove spurious warning, normalize indentation, and use Python idioms in revert_baselineskip()
2024-11-25 19:07:45 +00:00 · 2018-01-31 15:09:32 +01:00 · 2018-01-31 15:09:32 +01:00 · 2fce4d49ee
commit 2fce4d49ee
parent e4c0c6afdf
8 changed files with 251 additions and 200 deletions
--- a/autotests/export/lyx2lyx/lyx_2_3_test.lyx
+++ b/autotests/export/lyx2lyx/lyx_2_3_test.lyx
@ -94,7 +94,7 @@ Test reversion/conversion between 2.3 and 2.2 formats with lyx2lyx.
 \end_layout

 \begin_layout Description
-Allowbreak: 
+allowbreak: 
 \bar under

 \begin_inset Box Boxed
@ -169,5 +169,69 @@ without spaces.

 \end_layout

+\begin_layout Description
+baselineskip%: 
+\begin_inset Box Boxed
+position "t"
+hor_pos "c"
+has_inner_box 1
+inner_pos "t"
+use_parbox 1
+use_makebox 0
+width "250baselineskip%"
+special "none"
+height "50baselineskip%"
+height_special "none"
+thickness "4baselineskip%"
+separation "9baselineskip%"
+shadowsize "4pt"
+framecolor "black"
+backgroundcolor "none"
+status open
+
+\begin_layout Plain Layout
+test
+\end_layout
+
+\end_inset
+
+ 
+\begin_inset CommandInset line
+LatexCommand rule
+offset "40baselineskip%"
+width "800baselineskip%"
+height "5.3baselineskip%"
+
+\end_inset
+
+
+\end_layout
+
+\begin_deeper
+\begin_layout Standard
+\begin_inset VSpace 200baselineskip%
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Vertical space above this paragraph is 2·baselineskip.
+\end_layout
+
+\begin_layout Standard
+\begin_inset space \hspace*{}
+\length 75.2baselineskip%
+\end_inset
+
+Paragraph with
+\begin_inset space \hspace{}
+\length 135baselineskip%
+\end_inset
+
+horizontal space insets using baselineskip.
+\end_layout
+
+\end_deeper
 \end_body
 \end_document
--- a/lib/lyx2lyx/lyx2lyx_tools.py
+++ b/lib/lyx2lyx/lyx2lyx_tools.py
@ -37,8 +37,8 @@ insert_to_preamble(document, text[, index]):
  default index is 0, so the material is inserted at the beginning.
  Prepends a comment "% Added by lyx2lyx" to text.

-put_cmd_in_ert(arg):
-  Here arg should be a list of strings (lines), which we want to
+put_cmd_in_ert(cmd):
+  Here cmd should be a list of strings (lines), which we want to
  wrap in ERT. Returns a list of strings so wrapped.
  A call to this routine will often go something like this:
    i = find_token('\\begin_inset FunkyInset', ...)
@ -81,7 +81,6 @@ import string
 from parser_tools import find_token, find_end_of_inset
 from unicode_symbols import unicode_reps

-
 # This will accept either a list of lines or a single line.
 # It is bad practice to pass something with embedded newlines,
 # though we will handle that.
@ -129,23 +128,26 @@ def insert_to_preamble(document, text, index = 0):
    document.preamble[index:index] = text


-def put_cmd_in_ert(arg):
-    '''
-    arg should be a list of lines we want to wrap in ERT.
-    Returns a list of strings, with the lines so wrapped.
-    '''
+# A dictionary of Unicode->LICR mappings for use in a Unicode string's translate() method
+# Created from the reversed list to keep the first of alternative definitions.
+licr_table = dict((ord(ch), cmd) for cmd, ch in unicode_reps[::-1])

+def put_cmd_in_ert(cmd):
+    """
+    Return ERT inset wrapping `cmd` as a list of strings.
+
+    `cmd` can be a string or list of lines. Non-ASCII characters are converted
+    to the respective LICR macros if defined in unicodesymbols.
+    """
    ret = ["\\begin_inset ERT", "status collapsed", "", "\\begin_layout Plain Layout", ""]
-    # It will be faster for us to work with a single string internally. 
-    # That way, we only go through the unicode_reps loop once.
-    if type(arg) is list:
-      s = "\n".join(arg)
+    # It will be faster to work with a single string internally.
+    if isinstance(cmd, list):
+        cmd = u"\n".join(cmd)
    else:
-      s = arg
-    for rep in unicode_reps:
-      s = s.replace(rep[1], rep[0])
-    s = s.replace('\\', "\\backslash\n")
-    ret += s.splitlines()
+        cmd = u"%s" % cmd # ensure it is an unicode instance
+    cmd = cmd.translate(licr_table)
+    cmd = cmd.replace("\\", "\\backslash\n")
+    ret += cmd.splitlines()
    ret += ["\\end_layout", "", "\\end_inset"]
    return ret

@ -314,9 +316,14 @@ def latex_length(slen):
    # the + always precedes the -

    # Convert relative lengths to LaTeX units
-    units = {"text%":"\\textwidth", "col%":"\\columnwidth",
-             "page%":"\\paperwidth", "line%":"\\linewidth",
-             "theight%":"\\textheight", "pheight%":"\\paperheight"}
+    units = {"col%": "\\columnwidth",
+             "text%": "\\textwidth",
+             "page%": "\\paperwidth", 
+             "line%": "\\linewidth",
+             "theight%": "\\textheight",
+             "pheight%": "\\paperheight",
+             "baselineskip%": "\\baselineskip"
+            }
    for unit in list(units.keys()):
        i = slen.find(unit)
        if i == -1:
--- a/lib/lyx2lyx/lyx_1_6.py
+++ b/lib/lyx2lyx/lyx_1_6.py
@ -23,7 +23,7 @@ import unicodedata
 import sys, os

 from parser_tools import find_token, find_end_of, find_tokens, get_value
-from unicode_symbols import read_unicodesymbols
+from unicode_symbols import unicode_reps

 ####################################################################
 # Private helper functions
@ -146,54 +146,6 @@ def set_option(document, m, option, value):
    return l


-# FIXME: Remove this function if the version imported from unicode_symbols works.
-# This function was the predecessor from that function, that in the meanwhile got
-# new fixes.
-def read_unicodesymbols2():
-    " Read the unicodesymbols list of unicode characters and corresponding commands."
-
-    # Provide support for both python 2 and 3
-    PY2 = sys.version_info[0] == 2
-    if not PY2:
-        unichr = chr
-    # End of code to support for both python 2 and 3
-
-    pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
-    fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols'))
-    spec_chars = []
-    # Two backslashes, followed by some non-word character, and then a character
-    # in brackets. The idea is to check for constructs like: \"{u}, which is how
-    # they are written in the unicodesymbols file; but they can also be written
-    # as: \"u or even \" u.
-    r = re.compile(r'\\\\(\W)\{(\w)\}')
-    for line in fp.readlines():
-        if line[0] != '#' and line.strip() != "":
-            line=line.replace(' "',' ') # remove all quotation marks with spaces before
-            line=line.replace('" ',' ') # remove all quotation marks with spaces after
-            line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis)
-            try:
-                [ucs4,command,dead] = line.split(None,2)
-                if command[0:1] != "\\":
-                    continue
-                spec_chars.append([command, unichr(eval(ucs4))])
-            except:
-                continue
-            m = r.match(command)
-            if m != None:
-                command = "\\\\"
-                # If the character is a double-quote, then we need to escape it, too,
-                # since it is done that way in the LyX file.
-                if m.group(1) == "\"":
-                    command += "\\"
-                commandbl = command
-                command += m.group(1) + m.group(2)
-                commandbl += m.group(1) + ' ' + m.group(2)
-                spec_chars.append([command, unichr(eval(ucs4))])
-                spec_chars.append([commandbl, unichr(eval(ucs4))])
-    fp.close()
-    return spec_chars
-
-
 def extract_argument(line):
    'Extracts a LaTeX argument from the start of line. Returns (arg, rest).'

@ -280,8 +232,6 @@ def latex2ert(line, isindex):
    return retval


-unicode_reps = read_unicodesymbols()
-
 #Bug 5022....
 #Might should do latex2ert first, then deal with stuff that DOESN'T
 #end up inside ERT. That routine could be modified so that it returned
--- a/lib/lyx2lyx/lyx_2_2.py
+++ b/lib/lyx2lyx/lyx_2_2.py
@ -746,10 +746,10 @@ def convert_phrases(document):
            if len(words) > 1 and words[0] == "\\begin_inset" and \
               words[1] in ["CommandInset", "External", "Formula", "Graphics", "listings"]:
                # must not replace anything in insets that store LaTeX contents in .lyx files
-                # (math and command insets withut overridden read() and write() methods
+                # (math and command insets without overridden read() and write() methods)
                j = find_end_of_inset(document.body, i)
                if j == -1:
-                    document.warning("Malformed LyX document: Can't find end of Formula inset at line " + str(i))
+                    document.warning("Malformed LyX document: Can't find end of inset at line " + str(i))
                    i += 1
                else:
                    i = j
--- a/lib/lyx2lyx/lyx_2_3.py
+++ b/lib/lyx2lyx/lyx_2_3.py
@ -27,15 +27,14 @@ import sys, os
 from parser_tools import (del_token, del_value, del_complete_lines,
    find_complete_lines, find_end_of, find_end_of_layout, find_end_of_inset,
    find_re, find_token, find_token_backwards, get_containing_inset,
-    get_containing_layout, get_bool_value, get_value, get_quoted_value)
-#  find_tokens, find_token_exact, is_in_inset,
-#  check_token, get_option_value
+    get_containing_layout, get_bool_value, get_value, get_quoted_value,
+    is_in_inset)
+#  find_tokens, find_token_exact, check_token, get_option_value

-from lyx2lyx_tools import add_to_preamble, put_cmd_in_ert, revert_font_attrs, \
-    insert_to_preamble
-#  get_ert, lyx2latex, \
-#  lyx2verbatim, length_in_bp, convert_info_insets
-#  latex_length, revert_flex_inset, hex2ratio, str2bool
+from lyx2lyx_tools import (add_to_preamble, put_cmd_in_ert, revert_font_attrs,
+                           insert_to_preamble, latex_length)
+#  get_ert, lyx2latex, lyx2verbatim, length_in_bp, convert_info_insets
+#  revert_flex_inset, hex2ratio, str2bool

 ####################################################################
 # Private helper functions
@ -88,13 +87,12 @@ def convert_dateinset(document):
            continue
        if get_value(document.body, 'template', i, j) == "Date":
            document.body[i : j + 1] = put_cmd_in_ert("\\today ")
-        i += 1
-        continue
+        i = j+1 # skip inset


 def convert_inputenc(document):
    " Replace no longer supported input encoding settings. "
-    i = find_token(document.header, "\\inputenc", 0)
+    i = find_token(document.header, "\\inputenc")
    if i == -1:
        return
    if get_value(document.header, "\\inputencoding", i) == "pt254":
@ -1859,7 +1857,7 @@ def convert_dashligatures(document):
        while i+1 < len(lines):
            i += 1
            line = lines[i]
-            # skip lines without any dashes:
+            # skip lines without dashes:
            if not re.search(u"[\u2013\u2014]|\\twohyphens|\\threehyphens", line):
                continue
            # skip label width string (see bug 10243):
@ -1867,28 +1865,28 @@ def convert_dashligatures(document):
                continue
            # do not touch hyphens in some insets (cf. lyx_2_2.convert_dashes):
            try:
-                value, start, end = get_containing_inset(lines, i)
+                inset_type, start, end = get_containing_inset(lines, i)
            except TypeError: # no containing inset
-                value, start, end = "no inset", -1, -1
-            if (value.split()[0] in
+                inset_type, start, end = "no inset", -1, -1
+            if (inset_type.split()[0] in
                ["CommandInset", "ERT", "External", "Formula",
                 "FormulaMacro", "Graphics", "IPA", "listings"]
-                or value == "Flex Code"):
+                or inset_type == "Flex Code"):
                i = end
                continue
            try:
-                layout, start, end, j = get_containing_layout(lines, i)
+                layoutname, start, end, j = get_containing_layout(lines, i)
            except TypeError: # no (or malformed) containing layout
                document.warning("Malformed LyX document: "
                                "Can't find layout at line %d" % i)
                continue
-            if layout == "LyX-Code":
+            if layoutname == "LyX-Code":
                i = end
                continue

            # literal dash followed by a word or no-break space:
-            if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line,
-                         flags=re.UNICODE):
+            if re.search(u"[\u2013\u2014]([\w\u00A0]|$)",
+                         line, flags=re.UNICODE):
                has_literal_dashes = True
            # ligature dash followed by word or no-break space on next line:
            if (re.search(r"(\\twohyphens|\\threehyphens)", line) and
@ -1900,14 +1898,15 @@ def convert_dashligatures(document):
                                 '"ligature" dashes.\n Line breaks may have changed. '
                                 'See UserGuide chapter 3.9.1 for details.')
                break
-        if has_literal_dashes:
+
+        if has_literal_dashes and not has_ligature_dashes:
            use_dash_ligatures = False
-        elif has_ligature_dashes:
+        elif has_ligature_dashes and not has_literal_dashes:
            use_dash_ligatures = True
+
    # insert the setting if there is a preferred value
    if use_dash_ligatures is not None:
-        i = find_token(document.header, "\\graphics")
-        document.header.insert(i, "\\use_dash_ligatures %s"
+        document.header.insert(-1, "\\use_dash_ligatures %s"
                               % str(use_dash_ligatures).lower())


@ -2022,62 +2021,35 @@ def revert_mathindent(document):
 def revert_baselineskip(document):
    " Revert baselineskips to TeX code "
    i = 0
-  vspaceLine = 0
-  hspaceLine = 0
+    regexp = re.compile(r'.*baselineskip%.*')
    while True:
-    regexp = re.compile(r'^.*baselineskip%.*$')
+        i = i + 1
        i = find_re(document.body, regexp, i)
        if i == -1:
            return
-    vspaceLine = find_token(document.body, "\\begin_inset VSpace", i)
-    if  vspaceLine == i:
+        if  document.body[i].startswith("\\begin_inset VSpace"):
            # output VSpace inset as TeX code
-      # first read out the values
-      beg = document.body[i].rfind("VSpace ");
-      end = document.body[i].rfind("baselineskip%");
-      baselineskip = float(document.body[i][beg + 7:end]);
-      # we store the value in percent, thus divide by 100
-      baselineskip = baselineskip/100;
-      baselineskip = str(baselineskip);
+            end = find_end_of_inset(document.body, i)
+            if end == -1:
+                document.warning("Malformed LyX document: "
+                        "Can't find end of VSpace inset at line %d." % i)
+                continue
+            # read out the value
+            baselineskip = document.body[i].split()[-1]
            # check if it is the starred version
-      if document.body[i].find('*') != -1:
-        star = '*'
-      else:
-        star = ''
+            star = '*' if '*' in document.body[i] else ''
            # now output TeX code
-      endInset = find_end_of_inset(document.body, i)
-      if endInset == -1:
-        document.warning("Malformed LyX document: Missing '\\end_inset' of VSpace inset.")
-        return
-      else:
-        document.body[vspaceLine: endInset + 1] = put_cmd_in_ert("\\vspace" + star + '{' + baselineskip + "\\baselineskip}")
-    hspaceLine = find_token(document.body, "\\begin_inset space \\hspace", i - 1)
-    document.warning("hspaceLine: " + str(hspaceLine))
-    document.warning("i: " + str(i))
-    if  hspaceLine == i - 1:
+            cmd = "\\vspace%s{%s}" %(star, latex_length(baselineskip)[1])
+            document.body[i:end+1] = put_cmd_in_ert(cmd)
+            i += 8
+            continue
+        begin, end = is_in_inset(document.body, i, "\\begin_inset space \\hspace")
+        if  begin != - 1:
            # output space inset as TeX code
-      # first read out the values
-      beg = document.body[i].rfind("\\length ");
-      end = document.body[i].rfind("baselineskip%");
-      baselineskip = float(document.body[i][beg + 7:end]);
-      document.warning("baselineskip: " + str(baselineskip))
-      # we store the value in percent, thus divide by 100
-      baselineskip = baselineskip/100;
-      baselineskip = str(baselineskip);
-      # check if it is the starred version
-      if document.body[i-1].find('*') != -1:
-        star = '*'
-      else:
-        star = ''
-      # now output TeX code
-      endInset = find_end_of_inset(document.body, i)
-      if endInset == -1:
-        document.warning("Malformed LyX document: Missing '\\end_inset' of space inset.")
-        return
-      else:
-        document.body[hspaceLine: endInset + 1] = put_cmd_in_ert("\\hspace" + star + '{' + baselineskip + "\\baselineskip}")
-
-    i = i + 1
+            baselineskip = document.body[i].split()[-1]
+            star = '*' if '*' in document.body[i-1] else ''
+            cmd = "\\hspace%s{%s}" %(star, latex_length(baselineskip)[1])
+            document.body[begin:end+1] = put_cmd_in_ert(cmd)


 def revert_rotfloat(document):
--- a/lib/lyx2lyx/parser_tools.py
+++ b/lib/lyx2lyx/parser_tools.py
@ -23,7 +23,7 @@ This module offers several free functions to help parse lines.
 More documentaton is below, but here is a quick guide to what
 they do. Optional arguments are marked by brackets.

-find_token(lines, token, start[, end[, ignorews]]):
+find_token(lines, token[, start[, end[, ignorews]]]):
  Returns the first line i, start <= i < end, on which
  token is found at the beginning. Returns -1 if not
  found.
@ -31,10 +31,10 @@ find_token(lines, token, start[, end[, ignorews]]):
  in whitespace do not count, except that there must be no
  extra whitespace following token itself.

-find_token_exact(lines, token, start[, end]):
+find_token_exact(lines, token[, start[, end]]]):
  As find_token, but with ignorews set to True.

-find_tokens(lines, tokens, start[, end[, ignorews]]):
+find_tokens(lines, tokens[, start[, end[, ignorews]]]):
  Returns the first line i, start <= i < end, on which
  one of the tokens in tokens is found at the beginning.
  Returns -1 if not found.
@ -42,7 +42,7 @@ find_tokens(lines, tokens, start[, end[, ignorews]]):
  in whitespace do not count, except that there must be no
  extra whitespace following token itself.

-find_tokens_exact(lines, token, start[, end]):
+find_tokens_exact(lines, token[, start[, end]]):
  As find_tokens, but with ignorews True.

 find_token_backwards(lines, token, start):
@ -543,8 +543,9 @@ def is_in_inset(lines, i, inset, default=(-1,-1)):
      is_in_inset(document.body, i, "\\begin_inset Tabular")
    returns (-1,-1) if `i` is not within a "Tabular" inset (i.e. a table).
    If it is, then it returns the line on which the table begins and the one
-    on which it ends. Note that this pair will evaulate to
-    boolean True, so
+    on which it ends.
+    Note that this pair will evaulate to boolean True, so (with the optional
+    default value set to False)
      if is_in_inset(..., default=False):
    will do what you expect.
    """
--- a/lib/lyx2lyx/test_lyx2lyx_tools.py
+++ b/lib/lyx2lyx/test_lyx2lyx_tools.py
@ -0,0 +1,52 @@
+# This file is part of lyx2lyx
+# -*- coding: utf-8 -*-
+# Copyright (C) 2018 The LyX team
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+" This modules tests the auxiliary functions for lyx2lyx."
+
+from lyx2lyx_tools import *
+
+import unittest
+
+class TestParserTools(unittest.TestCase):
+
+    def test_put_cmd_in_ert(self):
+        ert =  ['\\begin_inset ERT',
+                'status collapsed',
+                '',
+                '\\begin_layout Plain Layout',
+                '',
+                u'\\backslash',
+                u'texttt{Gr\\backslash',
+                u'"{u}\\backslash',
+                u'ss{}e}',
+                '\\end_layout',
+                '',
+                '\\end_inset']
+        self.assertEqual(put_cmd_in_ert(u"\\texttt{Grüße}"), ert)
+        self.assertEqual(put_cmd_in_ert([u"\\texttt{Grüße}"]), ert)
+
+    def test_latex_length(self):
+        self.assertEqual(latex_length("-30.5col%"), (True, "-0.305\\columnwidth"))
+        self.assertEqual(latex_length("35baselineskip%"), (True, "0.35\\baselineskip"))
+        self.assertEqual(latex_length("11em"), (False, "11em"))
+        self.assertEqual(latex_length("-0.4pt"), (False, "-0.4pt"))
+        
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/lib/lyx2lyx/unicode_symbols.py
+++ b/lib/lyx2lyx/unicode_symbols.py
@ -18,7 +18,7 @@

 " Import unicode_reps from this module for access to the unicode<->LaTeX mapping. "

-import sys, os, re
+import sys, os, re, codecs

 # Provide support for both python 2 and 3
 PY2 = sys.version_info[0] == 2
@ -28,14 +28,13 @@ if not PY2:

 def read_unicodesymbols():
    " Read the unicodesymbols list of unicode characters and corresponding commands."
-    pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
+    pathname = os.path.abspath(os.path.dirname(__file__))
    filename = os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols')

-    # For python 3+ we have to specify the encoding for those systems
-    # where the default is not UTF-8
-    fp = open(filename, encoding="utf8") if (not PY2) else open(filename)
+    # Read as Unicode strings in both, Python 2 and 3
+    # Specify the encoding for those systems where the default is not UTF-8
+    fp = codecs.open(filename, encoding="utf8")

-    spec_chars = []
    # A backslash, followed by some non-word character, and then a character
    # in brackets. The idea is to check for constructs like: \"{u}, which is how
    # they are written in the unicodesymbols file; but they can also be written
@ -43,8 +42,12 @@ def read_unicodesymbols():
    # The two backslashes in the string literal are needed to specify a literal
    # backslash in the regex. Without r prefix, these would be four backslashes.
    r = re.compile(r'\\(\W)\{(\w)\}')
+
+    spec_chars = []
    for line in fp.readlines():
-        if line[0] != '#' and line.strip() != "":
+        if not line.strip() or line.startswith('#'):
+            # skip empty lines and comments
+            continue
        # Note: backslashes in the string literals with r prefix are not escaped,
        #       so one backslash in the source file equals one backslash in memory.
        #       Without r prefix backslahses are escaped, so two backslashes in the
@ -57,10 +60,11 @@ def read_unicodesymbols():
            [ucs4,command,dead] = line.split(None,2)
            if command[0:1] != "\\":
                continue
+            literal_char = unichr(int(ucs4, 16))
            if (line.find("notermination=text") < 0 and
                line.find("notermination=both") < 0 and command[-1] != "}"):
                command = command + "{}"
-                spec_chars.append([command, unichr(eval(ucs4))])
+            spec_chars.append([command, literal_char])
        except:
            continue
        m = r.match(command)
@ -69,10 +73,11 @@ def read_unicodesymbols():
            commandbl = command
            command += m.group(1) + m.group(2)
            commandbl += m.group(1) + ' ' + m.group(2)
-                spec_chars.append([command, unichr(eval(ucs4))])
-                spec_chars.append([commandbl, unichr(eval(ucs4))])
+            spec_chars.append([command, literal_char])
+            spec_chars.append([commandbl, literal_char])
    fp.close()
    return spec_chars


 unicode_reps = read_unicodesymbols()
+