Fix lyx2lyx dash conversion and make it faster.

2025-01-03 08:28:25 +00:00 · 2018-01-24 17:38:19 +01:00 · 2018-01-24 17:38:19 +01:00 · a151b274bf
commit a151b274bf
parent 62f32992ab
3 changed files with 91 additions and 83 deletions
--- a/lib/lyx2lyx/lyx_2_1.py
+++ b/lib/lyx2lyx/lyx_2_1.py
@ -1557,10 +1557,11 @@ def convert_latexargs(document):
                    "theorems-chap-bytype", "theorems-chap", "theorems-named", "theorems-sec-bytype",
                    "theorems-sec", "theorems-starred", "theorems-std", "todonotes"]
    # Modules we need to take care of
-    caveat_modules = ["initials"]
+    caveat_modules = ["initials"] # TODO: , "graphicboxes", "bicaption"]
    # information about the relevant styles in caveat_modules (number of opt and req args)
    # use this if we get more caveat_modules. For now, use hard coding (see below).
    # initials = [{'Layout' : 'Initial', 'opt' : 1, 'req' : 1}]
    # graphicboxes = { ... }
    # Is this a known safe layout?
    safe_layout = document.textclass in safe_layouts
--- a/lib/lyx2lyx/lyx_2_2.py
+++ b/lib/lyx2lyx/lyx_2_2.py
@ -37,7 +37,7 @@ from lyx2lyx_tools import (add_to_preamble, put_cmd_in_ert, get_ert,
 from parser_tools import (check_token, del_complete_lines,
    find_end_of_inset, find_end_of_layout, find_nonempty_line, find_re,
    find_token, find_token_backwards, get_containing_layout,
-    get_value, is_in_inset)
+    get_containing_inset, get_value, is_in_inset)
 ####################################################################
@ -622,41 +622,40 @@ def convert_dashes(document):
    while i+1 < len(lines):
        i += 1
        line = lines[i]
-        words = line.split()
+        if "--" not in line:
        if (len(words) > 1 and words[0] == "\\begin_inset"
            and (words[1] in ["CommandInset", "ERT", "External", "Formula",
                              "FormulaMacro", "Graphics", "IPA", "listings"]
                 or line.endswith("Flex Code"))):
            # must not replace anything in insets that store LaTeX contents in .lyx files
            # (math and command insets without overridden read() and write() methods
            # filtering out IPA makes Text::readParToken() more simple
            # skip ERT as well since it is not needed there
            # Flex Code is logical markup, typically rendered as typewriter
            j = find_end_of_inset(lines, i)
            if j == -1:
                document.warning("Malformed LyX document: Can't find end of " +
                                 words[1] + " inset at line " + str(i))
            else:
                i = j
            continue
        if lines[i] == "\\begin_layout LyX-Code":
            j = find_end_of_layout(lines, i)
            if j == -1:
                document.warning("Malformed LyX document: "
                    "Can't find end of %s layout at line %d" % (words[1],i))
            else:
                i = j
            continue
        # skip label width string (bug 10243):
        if line.startswith("\\labelwidthstring"):
            # skip label width string (bug 10243)
            continue
-
+        # Do not touch hyphens in some insets:
-        if "--" in line:
+        try:
-            # We can have an arbitrary number of consecutive hyphens.
+            value, start, end = get_containing_inset(lines, i)
-            # Replace as LaTeX does: First try emdash, then endash
+        except TypeError:
-            line = line.replace("---", "\\threehyphens\n")
+            # False means no (or malformed) containing inset
-            line = line.replace("--", "\\twohyphens\n")
+            value, start, end = "no inset", -1, -1
-            lines[i:i+1] = line.splitlines()
+        # We must not replace anything in insets that store LaTeX contents in .lyx files
        # (math and command insets without overridden read() and write() methods.
        # Filtering out IPA and ERT makes Text::readParToken() more simple,
        # Flex Code is logical markup, typically rendered as typewriter
        if (value.split()[0] in ["CommandInset", "ERT", "External", "Formula",
                                 "FormulaMacro", "Graphics", "IPA", "listings"]
            or value in ["Flex Code", "Flex URL"]):
            i = end
            continue
        try:
            layout, start, end, j = get_containing_layout(lines, i)
        except TypeError: # no (or malformed) containing layout
            document.warning("Malformed LyX document: "
                             "Can't find layout at line %d" % i)
            continue
        if layout == "LyX-Code":
            i = end
            continue
        # We can have an arbitrary number of consecutive hyphens.
        # Replace as LaTeX does: First try emdash, then endash
        line = line.replace("---", "\\threehyphens\n")
        line = line.replace("--", "\\twohyphens\n")
        lines[i:i+1] = line.splitlines()
    # remove ligature breaks between dashes
    i = 1
@ -672,40 +671,40 @@ def convert_dashes(document):
 def revert_dashes(document):
    """
    Remove preamble code from 2.3->2.2 conversion.
    Prevent ligatures of existing --- and --.
    Revert \\twohyphens and \\threehyphens to -- and ---.
    Remove preamble code from 2.3->2.2 conversion.
    """
    del_complete_lines(document.preamble,
                       ['% Added by lyx2lyx',
                        r'\renewcommand{\textendash}{--}',
                        r'\renewcommand{\textemdash}{---}'])
    # Insert ligature breaks to prevent ligation of hyphens to dashes:
    lines = document.body
    i = 0
    while i+1 < len(lines):
        i += 1
        line = lines[i]
        if "--" not in line:
            continue
        # skip label width string (bug 10243):
        if line.startswith("\\labelwidthstring"):
            continue
        # do not touch hyphens in some insets (cf. convert_dashes):
-        if line.startswith("\\begin_inset"):
+        try:
-            try:
+            value, start, end = get_containing_inset(lines, i)
-                if line.split()[1] in ["CommandInset", "ERT", "External",
+        except TypeError:
-                                       "Formula", "FormulaMacro", "Graphics",
+            # False means no (or malformed) containing inset
-                                       "IPA", "listings"]:
+            value, start, end = "no inset", -1, -1
-                    j = find_end_of_inset(lines, i)
+        if (value.split()[0] in ["CommandInset", "ERT", "External", "Formula",
-                    if j == -1:
+                                 "FormulaMacro", "Graphics", "IPA", "listings"]
-                        document.warning("Malformed LyX document: Can't find "
+            or value == "Flex URL"):
-                                    "end of %s inset at line %d." % (itype, i))
+            i = end
-                        continue
+            continue
-                    i = j
+        line = line.replace("--", "-\\SpecialChar \\textcompwordmark{}\n-")
-            except IndexError:
+        document.body[i:i+1] = line.split('\n')
-                continue
+
        if "--" in line:
            line = line.replace("--", "-\\SpecialChar \\textcompwordmark{}\n-")
            document.body[i:i+1] = line.split('\n')
    # Revert \twohyphens and \threehyphens:
    i = 1
    while i < len(lines):
--- a/lib/lyx2lyx/lyx_2_3.py
+++ b/lib/lyx2lyx/lyx_2_3.py
@ -26,7 +26,7 @@ import sys, os
 from parser_tools import (del_token, del_value, del_complete_lines,
    find_complete_lines, find_end_of, find_end_of_layout, find_end_of_inset,
-    find_re, find_token, find_token_backwards,
+    find_re, find_token, find_token_backwards, get_containing_inset,
    get_containing_layout, get_bool_value, get_value, get_quoted_value)
 #  find_tokens, find_token_exact, is_in_inset,
 #  check_token, get_option_value
@ -1853,46 +1853,54 @@ def convert_dashligatures(document):
    if use_dash_ligatures is None:
        # Look for dashes (Documents by LyX 2.1 or older have "\twohyphens\n"
        # or "\threehyphens\n" as interim representation for -- an ---.)
-        has_literal_dashes = False
+        lines = document.body
-        has_ligature_dashes = False
+        has_literal_dashes = has_ligature_dashes = False
-        j = 0
+        i = j = 0
-        for i, line in enumerate(document.body):
+        while i+1 < len(lines):
-            # Skip some document parts where dashes are not converted
+            i += 1
-            if (i < j) or line.startswith("\\labelwidthstring"):
+            line = lines[i]
            # skip lines without any dashes:
            if not re.search(u"[\u2013\u2014]|\\twohyphens|\\threehyphens", line):
                continue
-            if line.startswith("\\begin_inset"):
+            # skip label width string (see bug 10243):
-                try:
+            if line.startswith("\\labelwidthstring"):
                    it = line.split()[1]
                except IndexError:
                    continue
                if (it in ["CommandInset", "ERT", "External", "Formula",
                           "FormulaMacro", "Graphics", "IPA", "listings"]
                    or line.endswith("Flex Code")):
                    j = find_end_of_inset(document.body, i)
                    if j == -1:
                        document.warning("Malformed LyX document: Can't "
                            "find end of %s inset at line %d." % (itype, i))
                        continue
            if line == "\\begin_layout LyX-Code":
                j = find_end_of_layout(document.body, i)
                if j == -1:
                    document.warning("Malformed LyX document: "
                       "Can't find end of %s layout at line %d" % (words[1],i))
                continue
            # do not touch hyphens in some insets (cf. lyx_2_2.convert_dashes):
            try:
                value, start, end = get_containing_inset(lines, i)
            except TypeError: # no containing inset
                value, start, end = "no inset", -1, -1
            if (value.split()[0] in
                ["CommandInset", "ERT", "External", "Formula",
                 "FormulaMacro", "Graphics", "IPA", "listings"]
                or value == "Flex Code"):
                i = end
                continue
            try:
                layout, start, end, j = get_containing_layout(lines, i)
            except TypeError: # no (or malformed) containing layout
                document.warning("Malformed LyX document: "
                                "Can't find layout at line %d" % i)
                continue
            if layout == "LyX-Code":
                i = end
                continue
            # literal dash followed by a word or no-break space:
            if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line,
                         flags=re.UNICODE):
                has_literal_dashes = True
            # ligature dash followed by word or no-break space on next line:
            if (re.search(r"(\\twohyphens|\\threehyphens)", line) and
-                re.match(u"[\w\u00A0]", document.body[i+1], flags=re.UNICODE)):
+                re.match(u"[\w\u00A0]", lines[i+1], flags=re.UNICODE)):
                has_ligature_dashes = True
-        if has_literal_dashes and has_ligature_dashes:
+            if has_literal_dashes and has_ligature_dashes:
-            # TODO: insert a warning note in the document?
+                # TODO: insert a warning note in the document?
-            document.warning('This document contained both literal and '
+                document.warning('This document contained both literal and '
-                '"ligature" dashes.\n Line breaks may have changed. '
+                                 '"ligature" dashes.\n Line breaks may have changed. '
-                'See UserGuide chapter 3.9.1 for details.')
+                                 'See UserGuide chapter 3.9.1 for details.')
-        elif has_literal_dashes:
+                break
        if has_literal_dashes:
            use_dash_ligatures = False
        elif has_ligature_dashes:
            use_dash_ligatures = True