Fix lyx2lyx dash conversion and make it faster.

2025-01-21 23:09:40 +00:00 · 2018-01-24 17:38:19 +01:00 · 2018-01-24 17:38:19 +01:00 · a151b274bf
commit a151b274bf
parent 62f32992ab
3 changed files with 91 additions and 83 deletions
--- a/lib/lyx2lyx/lyx_2_1.py
+++ b/lib/lyx2lyx/lyx_2_1.py
@ -1557,10 +1557,11 @@ def convert_latexargs(document):
                    "theorems-chap-bytype", "theorems-chap", "theorems-named", "theorems-sec-bytype",
                    "theorems-sec", "theorems-starred", "theorems-std", "todonotes"]
    # Modules we need to take care of
-    caveat_modules = ["initials"]
+    caveat_modules = ["initials"] # TODO: , "graphicboxes", "bicaption"]
    # information about the relevant styles in caveat_modules (number of opt and req args)
    # use this if we get more caveat_modules. For now, use hard coding (see below).
    # initials = [{'Layout' : 'Initial', 'opt' : 1, 'req' : 1}]
+    # graphicboxes = { ... }

    # Is this a known safe layout?
    safe_layout = document.textclass in safe_layouts
--- a/lib/lyx2lyx/lyx_2_2.py
+++ b/lib/lyx2lyx/lyx_2_2.py
@ -37,7 +37,7 @@ from lyx2lyx_tools import (add_to_preamble, put_cmd_in_ert, get_ert,
 from parser_tools import (check_token, del_complete_lines,
    find_end_of_inset, find_end_of_layout, find_nonempty_line, find_re,
    find_token, find_token_backwards, get_containing_layout,
-    get_value, is_in_inset)
+    get_containing_inset, get_value, is_in_inset)


 ####################################################################
@ -622,41 +622,40 @@ def convert_dashes(document):
    while i+1 < len(lines):
        i += 1
        line = lines[i]
-        words = line.split()
-        if (len(words) > 1 and words[0] == "\\begin_inset"
-            and (words[1] in ["CommandInset", "ERT", "External", "Formula",
-                              "FormulaMacro", "Graphics", "IPA", "listings"]
-                 or line.endswith("Flex Code"))):
-            # must not replace anything in insets that store LaTeX contents in .lyx files
-            # (math and command insets without overridden read() and write() methods
-            # filtering out IPA makes Text::readParToken() more simple
-            # skip ERT as well since it is not needed there
-            # Flex Code is logical markup, typically rendered as typewriter
-            j = find_end_of_inset(lines, i)
-            if j == -1:
-                document.warning("Malformed LyX document: Can't find end of " +
-                                 words[1] + " inset at line " + str(i))
-            else:
-                i = j
-            continue
-        if lines[i] == "\\begin_layout LyX-Code":
-            j = find_end_of_layout(lines, i)
-            if j == -1:
-                document.warning("Malformed LyX document: "
-                    "Can't find end of %s layout at line %d" % (words[1],i))
-            else:
-                i = j
+        if "--" not in line:
            continue
+        # skip label width string (bug 10243):
        if line.startswith("\\labelwidthstring"):
-            # skip label width string (bug 10243)
            continue
-
-        if "--" in line:
-            # We can have an arbitrary number of consecutive hyphens.
-            # Replace as LaTeX does: First try emdash, then endash
-            line = line.replace("---", "\\threehyphens\n")
-            line = line.replace("--", "\\twohyphens\n")
-            lines[i:i+1] = line.splitlines()
+        # Do not touch hyphens in some insets:
+        try:
+            value, start, end = get_containing_inset(lines, i)
+        except TypeError:
+            # False means no (or malformed) containing inset
+            value, start, end = "no inset", -1, -1
+        # We must not replace anything in insets that store LaTeX contents in .lyx files
+        # (math and command insets without overridden read() and write() methods.
+        # Filtering out IPA and ERT makes Text::readParToken() more simple,
+        # Flex Code is logical markup, typically rendered as typewriter
+        if (value.split()[0] in ["CommandInset", "ERT", "External", "Formula",
+                                 "FormulaMacro", "Graphics", "IPA", "listings"]
+            or value in ["Flex Code", "Flex URL"]):
+            i = end
+            continue
+        try:
+            layout, start, end, j = get_containing_layout(lines, i)
+        except TypeError: # no (or malformed) containing layout
+            document.warning("Malformed LyX document: "
+                             "Can't find layout at line %d" % i)
+            continue
+        if layout == "LyX-Code":
+            i = end
+            continue
+        # We can have an arbitrary number of consecutive hyphens.
+        # Replace as LaTeX does: First try emdash, then endash
+        line = line.replace("---", "\\threehyphens\n")
+        line = line.replace("--", "\\twohyphens\n")
+        lines[i:i+1] = line.splitlines()

    # remove ligature breaks between dashes
    i = 1
@ -672,40 +671,40 @@ def convert_dashes(document):

 def revert_dashes(document):
    """
+    Remove preamble code from 2.3->2.2 conversion.
    Prevent ligatures of existing --- and --.
    Revert \\twohyphens and \\threehyphens to -- and ---.
-    Remove preamble code from 2.3->2.2 conversion.
    """
    del_complete_lines(document.preamble,
                       ['% Added by lyx2lyx',
                        r'\renewcommand{\textendash}{--}',
                        r'\renewcommand{\textemdash}{---}'])
+
    # Insert ligature breaks to prevent ligation of hyphens to dashes:
    lines = document.body
    i = 0
    while i+1 < len(lines):
        i += 1
        line = lines[i]
+        if "--" not in line:
+            continue
        # skip label width string (bug 10243):
        if line.startswith("\\labelwidthstring"):
            continue
        # do not touch hyphens in some insets (cf. convert_dashes):
-        if line.startswith("\\begin_inset"):
-            try:
-                if line.split()[1] in ["CommandInset", "ERT", "External",
-                                       "Formula", "FormulaMacro", "Graphics",
-                                       "IPA", "listings"]:
-                    j = find_end_of_inset(lines, i)
-                    if j == -1:
-                        document.warning("Malformed LyX document: Can't find "
-                                    "end of %s inset at line %d." % (itype, i))
-                        continue
-                    i = j
-            except IndexError:
-                continue
-        if "--" in line:
-            line = line.replace("--", "-\\SpecialChar \\textcompwordmark{}\n-")
-            document.body[i:i+1] = line.split('\n')
+        try:
+            value, start, end = get_containing_inset(lines, i)
+        except TypeError:
+            # False means no (or malformed) containing inset
+            value, start, end = "no inset", -1, -1
+        if (value.split()[0] in ["CommandInset", "ERT", "External", "Formula",
+                                 "FormulaMacro", "Graphics", "IPA", "listings"]
+            or value == "Flex URL"):
+            i = end
+            continue
+        line = line.replace("--", "-\\SpecialChar \\textcompwordmark{}\n-")
+        document.body[i:i+1] = line.split('\n')
+
    # Revert \twohyphens and \threehyphens:
    i = 1
    while i < len(lines):
--- a/lib/lyx2lyx/lyx_2_3.py
+++ b/lib/lyx2lyx/lyx_2_3.py
@ -26,7 +26,7 @@ import sys, os

 from parser_tools import (del_token, del_value, del_complete_lines,
    find_complete_lines, find_end_of, find_end_of_layout, find_end_of_inset,
-    find_re, find_token, find_token_backwards,
+    find_re, find_token, find_token_backwards, get_containing_inset,
    get_containing_layout, get_bool_value, get_value, get_quoted_value)
 #  find_tokens, find_token_exact, is_in_inset,
 #  check_token, get_option_value
@ -1853,46 +1853,54 @@ def convert_dashligatures(document):
    if use_dash_ligatures is None:
        # Look for dashes (Documents by LyX 2.1 or older have "\twohyphens\n"
        # or "\threehyphens\n" as interim representation for -- an ---.)
-        has_literal_dashes = False
-        has_ligature_dashes = False
-        j = 0
-        for i, line in enumerate(document.body):
-            # Skip some document parts where dashes are not converted
-            if (i < j) or line.startswith("\\labelwidthstring"):
+        lines = document.body
+        has_literal_dashes = has_ligature_dashes = False
+        i = j = 0
+        while i+1 < len(lines):
+            i += 1
+            line = lines[i]
+            # skip lines without any dashes:
+            if not re.search(u"[\u2013\u2014]|\\twohyphens|\\threehyphens", line):
                continue
-            if line.startswith("\\begin_inset"):
-                try:
-                    it = line.split()[1]
-                except IndexError:
-                    continue
-                if (it in ["CommandInset", "ERT", "External", "Formula",
-                           "FormulaMacro", "Graphics", "IPA", "listings"]
-                    or line.endswith("Flex Code")):
-                    j = find_end_of_inset(document.body, i)
-                    if j == -1:
-                        document.warning("Malformed LyX document: Can't "
-                            "find end of %s inset at line %d." % (itype, i))
-                        continue
-            if line == "\\begin_layout LyX-Code":
-                j = find_end_of_layout(document.body, i)
-                if j == -1:
-                    document.warning("Malformed LyX document: "
-                       "Can't find end of %s layout at line %d" % (words[1],i))
+            # skip label width string (see bug 10243):
+            if line.startswith("\\labelwidthstring"):
                continue
+            # do not touch hyphens in some insets (cf. lyx_2_2.convert_dashes):
+            try:
+                value, start, end = get_containing_inset(lines, i)
+            except TypeError: # no containing inset
+                value, start, end = "no inset", -1, -1
+            if (value.split()[0] in
+                ["CommandInset", "ERT", "External", "Formula",
+                 "FormulaMacro", "Graphics", "IPA", "listings"]
+                or value == "Flex Code"):
+                i = end
+                continue
+            try:
+                layout, start, end, j = get_containing_layout(lines, i)
+            except TypeError: # no (or malformed) containing layout
+                document.warning("Malformed LyX document: "
+                                "Can't find layout at line %d" % i)
+                continue
+            if layout == "LyX-Code":
+                i = end
+                continue
+
            # literal dash followed by a word or no-break space:
            if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line,
                         flags=re.UNICODE):
                has_literal_dashes = True
            # ligature dash followed by word or no-break space on next line:
            if (re.search(r"(\\twohyphens|\\threehyphens)", line) and
-                re.match(u"[\w\u00A0]", document.body[i+1], flags=re.UNICODE)):
+                re.match(u"[\w\u00A0]", lines[i+1], flags=re.UNICODE)):
                has_ligature_dashes = True
-        if has_literal_dashes and has_ligature_dashes:
-            # TODO: insert a warning note in the document?
-            document.warning('This document contained both literal and '
-                '"ligature" dashes.\n Line breaks may have changed. '
-                'See UserGuide chapter 3.9.1 for details.')
-        elif has_literal_dashes:
+            if has_literal_dashes and has_ligature_dashes:
+                # TODO: insert a warning note in the document?
+                document.warning('This document contained both literal and '
+                                 '"ligature" dashes.\n Line breaks may have changed. '
+                                 'See UserGuide chapter 3.9.1 for details.')
+                break
+        if has_literal_dashes:
            use_dash_ligatures = False
        elif has_ligature_dashes:
            use_dash_ligatures = True