Fix lyx2lyx conversion of dashes.

2024-12-22 13:18:28 +00:00 · 2017-09-30 23:26:02 +02:00 · 2017-09-30 23:26:02 +02:00 · 586d16a49c
commit 586d16a49c
parent 291d2f1037
2 changed files with 81 additions and 78 deletions
--- a/lib/lyx2lyx/lyx_2_2.py
+++ b/lib/lyx2lyx/lyx_2_2.py
@ -659,6 +659,12 @@ def convert_dashes(document):
 def revert_dashes(document):
    "convert \\twohyphens and \\threehyphens to -- and ---"
    # eventually remove preamble code from 2.3->2.2 conversion:
    for i, line in enumerate(document.preamble):
        if i > 1 and line == r'\renewcommand{\textemdash}{---}':
            if (document.preamble[i-1] == r'\renewcommand{\textendash}{--}'
                and document.preamble[i-2] == '% Added by lyx2lyx'):
                del document.preamble[i-2:i+1]
    i = 0
    while i < len(document.body):
        words = document.body[i].split()
--- a/lib/lyx2lyx/lyx_2_3.py
+++ b/lib/lyx2lyx/lyx_2_3.py
@ -1841,58 +1841,63 @@ def revert_chapterbib(document):
 def convert_dashligatures(document):
-    " Remove a zero-length space (U+200B) after en- and em-dashes. "
+    "Set 'use_dash_ligatures' according to content."
-
+    use_dash_ligatures = None
-    i = find_token(document.header, "\\use_microtype", 0)
+    # eventually remove preamble code from 2.3->2.2 conversion:
-    if i != -1:
+    for i, line in enumerate(document.preamble):
-        if document.initial_format > 474 and document.initial_format < 509:
+        if i > 1 and line == r'\renewcommand{\textemdash}{---}':
-            # This was created by LyX 2.2
+            if (document.preamble[i-1] == r'\renewcommand{\textendash}{--}'
-            document.header[i+1:i+1] = ["\\use_dash_ligatures false"]
+                and document.preamble[i-2] == '% Added by lyx2lyx'):
-        else:
+                del document.preamble[i-2:i+1]
-            # This was created by LyX 2.1 or earlier
+                use_dash_ligatures = True
-            document.header[i+1:i+1] = ["\\use_dash_ligatures true"]
+    if use_dash_ligatures is None:
-
+        # Look for dashes:
-    i = 0
+        # (Documents by LyX 2.1 or older have "\twohyphens\n" or "\threehyphens\n"
-    while i < len(document.body):
+        # as interim representation for dash ligatures in 2.2.)
-        words = document.body[i].split()
+        has_literal_dashes = False
-        # Skip some document parts where dashes are not converted
+        has_ligature_dashes = False
-        if len(words) > 1 and words[0] == "\\begin_inset" and \
+        j = 0
-           words[1] in ["CommandInset", "ERT", "External", "Formula", \
+        for i, line in enumerate(document.body):
-                        "FormulaMacro", "Graphics", "IPA", "listings"]:
+            # Skip some document parts where dashes are not converted
-            j = find_end_of_inset(document.body, i)
+            if (i < j) or line.startswith("\\labelwidthstring"):
-            if j == -1:
+                continue
-                document.warning("Malformed LyX document: Can't find end of " \
+            words = line.split()
-                                 + words[1] + " inset at line " + str(i))
+            if len(words) > 1 and words[0] == "\\begin_inset" and \
-                i += 1
+            words[1] in ["CommandInset", "ERT", "External", "Formula",
-            else:
+                         "FormulaMacro", "Graphics", "IPA", "listings"]:
-                i = j
+                j = find_end_of_inset(document.body, i)
-            continue
+                if j == -1:
-        if len(words) > 0 and words[0] in ["\\leftindent", \
+                    document.warning("Malformed LyX document: "
-                "\\paragraph_spacing", "\\align", "\\labelwidthstring"]:
+                        "Can't find end of %s inset at line %d" % (words[1],i))
-            i += 1
+                continue
-            continue
+            # literal dash followed by a word or no-break space:
-
+            if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line,
-        start = 0
+                         flags=re.UNICODE):
-        while True:
+                has_literal_dashes = True
-            j = document.body[i].find(u"\u2013", start) # en-dash
+            # ligature dash followed by word or no-break space on next line:
-            k = document.body[i].find(u"\u2014", start) # em-dash
+            if re.search(ur"(\\twohyphens|\\threehyphens)", line,
-            if j == -1 and k == -1:
+                            flags=re.UNICODE) and re.match(u"[\w\u00A0]",
-                break
+                            document.body[i+1], flags=re.UNICODE):
-            if j == -1 or (k != -1 and k < j):
+                has_ligature_dashes = True
-                j = k
+        if has_literal_dashes and has_ligature_dashes:
-            after = document.body[i][j+1:]
+            # TODO: insert a warning note in the document?
-            if after.startswith(u"\u200B"):
+            document.warning('This document contained both literal and '
-                document.body[i] = document.body[i][:j+1] + after[1:]
+                '"ligature" dashes.\n Line breaks may have changed. '
-            else:
+                'See UserGuide chapter 3.9.1 for details.')
-                if len(after) == 0 and document.body[i+1].startswith(u"\u200B"):
+        elif has_literal_dashes:
-                    document.body[i+1] = document.body[i+1][1:]
+            use_dash_ligatures = False
-                    break
+        elif has_ligature_dashes:
-            start = j+1
+            use_dash_ligatures = True
-        i += 1
+    # insert the setting if there is a preferred value
-
+    if use_dash_ligatures is not None:
        i = find_token(document.header, "\\use_microtype", 0)
        if i != -1:
            document.header.insert(i+1, "\\use_dash_ligatures %s"
                                % str(use_dash_ligatures).lower())
 def revert_dashligatures(document):
-    " Remove font ligature settings for en- and em-dashes. "
+    """Remove font ligature settings for en- and em-dashes.
    Revert conversion of \twodashes or \threedashes to literal dashes."""
    i = find_token(document.header, "\\use_dash_ligatures", 0)
    if i == -1:
        return
@ -1902,42 +1907,34 @@ def revert_dashligatures(document):
    i = find_token(document.header, "\\use_non_tex_fonts", 0)
    if i != -1:
        use_non_tex_fonts = get_bool_value(document.header, "\\use_non_tex_fonts", i)
-    if not use_dash_ligatures or use_non_tex_fonts:
+    if not use_dash_ligatures or document.backend != "latex":
        return
-    # Add a zero-length space (U+200B) after en- and em-dashes
+    j = 0
-    i = 0
+    new_body = []
-    while i < len(document.body):
+    for i, line in enumerate(document.body):
        words = document.body[i].split()
        # Skip some document parts where dashes are not converted
        if (i < j) or line.startswith("\\labelwidthstring"):
            new_body.append(line)
            continue
        words = line.split()
        if len(words) > 1 and words[0] == "\\begin_inset" and \
-           words[1] in ["CommandInset", "ERT", "External", "Formula", \
+           words[1] in ["CommandInset", "ERT", "External", "Formula",
                        "FormulaMacro", "Graphics", "IPA", "listings"]:
            j = find_end_of_inset(document.body, i)
            if j == -1:
-                document.warning("Malformed LyX document: Can't find end of " \
+                document.warning("Malformed LyX document: Can't find end of "
                                 + words[1] + " inset at line " + str(i))
-                i += 1
+            new_body.append(line)
            else:
                i = j
            continue
-        if len(words) > 0 and words[0] in ["\\leftindent", \
+        line = line.replace(u'\u2013', '\\twohyphens\n')
-                "\\paragraph_spacing", "\\align", "\\labelwidthstring"]:
+        line = line.replace(u'\u2014', '\\threehyphens\n')
-            i += 1
+        lines = line.split('\n')
-            continue
+        new_body.extend(line.split('\n'))
-
+    document.body = new_body
-        start = 0
+    # redefine the dash LICRs to use ligature dashes:
-        while True:
+    add_to_preamble(document, [r'\renewcommand{\textendash}{--}',
-            j = document.body[i].find(u"\u2013", start) # en-dash
+                               r'\renewcommand{\textemdash}{---}'])
            k = document.body[i].find(u"\u2014", start) # em-dash
            if j == -1 and k == -1:
                break
            if j == -1 or (k != -1 and k < j):
                j = k
            after = document.body[i][j+1:]
            document.body[i] = document.body[i][:j+1] + u"\u200B" + after
            start = j+1
        i += 1
 def revert_noto(document):
@ -2228,7 +2225,7 @@ def revert_mathnumberingname(document):
        else:
            l = find_token(document.header, "\\use_default_options", 0)
            document.header.insert(l, "\\options reqno")
-    # add the math_number_before tag   
+    # add the math_number_before tag
    regexp = re.compile(r'(\\math_numbering_side default)')
    i = find_re(document.header, regexp, 0)
    if i != -1: