Fix lyx2lyx conversion of dashes.

Update for UserGuide and de/UserGuide will follow.
2024-12-22 05:16:21 +00:00 · 2017-09-30 23:26:02 +02:00 · 2017-09-30 23:26:02 +02:00 · 39beb08bf8
commit 39beb08bf8
parent f4ef992966
3 changed files with 90 additions and 100 deletions
--- a/lib/RELEASE-NOTES
+++ b/lib/RELEASE-NOTES
@ -14,10 +14,9 @@
  if needed, as usual.

 * The new setting
-  "Document->Settings->Fonts->Disallow line breaks after dashes" forces
-  output of en- and em-dashes as \textendash and \textemdash when exporting
-  to LaTeX. It is is "false" by default but "true" when opening documents
-  edited with LyX 2.2.
+  "Document->Settings->Fonts->Disallow line breaks after dashes"
+  turns off the conversion of em- and en-dash characters to --- and --
+  respectively for LaTeX export. It is is "false" by default.
  See chapter 3.9.1.1 "Dashes and Line Breaks" of the User Guide and
  "Caveats when upgrading from earlier versions to 2.3.x" below.

@ -214,27 +213,15 @@
  the external_templates file, you will have to move the modifications to
  the respective *.xtemplate file manually.

-* By default, LyX 2.3 outputs en- and em-dashes after which a line break can
-  occur in the output. Sometimes, this results in undesired line breaks.
+* By default, LyX 2.3 outputs en- and em-dashes as -- and --- respectively,
+  so that a line break can occur in the output immediately after the dash.
+  Sometimes, this results in undesired line breaks or overfull lines due to
+  suppression of hyphenation in the word preceding the dash.
  Select "Document->Settings->Fonts->Disallow line breaks after dashes"
-  to keep the LyX 2.2 behaviour, where such line breaks have been generally
-  suppressed. See chapter 3.9.1.1, "Dashes and Line Breaks", of the User Guide
+  to keep the LyX 2.2 behaviour.
+  See chapter 3.9.1.1, "Dashes and Line Breaks", of the User Guide
  for details.

-* ZWSP characters (u200b) following literal em- and en-dashes are deleted by
-  lyx2lyx when converting to 2.3 format. If you used them as optional line
-  breaks after dashes, convert them to space insets before opening your
-  document with LyX 2.3 or the optional line breaks will be lost!
-
-* If using TeX fonts and en- and em-dashes are output in breakable form,
-  when exporting documents containing en- and em-dashes to the format of
-  LyX 2.0 or earlier, the following line has to be manually added to the
-  unicodesymbols file of that LyX version:<br>
-  0x200b "\\hspace{0pt}" "" "" "" "" # ZERO WIDTH SPACE<br>
-  This avoids "uncodable character" issues if the document is actually
-  loaded by that LyX version. LyX 2.1 and later versions already have the
-  necessary definition in their unicodesymbols file.
-
 * If trying to compile documents using R scripts and sweave/knitr, LyX
  2.3.x would not allow for re-running the R scripts, unless the user:
  1) explicitly disables the "Forbid use of needauth converters"
--- a/lib/lyx2lyx/lyx_2_2.py
+++ b/lib/lyx2lyx/lyx_2_2.py
@ -659,6 +659,12 @@ def convert_dashes(document):
 def revert_dashes(document):
    "convert \\twohyphens and \\threehyphens to -- and ---"

+    # eventually remove preamble code from 2.3->2.2 conversion:
+    for i, line in enumerate(document.preamble):
+        if i > 1 and line == r'\renewcommand{\textemdash}{---}':
+            if (document.preamble[i-1] == r'\renewcommand{\textendash}{--}'
+                and document.preamble[i-2] == '% Added by lyx2lyx'):
+                del document.preamble[i-2:i+1]
    i = 0
    while i < len(document.body):
        words = document.body[i].split()
--- a/lib/lyx2lyx/lyx_2_3.py
+++ b/lib/lyx2lyx/lyx_2_3.py
@ -1841,58 +1841,63 @@ def revert_chapterbib(document):


 def convert_dashligatures(document):
-    " Remove a zero-length space (U+200B) after en- and em-dashes. "
-
-    i = find_token(document.header, "\\use_microtype", 0)
-    if i != -1:
-        if document.initial_format > 474 and document.initial_format < 509:
-            # This was created by LyX 2.2
-            document.header[i+1:i+1] = ["\\use_dash_ligatures false"]
-        else:
-            # This was created by LyX 2.1 or earlier
-            document.header[i+1:i+1] = ["\\use_dash_ligatures true"]
-
-    i = 0
-    while i < len(document.body):
-        words = document.body[i].split()
-        # Skip some document parts where dashes are not converted
-        if len(words) > 1 and words[0] == "\\begin_inset" and \
-           words[1] in ["CommandInset", "ERT", "External", "Formula", \
-                        "FormulaMacro", "Graphics", "IPA", "listings"]:
-            j = find_end_of_inset(document.body, i)
-            if j == -1:
-                document.warning("Malformed LyX document: Can't find end of " \
-                                 + words[1] + " inset at line " + str(i))
-                i += 1
-            else:
-                i = j
-            continue
-        if len(words) > 0 and words[0] in ["\\leftindent", \
-                "\\paragraph_spacing", "\\align", "\\labelwidthstring"]:
-            i += 1
-            continue
-
-        start = 0
-        while True:
-            j = document.body[i].find(u"\u2013", start) # en-dash
-            k = document.body[i].find(u"\u2014", start) # em-dash
-            if j == -1 and k == -1:
-                break
-            if j == -1 or (k != -1 and k < j):
-                j = k
-            after = document.body[i][j+1:]
-            if after.startswith(u"\u200B"):
-                document.body[i] = document.body[i][:j+1] + after[1:]
-            else:
-                if len(after) == 0 and document.body[i+1].startswith(u"\u200B"):
-                    document.body[i+1] = document.body[i+1][1:]
-                    break
-            start = j+1
-        i += 1
-
+    "Set 'use_dash_ligatures' according to content."
+    use_dash_ligatures = None
+    # eventually remove preamble code from 2.3->2.2 conversion:
+    for i, line in enumerate(document.preamble):
+        if i > 1 and line == r'\renewcommand{\textemdash}{---}':
+            if (document.preamble[i-1] == r'\renewcommand{\textendash}{--}'
+                and document.preamble[i-2] == '% Added by lyx2lyx'):
+                del document.preamble[i-2:i+1]
+                use_dash_ligatures = True
+    if use_dash_ligatures is None:
+        # Look for dashes:
+        # (Documents by LyX 2.1 or older have "\twohyphens\n" or "\threehyphens\n"
+        # as interim representation for dash ligatures in 2.2.)
+        has_literal_dashes = False
+        has_ligature_dashes = False
+        j = 0
+        for i, line in enumerate(document.body):
+            # Skip some document parts where dashes are not converted
+            if (i < j) or line.startswith("\\labelwidthstring"):
+                continue
+            words = line.split()
+            if len(words) > 1 and words[0] == "\\begin_inset" and \
+            words[1] in ["CommandInset", "ERT", "External", "Formula",
+                         "FormulaMacro", "Graphics", "IPA", "listings"]:
+                j = find_end_of_inset(document.body, i)
+                if j == -1:
+                    document.warning("Malformed LyX document: "
+                        "Can't find end of %s inset at line %d" % (words[1],i))
+                continue
+            # literal dash followed by a word or no-break space:
+            if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line,
+                         flags=re.UNICODE):
+                has_literal_dashes = True
+            # ligature dash followed by word or no-break space on next line:
+            if re.search(u"(\\\\twohyphens|\\\\threehyphens)", line,
+                            flags=re.UNICODE) and re.match(u"[\w\u00A0]",
+                            document.body[i+1], flags=re.UNICODE):
+                has_ligature_dashes = True
+        if has_literal_dashes and has_ligature_dashes:
+            # TODO: insert a warning note in the document?
+            document.warning('This document contained both literal and '
+                '"ligature" dashes.\n Line breaks may have changed. '
+                'See UserGuide chapter 3.9.1 for details.')
+        elif has_literal_dashes:
+            use_dash_ligatures = False
+        elif has_ligature_dashes:
+            use_dash_ligatures = True
+    # insert the setting if there is a preferred value
+    if use_dash_ligatures is not None:
+        i = find_token(document.header, "\\use_microtype", 0)
+        if i != -1:
+            document.header.insert(i+1, "\\use_dash_ligatures %s"
+                                % str(use_dash_ligatures).lower())

 def revert_dashligatures(document):
-    " Remove font ligature settings for en- and em-dashes. "
+    """Remove font ligature settings for en- and em-dashes.
+    Revert conversion of \twodashes or \threedashes to literal dashes."""
    i = find_token(document.header, "\\use_dash_ligatures", 0)
    if i == -1:
        return
@ -1902,42 +1907,34 @@ def revert_dashligatures(document):
    i = find_token(document.header, "\\use_non_tex_fonts", 0)
    if i != -1:
        use_non_tex_fonts = get_bool_value(document.header, "\\use_non_tex_fonts", i)
-    if not use_dash_ligatures or use_non_tex_fonts:
+    if not use_dash_ligatures or document.backend != "latex":
        return

-    # Add a zero-length space (U+200B) after en- and em-dashes
-    i = 0
-    while i < len(document.body):
-        words = document.body[i].split()
+    j = 0
+    new_body = []
+    for i, line in enumerate(document.body):
        # Skip some document parts where dashes are not converted
+        if (i < j) or line.startswith("\\labelwidthstring"):
+            new_body.append(line)
+            continue
+        words = line.split()
        if len(words) > 1 and words[0] == "\\begin_inset" and \
-           words[1] in ["CommandInset", "ERT", "External", "Formula", \
+           words[1] in ["CommandInset", "ERT", "External", "Formula",
                        "FormulaMacro", "Graphics", "IPA", "listings"]:
            j = find_end_of_inset(document.body, i)
            if j == -1:
-                document.warning("Malformed LyX document: Can't find end of " \
+                document.warning("Malformed LyX document: Can't find end of "
                                 + words[1] + " inset at line " + str(i))
-                i += 1
-            else:
-                i = j
+            new_body.append(line)
            continue
-        if len(words) > 0 and words[0] in ["\\leftindent", \
-                "\\paragraph_spacing", "\\align", "\\labelwidthstring"]:
-            i += 1
-            continue
-
-        start = 0
-        while True:
-            j = document.body[i].find(u"\u2013", start) # en-dash
-            k = document.body[i].find(u"\u2014", start) # em-dash
-            if j == -1 and k == -1:
-                break
-            if j == -1 or (k != -1 and k < j):
-                j = k
-            after = document.body[i][j+1:]
-            document.body[i] = document.body[i][:j+1] + u"\u200B" + after
-            start = j+1
-        i += 1
+        line = line.replace(u'\u2013', '\\twohyphens\n')
+        line = line.replace(u'\u2014', '\\threehyphens\n')
+        lines = line.split('\n')
+        new_body.extend(line.split('\n'))
+    document.body = new_body
+    # redefine the dash LICRs to use ligature dashes:
+    add_to_preamble(document, [r'\renewcommand{\textendash}{--}',
+                               r'\renewcommand{\textemdash}{---}'])


 def revert_noto(document):
@ -2228,7 +2225,7 @@ def revert_mathnumberingname(document):
        else:
            l = find_token(document.header, "\\use_default_options", 0)
            document.header.insert(l, "\\options reqno")
-    # add the math_number_before tag   
+    # add the math_number_before tag
    regexp = re.compile(r'(\\math_numbering_side default)')
    i = find_re(document.header, regexp, 0)
    if i != -1: