Fix lyx2lyx dash conversion and make it faster.

This commit is contained in:
Günter Milde 2018-01-24 17:38:19 +01:00
parent 62f32992ab
commit a151b274bf
3 changed files with 91 additions and 83 deletions

View File

@ -1557,10 +1557,11 @@ def convert_latexargs(document):
"theorems-chap-bytype", "theorems-chap", "theorems-named", "theorems-sec-bytype", "theorems-chap-bytype", "theorems-chap", "theorems-named", "theorems-sec-bytype",
"theorems-sec", "theorems-starred", "theorems-std", "todonotes"] "theorems-sec", "theorems-starred", "theorems-std", "todonotes"]
# Modules we need to take care of # Modules we need to take care of
caveat_modules = ["initials"] caveat_modules = ["initials"] # TODO: , "graphicboxes", "bicaption"]
# information about the relevant styles in caveat_modules (number of opt and req args) # information about the relevant styles in caveat_modules (number of opt and req args)
# use this if we get more caveat_modules. For now, use hard coding (see below). # use this if we get more caveat_modules. For now, use hard coding (see below).
# initials = [{'Layout' : 'Initial', 'opt' : 1, 'req' : 1}] # initials = [{'Layout' : 'Initial', 'opt' : 1, 'req' : 1}]
# graphicboxes = { ... }
# Is this a known safe layout? # Is this a known safe layout?
safe_layout = document.textclass in safe_layouts safe_layout = document.textclass in safe_layouts

View File

@ -37,7 +37,7 @@ from lyx2lyx_tools import (add_to_preamble, put_cmd_in_ert, get_ert,
from parser_tools import (check_token, del_complete_lines, from parser_tools import (check_token, del_complete_lines,
find_end_of_inset, find_end_of_layout, find_nonempty_line, find_re, find_end_of_inset, find_end_of_layout, find_nonempty_line, find_re,
find_token, find_token_backwards, get_containing_layout, find_token, find_token_backwards, get_containing_layout,
get_value, is_in_inset) get_containing_inset, get_value, is_in_inset)
#################################################################### ####################################################################
@ -622,41 +622,40 @@ def convert_dashes(document):
while i+1 < len(lines): while i+1 < len(lines):
i += 1 i += 1
line = lines[i] line = lines[i]
words = line.split() if "--" not in line:
if (len(words) > 1 and words[0] == "\\begin_inset"
and (words[1] in ["CommandInset", "ERT", "External", "Formula",
"FormulaMacro", "Graphics", "IPA", "listings"]
or line.endswith("Flex Code"))):
# must not replace anything in insets that store LaTeX contents in .lyx files
# (math and command insets without overridden read() and write() methods
# filtering out IPA makes Text::readParToken() more simple
# skip ERT as well since it is not needed there
# Flex Code is logical markup, typically rendered as typewriter
j = find_end_of_inset(lines, i)
if j == -1:
document.warning("Malformed LyX document: Can't find end of " +
words[1] + " inset at line " + str(i))
else:
i = j
continue
if lines[i] == "\\begin_layout LyX-Code":
j = find_end_of_layout(lines, i)
if j == -1:
document.warning("Malformed LyX document: "
"Can't find end of %s layout at line %d" % (words[1],i))
else:
i = j
continue continue
# skip label width string (bug 10243):
if line.startswith("\\labelwidthstring"): if line.startswith("\\labelwidthstring"):
# skip label width string (bug 10243)
continue continue
# Do not touch hyphens in some insets:
if "--" in line: try:
# We can have an arbitrary number of consecutive hyphens. value, start, end = get_containing_inset(lines, i)
# Replace as LaTeX does: First try emdash, then endash except TypeError:
line = line.replace("---", "\\threehyphens\n") # False means no (or malformed) containing inset
line = line.replace("--", "\\twohyphens\n") value, start, end = "no inset", -1, -1
lines[i:i+1] = line.splitlines() # We must not replace anything in insets that store LaTeX contents in .lyx files
# (math and command insets without overridden read() and write() methods.
# Filtering out IPA and ERT makes Text::readParToken() more simple,
# Flex Code is logical markup, typically rendered as typewriter
if (value.split()[0] in ["CommandInset", "ERT", "External", "Formula",
"FormulaMacro", "Graphics", "IPA", "listings"]
or value in ["Flex Code", "Flex URL"]):
i = end
continue
try:
layout, start, end, j = get_containing_layout(lines, i)
except TypeError: # no (or malformed) containing layout
document.warning("Malformed LyX document: "
"Can't find layout at line %d" % i)
continue
if layout == "LyX-Code":
i = end
continue
# We can have an arbitrary number of consecutive hyphens.
# Replace as LaTeX does: First try emdash, then endash
line = line.replace("---", "\\threehyphens\n")
line = line.replace("--", "\\twohyphens\n")
lines[i:i+1] = line.splitlines()
# remove ligature breaks between dashes # remove ligature breaks between dashes
i = 1 i = 1
@ -672,40 +671,40 @@ def convert_dashes(document):
def revert_dashes(document): def revert_dashes(document):
""" """
Remove preamble code from 2.3->2.2 conversion.
Prevent ligatures of existing --- and --. Prevent ligatures of existing --- and --.
Revert \\twohyphens and \\threehyphens to -- and ---. Revert \\twohyphens and \\threehyphens to -- and ---.
Remove preamble code from 2.3->2.2 conversion.
""" """
del_complete_lines(document.preamble, del_complete_lines(document.preamble,
['% Added by lyx2lyx', ['% Added by lyx2lyx',
r'\renewcommand{\textendash}{--}', r'\renewcommand{\textendash}{--}',
r'\renewcommand{\textemdash}{---}']) r'\renewcommand{\textemdash}{---}'])
# Insert ligature breaks to prevent ligation of hyphens to dashes: # Insert ligature breaks to prevent ligation of hyphens to dashes:
lines = document.body lines = document.body
i = 0 i = 0
while i+1 < len(lines): while i+1 < len(lines):
i += 1 i += 1
line = lines[i] line = lines[i]
if "--" not in line:
continue
# skip label width string (bug 10243): # skip label width string (bug 10243):
if line.startswith("\\labelwidthstring"): if line.startswith("\\labelwidthstring"):
continue continue
# do not touch hyphens in some insets (cf. convert_dashes): # do not touch hyphens in some insets (cf. convert_dashes):
if line.startswith("\\begin_inset"): try:
try: value, start, end = get_containing_inset(lines, i)
if line.split()[1] in ["CommandInset", "ERT", "External", except TypeError:
"Formula", "FormulaMacro", "Graphics", # False means no (or malformed) containing inset
"IPA", "listings"]: value, start, end = "no inset", -1, -1
j = find_end_of_inset(lines, i) if (value.split()[0] in ["CommandInset", "ERT", "External", "Formula",
if j == -1: "FormulaMacro", "Graphics", "IPA", "listings"]
document.warning("Malformed LyX document: Can't find " or value == "Flex URL"):
"end of %s inset at line %d." % (itype, i)) i = end
continue continue
i = j line = line.replace("--", "-\\SpecialChar \\textcompwordmark{}\n-")
except IndexError: document.body[i:i+1] = line.split('\n')
continue
if "--" in line:
line = line.replace("--", "-\\SpecialChar \\textcompwordmark{}\n-")
document.body[i:i+1] = line.split('\n')
# Revert \twohyphens and \threehyphens: # Revert \twohyphens and \threehyphens:
i = 1 i = 1
while i < len(lines): while i < len(lines):

View File

@ -26,7 +26,7 @@ import sys, os
from parser_tools import (del_token, del_value, del_complete_lines, from parser_tools import (del_token, del_value, del_complete_lines,
find_complete_lines, find_end_of, find_end_of_layout, find_end_of_inset, find_complete_lines, find_end_of, find_end_of_layout, find_end_of_inset,
find_re, find_token, find_token_backwards, find_re, find_token, find_token_backwards, get_containing_inset,
get_containing_layout, get_bool_value, get_value, get_quoted_value) get_containing_layout, get_bool_value, get_value, get_quoted_value)
# find_tokens, find_token_exact, is_in_inset, # find_tokens, find_token_exact, is_in_inset,
# check_token, get_option_value # check_token, get_option_value
@ -1853,46 +1853,54 @@ def convert_dashligatures(document):
if use_dash_ligatures is None: if use_dash_ligatures is None:
# Look for dashes (Documents by LyX 2.1 or older have "\twohyphens\n" # Look for dashes (Documents by LyX 2.1 or older have "\twohyphens\n"
# or "\threehyphens\n" as interim representation for -- an ---.) # or "\threehyphens\n" as interim representation for -- an ---.)
has_literal_dashes = False lines = document.body
has_ligature_dashes = False has_literal_dashes = has_ligature_dashes = False
j = 0 i = j = 0
for i, line in enumerate(document.body): while i+1 < len(lines):
# Skip some document parts where dashes are not converted i += 1
if (i < j) or line.startswith("\\labelwidthstring"): line = lines[i]
# skip lines without any dashes:
if not re.search(u"[\u2013\u2014]|\\twohyphens|\\threehyphens", line):
continue continue
if line.startswith("\\begin_inset"): # skip label width string (see bug 10243):
try: if line.startswith("\\labelwidthstring"):
it = line.split()[1]
except IndexError:
continue
if (it in ["CommandInset", "ERT", "External", "Formula",
"FormulaMacro", "Graphics", "IPA", "listings"]
or line.endswith("Flex Code")):
j = find_end_of_inset(document.body, i)
if j == -1:
document.warning("Malformed LyX document: Can't "
"find end of %s inset at line %d." % (itype, i))
continue
if line == "\\begin_layout LyX-Code":
j = find_end_of_layout(document.body, i)
if j == -1:
document.warning("Malformed LyX document: "
"Can't find end of %s layout at line %d" % (words[1],i))
continue continue
# do not touch hyphens in some insets (cf. lyx_2_2.convert_dashes):
try:
value, start, end = get_containing_inset(lines, i)
except TypeError: # no containing inset
value, start, end = "no inset", -1, -1
if (value.split()[0] in
["CommandInset", "ERT", "External", "Formula",
"FormulaMacro", "Graphics", "IPA", "listings"]
or value == "Flex Code"):
i = end
continue
try:
layout, start, end, j = get_containing_layout(lines, i)
except TypeError: # no (or malformed) containing layout
document.warning("Malformed LyX document: "
"Can't find layout at line %d" % i)
continue
if layout == "LyX-Code":
i = end
continue
# literal dash followed by a word or no-break space: # literal dash followed by a word or no-break space:
if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line, if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line,
flags=re.UNICODE): flags=re.UNICODE):
has_literal_dashes = True has_literal_dashes = True
# ligature dash followed by word or no-break space on next line: # ligature dash followed by word or no-break space on next line:
if (re.search(r"(\\twohyphens|\\threehyphens)", line) and if (re.search(r"(\\twohyphens|\\threehyphens)", line) and
re.match(u"[\w\u00A0]", document.body[i+1], flags=re.UNICODE)): re.match(u"[\w\u00A0]", lines[i+1], flags=re.UNICODE)):
has_ligature_dashes = True has_ligature_dashes = True
if has_literal_dashes and has_ligature_dashes: if has_literal_dashes and has_ligature_dashes:
# TODO: insert a warning note in the document? # TODO: insert a warning note in the document?
document.warning('This document contained both literal and ' document.warning('This document contained both literal and '
'"ligature" dashes.\n Line breaks may have changed. ' '"ligature" dashes.\n Line breaks may have changed. '
'See UserGuide chapter 3.9.1 for details.') 'See UserGuide chapter 3.9.1 for details.')
elif has_literal_dashes: break
if has_literal_dashes:
use_dash_ligatures = False use_dash_ligatures = False
elif has_ligature_dashes: elif has_ligature_dashes:
use_dash_ligatures = True use_dash_ligatures = True