mirror of
https://git.lyx.org/repos/lyx.git
synced 2024-11-22 10:00:33 +00:00
revert accented characters to InsetLatexAccent if possible.
The function convert_accent will be used when we get rid of InsetLatexAccent. git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@17017 a592a061-630c-0410-9148-cb99ea01b6c8
This commit is contained in:
parent
5d41611407
commit
d18412ce9a
@ -20,7 +20,9 @@
|
||||
""" Convert files to the file format generated by lyx 1.5"""
|
||||
|
||||
import re
|
||||
from parser_tools import find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
|
||||
import unicodedata
|
||||
|
||||
from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
|
||||
from LyX import get_encoding
|
||||
|
||||
|
||||
@ -839,6 +841,251 @@ def revert_caption(document):
|
||||
i = i + 1
|
||||
|
||||
|
||||
# Accents of InsetLaTeXAccent
|
||||
accent_map = {
|
||||
"`" : u'\u0300', # grave
|
||||
"'" : u'\u0301', # acute
|
||||
"^" : u'\u0302', # circumflex
|
||||
"~" : u'\u0303', # tilde
|
||||
"=" : u'\u0304', # macron
|
||||
"u" : u'\u0306', # breve
|
||||
"." : u'\u0307', # dot above
|
||||
"\"": u'\u0308', # diaresis
|
||||
"r" : u'\u030a', # ring above
|
||||
"H" : u'\u030b', # double acute
|
||||
"v" : u'\u030c', # caron
|
||||
"b" : u'\u0320', # minus sign below
|
||||
"d" : u'\u0323', # dot below
|
||||
"c" : u'\u0327', # cedilla
|
||||
"k" : u'\u0328', # ogonek
|
||||
"t" : u'\u0361' # tie. This is special: It spans two characters, but
|
||||
# only one is given as argument, so we don't need to
|
||||
# treat it differently.
|
||||
}
|
||||
|
||||
|
||||
# special accents of InsetLaTeXAccent without argument
|
||||
special_accent_map = {
|
||||
'i' : u'\u0131', # dotless i
|
||||
'j' : u'\u0237', # dotless j
|
||||
'l' : u'\u0142', # l with stroke
|
||||
'L' : u'\u0141' # L with stroke
|
||||
}
|
||||
|
||||
|
||||
# special accent arguments of InsetLaTeXAccent
|
||||
accented_map = {
|
||||
'\\i' : u'\u0131', # dotless i
|
||||
'\\j' : u'\u0237' # dotless j
|
||||
}
|
||||
|
||||
|
||||
def _convert_accent(accent, accented_char):
|
||||
type = accent
|
||||
char = accented_char
|
||||
if char == '':
|
||||
if type in special_accent_map:
|
||||
return special_accent_map[type]
|
||||
# a missing char is treated as space by LyX
|
||||
char = ' '
|
||||
elif type == 'q' and char in ['t', 'd', 'l', 'L']:
|
||||
# Special caron, only used with t, d, l and L.
|
||||
# It is not in the map because we convert it to the same unicode
|
||||
# character as the normal caron: \q{} is only defined if babel with
|
||||
# the czech or slovak language is used, and the normal caron
|
||||
# produces the correct output if the T1 font encoding is used.
|
||||
# For the same reason we never convert to \q{} in the other direction.
|
||||
type = 'v'
|
||||
elif char in accented_map:
|
||||
char = accented_map[char]
|
||||
elif (len(char) > 1):
|
||||
# We can only convert accents on a single char
|
||||
return ''
|
||||
a = accent_map.get(type)
|
||||
if a:
|
||||
return unicodedata.normalize("NFKC", "%s%s" % (char, a))
|
||||
return ''
|
||||
|
||||
|
||||
def convert_ertbackslash(body, i, ert, default_layout):
|
||||
r""" -------------------------------------------------------------------------------------------
|
||||
Convert backslashes and '\n' into valid ERT code, append the converted
|
||||
text to body[i] and return the (maybe incremented) line index i"""
|
||||
|
||||
for c in ert:
|
||||
if c == '\\':
|
||||
body[i] = body[i] + '\\backslash '
|
||||
i = i + 1
|
||||
body.insert(i, '')
|
||||
elif c == '\n':
|
||||
body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, '']
|
||||
i = i + 4
|
||||
else:
|
||||
body[i] = body[i] + c
|
||||
return i
|
||||
|
||||
|
||||
def convert_accent(document):
|
||||
# The following forms are supported by LyX:
|
||||
# '\i \"{a}' (standard form, as written by LyX)
|
||||
# '\i \"{}' (standard form, as written by LyX if the accented char is a space)
|
||||
# '\i \"{ }' (also accepted if the accented char is a space)
|
||||
# '\i \" a' (also accepted)
|
||||
# '\i \"' (also accepted)
|
||||
re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$')
|
||||
re_contents = re.compile(r'^([^\s{]+)(.*)$')
|
||||
re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
|
||||
i = 0
|
||||
while 1:
|
||||
i = find_re(document.body, re_wholeinset, i)
|
||||
if i == -1:
|
||||
return
|
||||
match = re_wholeinset.match(document.body[i])
|
||||
prefix = match.group(1)
|
||||
contents = match.group(3).strip()
|
||||
match = re_contents.match(contents)
|
||||
if match:
|
||||
# Strip first char (always \)
|
||||
accent = match.group(1)[1:]
|
||||
accented_contents = match.group(2).strip()
|
||||
match = re_accentedcontents.match(accented_contents)
|
||||
accented_char = match.group(1)
|
||||
converted = _convert_accent(accent, accented_char)
|
||||
if converted == '':
|
||||
# Normalize contents
|
||||
contents = '%s{%s}' % (accent, accented_char),
|
||||
else:
|
||||
document.body[i] = '%s%s' % (prefix, converted)
|
||||
i += 1
|
||||
continue
|
||||
document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
|
||||
document.body[i] = prefix
|
||||
document.body[i+1:i+1] = ['\\begin_inset ERT',
|
||||
'status collapsed',
|
||||
'',
|
||||
'\\begin_layout %s' % document.default_layout,
|
||||
'',
|
||||
'',
|
||||
'']
|
||||
i = convert_ertbackslash(document.body, i + 7,
|
||||
'\\%s' % contents,
|
||||
document.default_layout)
|
||||
document.body[i+1:i+1] = ['\\end_layout',
|
||||
'',
|
||||
'\\end_inset']
|
||||
i += 3
|
||||
|
||||
|
||||
def revert_accent(document):
|
||||
inverse_accent_map = {}
|
||||
for k in accent_map:
|
||||
inverse_accent_map[accent_map[k]] = k
|
||||
inverse_special_accent_map = {}
|
||||
for k in special_accent_map:
|
||||
inverse_special_accent_map[special_accent_map[k]] = k
|
||||
inverse_accented_map = {}
|
||||
for k in accented_map:
|
||||
inverse_accented_map[accented_map[k]] = k
|
||||
|
||||
# Since LyX may insert a line break within a word we must combine all
|
||||
# words before unicode normalization.
|
||||
# We do this only if the next line starts with an accent, otherwise we
|
||||
# would create things like '\begin_inset ERTstatus'.
|
||||
numberoflines = len(document.body)
|
||||
for i in range(numberoflines-1):
|
||||
if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
|
||||
continue
|
||||
if (document.body[i+1][0] in inverse_accent_map):
|
||||
# the last character of this line and the first of the next line
|
||||
# form probably a surrogate pair.
|
||||
while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
|
||||
document.body[i] += document.body[i+1][0]
|
||||
document.body[i+1] = document.body[i+1][1:]
|
||||
|
||||
# Normalize to "Normal form D" (NFD, also known as canonical decomposition).
|
||||
# This is needed to catch all accented characters.
|
||||
for i in range(numberoflines):
|
||||
# Unfortunately we have a mixture of unicode strings and plain strings,
|
||||
# because we never use u'xxx' for string literals, but 'xxx'.
|
||||
# Therefore we may have to try two times to normalize the data.
|
||||
try:
|
||||
document.body[i] = unicodedata.normalize("NFKD", document.body[i])
|
||||
except TypeError:
|
||||
document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
|
||||
|
||||
# Replace accented characters with InsetLaTeXAccent
|
||||
# Do not convert characters that can be represented in the chosen
|
||||
# encoding.
|
||||
encoding_stack = [get_encoding(document.language, document.inputencoding, 248)]
|
||||
lang_re = re.compile(r"^\\lang\s(\S+)")
|
||||
for i in range(len(document.body)):
|
||||
|
||||
if document.inputencoding == "auto" or document.inputencoding == "default":
|
||||
# Track the encoding of the current line
|
||||
result = lang_re.match(document.body[i])
|
||||
if result:
|
||||
language = result.group(1)
|
||||
if language == "default":
|
||||
encoding_stack[-1] = document.encoding
|
||||
else:
|
||||
from lyx2lyx_lang import lang
|
||||
encoding_stack[-1] = lang[language][3]
|
||||
continue
|
||||
elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
|
||||
encoding_stack.append(encoding_stack[-1])
|
||||
continue
|
||||
elif find_token(document.body, "\\end_layout", i, i + 1) == i:
|
||||
del encoding_stack[-1]
|
||||
continue
|
||||
|
||||
for j in range(len(document.body[i])):
|
||||
# dotless i and dotless j are both in special_accent_map and can
|
||||
# occur as an accented character, so we need to test that the
|
||||
# following character is no accent
|
||||
if (document.body[i][j] in inverse_special_accent_map and
|
||||
(j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)):
|
||||
accent = document.body[i][j]
|
||||
try:
|
||||
dummy = accent.encode(encoding_stack[-1])
|
||||
except UnicodeEncodeError:
|
||||
# Insert the rest of the line as new line
|
||||
if j < len(document.body[i]) - 1:
|
||||
document.body[i+1:i+1] = document.body[i][j+1:]
|
||||
# Delete the accented character
|
||||
if j > 0:
|
||||
document.body[i] = document.body[i][:j-1]
|
||||
else:
|
||||
document.body[i] = u''
|
||||
# Finally add the InsetLaTeXAccent
|
||||
document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
|
||||
break
|
||||
elif j > 0 and document.body[i][j] in inverse_accent_map:
|
||||
accented_char = document.body[i][j-1]
|
||||
if accented_char == ' ':
|
||||
# Conform to LyX output
|
||||
accented_char = ''
|
||||
elif accented_char in inverse_accented_map:
|
||||
accented_char = inverse_accented_map[accented_char]
|
||||
accent = document.body[i][j]
|
||||
try:
|
||||
dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1])
|
||||
except UnicodeEncodeError:
|
||||
# Insert the rest of the line as new line
|
||||
if j < len(document.body[i]) - 1:
|
||||
document.body[i+1:i+1] = document.body[i][j+1:]
|
||||
# Delete the accented characters
|
||||
if j > 1:
|
||||
document.body[i] = document.body[i][:j-2]
|
||||
else:
|
||||
document.body[i] = u''
|
||||
# Finally add the InsetLaTeXAccent
|
||||
document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
|
||||
break
|
||||
# Normalize to "Normal form C" (NFC, pre-composed characters) again
|
||||
for i in range(numberoflines):
|
||||
document.body[i] = unicodedata.normalize("NFKC", document.body[i])
|
||||
|
||||
|
||||
##
|
||||
# Conversion hub
|
||||
#
|
||||
@ -867,7 +1114,7 @@ revert = [[257, []],
|
||||
[251, [revert_commandparams]],
|
||||
[250, [revert_cs_label]],
|
||||
[249, []],
|
||||
[248, [revert_utf8]],
|
||||
[248, [revert_accent, revert_utf8]],
|
||||
[247, [revert_booktabs]],
|
||||
[246, [revert_font_settings]],
|
||||
[245, [revert_framed]]]
|
||||
|
Loading…
Reference in New Issue
Block a user