revert accented characters to InsetLatexAccent if possible.

The function convert_accent will be used when we get rid of InsetLatexAccent.


git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@17017 a592a061-630c-0410-9148-cb99ea01b6c8
This commit is contained in:
Georg Baum 2007-02-01 19:28:55 +00:00
parent 5d41611407
commit d18412ce9a

View File

@ -20,7 +20,9 @@
""" Convert files to the file format generated by lyx 1.5"""
import re
from parser_tools import find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
import unicodedata
from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
from LyX import get_encoding
@ -839,6 +841,251 @@ def revert_caption(document):
i = i + 1
# Accents of InsetLaTeXAccent
accent_map = {
"`" : u'\u0300', # grave
"'" : u'\u0301', # acute
"^" : u'\u0302', # circumflex
"~" : u'\u0303', # tilde
"=" : u'\u0304', # macron
"u" : u'\u0306', # breve
"." : u'\u0307', # dot above
"\"": u'\u0308', # diaresis
"r" : u'\u030a', # ring above
"H" : u'\u030b', # double acute
"v" : u'\u030c', # caron
"b" : u'\u0320', # minus sign below
"d" : u'\u0323', # dot below
"c" : u'\u0327', # cedilla
"k" : u'\u0328', # ogonek
"t" : u'\u0361' # tie. This is special: It spans two characters, but
# only one is given as argument, so we don't need to
# treat it differently.
}
# special accents of InsetLaTeXAccent without argument
special_accent_map = {
'i' : u'\u0131', # dotless i
'j' : u'\u0237', # dotless j
'l' : u'\u0142', # l with stroke
'L' : u'\u0141' # L with stroke
}
# special accent arguments of InsetLaTeXAccent
accented_map = {
'\\i' : u'\u0131', # dotless i
'\\j' : u'\u0237' # dotless j
}
def _convert_accent(accent, accented_char):
type = accent
char = accented_char
if char == '':
if type in special_accent_map:
return special_accent_map[type]
# a missing char is treated as space by LyX
char = ' '
elif type == 'q' and char in ['t', 'd', 'l', 'L']:
# Special caron, only used with t, d, l and L.
# It is not in the map because we convert it to the same unicode
# character as the normal caron: \q{} is only defined if babel with
# the czech or slovak language is used, and the normal caron
# produces the correct output if the T1 font encoding is used.
# For the same reason we never convert to \q{} in the other direction.
type = 'v'
elif char in accented_map:
char = accented_map[char]
elif (len(char) > 1):
# We can only convert accents on a single char
return ''
a = accent_map.get(type)
if a:
return unicodedata.normalize("NFKC", "%s%s" % (char, a))
return ''
def convert_ertbackslash(body, i, ert, default_layout):
r""" -------------------------------------------------------------------------------------------
Convert backslashes and '\n' into valid ERT code, append the converted
text to body[i] and return the (maybe incremented) line index i"""
for c in ert:
if c == '\\':
body[i] = body[i] + '\\backslash '
i = i + 1
body.insert(i, '')
elif c == '\n':
body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, '']
i = i + 4
else:
body[i] = body[i] + c
return i
def convert_accent(document):
# The following forms are supported by LyX:
# '\i \"{a}' (standard form, as written by LyX)
# '\i \"{}' (standard form, as written by LyX if the accented char is a space)
# '\i \"{ }' (also accepted if the accented char is a space)
# '\i \" a' (also accepted)
# '\i \"' (also accepted)
re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$')
re_contents = re.compile(r'^([^\s{]+)(.*)$')
re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
i = 0
while 1:
i = find_re(document.body, re_wholeinset, i)
if i == -1:
return
match = re_wholeinset.match(document.body[i])
prefix = match.group(1)
contents = match.group(3).strip()
match = re_contents.match(contents)
if match:
# Strip first char (always \)
accent = match.group(1)[1:]
accented_contents = match.group(2).strip()
match = re_accentedcontents.match(accented_contents)
accented_char = match.group(1)
converted = _convert_accent(accent, accented_char)
if converted == '':
# Normalize contents
contents = '%s{%s}' % (accent, accented_char),
else:
document.body[i] = '%s%s' % (prefix, converted)
i += 1
continue
document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
document.body[i] = prefix
document.body[i+1:i+1] = ['\\begin_inset ERT',
'status collapsed',
'',
'\\begin_layout %s' % document.default_layout,
'',
'',
'']
i = convert_ertbackslash(document.body, i + 7,
'\\%s' % contents,
document.default_layout)
document.body[i+1:i+1] = ['\\end_layout',
'',
'\\end_inset']
i += 3
def revert_accent(document):
inverse_accent_map = {}
for k in accent_map:
inverse_accent_map[accent_map[k]] = k
inverse_special_accent_map = {}
for k in special_accent_map:
inverse_special_accent_map[special_accent_map[k]] = k
inverse_accented_map = {}
for k in accented_map:
inverse_accented_map[accented_map[k]] = k
# Since LyX may insert a line break within a word we must combine all
# words before unicode normalization.
# We do this only if the next line starts with an accent, otherwise we
# would create things like '\begin_inset ERTstatus'.
numberoflines = len(document.body)
for i in range(numberoflines-1):
if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
continue
if (document.body[i+1][0] in inverse_accent_map):
# the last character of this line and the first of the next line
# form probably a surrogate pair.
while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
document.body[i] += document.body[i+1][0]
document.body[i+1] = document.body[i+1][1:]
# Normalize to "Normal form D" (NFD, also known as canonical decomposition).
# This is needed to catch all accented characters.
for i in range(numberoflines):
# Unfortunately we have a mixture of unicode strings and plain strings,
# because we never use u'xxx' for string literals, but 'xxx'.
# Therefore we may have to try two times to normalize the data.
try:
document.body[i] = unicodedata.normalize("NFKD", document.body[i])
except TypeError:
document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
# Replace accented characters with InsetLaTeXAccent
# Do not convert characters that can be represented in the chosen
# encoding.
encoding_stack = [get_encoding(document.language, document.inputencoding, 248)]
lang_re = re.compile(r"^\\lang\s(\S+)")
for i in range(len(document.body)):
if document.inputencoding == "auto" or document.inputencoding == "default":
# Track the encoding of the current line
result = lang_re.match(document.body[i])
if result:
language = result.group(1)
if language == "default":
encoding_stack[-1] = document.encoding
else:
from lyx2lyx_lang import lang
encoding_stack[-1] = lang[language][3]
continue
elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
encoding_stack.append(encoding_stack[-1])
continue
elif find_token(document.body, "\\end_layout", i, i + 1) == i:
del encoding_stack[-1]
continue
for j in range(len(document.body[i])):
# dotless i and dotless j are both in special_accent_map and can
# occur as an accented character, so we need to test that the
# following character is no accent
if (document.body[i][j] in inverse_special_accent_map and
(j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)):
accent = document.body[i][j]
try:
dummy = accent.encode(encoding_stack[-1])
except UnicodeEncodeError:
# Insert the rest of the line as new line
if j < len(document.body[i]) - 1:
document.body[i+1:i+1] = document.body[i][j+1:]
# Delete the accented character
if j > 0:
document.body[i] = document.body[i][:j-1]
else:
document.body[i] = u''
# Finally add the InsetLaTeXAccent
document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
break
elif j > 0 and document.body[i][j] in inverse_accent_map:
accented_char = document.body[i][j-1]
if accented_char == ' ':
# Conform to LyX output
accented_char = ''
elif accented_char in inverse_accented_map:
accented_char = inverse_accented_map[accented_char]
accent = document.body[i][j]
try:
dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1])
except UnicodeEncodeError:
# Insert the rest of the line as new line
if j < len(document.body[i]) - 1:
document.body[i+1:i+1] = document.body[i][j+1:]
# Delete the accented characters
if j > 1:
document.body[i] = document.body[i][:j-2]
else:
document.body[i] = u''
# Finally add the InsetLaTeXAccent
document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
break
# Normalize to "Normal form C" (NFC, pre-composed characters) again
for i in range(numberoflines):
document.body[i] = unicodedata.normalize("NFKC", document.body[i])
##
# Conversion hub
#
@ -867,7 +1114,7 @@ revert = [[257, []],
[251, [revert_commandparams]],
[250, [revert_cs_label]],
[249, []],
[248, [revert_utf8]],
[248, [revert_accent, revert_utf8]],
[247, [revert_booktabs]],
[246, [revert_font_settings]],
[245, [revert_framed]]]