From d18412ce9a8c3865a824055a37aae6b7f7e2e46a Mon Sep 17 00:00:00 2001 From: Georg Baum Date: Thu, 1 Feb 2007 19:28:55 +0000 Subject: [PATCH] revert accented characters to InsetLatexAccent if possible. The function convert_accent will be used when we get rid of InsetLatexAccent. git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@17017 a592a061-630c-0410-9148-cb99ea01b6c8 --- lib/lyx2lyx/lyx_1_5.py | 251 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 249 insertions(+), 2 deletions(-) diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py index 68f71cc18b..3bc8a048b0 100644 --- a/lib/lyx2lyx/lyx_1_5.py +++ b/lib/lyx2lyx/lyx_1_5.py @@ -20,7 +20,9 @@ """ Convert files to the file format generated by lyx 1.5""" import re -from parser_tools import find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value +import unicodedata + +from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value from LyX import get_encoding @@ -839,6 +841,251 @@ def revert_caption(document): i = i + 1 +# Accents of InsetLaTeXAccent +accent_map = { + "`" : u'\u0300', # grave + "'" : u'\u0301', # acute + "^" : u'\u0302', # circumflex + "~" : u'\u0303', # tilde + "=" : u'\u0304', # macron + "u" : u'\u0306', # breve + "." : u'\u0307', # dot above + "\"": u'\u0308', # diaresis + "r" : u'\u030a', # ring above + "H" : u'\u030b', # double acute + "v" : u'\u030c', # caron + "b" : u'\u0320', # minus sign below + "d" : u'\u0323', # dot below + "c" : u'\u0327', # cedilla + "k" : u'\u0328', # ogonek + "t" : u'\u0361' # tie. This is special: It spans two characters, but + # only one is given as argument, so we don't need to + # treat it differently. +} + + +# special accents of InsetLaTeXAccent without argument +special_accent_map = { + 'i' : u'\u0131', # dotless i + 'j' : u'\u0237', # dotless j + 'l' : u'\u0142', # l with stroke + 'L' : u'\u0141' # L with stroke +} + + +# special accent arguments of InsetLaTeXAccent +accented_map = { + '\\i' : u'\u0131', # dotless i + '\\j' : u'\u0237' # dotless j +} + + +def _convert_accent(accent, accented_char): + type = accent + char = accented_char + if char == '': + if type in special_accent_map: + return special_accent_map[type] + # a missing char is treated as space by LyX + char = ' ' + elif type == 'q' and char in ['t', 'd', 'l', 'L']: + # Special caron, only used with t, d, l and L. + # It is not in the map because we convert it to the same unicode + # character as the normal caron: \q{} is only defined if babel with + # the czech or slovak language is used, and the normal caron + # produces the correct output if the T1 font encoding is used. + # For the same reason we never convert to \q{} in the other direction. + type = 'v' + elif char in accented_map: + char = accented_map[char] + elif (len(char) > 1): + # We can only convert accents on a single char + return '' + a = accent_map.get(type) + if a: + return unicodedata.normalize("NFKC", "%s%s" % (char, a)) + return '' + + +def convert_ertbackslash(body, i, ert, default_layout): + r""" ------------------------------------------------------------------------------------------- + Convert backslashes and '\n' into valid ERT code, append the converted + text to body[i] and return the (maybe incremented) line index i""" + + for c in ert: + if c == '\\': + body[i] = body[i] + '\\backslash ' + i = i + 1 + body.insert(i, '') + elif c == '\n': + body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, ''] + i = i + 4 + else: + body[i] = body[i] + c + return i + + +def convert_accent(document): + # The following forms are supported by LyX: + # '\i \"{a}' (standard form, as written by LyX) + # '\i \"{}' (standard form, as written by LyX if the accented char is a space) + # '\i \"{ }' (also accepted if the accented char is a space) + # '\i \" a' (also accepted) + # '\i \"' (also accepted) + re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$') + re_contents = re.compile(r'^([^\s{]+)(.*)$') + re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$') + i = 0 + while 1: + i = find_re(document.body, re_wholeinset, i) + if i == -1: + return + match = re_wholeinset.match(document.body[i]) + prefix = match.group(1) + contents = match.group(3).strip() + match = re_contents.match(contents) + if match: + # Strip first char (always \) + accent = match.group(1)[1:] + accented_contents = match.group(2).strip() + match = re_accentedcontents.match(accented_contents) + accented_char = match.group(1) + converted = _convert_accent(accent, accented_char) + if converted == '': + # Normalize contents + contents = '%s{%s}' % (accent, accented_char), + else: + document.body[i] = '%s%s' % (prefix, converted) + i += 1 + continue + document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents) + document.body[i] = prefix + document.body[i+1:i+1] = ['\\begin_inset ERT', + 'status collapsed', + '', + '\\begin_layout %s' % document.default_layout, + '', + '', + ''] + i = convert_ertbackslash(document.body, i + 7, + '\\%s' % contents, + document.default_layout) + document.body[i+1:i+1] = ['\\end_layout', + '', + '\\end_inset'] + i += 3 + + +def revert_accent(document): + inverse_accent_map = {} + for k in accent_map: + inverse_accent_map[accent_map[k]] = k + inverse_special_accent_map = {} + for k in special_accent_map: + inverse_special_accent_map[special_accent_map[k]] = k + inverse_accented_map = {} + for k in accented_map: + inverse_accented_map[accented_map[k]] = k + + # Since LyX may insert a line break within a word we must combine all + # words before unicode normalization. + # We do this only if the next line starts with an accent, otherwise we + # would create things like '\begin_inset ERTstatus'. + numberoflines = len(document.body) + for i in range(numberoflines-1): + if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ': + continue + if (document.body[i+1][0] in inverse_accent_map): + # the last character of this line and the first of the next line + # form probably a surrogate pair. + while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '): + document.body[i] += document.body[i+1][0] + document.body[i+1] = document.body[i+1][1:] + + # Normalize to "Normal form D" (NFD, also known as canonical decomposition). + # This is needed to catch all accented characters. + for i in range(numberoflines): + # Unfortunately we have a mixture of unicode strings and plain strings, + # because we never use u'xxx' for string literals, but 'xxx'. + # Therefore we may have to try two times to normalize the data. + try: + document.body[i] = unicodedata.normalize("NFKD", document.body[i]) + except TypeError: + document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8')) + + # Replace accented characters with InsetLaTeXAccent + # Do not convert characters that can be represented in the chosen + # encoding. + encoding_stack = [get_encoding(document.language, document.inputencoding, 248)] + lang_re = re.compile(r"^\\lang\s(\S+)") + for i in range(len(document.body)): + + if document.inputencoding == "auto" or document.inputencoding == "default": + # Track the encoding of the current line + result = lang_re.match(document.body[i]) + if result: + language = result.group(1) + if language == "default": + encoding_stack[-1] = document.encoding + else: + from lyx2lyx_lang import lang + encoding_stack[-1] = lang[language][3] + continue + elif find_token(document.body, "\\begin_layout", i, i + 1) == i: + encoding_stack.append(encoding_stack[-1]) + continue + elif find_token(document.body, "\\end_layout", i, i + 1) == i: + del encoding_stack[-1] + continue + + for j in range(len(document.body[i])): + # dotless i and dotless j are both in special_accent_map and can + # occur as an accented character, so we need to test that the + # following character is no accent + if (document.body[i][j] in inverse_special_accent_map and + (j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)): + accent = document.body[i][j] + try: + dummy = accent.encode(encoding_stack[-1]) + except UnicodeEncodeError: + # Insert the rest of the line as new line + if j < len(document.body[i]) - 1: + document.body[i+1:i+1] = document.body[i][j+1:] + # Delete the accented character + if j > 0: + document.body[i] = document.body[i][:j-1] + else: + document.body[i] = u'' + # Finally add the InsetLaTeXAccent + document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent] + break + elif j > 0 and document.body[i][j] in inverse_accent_map: + accented_char = document.body[i][j-1] + if accented_char == ' ': + # Conform to LyX output + accented_char = '' + elif accented_char in inverse_accented_map: + accented_char = inverse_accented_map[accented_char] + accent = document.body[i][j] + try: + dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1]) + except UnicodeEncodeError: + # Insert the rest of the line as new line + if j < len(document.body[i]) - 1: + document.body[i+1:i+1] = document.body[i][j+1:] + # Delete the accented characters + if j > 1: + document.body[i] = document.body[i][:j-2] + else: + document.body[i] = u'' + # Finally add the InsetLaTeXAccent + document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char) + break + # Normalize to "Normal form C" (NFC, pre-composed characters) again + for i in range(numberoflines): + document.body[i] = unicodedata.normalize("NFKC", document.body[i]) + + ## # Conversion hub # @@ -867,7 +1114,7 @@ revert = [[257, []], [251, [revert_commandparams]], [250, [revert_cs_label]], [249, []], - [248, [revert_utf8]], + [248, [revert_accent, revert_utf8]], [247, [revert_booktabs]], [246, [revert_font_settings]], [245, [revert_framed]]]