mirror of
https://git.lyx.org/repos/lyx.git
synced 2025-01-18 13:40:19 +00:00
214 lines
7.0 KiB
Python
214 lines
7.0 KiB
Python
|
#! /usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
# file convert_kmap.py
|
||
|
# This file is part of LyX, the document processor.
|
||
|
# Licence details can be found in the file COPYING.
|
||
|
|
||
|
# author Georg Baum
|
||
|
|
||
|
# Full author contact details are available in file CREDITS
|
||
|
|
||
|
# This script converts a kmap file from LaTeX commands to unicode characters
|
||
|
# The kmap file is read and written in utf8 encoding
|
||
|
|
||
|
|
||
|
import os, re, string, sys, unicodedata
|
||
|
|
||
|
def usage(prog_name):
|
||
|
return ("Usage: %s unicodesymbolsfile inputfile outputfile\n" % prog_name +
|
||
|
"or %s unicodesymbolsfile <inputfile >outputfile" % prog_name)
|
||
|
|
||
|
|
||
|
def error(message):
|
||
|
sys.stderr.write(message + '\n')
|
||
|
sys.exit(1)
|
||
|
|
||
|
|
||
|
def trim_eol(line):
|
||
|
" Remove end of line char(s)."
|
||
|
if line[-2:-1] == '\r':
|
||
|
return line[:-2]
|
||
|
elif line[-1:] == '\r' or line[-1:] == '\n':
|
||
|
return line[:-1]
|
||
|
else:
|
||
|
# file with no EOL in last line
|
||
|
return line
|
||
|
|
||
|
|
||
|
def read(input):
|
||
|
" Read input file and strip lineendings."
|
||
|
lines = list()
|
||
|
while 1:
|
||
|
line = input.readline()
|
||
|
if not line:
|
||
|
break
|
||
|
lines.append(trim_eol(line).decode('utf8'))
|
||
|
return lines
|
||
|
|
||
|
|
||
|
def escape(word):
|
||
|
" Escape a word for LyXLex."
|
||
|
re_quote = re.compile(r'\s|,')
|
||
|
retval = u''
|
||
|
i = 0
|
||
|
for c in word:
|
||
|
if c == '\\' or c == '"' or c == '#':
|
||
|
retval = retval + u'\\'
|
||
|
retval = retval + c
|
||
|
if re_quote.match(retval):
|
||
|
return u'"%s"' % retval
|
||
|
return retval
|
||
|
|
||
|
|
||
|
def unescape(word):
|
||
|
" Unescape a LyXLex escaped word."
|
||
|
if len(word) > 1 and word[0] == '"' and word[-1] == '"':
|
||
|
start = 1
|
||
|
stop = len(word) - 1
|
||
|
else:
|
||
|
start = 0
|
||
|
stop = len(word)
|
||
|
retval = u''
|
||
|
i = start
|
||
|
while i < stop:
|
||
|
if word[i] == '\\' and i < stop - 1:
|
||
|
i = i + 1
|
||
|
retval = retval + word[i]
|
||
|
i = i + 1
|
||
|
return retval
|
||
|
|
||
|
|
||
|
def readsymbols(input):
|
||
|
" Build the symbol list from the unicodesymbols file and add some hardcoded symbols."
|
||
|
symbols = list()
|
||
|
while 1:
|
||
|
line = input.readline()
|
||
|
if not line:
|
||
|
break
|
||
|
line = trim_eol(line)
|
||
|
tokens = line.split()
|
||
|
if len(tokens) > 0 and tokens[0][0] != '#':
|
||
|
if len(tokens) > 1:
|
||
|
tokens[1] = unescape(tokens[1])
|
||
|
if tokens[0][0:2] == "0x":
|
||
|
tokens[0] = int(tokens[0][2:], 16)
|
||
|
symbols.append(tokens)
|
||
|
# special cases from .cdef files (e.g. duplicates with different commands)
|
||
|
symbols.append([0x00a1, '\\nobreakspace'])
|
||
|
symbols.append([0x00a7, '\\S'])
|
||
|
symbols.append([0x00a9, '\\copyright'])
|
||
|
symbols.append([0x00b1, '$\\pm$'])
|
||
|
symbols.append([0x00b5, '$\\mu$'])
|
||
|
symbols.append([0x00b7, '$\\cdot$'])
|
||
|
symbols.append([0x00b9, '$\\mathonesuperior$'])
|
||
|
symbols.append([0x00d7, '$\\times$'])
|
||
|
symbols.append([0x00d7, '\\times'])
|
||
|
symbols.append([0x00f7, '\\div'])
|
||
|
symbols.append([0x20ac, '\\euro'])
|
||
|
# special caron, see lib/lyx2lyx/lyx_1_5.py for an explanation
|
||
|
symbols.append([0x030c, '\\q', '', 'combining'])
|
||
|
return symbols
|
||
|
|
||
|
|
||
|
def write(output, lines):
|
||
|
" Write output file with native lineendings."
|
||
|
for line in lines:
|
||
|
output.write(line.encode('utf8') + os.linesep)
|
||
|
|
||
|
|
||
|
def translate_symbol(unicodesymbols, symbol, try_combining = True):
|
||
|
" Translate a symbol from LaTeX to unicode."
|
||
|
re_combining = re.compile(r'^[^a-zA-Z]')
|
||
|
if len(symbol) == 1:
|
||
|
return symbol
|
||
|
for i in unicodesymbols:
|
||
|
# Play safe and don't try combining symbols (not sure if this is
|
||
|
# needed)
|
||
|
if i[1] == symbol and (len(i) < 4 or i[3].find('combining') < 0):
|
||
|
return unichr(i[0])
|
||
|
if try_combining:
|
||
|
# no direct match, see whether this is a combining sequence
|
||
|
for i in unicodesymbols:
|
||
|
if len(i) > 3 and i[3].find('combining') >= 0 and symbol.find(i[1]) == 0:
|
||
|
# Test whether this is really a combining sequence, e.g.
|
||
|
# \"o or \d{o}, and not a symbol like \dh that shares the
|
||
|
# beginning with a combining symbol
|
||
|
translated = symbol[len(i[1]):]
|
||
|
if translated != '' and re_combining.match(translated):
|
||
|
# Really a combining sequence
|
||
|
if len(translated) > 1 and translated[0] == '{' and translated[-1] == '}':
|
||
|
# Strip braces from things like \d{o}
|
||
|
translated = translated[1:-1]
|
||
|
else:
|
||
|
# for some strange reason \\'\\i does not get
|
||
|
# correctly combined, so we try \\'\\i which has an
|
||
|
# entry in unicodesymbols
|
||
|
combined = translate_symbol(unicodesymbols, u'%s{%s}' % (i[1], translated))
|
||
|
if combined != '':
|
||
|
return combined
|
||
|
if len(translated) > 1:
|
||
|
# The base character may be a symbol itself, e.g \"{\i}
|
||
|
translated = translate_symbol(unicodesymbols, translated, False)
|
||
|
# Play safe and only translate combining sequences with
|
||
|
# one base character
|
||
|
if len(translated) == 1 and (i[1] != '\\q' or translated in ['t', 'd', 'l', 'L']):
|
||
|
return unicodedata.normalize("NFKC", translated + unichr(i[0]))
|
||
|
else:
|
||
|
# we founed a combining character, but could not convert the argument to a single character
|
||
|
return ''
|
||
|
return ''
|
||
|
|
||
|
|
||
|
def convert(lines, unicodesymbols):
|
||
|
" Translate all symbols in lines from LaTeX to unicode."
|
||
|
# convert both commented and active entries
|
||
|
re_kmap = re.compile(r'^(#?\s*\\kmap\s+\S+\s+)([^\s]+)(.*)$')
|
||
|
re_kxmod = re.compile(r'^(#?\s*\\kxmod\s+\S+\s+\S+\s+)([^\s]+)(.*)$')
|
||
|
for i in range(len(lines)):
|
||
|
match = re_kmap.match(lines[i])
|
||
|
if not match:
|
||
|
match = re_kxmod.match(lines[i])
|
||
|
if match:
|
||
|
symbol = unescape(match.group(2))
|
||
|
if len(symbol) > 2 and symbol[-2:] == '{}':
|
||
|
# The unicodesymbols file does not include the trailing delimiter {}
|
||
|
symbol = symbol[0:-2]
|
||
|
translated = translate_symbol(unicodesymbols, symbol)
|
||
|
if translated == '':
|
||
|
lines[i] = u'%s%s%s' % (match.group(1), match.group(2), match.group(3))
|
||
|
else:
|
||
|
lines[i] = u'%s%s%s' % (match.group(1), escape(translated), match.group(3))
|
||
|
continue
|
||
|
|
||
|
|
||
|
def main(argv):
|
||
|
|
||
|
# Open files
|
||
|
if len(argv) == 2:
|
||
|
input = sys.stdin
|
||
|
output = sys.stdout
|
||
|
elif len(argv) == 4:
|
||
|
input = open(argv[2], 'rb')
|
||
|
output = open(argv[3], 'wb')
|
||
|
else:
|
||
|
error(usage(argv[0]))
|
||
|
unicodesymbols = open(argv[1], 'rb')
|
||
|
|
||
|
# Do the real work
|
||
|
symbols = readsymbols(unicodesymbols)
|
||
|
lines = read(input)
|
||
|
convert(lines, symbols)
|
||
|
write(output, lines)
|
||
|
|
||
|
# Close files
|
||
|
if len(argv) == 3:
|
||
|
input.close()
|
||
|
output.close()
|
||
|
|
||
|
return 0
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main(sys.argv)
|