mirror of
https://git.lyx.org/repos/lyx.git
synced 2025-01-15 20:50:56 +00:00
127 lines
4.5 KiB
Python
127 lines
4.5 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
# file html2latexwrapper.py
|
||
|
# This file is part of LyX, the document processor.
|
||
|
# Licence details can be found in the file COPYING.
|
||
|
|
||
|
# author Georg Baum
|
||
|
|
||
|
# Full author contact details are available in file CREDITS
|
||
|
|
||
|
# Usage:
|
||
|
# html2latexwrapper.py <converter> <from file> <to file>
|
||
|
|
||
|
# This script will call <converter> -s <from file> > <to file>
|
||
|
# and add a \usepackage{inputenc} line if needed.
|
||
|
|
||
|
|
||
|
import os, string, sys, re
|
||
|
|
||
|
from lyxpreview_tools import error, run_command
|
||
|
|
||
|
|
||
|
def usage(prog_name):
|
||
|
return "Usage: %s <converter> <from file> <to file>" % prog_name
|
||
|
|
||
|
|
||
|
def get_encoding(from_file_name):
|
||
|
'''Read the encoding from a HTML or XHTML file'''
|
||
|
try:
|
||
|
from_file = open(from_file_name, 'rt')
|
||
|
regexpxml = re.compile(r'^\s?<\?xml\s+.*?encoding\s*=\s*"([^"]+)"', re.IGNORECASE)
|
||
|
regexptype = re.compile(r'^\s?<meta\s+.*?charset\s*=\s*"([^"]+)"', re.IGNORECASE)
|
||
|
for line in from_file.readlines():
|
||
|
m = regexpxml.match(line)
|
||
|
if not m:
|
||
|
m = regexptype.match(line)
|
||
|
if m:
|
||
|
from_file.close()
|
||
|
return m.group(1).lower()
|
||
|
from_file.close()
|
||
|
except:
|
||
|
pass
|
||
|
return ''
|
||
|
|
||
|
|
||
|
def main(argv):
|
||
|
# Parse and manipulate the command line arguments.
|
||
|
if len(argv) != 4:
|
||
|
error(usage(argv[0]))
|
||
|
|
||
|
converter = argv[1]
|
||
|
from_file_name = argv[2]
|
||
|
to_file_name = argv[3]
|
||
|
|
||
|
# Run gnuhtml2latex
|
||
|
cmd = '%s -s %s' % (converter, from_file_name)
|
||
|
(ret, output) = run_command(cmd, False)
|
||
|
|
||
|
# Determine encoding of HTML file
|
||
|
enc = get_encoding(from_file_name).replace('iso_8859', 'iso-8859')
|
||
|
# The HTML encodings were taken from http://www.iana.org/assignments/character-sets/character-sets.xml.
|
||
|
# Only those with inputenc support were added, and only thge most important aliases.
|
||
|
# List of encodings that have the same name in HTML (may be as an alias) and inputenc
|
||
|
same_enc = ['cp437', 'cp850', 'cp852', 'cp855', 'cp858', 'cp862', 'cp865', 'cp866', \
|
||
|
'cp1250', 'cp1251', 'cp1252', 'cp1255', 'cp1256', 'cp1257', \
|
||
|
'koi8-r', 'koi8-u', 'pt154', 'pt254', \
|
||
|
'latin1', 'latin2', 'latin3', 'latin4', 'latin5', 'latin9', 'latin10']
|
||
|
# Translation table from HTML encoding names to inputenc encoding names
|
||
|
encodings = {'utf-8' : 'utf8', 'csutf8' : 'utf8', \
|
||
|
'iso-8859-1' : 'latin1', 'cp819' : 'latin1', \
|
||
|
'iso-8859-2' : 'latin2', \
|
||
|
'iso-8859-3' : 'latin3', \
|
||
|
'iso-8859-4' : 'latin4', \
|
||
|
'iso-8859-5' : 'iso88595', 'cyrillic' : 'iso88595', \
|
||
|
'iso-8859-6' : '8859-6', 'arabic' : '8859-6', \
|
||
|
'iso-8859-7' : 'iso-8859-7', 'greek' : 'iso-8859-7', \
|
||
|
'iso-8859-8' : '8859-8', 'hebrew' : '8859-8', \
|
||
|
'iso-8859-9' : 'latin5', \
|
||
|
'iso-8859-13' : 'l7xenc', \
|
||
|
'iso-8859-15' : 'latin9', \
|
||
|
'iso-8859-16' : 'latin10', \
|
||
|
'ibm437' : 'cp437', \
|
||
|
'ibm850' : 'cp850', \
|
||
|
'ibm852' : 'cp852', \
|
||
|
'ibm855' : 'cp855', \
|
||
|
'ibm858' : 'cp858', \
|
||
|
'ibm862' : 'cp862', \
|
||
|
'ibm865' : 'cp865', \
|
||
|
'ibm866' : 'cp866', \
|
||
|
'ibm1250' : 'cp1250', \
|
||
|
'ibm1251' : 'cp1251', \
|
||
|
'ibm1255' : 'cp1255', \
|
||
|
'ibm1256' : 'cp1256', \
|
||
|
'ibm1257' : 'cp1257', \
|
||
|
'macintosh' : 'applemac', 'mac' : 'applemac', 'csmacintosh' : 'applemac'}
|
||
|
if enc != '':
|
||
|
if enc in encodings.keys():
|
||
|
enc = encodings[enc]
|
||
|
elif enc not in same_enc:
|
||
|
enc = ''
|
||
|
|
||
|
# Read conversion result
|
||
|
lines = output.split('\n')
|
||
|
|
||
|
# Do not add the inputenc call if inputenc or CJK is already loaded
|
||
|
add_inputenc = (enc != '')
|
||
|
if add_inputenc:
|
||
|
regexp = re.compile(r'^\s?\\usepackage\s?(\[[^]+]\])?\s?{(inputenc)|(CJK)|(CJKutf8)}')
|
||
|
for line in lines:
|
||
|
if regexp.match(line):
|
||
|
add_inputenc = False
|
||
|
break
|
||
|
|
||
|
# Write output file and insert inputenc call if needed
|
||
|
to_file = open(to_file_name, 'wt')
|
||
|
for line in lines:
|
||
|
to_file.write(line + '\n')
|
||
|
if add_inputenc and line.find('\\documentclass') == 0:
|
||
|
to_file.write('\\usepackage[%s]{inputenc}\n' % enc)
|
||
|
to_file.close()
|
||
|
|
||
|
return ret
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main(sys.argv)
|