lyx_mirror/lib/scripts/html2latexwrapper.py
Richard Heck f9de88c53c Remove the shebang line from our python scripts, per Jose's suggestion.
Fedora ships these chmod 644 and has never seen a problem. The advantage
to doing this is that it better controls what version of python we are
using to launch the script, and it will reveal if we're somehow somewhere
not controlling that properly.
2014-01-28 12:52:16 -05:00

127 lines
4.5 KiB
Python

# -*- coding: utf-8 -*-
# file html2latexwrapper.py
# This file is part of LyX, the document processor.
# Licence details can be found in the file COPYING.
# author Georg Baum
# Full author contact details are available in file CREDITS
# Usage:
# html2latexwrapper.py <converter> <from file> <to file>
# This script will call <converter> -s <from file> > <to file>
# and add a \usepackage{inputenc} line if needed.
import os, string, sys, re
from lyxpreview_tools import error, run_command
def usage(prog_name):
return "Usage: %s <converter> <from file> <to file>" % prog_name
def get_encoding(from_file_name):
'''Read the encoding from a HTML or XHTML file'''
try:
from_file = open(from_file_name, 'rt')
regexpxml = re.compile(r'^\s?<\?xml\s+.*?encoding\s*=\s*"([^"]+)"', re.IGNORECASE)
regexptype = re.compile(r'^\s?<meta\s+.*?charset\s*=\s*"([^"]+)"', re.IGNORECASE)
for line in from_file.readlines():
m = regexpxml.match(line)
if not m:
m = regexptype.match(line)
if m:
from_file.close()
return m.group(1).lower()
from_file.close()
except:
pass
return ''
def main(argv):
# Parse and manipulate the command line arguments.
if len(argv) != 4:
error(usage(argv[0]))
converter = argv[1]
from_file_name = argv[2]
to_file_name = argv[3]
# Run gnuhtml2latex
cmd = '%s -s %s' % (converter, from_file_name)
(ret, output) = run_command(cmd, False)
# Determine encoding of HTML file
enc = get_encoding(from_file_name).replace('iso_8859', 'iso-8859')
# The HTML encodings were taken from http://www.iana.org/assignments/character-sets/character-sets.xml.
# Only those with inputenc support were added, and only thge most important aliases.
# List of encodings that have the same name in HTML (may be as an alias) and inputenc
same_enc = ['cp437', 'cp850', 'cp852', 'cp855', 'cp858', 'cp862', 'cp865', 'cp866', \
'cp1250', 'cp1251', 'cp1252', 'cp1255', 'cp1256', 'cp1257', \
'koi8-r', 'koi8-u', 'pt154', 'pt254', \
'latin1', 'latin2', 'latin3', 'latin4', 'latin5', 'latin9', 'latin10']
# Translation table from HTML encoding names to inputenc encoding names
encodings = {'utf-8' : 'utf8', 'csutf8' : 'utf8', \
'iso-8859-1' : 'latin1', 'cp819' : 'latin1', \
'iso-8859-2' : 'latin2', \
'iso-8859-3' : 'latin3', \
'iso-8859-4' : 'latin4', \
'iso-8859-5' : 'iso88595', 'cyrillic' : 'iso88595', \
'iso-8859-6' : '8859-6', 'arabic' : '8859-6', \
'iso-8859-7' : 'iso-8859-7', 'greek' : 'iso-8859-7', \
'iso-8859-8' : '8859-8', 'hebrew' : '8859-8', \
'iso-8859-9' : 'latin5', \
'iso-8859-13' : 'l7xenc', \
'iso-8859-15' : 'latin9', \
'iso-8859-16' : 'latin10', \
'ibm437' : 'cp437', \
'ibm850' : 'cp850', \
'ibm852' : 'cp852', \
'ibm855' : 'cp855', \
'ibm858' : 'cp858', \
'ibm862' : 'cp862', \
'ibm865' : 'cp865', \
'ibm866' : 'cp866', \
'ibm1250' : 'cp1250', \
'ibm1251' : 'cp1251', \
'ibm1255' : 'cp1255', \
'ibm1256' : 'cp1256', \
'ibm1257' : 'cp1257', \
'macintosh' : 'applemac', 'mac' : 'applemac', 'csmacintosh' : 'applemac'}
if enc != '':
if enc in encodings.keys():
enc = encodings[enc]
elif enc not in same_enc:
enc = ''
# Read conversion result
lines = output.split('\n')
# Do not add the inputenc call if inputenc or CJK is already loaded
add_inputenc = (enc != '')
if add_inputenc:
regexp = re.compile(r'^\s?\\usepackage\s?(\[[^]+]\])?\s?{(inputenc)|(CJK)|(CJKutf8)}')
for line in lines:
if regexp.match(line):
add_inputenc = False
break
# Write output file and insert inputenc call if needed
to_file = open(to_file_name, 'wt')
for line in lines:
to_file.write(line + '\n')
if add_inputenc and line.find('\\documentclass') == 0:
to_file.write('\\usepackage[%s]{inputenc}\n' % enc)
to_file.close()
return ret
if __name__ == "__main__":
main(sys.argv)