Work around gnuhtml2latex encoding bug

gnuhtml2latex does not handle encodings at all. Therefore the result is not
imported correctly by tex2lyx if the HTML file is encoded in anything else
than ascii or latin1 (the default of tex2lyx). The simple wrapper script
loads inputenc if needed. It may not be possible to compile the result with
LaTeX, (e.gif utf8 is used), but for running tex2lyx it will work just fine.
This commit is contained in:
Georg Baum 2013-04-12 21:35:50 +02:00
parent 45ae6a8aee
commit 8c8e19bb1c
4 changed files with 155 additions and 21 deletions

View File

@ -2043,6 +2043,7 @@ dist_scripts_DATA += \
scripts/fig2pdftex.py \
scripts/fig2pstex.py \
scripts/fig_copy.py \
scripts/html2latexwrapper.py \
scripts/include_bib.py \
scripts/layout2layout.py \
scripts/legacy_lyxpreview2ppm.py \

View File

@ -679,12 +679,15 @@ def checkConverterEntries():
checkProg('a Sweave -> R/S code converter', ['Rscript --verbose --no-save --no-restore $$s/scripts/lyxstangle.R $$i $$e $$r'],
rc_entry = [ r'\converter sweave r "%%" ""' ])
#
checkProg('a knitr -> R/S code converter', ['Rscript --verbose --no-save --no-restore $$s/scripts/lyxknitr.R $$p$$i $$p$$o $$e $$r tangle'],
checkProg('a knitr -> R/S code converter', ['Rscript --verbose --no-save --no-restore $$s/scripts/lyxknitr.R $$p$$i $$p$$o $$e $$r tangle'],
rc_entry = [ r'\converter knitr r "%%" ""' ])
#
checkProg('an HTML -> LaTeX converter', ['html2latex $$i', 'gnuhtml2latex -s $$i > $$o',
checkProg('an HTML -> LaTeX converter', ['html2latex $$i', 'gnuhtml2latex',
'htmltolatex -input $$i -output $$o', 'htmltolatex.jar -input $$i -output $$o'],
rc_entry = [ r'\converter html latex "%%" ""' ])
rc_entry = [ r'\converter html latex "%%" ""', \
r'\converter html latex "python -tt $$s/scripts/html2latexwrapper.py %% $$i $$o" ""', \
r'\converter html latex "%%" ""', \
r'\converter html latex "%%" ""', '' ])
#
checkProg('an MS Word -> LaTeX converter', ['wvCleanLatex $$i $$o'],
rc_entry = [ r'\converter word latex "%%" ""' ])
@ -801,14 +804,13 @@ def checkConverterEntries():
\converter fig pstex "python -tt $$s/scripts/fig2pstex.py $$i $$o" ""''')
#
checkProg('a TIFF -> PS converter', ['tiff2ps $$i > $$o'],
rc_entry = [ r'\converter tiff eps "%%" ""', ''])
rc_entry = [ r'\converter tiff eps "%%" ""'])
#
checkProg('a TGIF -> EPS/PPM converter', ['tgif'],
rc_entry = [
r'''\converter tgif eps "tgif -print -color -eps -stdout $$i > $$o" ""
\converter tgif png "tgif -print -color -png -o $$d $$i" ""
\converter tgif pdf6 "tgif -print -color -pdf -stdout $$i > $$o" ""''',
''])
\converter tgif pdf6 "tgif -print -color -pdf -stdout $$i > $$o" ""'''])
#
checkProg('a WMF -> EPS converter', ['metafile2eps $$i $$o', 'wmf2eps -o $$o $$i'],
rc_entry = [ r'\converter wmf eps "%%" ""'])
@ -817,10 +819,10 @@ def checkConverterEntries():
rc_entry = [ r'\converter emf eps "%%" ""'])
# Only define a converter to pdf6 for graphics
checkProg('an EPS -> PDF converter', ['epstopdf'],
rc_entry = [ r'\converter eps pdf6 "epstopdf --outfile=$$o $$i" ""', ''])
rc_entry = [ r'\converter eps pdf6 "epstopdf --outfile=$$o $$i" ""'])
#
checkProg('an EPS -> PNG converter', ['convert $$i $$o'],
rc_entry = [ r'\converter eps png "%%" ""', ''])
rc_entry = [ r'\converter eps png "%%" ""'])
#
# no agr -> pdf6 converter, since the pdf library used by gracebat is not
# free software and therefore not compiled in in many installations.
@ -831,14 +833,12 @@ def checkConverterEntries():
r'''\converter agr eps "gracebat -hardcopy -printfile $$o -hdevice EPS $$i 2>/dev/null" ""
\converter agr png "gracebat -hardcopy -printfile $$o -hdevice PNG $$i 2>/dev/null" ""
\converter agr jpg "gracebat -hardcopy -printfile $$o -hdevice JPEG $$i 2>/dev/null" ""
\converter agr ppm "gracebat -hardcopy -printfile $$o -hdevice PNM $$i 2>/dev/null" ""''',
''])
\converter agr ppm "gracebat -hardcopy -printfile $$o -hdevice PNM $$i 2>/dev/null" ""'''])
#
checkProg('a Dot -> Image converter', ['dot'],
rc_entry = [
r'''\converter dot eps "dot -Teps $$i -o $$o" ""
\converter dot png "dot -Tpng $$i -o $$o" ""''',
''])
\converter dot png "dot -Tpng $$i -o $$o" ""'''])
#
checkProg('a Dia -> PNG converter', ['dia -e $$o -t png $$i'],
rc_entry = [ r'\converter dia png "%%" ""'])
@ -866,8 +866,7 @@ def checkConverterEntries():
checkProg('a spreadsheet -> latex converter', ['ssconvert'],
rc_entry = [ r'''\converter gnumeric latex "ssconvert --export-type=Gnumeric_html:latex $$i $$o" ""
\converter oocalc latex "ssconvert --export-type=Gnumeric_html:latex $$i $$o" ""
\converter excel latex "ssconvert --export-type=Gnumeric_html:latex $$i $$o" ""''',
''])
\converter excel latex "ssconvert --export-type=Gnumeric_html:latex $$i $$o" ""'''])
path, lilypond = checkProg('a LilyPond -> EPS/PDF/PNG converter', ['lilypond'])
if (lilypond != ''):
@ -918,7 +917,7 @@ def checkConverterEntries():
logger.info('+ found LilyPond-book, but could not extract version number.')
#
checkProg('a Noteedit -> LilyPond converter', ['noteedit --export-lilypond $$i'],
rc_entry = [ r'\converter noteedit lilypond "%%" ""', ''])
rc_entry = [ r'\converter noteedit lilypond "%%" ""' ])
#
# Currently, lyxpak outputs a gzip compressed tar archive on *nix
# and a zip archive on Windows.

View File

@ -0,0 +1,127 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# file html2latexwrapper.py
# This file is part of LyX, the document processor.
# Licence details can be found in the file COPYING.
# author Georg Baum
# Full author contact details are available in file CREDITS
# Usage:
# html2latexwrapper.py <converter> <from file> <to file>
# This script will call <converter> -s <from file> > <to file>
# and add a \usepackage{inputenc} line if needed.
import os, string, sys, re
from lyxpreview_tools import error, run_command
def usage(prog_name):
return "Usage: %s <converter> <from file> <to file>" % prog_name
def get_encoding(from_file_name):
'''Read the encoding from a HTML or XHTML file'''
try:
from_file = open(from_file_name, 'rt')
regexpxml = re.compile(r'^\s?<\?xml\s+.*?encoding\s*=\s*"([^"]+)"', re.IGNORECASE)
regexptype = re.compile(r'^\s?<meta\s+.*?charset\s*=\s*"([^"]+)"', re.IGNORECASE)
for line in from_file.readlines():
m = regexpxml.match(line)
if not m:
m = regexptype.match(line)
if m:
from_file.close()
return m.group(1).lower()
from_file.close()
except:
pass
return ''
def main(argv):
# Parse and manipulate the command line arguments.
if len(argv) != 4:
error(usage(argv[0]))
converter = argv[1]
from_file_name = argv[2]
to_file_name = argv[3]
# Run gnuhtml2latex
cmd = '%s -s %s' % (converter, from_file_name)
(ret, output) = run_command(cmd, False)
# Determine encoding of HTML file
enc = get_encoding(from_file_name).replace('iso_8859', 'iso-8859')
# The HTML encodings were taken from http://www.iana.org/assignments/character-sets/character-sets.xml.
# Only those with inputenc support were added, and only thge most important aliases.
# List of encodings that have the same name in HTML (may be as an alias) and inputenc
same_enc = ['cp437', 'cp850', 'cp852', 'cp855', 'cp858', 'cp862', 'cp865', 'cp866', \
'cp1250', 'cp1251', 'cp1252', 'cp1255', 'cp1256', 'cp1257', \
'koi8-r', 'koi8-u', 'pt154', 'pt254', \
'latin1', 'latin2', 'latin3', 'latin4', 'latin5', 'latin9', 'latin10']
# Translation table from HTML encoding names to inputenc encoding names
encodings = {'utf-8' : 'utf8', 'csutf8' : 'utf8', \
'iso-8859-1' : 'latin1', 'cp819' : 'latin1', \
'iso-8859-2' : 'latin2', \
'iso-8859-3' : 'latin3', \
'iso-8859-4' : 'latin4', \
'iso-8859-5' : 'iso88595', 'cyrillic' : 'iso88595', \
'iso-8859-6' : '8859-6', 'arabic' : '8859-6', \
'iso-8859-7' : 'iso-8859-7', 'greek' : 'iso-8859-7', \
'iso-8859-8' : '8859-8', 'hebrew' : '8859-8', \
'iso-8859-9' : 'latin5', \
'iso-8859-13' : 'l7xenc', \
'iso-8859-15' : 'latin9', \
'iso-8859-16' : 'latin10', \
'ibm437' : 'cp437', \
'ibm850' : 'cp850', \
'ibm852' : 'cp852', \
'ibm855' : 'cp855', \
'ibm858' : 'cp858', \
'ibm862' : 'cp862', \
'ibm865' : 'cp865', \
'ibm866' : 'cp866', \
'ibm1250' : 'cp1250', \
'ibm1251' : 'cp1251', \
'ibm1255' : 'cp1255', \
'ibm1256' : 'cp1256', \
'ibm1257' : 'cp1257', \
'macintosh' : 'applemac', 'mac' : 'applemac', 'csmacintosh' : 'applemac'}
if enc != '':
if enc in encodings.keys():
enc = encodings[enc]
elif enc not in same_enc:
enc = ''
# Read conversion result
lines = output.split('\n')
# Do not add the inputenc call if inputenc or CJK is already loaded
add_inputenc = (enc != '')
if add_inputenc:
regexp = re.compile(r'^\s?\\usepackage\s?(\[[^]+]\])?\s?{(inputenc)|(CJK)|(CJKutf8)}')
for line in lines:
if regexp.match(line):
add_inputenc = False
break
# Write output file and insert inputenc call if needed
to_file = open(to_file_name, 'wt')
for line in lines:
to_file.write(line + '\n')
if add_inputenc and line.find('\\documentclass') == 0:
to_file.write('\\usepackage[%s]{inputenc}\n' % enc)
to_file.close()
return ret
if __name__ == "__main__":
main(sys.argv)

View File

@ -116,14 +116,21 @@ def find_exe_or_terminate(candidates):
return exe
def run_command_popen(cmd):
def run_command_popen(cmd, stderr2stdout):
if os.name == 'nt':
unix = False
else:
unix = True
pipe = subprocess.Popen(cmd, shell=unix, close_fds=unix, stdin=subprocess.PIPE, \
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
cmd_stdout = pipe.communicate()[0]
if stderr2stdout:
pipe = subprocess.Popen(cmd, shell=unix, close_fds=unix, stdin=subprocess.PIPE, \
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
cmd_stdout = pipe.communicate()[0]
else:
pipe = subprocess.Popen(cmd, shell=unix, close_fds=unix, stdin=subprocess.PIPE, \
stdout=subprocess.PIPE, universal_newlines=True)
(cmd_stdout, cmd_stderr) = pipe.communicate()
if cmd_stderr:
sys.stderr.write(cmd_stderr)
cmd_status = pipe.returncode
global debug
@ -182,12 +189,12 @@ def run_command_win32(cmd):
return 0, data
def run_command(cmd):
def run_command(cmd, stderr2stdout = True):
progress("Running %s" % cmd)
if use_win32_modules:
return run_command_win32(cmd)
else:
return run_command_popen(cmd)
return run_command_popen(cmd, stderr2stdout)
def get_version_info():