lyx_mirror/lib/scripts/html2latexwrapper.py

#! /usr/bin/env python
# -*- coding: utf-8 -*-

# file html2latexwrapper.py
# This file is part of LyX, the document processor.
# Licence details can be found in the file COPYING.

# author Georg Baum

# Full author contact details are available in file CREDITS

# Usage:
# html2latexwrapper.py <converter> <from file> <to file>

# This script will call <converter> -s <from file> > <to file>
# and add a \usepackage{inputenc} line if needed.


import os, string, sys, re

from lyxpreview_tools import error, run_command


def usage(prog_name):
    return "Usage: %s <converter> <from file> <to file>" % prog_name


def get_encoding(from_file_name):
    '''Read the encoding from a HTML or XHTML file'''
    try:
        from_file = open(from_file_name, 'rt')
        regexpxml = re.compile(r'^\s?<\?xml\s+.*?encoding\s*=\s*"([^"]+)"', re.IGNORECASE)
        regexptype = re.compile(r'^\s?<meta\s+.*?charset\s*=\s*"([^"]+)"', re.IGNORECASE)
        for line in from_file.readlines():
            m = regexpxml.match(line)
            if not m:
                m = regexptype.match(line)
            if m:
                from_file.close()
                return m.group(1).lower()
        from_file.close()
    except:
        pass
    return ''


def main(argv):
    # Parse and manipulate the command line arguments.
    if len(argv) != 4:
        error(usage(argv[0]))

    converter = argv[1]
    from_file_name = argv[2]
    to_file_name = argv[3]

    # Run gnuhtml2latex
    cmd = '%s -s %s' % (converter, from_file_name)
    (ret, output) = run_command(cmd, False)

    # Determine encoding of HTML file
    enc = get_encoding(from_file_name).replace('iso_8859', 'iso-8859')
    # The HTML encodings were taken from http://www.iana.org/assignments/character-sets/character-sets.xml.
    # Only those with inputenc support were added, and only thge most important aliases.
    # List of encodings that have the same name in HTML (may be as an alias) and inputenc
    same_enc = ['cp437', 'cp850', 'cp852', 'cp855', 'cp858', 'cp862', 'cp865', 'cp866', \
                'cp1250', 'cp1251', 'cp1252', 'cp1255', 'cp1256', 'cp1257', \
                'koi8-r', 'koi8-u', 'pt154', 'pt254', \
                'latin1', 'latin2', 'latin3', 'latin4', 'latin5', 'latin9', 'latin10']
    # Translation table from HTML encoding names to inputenc encoding names
    encodings = {'utf-8' : 'utf8', 'csutf8' : 'utf8', \
                 'iso-8859-1' : 'latin1', 'cp819' : 'latin1', \
                 'iso-8859-2' : 'latin2', \
                 'iso-8859-3' : 'latin3', \
                 'iso-8859-4' : 'latin4', \
                 'iso-8859-5' : 'iso88595', 'cyrillic' : 'iso88595', \
                 'iso-8859-6' : '8859-6', 'arabic' : '8859-6', \
                 'iso-8859-7' : 'iso-8859-7', 'greek' : 'iso-8859-7', \
                 'iso-8859-8' : '8859-8', 'hebrew' : '8859-8', \
                 'iso-8859-9' : 'latin5', \
                 'iso-8859-13' : 'l7xenc', \
                 'iso-8859-15' : 'latin9', \
                 'iso-8859-16' : 'latin10', \
                 'ibm437' : 'cp437', \
                 'ibm850' : 'cp850', \
                 'ibm852' : 'cp852', \
                 'ibm855' : 'cp855', \
                 'ibm858' : 'cp858', \
                 'ibm862' : 'cp862', \
                 'ibm865' : 'cp865', \
                 'ibm866' : 'cp866', \
                 'ibm1250' : 'cp1250', \
                 'ibm1251' : 'cp1251', \
                 'ibm1255' : 'cp1255', \
                 'ibm1256' : 'cp1256', \
                 'ibm1257' : 'cp1257', \
                 'macintosh' : 'applemac', 'mac' : 'applemac', 'csmacintosh' : 'applemac'}
    if enc != '':
        if enc in encodings.keys():
            enc = encodings[enc]
        elif enc not in same_enc:
            enc = ''

    # Read conversion result
    lines = output.split('\n')

    # Do not add the inputenc call if inputenc or CJK is already loaded
    add_inputenc = (enc != '')
    if add_inputenc:
        regexp = re.compile(r'^\s?\\usepackage\s?(\[[^]+]\])?\s?{(inputenc)|(CJK)|(CJKutf8)}')
        for line in lines:
            if regexp.match(line):
                add_inputenc = False
                break

    # Write output file and insert inputenc call if needed
    to_file = open(to_file_name, 'wt')
    for line in lines:
        to_file.write(line + '\n')
        if add_inputenc and line.find('\\documentclass') == 0:
            to_file.write('\\usepackage[%s]{inputenc}\n' % enc)
    to_file.close()

    return ret


if __name__ == "__main__":
    main(sys.argv)
Work around gnuhtml2latex encoding bug gnuhtml2latex does not handle encodings at all. Therefore the result is not imported correctly by tex2lyx if the HTML file is encoded in anything else than ascii or latin1 (the default of tex2lyx). The simple wrapper script loads inputenc if needed. It may not be possible to compile the result with LaTeX, (e.gif utf8 is used), but for running tex2lyx it will work just fine. 2013-04-12 19:35:50 +00:00			`#! /usr/bin/env python`
			`# -- coding: utf-8 --`

			`# file html2latexwrapper.py`
			`# This file is part of LyX, the document processor.`
			`# Licence details can be found in the file COPYING.`

			`# author Georg Baum`

			`# Full author contact details are available in file CREDITS`

			`# Usage:`
			`# html2latexwrapper.py <converter> <from file> <to file>`

			`# This script will call <converter> -s <from file> > <to file>`
			`# and add a \usepackage{inputenc} line if needed.`


			`import os, string, sys, re`

			`from lyxpreview_tools import error, run_command`


			`def usage(prog_name):`
			`return "Usage: %s <converter> <from file> <to file>" % prog_name`


			`def get_encoding(from_file_name):`
			`'''Read the encoding from a HTML or XHTML file'''`
			`try:`
			`from_file = open(from_file_name, 'rt')`
			`regexpxml = re.compile(r'^\s?<\?xml\s+.?encoding\s=\s*"([^"]+)"', re.IGNORECASE)`
			`regexptype = re.compile(r'^\s?<meta\s+.?charset\s=\s*"([^"]+)"', re.IGNORECASE)`
			`for line in from_file.readlines():`
			`m = regexpxml.match(line)`
			`if not m:`
			`m = regexptype.match(line)`
			`if m:`
			`from_file.close()`
			`return m.group(1).lower()`
			`from_file.close()`
			`except:`
			`pass`
			`return ''`


			`def main(argv):`
			`# Parse and manipulate the command line arguments.`
			`if len(argv) != 4:`
			`error(usage(argv[0]))`

			`converter = argv[1]`
			`from_file_name = argv[2]`
			`to_file_name = argv[3]`

			`# Run gnuhtml2latex`
			`cmd = '%s -s %s' % (converter, from_file_name)`
			`(ret, output) = run_command(cmd, False)`

			`# Determine encoding of HTML file`
			`enc = get_encoding(from_file_name).replace('iso_8859', 'iso-8859')`
			`# The HTML encodings were taken from http://www.iana.org/assignments/character-sets/character-sets.xml.`
			`# Only those with inputenc support were added, and only thge most important aliases.`
			`# List of encodings that have the same name in HTML (may be as an alias) and inputenc`
			`same_enc = ['cp437', 'cp850', 'cp852', 'cp855', 'cp858', 'cp862', 'cp865', 'cp866', \`
			`'cp1250', 'cp1251', 'cp1252', 'cp1255', 'cp1256', 'cp1257', \`
			`'koi8-r', 'koi8-u', 'pt154', 'pt254', \`
			`'latin1', 'latin2', 'latin3', 'latin4', 'latin5', 'latin9', 'latin10']`
			`# Translation table from HTML encoding names to inputenc encoding names`
			`encodings = {'utf-8' : 'utf8', 'csutf8' : 'utf8', \`
			`'iso-8859-1' : 'latin1', 'cp819' : 'latin1', \`
			`'iso-8859-2' : 'latin2', \`
			`'iso-8859-3' : 'latin3', \`
			`'iso-8859-4' : 'latin4', \`
			`'iso-8859-5' : 'iso88595', 'cyrillic' : 'iso88595', \`
			`'iso-8859-6' : '8859-6', 'arabic' : '8859-6', \`
			`'iso-8859-7' : 'iso-8859-7', 'greek' : 'iso-8859-7', \`
			`'iso-8859-8' : '8859-8', 'hebrew' : '8859-8', \`
			`'iso-8859-9' : 'latin5', \`
			`'iso-8859-13' : 'l7xenc', \`
			`'iso-8859-15' : 'latin9', \`
			`'iso-8859-16' : 'latin10', \`
			`'ibm437' : 'cp437', \`
			`'ibm850' : 'cp850', \`
			`'ibm852' : 'cp852', \`
			`'ibm855' : 'cp855', \`
			`'ibm858' : 'cp858', \`
			`'ibm862' : 'cp862', \`
			`'ibm865' : 'cp865', \`
			`'ibm866' : 'cp866', \`
			`'ibm1250' : 'cp1250', \`
			`'ibm1251' : 'cp1251', \`
			`'ibm1255' : 'cp1255', \`
			`'ibm1256' : 'cp1256', \`
			`'ibm1257' : 'cp1257', \`
			`'macintosh' : 'applemac', 'mac' : 'applemac', 'csmacintosh' : 'applemac'}`
			`if enc != '':`
			`if enc in encodings.keys():`
			`enc = encodings[enc]`
			`elif enc not in same_enc:`
			`enc = ''`

			`# Read conversion result`
			`lines = output.split('\n')`

			`# Do not add the inputenc call if inputenc or CJK is already loaded`
			`add_inputenc = (enc != '')`
			`if add_inputenc:`
			`regexp = re.compile(r'^\s?\\usepackage\s?(\[[^]+]\])?\s?{(inputenc)\|(CJK)\|(CJKutf8)}')`
			`for line in lines:`
			`if regexp.match(line):`
			`add_inputenc = False`
			`break`

			`# Write output file and insert inputenc call if needed`
			`to_file = open(to_file_name, 'wt')`
			`for line in lines:`
			`to_file.write(line + '\n')`
			`if add_inputenc and line.find('\\documentclass') == 0:`
			`to_file.write('\\usepackage[%s]{inputenc}\n' % enc)`
			`to_file.close()`

			`return ret`


			`if __name__ == "__main__":`
			`main(sys.argv)`