lyx_mirror/lib/lyx2lyx/lyx_1_5.py
Georg Baum c9bbc5c1dd Output encoding messages only with a higher warning level
git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@17290 a592a061-630c-0410-9148-cb99ea01b6c8
2007-02-21 12:33:51 +00:00

1271 lines
50 KiB
Python

# This file is part of lyx2lyx
# -*- coding: utf-8 -*-
# Copyright (C) 2006 José Matos <jamatos@lyx.org>
# Copyright (C) 2004-2006 Georg Baum <Georg.Baum@post.rwth-aachen.de>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
""" Convert files to the file format generated by lyx 1.5"""
import re
import unicodedata
from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
from LyX import get_encoding
####################################################################
# Private helper functions
def find_end_of_inset(lines, i):
" Find end of inset, where lines[i] is included."
return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
def find_end_of_layout(lines, i):
" Find end of layout, where lines[i] is included."
return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
# End of helper functions
####################################################################
##
# Notes: Framed/Shaded
#
def revert_framed(document):
"Revert framed notes. "
i = 0
while 1:
i = find_tokens(document.body, ["\\begin_inset Note Framed", "\\begin_inset Note Shaded"], i)
if i == -1:
return
document.body[i] = "\\begin_inset Note"
i = i + 1
##
# Fonts
#
roman_fonts = {'default' : 'default', 'ae' : 'ae',
'times' : 'times', 'palatino' : 'palatino',
'helvet' : 'default', 'avant' : 'default',
'newcent' : 'newcent', 'bookman' : 'bookman',
'pslatex' : 'times'}
sans_fonts = {'default' : 'default', 'ae' : 'default',
'times' : 'default', 'palatino' : 'default',
'helvet' : 'helvet', 'avant' : 'avant',
'newcent' : 'default', 'bookman' : 'default',
'pslatex' : 'helvet'}
typewriter_fonts = {'default' : 'default', 'ae' : 'default',
'times' : 'default', 'palatino' : 'default',
'helvet' : 'default', 'avant' : 'default',
'newcent' : 'default', 'bookman' : 'default',
'pslatex' : 'courier'}
def convert_font_settings(document):
" Convert font settings. "
i = 0
i = find_token_exact(document.header, "\\fontscheme", i)
if i == -1:
document.warning("Malformed LyX document: Missing `\\fontscheme'.")
return
font_scheme = get_value(document.header, "\\fontscheme", i, i + 1)
if font_scheme == '':
document.warning("Malformed LyX document: Empty `\\fontscheme'.")
font_scheme = 'default'
if not font_scheme in roman_fonts.keys():
document.warning("Malformed LyX document: Unknown `\\fontscheme' `%s'." % font_scheme)
font_scheme = 'default'
document.header[i:i+1] = ['\\font_roman %s' % roman_fonts[font_scheme],
'\\font_sans %s' % sans_fonts[font_scheme],
'\\font_typewriter %s' % typewriter_fonts[font_scheme],
'\\font_default_family default',
'\\font_sc false',
'\\font_osf false',
'\\font_sf_scale 100',
'\\font_tt_scale 100']
def revert_font_settings(document):
" Revert font settings. "
i = 0
insert_line = -1
fonts = {'roman' : 'default', 'sans' : 'default', 'typewriter' : 'default'}
for family in 'roman', 'sans', 'typewriter':
name = '\\font_%s' % family
i = find_token_exact(document.header, name, i)
if i == -1:
document.warning("Malformed LyX document: Missing `%s'." % name)
i = 0
else:
if (insert_line < 0):
insert_line = i
fonts[family] = get_value(document.header, name, i, i + 1)
del document.header[i]
i = find_token_exact(document.header, '\\font_default_family', i)
if i == -1:
document.warning("Malformed LyX document: Missing `\\font_default_family'.")
font_default_family = 'default'
else:
font_default_family = get_value(document.header, "\\font_default_family", i, i + 1)
del document.header[i]
i = find_token_exact(document.header, '\\font_sc', i)
if i == -1:
document.warning("Malformed LyX document: Missing `\\font_sc'.")
font_sc = 'false'
else:
font_sc = get_value(document.header, '\\font_sc', i, i + 1)
del document.header[i]
if font_sc != 'false':
document.warning("Conversion of '\\font_sc' not yet implemented.")
i = find_token_exact(document.header, '\\font_osf', i)
if i == -1:
document.warning("Malformed LyX document: Missing `\\font_osf'.")
font_osf = 'false'
else:
font_osf = get_value(document.header, '\\font_osf', i, i + 1)
del document.header[i]
i = find_token_exact(document.header, '\\font_sf_scale', i)
if i == -1:
document.warning("Malformed LyX document: Missing `\\font_sf_scale'.")
font_sf_scale = '100'
else:
font_sf_scale = get_value(document.header, '\\font_sf_scale', i, i + 1)
del document.header[i]
if font_sf_scale != '100':
document.warning("Conversion of '\\font_sf_scale' not yet implemented.")
i = find_token_exact(document.header, '\\font_tt_scale', i)
if i == -1:
document.warning("Malformed LyX document: Missing `\\font_tt_scale'.")
font_tt_scale = '100'
else:
font_tt_scale = get_value(document.header, '\\font_tt_scale', i, i + 1)
del document.header[i]
if font_tt_scale != '100':
document.warning("Conversion of '\\font_tt_scale' not yet implemented.")
for font_scheme in roman_fonts.keys():
if (roman_fonts[font_scheme] == fonts['roman'] and
sans_fonts[font_scheme] == fonts['sans'] and
typewriter_fonts[font_scheme] == fonts['typewriter']):
document.header.insert(insert_line, '\\fontscheme %s' % font_scheme)
if font_default_family != 'default':
document.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family)
if font_osf == 'true':
document.warning("Ignoring `\\font_osf = true'")
return
font_scheme = 'default'
document.header.insert(insert_line, '\\fontscheme %s' % font_scheme)
if fonts['roman'] == 'cmr':
document.preamble.append('\\renewcommand{\\rmdefault}{cmr}')
if font_osf == 'true':
document.preamble.append('\\usepackage{eco}')
font_osf = 'false'
for font in 'lmodern', 'charter', 'utopia', 'beraserif', 'ccfonts', 'chancery':
if fonts['roman'] == font:
document.preamble.append('\\usepackage{%s}' % font)
for font in 'cmss', 'lmss', 'cmbr':
if fonts['sans'] == font:
document.preamble.append('\\renewcommand{\\sfdefault}{%s}' % font)
for font in 'berasans':
if fonts['sans'] == font:
document.preamble.append('\\usepackage{%s}' % font)
for font in 'cmtt', 'lmtt', 'cmtl':
if fonts['typewriter'] == font:
document.preamble.append('\\renewcommand{\\ttdefault}{%s}' % font)
for font in 'courier', 'beramono', 'luximono':
if fonts['typewriter'] == font:
document.preamble.append('\\usepackage{%s}' % font)
if font_default_family != 'default':
document.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family)
if font_osf == 'true':
document.warning("Ignoring `\\font_osf = true'")
def revert_booktabs(document):
" We remove the booktabs flag or everything else will become a mess. "
re_row = re.compile(r'^<row.*space="[^"]+".*>$')
re_tspace = re.compile(r'\s+topspace="[^"]+"')
re_bspace = re.compile(r'\s+bottomspace="[^"]+"')
re_ispace = re.compile(r'\s+interlinespace="[^"]+"')
i = 0
while 1:
i = find_token(document.body, "\\begin_inset Tabular", i)
if i == -1:
return
j = find_end_of_inset(document.body, i + 1)
if j == -1:
document.warning("Malformed LyX document: Could not find end of tabular.")
continue
for k in range(i, j):
if re.search('^<features.* booktabs="true".*>$', document.body[k]):
document.warning("Converting 'booktabs' table to normal table.")
document.body[k] = document.body[k].replace(' booktabs="true"', '')
if re.search(re_row, document.body[k]):
document.warning("Removing extra row space.")
document.body[k] = re_tspace.sub('', document.body[k])
document.body[k] = re_bspace.sub('', document.body[k])
document.body[k] = re_ispace.sub('', document.body[k])
i = i + 1
def convert_multiencoding(document, forward):
""" Fix files with multiple encodings.
Files with an inputencoding of "auto" or "default" and multiple languages
where at least two languages have different default encodings are encoded
in multiple encodings for file formats < 249. These files are incorrectly
read and written (as if the whole file was in the encoding of the main
language).
This is not true for files written by CJK-LyX, they are always in the locale
encoding.
This function
- converts from fake unicode values to true unicode if forward is true, and
- converts from true unicode values to fake unicode if forward is false.
document.encoding must be set to the old value (format 248) in both cases.
We do this here and not in LyX.py because it is far easier to do the
necessary parsing in modern formats than in ancient ones.
"""
if document.cjk_encoding != '':
return
encoding_stack = [document.encoding]
lang_re = re.compile(r"^\\lang\s(\S+)")
if document.inputencoding == "auto" or document.inputencoding == "default":
for i in range(len(document.body)):
result = lang_re.match(document.body[i])
if result:
language = result.group(1)
if language == "default":
document.warning("Resetting encoding from %s to %s." % (encoding_stack[-1], document.encoding), 3)
encoding_stack[-1] = document.encoding
else:
from lyx2lyx_lang import lang
document.warning("Setting encoding from %s to %s." % (encoding_stack[-1], lang[language][3]), 3)
encoding_stack[-1] = lang[language][3]
elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
document.warning("Adding nested encoding %s." % encoding_stack[-1], 3)
encoding_stack.append(encoding_stack[-1])
elif find_token(document.body, "\\end_layout", i, i + 1) == i:
document.warning("Removing nested encoding %s." % encoding_stack[-1], 3)
del encoding_stack[-1]
if encoding_stack[-1] != document.encoding:
if forward:
# This line has been incorrectly interpreted as if it was
# encoded in 'encoding'.
# Convert back to the 8bit string that was in the file.
orig = document.body[i].encode(document.encoding)
# Convert the 8bit string that was in the file to unicode
# with the correct encoding.
document.body[i] = orig.decode(encoding_stack[-1])
else:
# Convert unicode to the 8bit string that will be written
# to the file with the correct encoding.
orig = document.body[i].encode(encoding_stack[-1])
# Convert the 8bit string that will be written to the
# file to fake unicode with the encoding that will later
# be used when writing to the file.
document.body[i] = orig.decode(document.encoding)
def convert_utf8(document):
" Set document encoding to UTF-8. "
convert_multiencoding(document, True)
document.encoding = "utf8"
def revert_utf8(document):
" Set document encoding to the value corresponding to inputencoding. "
i = find_token(document.header, "\\inputencoding", 0)
if i == -1:
document.header.append("\\inputencoding auto")
elif get_value(document.header, "\\inputencoding", i) == "utf8":
document.header[i] = "\\inputencoding auto"
document.inputencoding = get_value(document.header, "\\inputencoding", 0)
document.encoding = get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)
convert_multiencoding(document, False)
def revert_cs_label(document):
" Remove status flag of charstyle label. "
i = 0
while 1:
i = find_token(document.body, "\\begin_inset CharStyle", i)
if i == -1:
return
# Seach for a line starting 'show_label'
# If it is not there, break with a warning message
i = i + 1
while 1:
if (document.body[i][:10] == "show_label"):
del document.body[i]
break
elif (document.body[i][:13] == "\\begin_layout"):
document.warning("Malformed LyX document: Missing 'show_label'.")
break
i = i + 1
i = i + 1
def convert_bibitem(document):
""" Convert
\bibitem [option]{argument}
to
\begin_inset LatexCommand bibitem
label "option"
key "argument"
\end_inset
This must be called after convert_commandparams.
"""
regex = re.compile(r'\S+\s*(\[[^\[\{]*\])?(\{[^}]*\})')
i = 0
while 1:
i = find_token(document.body, "\\bibitem", i)
if i == -1:
break
match = re.match(regex, document.body[i])
option = match.group(1)
argument = match.group(2)
lines = ['\\begin_inset LatexCommand bibitem']
if option != None:
lines.append('label "%s"' % option[1:-1].replace('"', '\\"'))
lines.append('key "%s"' % argument[1:-1].replace('"', '\\"'))
lines.append('')
lines.append('\\end_inset')
document.body[i:i+1] = lines
i = i + 1
commandparams_info = {
# command : [option1, option2, argument]
"bibitem" : ["label", "", "key"],
"bibtex" : ["options", "btprint", "bibfiles"],
"cite" : ["after", "before", "key"],
"citet" : ["after", "before", "key"],
"citep" : ["after", "before", "key"],
"citealt" : ["after", "before", "key"],
"citealp" : ["after", "before", "key"],
"citeauthor" : ["after", "before", "key"],
"citeyear" : ["after", "before", "key"],
"citeyearpar" : ["after", "before", "key"],
"citet*" : ["after", "before", "key"],
"citep*" : ["after", "before", "key"],
"citealt*" : ["after", "before", "key"],
"citealp*" : ["after", "before", "key"],
"citeauthor*" : ["after", "before", "key"],
"Citet" : ["after", "before", "key"],
"Citep" : ["after", "before", "key"],
"Citealt" : ["after", "before", "key"],
"Citealp" : ["after", "before", "key"],
"Citeauthor" : ["after", "before", "key"],
"Citet*" : ["after", "before", "key"],
"Citep*" : ["after", "before", "key"],
"Citealt*" : ["after", "before", "key"],
"Citealp*" : ["after", "before", "key"],
"Citeauthor*" : ["after", "before", "key"],
"citefield" : ["after", "before", "key"],
"citetitle" : ["after", "before", "key"],
"cite*" : ["after", "before", "key"],
"hfill" : ["", "", ""],
"index" : ["", "", "name"],
"printindex" : ["", "", "name"],
"label" : ["", "", "name"],
"eqref" : ["name", "", "reference"],
"pageref" : ["name", "", "reference"],
"prettyref" : ["name", "", "reference"],
"ref" : ["name", "", "reference"],
"vpageref" : ["name", "", "reference"],
"vref" : ["name", "", "reference"],
"tableofcontents" : ["", "", "type"],
"htmlurl" : ["name", "", "target"],
"url" : ["name", "", "target"]}
def convert_commandparams(document):
""" Convert
\begin_inset LatexCommand \cmdname[opt1][opt2]{arg}
\end_inset
to
\begin_inset LatexCommand cmdname
name1 "opt1"
name2 "opt2"
name3 "arg"
\end_inset
name1, name2 and name3 can be different for each command.
"""
# \begin_inset LatexCommand bibitem was not the official version (see
# convert_bibitem()), but could be read in, so we convert it here, too.
i = 0
while 1:
i = find_token(document.body, "\\begin_inset LatexCommand", i)
if i == -1:
break
command = document.body[i][26:].strip()
if command == "":
document.warning("Malformed LyX document: Missing LatexCommand name.")
i = i + 1
continue
# The following parser is taken from the original InsetCommandParams::scanCommand
name = ""
option1 = ""
option2 = ""
argument = ""
state = "WS"
# Used to handle things like \command[foo[bar]]{foo{bar}}
nestdepth = 0
b = 0
for c in command:
if ((state == "CMDNAME" and c == ' ') or
(state == "CMDNAME" and c == '[') or
(state == "CMDNAME" and c == '{')):
state = "WS"
if ((state == "OPTION" and c == ']') or
(state == "SECOPTION" and c == ']') or
(state == "CONTENT" and c == '}')):
if nestdepth == 0:
state = "WS"
else:
nestdepth = nestdepth - 1
if ((state == "OPTION" and c == '[') or
(state == "SECOPTION" and c == '[') or
(state == "CONTENT" and c == '{')):
nestdepth = nestdepth + 1
if state == "CMDNAME":
name += c
elif state == "OPTION":
option1 += c
elif state == "SECOPTION":
option2 += c
elif state == "CONTENT":
argument += c
elif state == "WS":
if c == '\\':
state = "CMDNAME"
elif c == '[' and b != ']':
state = "OPTION"
nestdepth = 0 # Just to be sure
elif c == '[' and b == ']':
state = "SECOPTION"
nestdepth = 0 # Just to be sure
elif c == '{':
state = "CONTENT"
nestdepth = 0 # Just to be sure
b = c
# Now we have parsed the command, output the parameters
lines = ["\\begin_inset LatexCommand %s" % name]
if option1 != "":
if commandparams_info[name][0] == "":
document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name))
else:
lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('"', '\\"')))
if option2 != "":
if commandparams_info[name][1] == "":
document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name))
else:
lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('"', '\\"')))
if argument != "":
if commandparams_info[name][2] == "":
document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name))
else:
lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('"', '\\"')))
document.body[i:i+1] = lines
i = i + 1
def revert_commandparams(document):
regex = re.compile(r'(\S+)\s+(.+)')
i = 0
while 1:
i = find_token(document.body, "\\begin_inset LatexCommand", i)
if i == -1:
break
name = document.body[i].split()[2]
j = find_end_of_inset(document.body, i + 1)
preview_line = ""
option1 = ""
option2 = ""
argument = ""
for k in range(i + 1, j):
match = re.match(regex, document.body[k])
if match:
pname = match.group(1)
pvalue = match.group(2)
if pname == "preview":
preview_line = document.body[k]
elif (commandparams_info[name][0] != "" and
pname == commandparams_info[name][0]):
option1 = pvalue.strip('"').replace('\\"', '"')
elif (commandparams_info[name][1] != "" and
pname == commandparams_info[name][1]):
option2 = pvalue.strip('"').replace('\\"', '"')
elif (commandparams_info[name][2] != "" and
pname == commandparams_info[name][2]):
argument = pvalue.strip('"').replace('\\"', '"')
elif document.body[k].strip() != "":
document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name))
if name == "bibitem":
if option1 == "":
lines = ["\\bibitem {%s}" % argument]
else:
lines = ["\\bibitem [%s]{%s}" % (option1, argument)]
else:
if option1 == "":
if option2 == "":
lines = ["\\begin_inset LatexCommand \\%s{%s}" % (name, argument)]
else:
lines = ["\\begin_inset LatexCommand \\%s[][%s]{%s}" % (name, option2, argument)]
else:
if option2 == "":
lines = ["\\begin_inset LatexCommand \\%s[%s]{%s}" % (name, option1, argument)]
else:
lines = ["\\begin_inset LatexCommand \\%s[%s][%s]{%s}" % (name, option1, option2, argument)]
if name != "bibitem":
if preview_line != "":
lines.append(preview_line)
lines.append('')
lines.append('\\end_inset')
document.body[i:j+1] = lines
i = j + 1
def revert_nomenclature(document):
" Convert nomenclature entry to ERT. "
regex = re.compile(r'(\S+)\s+(.+)')
i = 0
use_nomencl = 0
while 1:
i = find_token(document.body, "\\begin_inset LatexCommand nomenclature", i)
if i == -1:
break
use_nomencl = 1
j = find_end_of_inset(document.body, i + 1)
preview_line = ""
symbol = ""
description = ""
prefix = ""
for k in range(i + 1, j):
match = re.match(regex, document.body[k])
if match:
name = match.group(1)
value = match.group(2)
if name == "preview":
preview_line = document.body[k]
elif name == "symbol":
symbol = value.strip('"').replace('\\"', '"')
elif name == "description":
description = value.strip('"').replace('\\"', '"')
elif name == "prefix":
prefix = value.strip('"').replace('\\"', '"')
elif document.body[k].strip() != "":
document.warning("Ignoring unknown contents `%s' in nomenclature inset." % document.body[k])
if prefix == "":
command = 'nomenclature{%s}{%s}' % (symbol, description)
else:
command = 'nomenclature[%s]{%s}{%s}' % (prefix, symbol, description)
document.body[i:j+1] = ['\\begin_inset ERT',
'status collapsed',
'',
'\\begin_layout %s' % document.default_layout,
'',
'',
'\\backslash',
command,
'\\end_layout',
'',
'\\end_inset']
i = i + 11
if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1:
document.preamble.append('\\usepackage{nomencl}[2005/09/22]')
document.preamble.append('\\makenomenclature')
def revert_printnomenclature(document):
" Convert printnomenclature to ERT. "
regex = re.compile(r'(\S+)\s+(.+)')
i = 0
use_nomencl = 0
while 1:
i = find_token(document.body, "\\begin_inset LatexCommand printnomenclature", i)
if i == -1:
break
use_nomencl = 1
j = find_end_of_inset(document.body, i + 1)
preview_line = ""
labelwidth = ""
for k in range(i + 1, j):
match = re.match(regex, document.body[k])
if match:
name = match.group(1)
value = match.group(2)
if name == "preview":
preview_line = document.body[k]
elif name == "labelwidth":
labelwidth = value.strip('"').replace('\\"', '"')
elif document.body[k].strip() != "":
document.warning("Ignoring unknown contents `%s' in printnomenclature inset." % document.body[k])
if labelwidth == "":
command = 'nomenclature{}'
else:
command = 'nomenclature[%s]' % labelwidth
document.body[i:j+1] = ['\\begin_inset ERT',
'status collapsed',
'',
'\\begin_layout %s' % document.default_layout,
'',
'',
'\\backslash',
command,
'\\end_layout',
'',
'\\end_inset']
i = i + 11
if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1:
document.preamble.append('\\usepackage{nomencl}[2005/09/22]')
document.preamble.append('\\makenomenclature')
def convert_esint(document):
" Add \\use_esint setting to header. "
i = find_token(document.header, "\\cite_engine", 0)
if i == -1:
document.warning("Malformed LyX document: Missing `\\cite_engine'.")
return
# 0 is off, 1 is auto, 2 is on.
document.header.insert(i, '\\use_esint 0')
def revert_esint(document):
" Remove \\use_esint setting from header. "
i = find_token(document.header, "\\use_esint", 0)
if i == -1:
document.warning("Malformed LyX document: Missing `\\use_esint'.")
return
use_esint = document.header[i].split()[1]
del document.header[i]
# 0 is off, 1 is auto, 2 is on.
if (use_esint == 2):
document.preamble.append('\\usepackage{esint}')
def revert_clearpage(document):
" clearpage -> ERT "
i = 0
while 1:
i = find_token(document.body, "\\clearpage", i)
if i == -1:
break
document.body[i:i+1] = ['\\begin_inset ERT',
'status collapsed',
'',
'\\begin_layout %s' % document.default_layout,
'',
'',
'\\backslash',
'clearpage',
'\\end_layout',
'',
'\\end_inset']
i = i + 1
def revert_cleardoublepage(document):
" cleardoublepage -> ERT "
i = 0
while 1:
i = find_token(document.body, "\\cleardoublepage", i)
if i == -1:
break
document.body[i:i+1] = ['\\begin_inset ERT',
'status collapsed',
'',
'\\begin_layout %s' % document.default_layout,
'',
'',
'\\backslash',
'cleardoublepage',
'\\end_layout',
'',
'\\end_inset']
i = i + 1
def convert_lyxline(document):
" remove fontsize commands for \lyxline "
# The problematic is: The old \lyxline definition doesn't handle the fontsize
# to change the line thickness. The new definiton does this so that imported
# \lyxlines would have a different line thickness. The eventual fontsize command
# before \lyxline is therefore removed to get the same output.
fontsizes = ["tiny", "scriptsize", "footnotesize", "small", "normalsize",
"large", "Large", "LARGE", "huge", "Huge"]
for n in range(0, len(fontsizes)):
i = 0
k = 0
while i < len(document.body):
i = find_token(document.body, "\\size " + fontsizes[n], i)
k = find_token(document.body, "\\lyxline",i)
# the corresponding fontsize command is always 2 lines before the \lyxline
if (i != -1 and k == i+2):
document.body[i:i+1] = []
else:
break
i = i + 1
def revert_encodings(document):
" Set new encodings to auto. "
encodings = ["8859-6", "8859-8", "cp437", "cp437de", "cp850", "cp852",
"cp855", "cp858", "cp862", "cp865", "cp866", "cp1250",
"cp1252", "cp1256", "cp1257", "latin10", "pt254", "tis620-0"]
i = find_token(document.header, "\\inputencoding", 0)
if i == -1:
document.header.append("\\inputencoding auto")
else:
inputenc = get_value(document.header, "\\inputencoding", i)
if inputenc in encodings:
document.header[i] = "\\inputencoding auto"
document.inputencoding = get_value(document.header, "\\inputencoding", 0)
def convert_caption(document):
" Convert caption layouts to caption insets. "
i = 0
while 1:
i = find_token(document.body, "\\begin_layout Caption", i)
if i == -1:
return
j = find_end_of_layout(document.body, i)
if j == -1:
document.warning("Malformed LyX document: Missing `\\end_layout'.")
return
document.body[j:j] = ["\\end_layout", "", "\\end_inset", "", ""]
document.body[i:i+1] = ["\\begin_layout %s" % document.default_layout,
"\\begin_inset Caption", "",
"\\begin_layout %s" % document.default_layout]
i = i + 1
def revert_caption(document):
" Convert caption insets to caption layouts. "
" This assumes that the text class has a caption style. "
i = 0
while 1:
i = find_token(document.body, "\\begin_inset Caption", i)
if i == -1:
return
# We either need to delete the previous \begin_layout line, or we
# need to end the previous layout if this inset is not in the first
# position of the paragraph.
layout_before = find_token_backwards(document.body, "\\begin_layout", i)
if layout_before == -1:
document.warning("Malformed LyX document: Missing `\\begin_layout'.")
return
layout_line = document.body[layout_before]
del_layout_before = True
l = layout_before + 1
while l < i:
if document.body[l] != "":
del_layout_before = False
break
l = l + 1
if del_layout_before:
del document.body[layout_before:i]
i = layout_before
else:
document.body[i:i] = ["\\end_layout", ""]
i = i + 2
# Find start of layout in the inset and end of inset
j = find_token(document.body, "\\begin_layout", i)
if j == -1:
document.warning("Malformed LyX document: Missing `\\begin_layout'.")
return
k = find_end_of_inset(document.body, i)
if k == -1:
document.warning("Malformed LyX document: Missing `\\end_inset'.")
return
# We either need to delete the following \end_layout line, or we need
# to restart the old layout if this inset is not at the paragraph end.
layout_after = find_token(document.body, "\\end_layout", k)
if layout_after == -1:
document.warning("Malformed LyX document: Missing `\\end_layout'.")
return
del_layout_after = True
l = k + 1
while l < layout_after:
if document.body[l] != "":
del_layout_after = False
break
l = l + 1
if del_layout_after:
del document.body[k+1:layout_after+1]
else:
document.body[k+1:k+1] = [layout_line, ""]
# delete \begin_layout and \end_inset and replace \begin_inset with
# "\begin_layout Caption". This works because we can only have one
# paragraph in the caption inset: The old \end_layout will be recycled.
del document.body[k]
if document.body[k] == "":
del document.body[k]
del document.body[j]
if document.body[j] == "":
del document.body[j]
document.body[i] = "\\begin_layout Caption"
if document.body[i+1] == "":
del document.body[i+1]
i = i + 1
# Accents of InsetLaTeXAccent
accent_map = {
"`" : u'\u0300', # grave
"'" : u'\u0301', # acute
"^" : u'\u0302', # circumflex
"~" : u'\u0303', # tilde
"=" : u'\u0304', # macron
"u" : u'\u0306', # breve
"." : u'\u0307', # dot above
"\"": u'\u0308', # diaresis
"r" : u'\u030a', # ring above
"H" : u'\u030b', # double acute
"v" : u'\u030c', # caron
"b" : u'\u0320', # minus sign below
"d" : u'\u0323', # dot below
"c" : u'\u0327', # cedilla
"k" : u'\u0328', # ogonek
"t" : u'\u0361' # tie. This is special: It spans two characters, but
# only one is given as argument, so we don't need to
# treat it differently.
}
# special accents of InsetLaTeXAccent without argument
special_accent_map = {
'i' : u'\u0131', # dotless i
'j' : u'\u0237', # dotless j
'l' : u'\u0142', # l with stroke
'L' : u'\u0141' # L with stroke
}
# special accent arguments of InsetLaTeXAccent
accented_map = {
'\\i' : u'\u0131', # dotless i
'\\j' : u'\u0237' # dotless j
}
def _convert_accent(accent, accented_char):
type = accent
char = accented_char
if char == '':
if type in special_accent_map:
return special_accent_map[type]
# a missing char is treated as space by LyX
char = ' '
elif type == 'q' and char in ['t', 'd', 'l', 'L']:
# Special caron, only used with t, d, l and L.
# It is not in the map because we convert it to the same unicode
# character as the normal caron: \q{} is only defined if babel with
# the czech or slovak language is used, and the normal caron
# produces the correct output if the T1 font encoding is used.
# For the same reason we never convert to \q{} in the other direction.
type = 'v'
elif char in accented_map:
char = accented_map[char]
elif (len(char) > 1):
# We can only convert accents on a single char
return ''
a = accent_map.get(type)
if a:
return unicodedata.normalize("NFKC", "%s%s" % (char, a))
return ''
def convert_ertbackslash(body, i, ert, default_layout):
r""" -------------------------------------------------------------------------------------------
Convert backslashes and '\n' into valid ERT code, append the converted
text to body[i] and return the (maybe incremented) line index i"""
for c in ert:
if c == '\\':
body[i] = body[i] + '\\backslash '
i = i + 1
body.insert(i, '')
elif c == '\n':
body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, '']
i = i + 4
else:
body[i] = body[i] + c
return i
def convert_accent(document):
# The following forms are supported by LyX:
# '\i \"{a}' (standard form, as written by LyX)
# '\i \"{}' (standard form, as written by LyX if the accented char is a space)
# '\i \"{ }' (also accepted if the accented char is a space)
# '\i \" a' (also accepted)
# '\i \"' (also accepted)
re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$')
re_contents = re.compile(r'^([^\s{]+)(.*)$')
re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
i = 0
while 1:
i = find_re(document.body, re_wholeinset, i)
if i == -1:
return
match = re_wholeinset.match(document.body[i])
prefix = match.group(1)
contents = match.group(3).strip()
match = re_contents.match(contents)
if match:
# Strip first char (always \)
accent = match.group(1)[1:]
accented_contents = match.group(2).strip()
match = re_accentedcontents.match(accented_contents)
accented_char = match.group(1)
converted = _convert_accent(accent, accented_char)
if converted == '':
# Normalize contents
contents = '%s{%s}' % (accent, accented_char),
else:
document.body[i] = '%s%s' % (prefix, converted)
i += 1
continue
document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
document.body[i] = prefix
document.body[i+1:i+1] = ['\\begin_inset ERT',
'status collapsed',
'',
'\\begin_layout %s' % document.default_layout,
'',
'',
'']
i = convert_ertbackslash(document.body, i + 7,
'\\%s' % contents,
document.default_layout)
document.body[i+1:i+1] = ['\\end_layout',
'',
'\\end_inset']
i += 3
def revert_accent(document):
inverse_accent_map = {}
for k in accent_map:
inverse_accent_map[accent_map[k]] = k
inverse_special_accent_map = {}
for k in special_accent_map:
inverse_special_accent_map[special_accent_map[k]] = k
inverse_accented_map = {}
for k in accented_map:
inverse_accented_map[accented_map[k]] = k
# Since LyX may insert a line break within a word we must combine all
# words before unicode normalization.
# We do this only if the next line starts with an accent, otherwise we
# would create things like '\begin_inset ERTstatus'.
numberoflines = len(document.body)
for i in range(numberoflines-1):
if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
continue
if (document.body[i+1][0] in inverse_accent_map):
# the last character of this line and the first of the next line
# form probably a surrogate pair.
while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
document.body[i] += document.body[i+1][0]
document.body[i+1] = document.body[i+1][1:]
# Normalize to "Normal form D" (NFD, also known as canonical decomposition).
# This is needed to catch all accented characters.
for i in range(numberoflines):
# Unfortunately we have a mixture of unicode strings and plain strings,
# because we never use u'xxx' for string literals, but 'xxx'.
# Therefore we may have to try two times to normalize the data.
try:
document.body[i] = unicodedata.normalize("NFKD", document.body[i])
except TypeError:
document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
# Replace accented characters with InsetLaTeXAccent
# Do not convert characters that can be represented in the chosen
# encoding.
encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
lang_re = re.compile(r"^\\lang\s(\S+)")
for i in range(len(document.body)):
if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
# Track the encoding of the current line
result = lang_re.match(document.body[i])
if result:
language = result.group(1)
if language == "default":
encoding_stack[-1] = document.encoding
else:
from lyx2lyx_lang import lang
encoding_stack[-1] = lang[language][3]
continue
elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
encoding_stack.append(encoding_stack[-1])
continue
elif find_token(document.body, "\\end_layout", i, i + 1) == i:
del encoding_stack[-1]
continue
for j in range(len(document.body[i])):
# dotless i and dotless j are both in special_accent_map and can
# occur as an accented character, so we need to test that the
# following character is no accent
if (document.body[i][j] in inverse_special_accent_map and
(j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)):
accent = document.body[i][j]
try:
dummy = accent.encode(encoding_stack[-1])
except UnicodeEncodeError:
# Insert the rest of the line as new line
if j < len(document.body[i]) - 1:
document.body[i+1:i+1] = document.body[i][j+1:]
# Delete the accented character
if j > 0:
document.body[i] = document.body[i][:j-1]
else:
document.body[i] = u''
# Finally add the InsetLaTeXAccent
document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
break
elif j > 0 and document.body[i][j] in inverse_accent_map:
accented_char = document.body[i][j-1]
if accented_char == ' ':
# Conform to LyX output
accented_char = ''
elif accented_char in inverse_accented_map:
accented_char = inverse_accented_map[accented_char]
accent = document.body[i][j]
try:
dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1])
except UnicodeEncodeError:
# Insert the rest of the line as new line
if j < len(document.body[i]) - 1:
document.body[i+1:i+1] = document.body[i][j+1:]
# Delete the accented characters
if j > 1:
document.body[i] = document.body[i][:j-2]
else:
document.body[i] = u''
# Finally add the InsetLaTeXAccent
document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
break
# Normalize to "Normal form C" (NFC, pre-composed characters) again
for i in range(numberoflines):
document.body[i] = unicodedata.normalize("NFKC", document.body[i])
def normalize_font_whitespace(document):
""" Before format 259 the font changes were ignored if a
whitespace was the first or last character in the sequence, this function
transfers the whitespace outside."""
if document.backend != "latex":
return
lines = document.body
char_properties = {"\\series": "default",
"\\emph": "default",
"\\color": "none",
"\\shape": "default",
"\\bar": "default",
"\\family": "default"}
changes = {}
i = 0
while i < len(lines):
words = lines[i].split()
if len(words) > 0 and words[0] == "\\begin_layout":
# a new paragraph resets all font changes
changes.clear()
elif len(words) > 1 and words[0] in char_properties.keys():
# we have a font change
if char_properties[words[0]] == words[1]:
# property gets reset
if words[0] in changes.keys():
del changes[words[0]]
defaultproperty = True
else:
# property gets set
changes[words[0]] = words[1]
defaultproperty = False
# We need to explicitly reset all changed properties if we find
# a space below, because LyX 1.4 would output the space after
# closing the previous change and before starting the new one,
# and closing a font change means to close all properties, not
# just the changed one.
if lines[i-1] and lines[i-1][-1] == " ":
lines[i-1] = lines[i-1][:-1]
# a space before the font change
added_lines = [" "]
for k in changes.keys():
# exclude property k because that is already in lines[i]
if k != words[0]:
added_lines[1:1] = ["%s %s" % (k, changes[k])]
for k in changes.keys():
# exclude property k because that must be added below anyway
if k != words[0]:
added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
if defaultproperty:
# Property is reset in lines[i], so add the new stuff afterwards
lines[i+1:i+1] = added_lines
else:
# Reset property for the space
added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
lines[i:i] = added_lines
i = i + len(added_lines)
elif lines[i+1] and lines[i+1][0] == " " and (len(changes) > 0 or not defaultproperty):
# a space after the font change
if (lines[i+1] == " " and lines[i+2]):
next_words = lines[i+2].split()
if len(next_words) > 0 and next_words[0] == words[0]:
# a single blank with a property different from the
# previous and the next line must not be changed
i = i + 2
continue
lines[i+1] = lines[i+1][1:]
added_lines = [" "]
for k in changes.keys():
# exclude property k because that is already in lines[i]
if k != words[0]:
added_lines[1:1] = ["%s %s" % (k, changes[k])]
for k in changes.keys():
# exclude property k because that must be added below anyway
if k != words[0]:
added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
# Reset property for the space
added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
lines[i:i] = added_lines
i = i + len(added_lines)
i = i + 1
def revert_utf8x(document):
" Set utf8x encoding to utf8. "
i = find_token(document.header, "\\inputencoding", 0)
if i == -1:
document.header.append("\\inputencoding auto")
else:
inputenc = get_value(document.header, "\\inputencoding", i)
if inputenc == "utf8x":
document.header[i] = "\\inputencoding utf8"
document.inputencoding = get_value(document.header, "\\inputencoding", 0)
def convert_changes(document):
" Switch output_changes off if tracking_changes is off. "
i = find_token(document.header, '\\tracking_changes', 0)
if i == -1:
document.warning("Malformed lyx document: Missing '\\tracking_changes'.")
return
j = find_token(document.header, '\\output_changes', 0)
if j == -1:
document.warning("Malformed lyx document: Missing '\\output_changes'.")
return
tracking_changes = get_value(document.header, "\\tracking_changes", i)
output_changes = get_value(document.header, "\\output_changes", j)
if tracking_changes == "false" and output_changes == "true":
document.header[j] = "\\output_changes false"
def revert_ascii(document):
" Set ascii encoding to auto. "
i = find_token(document.header, "\\inputencoding", 0)
if i == -1:
document.header.append("\\inputencoding auto")
else:
inputenc = get_value(document.header, "\\inputencoding", i)
if inputenc == "ascii":
document.header[i] = "\\inputencoding auto"
document.inputencoding = get_value(document.header, "\\inputencoding", 0)
##
# Conversion hub
#
supported_versions = ["1.5.0","1.5"]
convert = [[246, []],
[247, [convert_font_settings]],
[248, []],
[249, [convert_utf8]],
[250, []],
[251, []],
[252, [convert_commandparams, convert_bibitem]],
[253, []],
[254, [convert_esint]],
[255, []],
[256, []],
[257, [convert_caption]],
[258, [convert_lyxline]],
[259, [convert_accent, normalize_font_whitespace]],
[260, []],
[261, [convert_changes]],
[262, []]]
revert = [[261, [revert_ascii]],
[260, []],
[259, [revert_utf8x]],
[258, []],
[257, []],
[256, [revert_caption]],
[255, [revert_encodings]],
[254, [revert_clearpage, revert_cleardoublepage]],
[253, [revert_esint]],
[252, [revert_nomenclature, revert_printnomenclature]],
[251, [revert_commandparams]],
[250, [revert_cs_label]],
[249, []],
[248, [revert_accent, revert_utf8]],
[247, [revert_booktabs]],
[246, [revert_font_settings]],
[245, [revert_framed]]]
if __name__ == "__main__":
pass