From 166420d02ccb073dc32ab5cd0bc466de54aa36bb Mon Sep 17 00:00:00 2001
From: Georg Baum <baum@lyx.org>
Date: Sun, 12 Jun 2016 21:21:15 +0200
Subject: [PATCH] Make lyx2lyx infrastructure python3 ready

The LyX class works now with python 3. Certain file format conversions may
still fail (convert_multiencoding() is a hot candidate), but this will need
to be fixed in the individual modules.
---
 lib/lyx2lyx/LyX.py     | 135 ++++++++++++++++++++++++++++++++---------
 lib/lyx2lyx/lyx_1_5.py |   1 -
 lib/lyx2lyx/lyx_1_6.py |   4 --
 lib/lyx2lyx/lyx_2_2.py |  10 +--
 4 files changed, 108 insertions(+), 42 deletions(-)

diff --git a/lib/lyx2lyx/LyX.py b/lib/lyx2lyx/LyX.py
index d98d765667..6cbc6ffffa 100644
--- a/lib/lyx2lyx/LyX.py
+++ b/lib/lyx2lyx/LyX.py
@@ -28,6 +28,7 @@ import locale
 import sys
 import re
 import time
+import io
 
 try:
     import lyx2lyx_version
@@ -37,6 +38,10 @@ except: # we are running from build directory so assume the last version
 
 default_debug__ = 2
 
+# Provide support for both python 2 and 3
+PY2 = sys.version_info[0] == 2
+# End of code to support for both python 2 and 3
+
 ####################################################################
 # Private helper functions
 
@@ -158,6 +163,17 @@ def trim_eol(line):
         return line[:-1]
 
 
+def trim_eol_binary(line):
+    " Remove end of line char(s)."
+    if line[-1] != 10 and line[-1] != 13:
+        # May happen for the last line of a document
+        return line
+    if line[-2:-1] == 13:
+        return line[:-2]
+    else:
+        return line[:-1]
+
+
 def get_encoding(language, inputencoding, format, cjk_encoding):
     " Returns enconding of the lyx file"
     if format > 248:
@@ -200,7 +216,8 @@ class LyX_base:
         error: the name of the error file, if empty use the standard error.
         debug: debug level, O means no debug, as its value increases be more verbose.
         """
-        self.choose_io(input, output)
+        self.choose_input(input)
+        self.output = output
 
         if error:
             self.err = open(error, "w")
@@ -281,25 +298,41 @@ class LyX_base:
         self.body parts, from self.input."""
 
         # First pass: Read header to determine file encoding
+        # If we are running under python3 then all strings are binary in this
+        # pass. In some cases we need to convert binary to unicode in order to
+        # use our parser tools. Since we do not know the true encoding yet we
+        # use latin1. This works since a) the parts we are interested in are
+        # pure ASCII (subset of latin1) and b) in contrast to pure ascii or
+        # utf8, one can decode any 8byte string using latin1.
         while True:
             line = self.input.readline()
             if not line:
                 # eof found before end of header
                 self.error("Invalid LyX file: Missing body.")
 
-            line = trim_eol(line)
-            if check_token(line, '\\begin_preamble'):
+            if PY2:
+                line = trim_eol(line)
+                decoded = line
+            else:
+                line = trim_eol_binary(line)
+                decoded = line.decode('latin1')
+            if check_token(decoded, '\\begin_preamble'):
                 while 1:
                     line = self.input.readline()
                     if not line:
                         # eof found before end of header
                         self.error("Invalid LyX file: Missing body.")
 
-                    line = trim_eol(line)
-                    if check_token(line, '\\end_preamble'):
+                    if PY2:
+                        line = trim_eol(line)
+                        decoded = line
+                    else:
+                        line = trim_eol_binary(line)
+                        decoded = line.decode('latin1')
+                    if check_token(decoded, '\\end_preamble'):
                         break
 
-                    if line.split()[:0] in ("\\layout",
+                    if decoded.split()[:0] in ("\\layout",
                                             "\\begin_layout", "\\begin_body"):
 
                         self.warning("Malformed LyX file:"
@@ -309,33 +342,48 @@ class LyX_base:
 
                     self.preamble.append(line)
 
-            if check_token(line, '\\end_preamble'):
+            if check_token(decoded, '\\end_preamble'):
                 continue
 
             line = line.strip()
             if not line:
                 continue
 
-            if line.split()[0] in ("\\layout", "\\begin_layout",
+            if decoded.split()[0] in ("\\layout", "\\begin_layout",
                                    "\\begin_body", "\\begin_deeper"):
                 self.body.append(line)
                 break
 
             self.header.append(line)
 
-        i = find_token(self.header, '\\textclass', 0)
+        if PY2:
+            i = find_token(self.header, '\\textclass', 0)
+        else:
+            i = find_token(self.header, b'\\textclass', 0)
         if i == -1:
             self.warning("Malformed LyX file: Missing '\\textclass'.")
-            i = find_token(self.header, '\\lyxformat', 0) + 1
-            self.header[i:i] = ['\\textclass article']
+            if PY2:
+                i = find_token(self.header, '\\lyxformat', 0) + 1
+                self.header[i:i] = ['\\textclass article']
+            else:
+                i = find_token(self.header, b'\\lyxformat', 0) + 1
+                self.header[i:i] = [b'\\textclass article']
 
-        self.textclass = get_value(self.header, "\\textclass", 0)
-        self.backend = get_backend(self.textclass)
-        self.format  = self.read_format()
-        self.language = get_value(self.header, "\\language", 0,
-                                  default = "english")
-        self.inputencoding = get_value(self.header, "\\inputencoding",
-                                       0, default = "auto")
+        if PY2:
+            self.textclass = get_value(self.header, "\\textclass", 0,
+                                       default = "")
+            self.language = get_value(self.header, "\\language", 0,
+                                      default = "english")
+            self.inputencoding = get_value(self.header, "\\inputencoding", 0,
+                                           default = "auto")
+        else:
+            self.textclass = get_value(self.header, b"\\textclass", 0,
+                                       default = b"")
+            self.language = get_value(self.header, b"\\language", 0,
+                                      default = b"english").decode('ascii')
+            self.inputencoding = get_value(self.header, b"\\inputencoding", 0,
+                                           default = b"auto").decode('ascii')
+        self.format = self.read_format()
         self.encoding = get_encoding(self.language,
                                      self.inputencoding, self.format,
                                      self.cjk_encoding)
@@ -344,6 +392,7 @@ class LyX_base:
         # Second pass over header and preamble, now we know the file encoding
         # Do not forget the textclass (Debian bug #700828)
         self.textclass = self.textclass.decode(self.encoding)
+        self.backend = get_backend(self.textclass)
         for i in range(len(self.header)):
             self.header[i] = self.header[i].decode(self.encoding)
         for i in range(len(self.preamble)):
@@ -361,6 +410,7 @@ class LyX_base:
 
     def write(self):
         " Writes the LyX file to self.output."
+        self.choose_output(self.output)
         self.set_version()
         self.set_format()
         self.set_textclass()
@@ -375,29 +425,53 @@ class LyX_base:
             header = self.header
 
         for line in header + [''] + self.body:
-            self.output.write(line.encode(self.encoding)+"\n")
+            self.output.write(line+u"\n")
 
 
-    def choose_io(self, input, output):
-        """Choose input and output streams, dealing transparently with
+    def choose_output(self, output):
+        """Choose output streams dealing transparently with
         compressed files."""
 
-        if output:
-            self.output = open(output, "wb")
+        # This is a bit complicated, because we need to be compatible both with
+        # python 2 and python 3. Therefore we handle the encoding here and not
+        # when writing individual lines and may need up to 3 layered file like
+        # interfaces.
+        if self.compressed:
+            if output:
+                outputfileobj = open(output, 'wb')
+            else:
+                # We cannot not use stdout directly since it needs text, not bytes in python 3
+                outputfileobj = os.fdopen(sys.stdout.fileno(), 'wb')
+            # We cannot not use gzip.open() since it is not supported by python 2
+            zipbuffer = gzip.GzipFile(mode='wb', fileobj=outputfileobj)
+            # We do not want to use different newlines on different OSes inside zipped files
+            self.output = io.TextIOWrapper(zipbuffer, encoding=self.encoding, newline='\n')
         else:
-            self.output = sys.stdout
+            if output:
+                self.output = io.open(output, 'w', encoding=self.encoding)
+            else:
+                self.output = io.open(sys.stdout.fileno(), 'w', encoding=self.encoding)
 
+
+    def choose_input(self, input):
+        """Choose input stream, dealing transparently with
+        compressed files."""
+
+        # Since we do not know the encoding yet we need to read the input as
+        # bytes in binary mode, and convert later to unicode.
         if input and input != '-':
             self.dir = os.path.dirname(os.path.abspath(input))
             try:
                 gzip.open(input).readline()
                 self.input = gzip.open(input)
-                self.output = gzip.GzipFile(mode="wb", fileobj=self.output)
+                self.compressed = True
             except:
-                self.input = open(input)
+                self.input = open(input, 'rb')
+                self.compressed = False
         else:
             self.dir = ''
-            self.input = sys.stdin
+            self.input = os.fdopen(sys.stdin.fileno(), 'rb')
+            self.compressed = False
 
 
     def lyxformat(self, format):
@@ -470,11 +544,14 @@ class LyX_base:
     def read_format(self):
         " Read from the header the fileformat of the present LyX file."
         for line in self.header:
-            result = fileformat.match(line)
+            if PY2:
+                result = fileformat.match(line)
+            else:
+                result = fileformat.match(line.decode('ascii'))
             if result:
                 return self.lyxformat(result.group(1))
         else:
-            self.error("Invalid LyX File.")
+            self.error("Invalid LyX File: Missing format.")
         return None
 
 
diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py
index 97adec9098..afacd26164 100644
--- a/lib/lyx2lyx/lyx_1_5.py
+++ b/lib/lyx2lyx/lyx_1_5.py
@@ -33,7 +33,6 @@ if not PY2:
     unichr = chr
 else:
     text_type = unicode
-    unichr = unichr
 # End of code to support for both python 2 and 3
 
 ####################################################################
diff --git a/lib/lyx2lyx/lyx_1_6.py b/lib/lyx2lyx/lyx_1_6.py
index f431d5fec1..3900dbf926 100644
--- a/lib/lyx2lyx/lyx_1_6.py
+++ b/lib/lyx2lyx/lyx_1_6.py
@@ -27,11 +27,7 @@ from parser_tools import find_token, find_end_of, find_tokens, get_value
 # Provide support for both python 2 and 3
 PY2 = sys.version_info[0] == 2
 if not PY2:
-    text_type = str
     unichr = chr
-else:
-    text_type = unicode
-    unichr = unichr
 # End of code to support for both python 2 and 3
 
 ####################################################################
diff --git a/lib/lyx2lyx/lyx_2_2.py b/lib/lyx2lyx/lyx_2_2.py
index 073f815cbb..519c8b5d9c 100644
--- a/lib/lyx2lyx/lyx_2_2.py
+++ b/lib/lyx2lyx/lyx_2_2.py
@@ -41,12 +41,6 @@ from parser_tools import find_token, find_token_backwards, find_re, \
 
 # Provide support for both python 2 and 3
 PY2 = sys.version_info[0] == 2
-if not PY2:
-    text_type = str
-    unichr = chr
-else:
-    text_type = unicode
-    unichr = unichr
 # End of code to support for both python 2 and 3
 
 ####################################################################
@@ -1164,8 +1158,8 @@ def convert_origin(document):
             origin = document.dir.replace('\\', '/') + '/'
         else:
             origin = os.path.join("/systemlyxdir", relpath).replace('\\', '/') + '/'
-        if os.name != 'nt':
-            origin = text_type(origin, sys.getfilesystemencoding())
+        if os.name != 'nt' and PY2:
+            origin = unicode(origin, sys.getfilesystemencoding())
     document.header[i:i] = ["\\origin " + origin]