lyx_mirror/lib/scripts/docbook2epub.py

# file docbook2epub.py
# This file is part of LyX, the document processor.
# Licence details can be found in the file COPYING.
#
# \author Thibaut Cuvelier
#
# Full author contact details are available in file CREDITS

# Usage:
#   python docbook2epub.py java_binary saxon_path xsltproc_path xslt_path in.docbook in.orig.path out.epub


import glob
import os
import shutil
import sys
import tempfile
import zipfile


def _parse_nullable_argument(arg):
    return arg if arg != '' and arg != 'none' else None


class ImageRename:
    def __init__(self, opf_path, local_path, epub_path):
        self.opf_path = opf_path
        self.local_path = local_path
        self.epub_path = epub_path


class DocBookToEpub:
    def __init__(self, args=None):
        if args is None:
            args = sys.argv

        if len(args) != 8:
            print(f'Exactly eight arguments are expected, only {len(args)} found: {args}.')
            sys.exit(1)

        self.own_path = sys.argv[0]
        self.java_path = _parse_nullable_argument(sys.argv[1])
        self.saxon_path = _parse_nullable_argument(sys.argv[2])
        self.xsltproc_path = _parse_nullable_argument(sys.argv[3])
        self.xslt_path = _parse_nullable_argument(sys.argv[4])
        self.input = sys.argv[5]
        self.input_path = sys.argv[6]
        self.output = sys.argv[7]
        self.script_folder = os.path.dirname(self.own_path) + '/../'

        print('Generating ePub with the following parameters:')
        print('    own_path: %s' % self.own_path)
        print('    java_path: %s' % self.java_path)
        print('    saxon_path: %s' % self.saxon_path)
        print('    xsltproc_path: %s' % self.xsltproc_path)
        print('    xslt_path: %s' % self.xslt_path)
        print('    input: %s' % self.input)
        print('    input_path: %s' % self.input_path)
        print('    output: %s' % self.output)

        # Precompute paths that will be used later.
        self.output_dir = tempfile.mkdtemp().replace('\\', '/')
        self.package_opf = self.output_dir + '/OEBPS/package.opf'
        print('Temporary output directory: %s' % self.output_dir)

        os.mkdir(self.output_dir + '/OEBPS')
        os.mkdir(self.output_dir + '/OEBPS/images')
        os.mkdir(self.output_dir + '/META-INF')
        print('Created the folder structure')

        if self.xslt_path is None:
            self.xslt = self.script_folder + 'docbook/epub3/chunk.xsl'
        else:
            self.xslt = self.xslt_path + '/epub3/chunk.xsl'
        print('XSLT style sheet to use:')
        print(self.xslt)

        if self.saxon_path is None:
            self.saxon_path = self.script_folder + 'scripts/saxon6.5.5.jar'

        # These will be filled during the execution of the script.
        self.renamed = None

    def gracefully_fail(self, reason):
        print('docbook2epub fails: %s' % reason)
        shutil.rmtree(self.output_dir, ignore_errors=True)
        sys.exit(1)

    def start_xslt_transformation(self):
        command = None
        if self.xsltproc_path is not None:
            command = self.start_xslt_transformation_xsltproc()
        elif self.java_path is not None:
            command = self.start_xslt_transformation_saxon6()

        if command is None:
            self.gracefully_fail('no XSLT processor available')

        print('Command to execute:')
        print(command)

        quoted_command = command
        if os.name == 'nt':
            # On Windows, it is typical to have spaces in folder names, and that requires to wrap the whole command
            # in quotes. On Linux, this might create errors when starting the command.
            quoted_command = '"' + command + '"'
        # This could be simplified by using subprocess.run, but this requires Python 3.5.

        if os.system(quoted_command) != 0:
            self.gracefully_fail('error from the XSLT processor')

        print('Generated ePub contents.')

    def start_xslt_transformation_xsltproc(self):
        params = '-stringparam base.dir "' + self.output_dir + '"'
        return '"' + self.xsltproc_path + '" ' + params + ' "' + self.xslt + '" "' + self.input + '"'

    def start_xslt_transformation_saxon6(self):
        params = 'base.dir=%s' % self.output_dir
        executable = '"' + self.java_path + '" -jar "' + self.saxon_path + '"'
        return executable + ' "' + self.input + '" "' + self.xslt + '" "' + params + '"'

    def get_images_from_package_opf(self):
        images = []

        # Example in the OPF file:
        #     <item id="d436e1" href="D:/LyX/lib/images/buffer-view.svgz" media-type="image/SVGZ"/>
        # The XHTML files are also <item> tags:
        #     <item id="id-d0e2" href="index.xhtml" media-type="application/xhtml+xml"/>
        try:
            with open(self.package_opf) as f:
                for line in f.readlines():
                    if '<item' in line and 'media-type="image' in line:
                        images.append(line.split('href="')[1].split('"')[0])
        except FileNotFoundError:
            print('The package.opf file was not found, probably due to a DocBook error. The ePub file will be corrupt.')

        return images

    def get_image_changes(self):
        epub_folder = 'images/'

        changes = []
        for image in self.get_images_from_package_opf():
            if os.path.exists(image):
                file_system_path = image
            elif os.path.exists(self.input_path + image):
                file_system_path = self.input_path + image
            else:
                file_system_path = ''

            changes.append(ImageRename(image, file_system_path, epub_folder + os.path.basename(image)))
        return changes

    def change_image_paths(self, file):
        # This could be optimised, as the same operation is performed a zillion times on many files:
        # https://www.oreilly.com/library/view/python-cookbook/0596001673/ch03s15.html
        with open(file, encoding='utf8') as f:
            contents = list(f)

        with open(file, 'w', encoding='utf8') as f:
            for line in contents:
                for change in self.renamed:
                    line = line.replace(change.opf_path, change.epub_path)
                f.write(line)

    def copy_images(self):
        # Copy the assets to the OEBPS/images/. All paths are available in OEBPS/package.opf, but they must also be
        # changed in the XHTML files. Typically, the current paths are absolute.

        # First, get the mapping old file => file in the ePub archive.
        self.renamed = self.get_image_changes()

        # Then, transform all paths (both OPF and XHTML files).
        self.change_image_paths(self.output_dir + '/OEBPS/package.opf')
        for file in glob.glob(self.output_dir + '/OEBPS/*.xhtml'):
            self.change_image_paths(file)

        # Ensure that the destination path exists. OEBPS exists due to the DocBook-to-ePub transformation.
        if not os.path.exists(self.output_dir + '/OEBPS/images/'):
            os.mkdir(self.output_dir + '/OEBPS/images/')

        # Finally, actually copy the image files.
        for change in self.renamed:
            shutil.copyfile(change.local_path, self.output_dir + '/OEBPS/' + change.epub_path)

    def create_zip_archive(self):
        with zipfile.ZipFile(self.output, 'w', zipfile.ZIP_DEFLATED) as zip:
            # Python 3.5 brings the `recursive` argument. For older versions, this trick is required...
            # for file in glob.glob(output_dir + '/**/*', recursive=True):
            for file in [os.path.join(dp, f) for dp, dn, filenames in os.walk(self.output_dir) for f in filenames]:
                zip.write(file, os.path.relpath(file, self.output_dir), compress_type=zipfile.ZIP_STORED)

        shutil.rmtree(self.output_dir)
        print('Generated ePub.')

    def transform(self):
        self.start_xslt_transformation()
        self.copy_images()
        self.create_zip_archive()


if __name__ == '__main__':
    DocBookToEpub(sys.argv).transform()
docbook2epub. 2021-01-28 03:57:57 +00:00			`# file docbook2epub.py`
			`# This file is part of LyX, the document processor.`
			`# Licence details can be found in the file COPYING.`
			`#`
			`# \author Thibaut Cuvelier`
			`#`
			`# Full author contact details are available in file CREDITS`

			`# Usage:`
ePub: try harder to find image files. 2021-02-19 19:59:20 +00:00			`# python docbook2epub.py java_binary saxon_path xsltproc_path xslt_path in.docbook in.orig.path out.epub`
docbook2epub. 2021-01-28 03:57:57 +00:00

ePub: copy images into the ePub archive. 2021-02-07 19:00:31 +00:00			`import glob`
docbook2epub. 2021-01-28 03:57:57 +00:00			`import os`
			`import shutil`
			`import sys`
			`import tempfile`
			`import zipfile`

ePub: refactor script as a series of small functions. 2021-02-07 05:53:43 +00:00
ePub: share a bit of code for parsing arguments. 2021-02-07 22:18:54 +00:00			`def _parse_nullable_argument(arg):`
			`return arg if arg != '' and arg != 'none' else None`


ePub: try harder to find image files. 2021-02-19 19:59:20 +00:00			`class ImageRename:`
			`def __init__(self, opf_path, local_path, epub_path):`
			`self.opf_path = opf_path`
			`self.local_path = local_path`
			`self.epub_path = epub_path`


ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`class DocBookToEpub:`
			`def __init__(self, args=None):`
			`if args is None:`
			`args = sys.argv`

ePub: try harder to find image files. 2021-02-19 19:59:20 +00:00			`if len(args) != 8:`
Update Python scripts to Python 3+ Remove support for Python 2 Use formatted strings where appropriated 2024-06-10 10:29:56 +00:00			`print(f'Exactly eight arguments are expected, only {len(args)} found: {args}.')`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`sys.exit(1)`

			`self.own_path = sys.argv[0]`
ePub: share a bit of code for parsing arguments. 2021-02-07 22:18:54 +00:00			`self.java_path = _parse_nullable_argument(sys.argv[1])`
			`self.saxon_path = _parse_nullable_argument(sys.argv[2])`
			`self.xsltproc_path = _parse_nullable_argument(sys.argv[3])`
			`self.xslt_path = _parse_nullable_argument(sys.argv[4])`
ePub: allow a user-specified Saxon. 2021-02-07 22:18:06 +00:00			`self.input = sys.argv[5]`
ePub: try harder to find image files. 2021-02-19 19:59:20 +00:00			`self.input_path = sys.argv[6]`
			`self.output = sys.argv[7]`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`self.script_folder = os.path.dirname(self.own_path) + '/../'`

			`print('Generating ePub with the following parameters:')`
docbook2epub: make the output of the parameters easier to read. 2024-11-11 22:44:05 +00:00			`print(' own_path: %s' % self.own_path)`
			`print(' java_path: %s' % self.java_path)`
			`print(' saxon_path: %s' % self.saxon_path)`
			`print(' xsltproc_path: %s' % self.xsltproc_path)`
			`print(' xslt_path: %s' % self.xslt_path)`
			`print(' input: %s' % self.input)`
			`print(' input_path: %s' % self.input_path)`
			`print(' output: %s' % self.output)`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00
			`# Precompute paths that will be used later.`
			`self.output_dir = tempfile.mkdtemp().replace('\\', '/')`
docbook2epub: create the OEBPS and META-INF folders that Saxon seems unable to create. 2024-11-11 22:43:34 +00:00			`self.package_opf = self.output_dir + '/OEBPS/package.opf'`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`print('Temporary output directory: %s' % self.output_dir)`

docbook2epub: create the OEBPS and META-INF folders that Saxon seems unable to create. 2024-11-11 22:43:34 +00:00			`os.mkdir(self.output_dir + '/OEBPS')`
			`os.mkdir(self.output_dir + '/OEBPS/images')`
			`os.mkdir(self.output_dir + '/META-INF')`
			`print('Created the folder structure')`

ePub: use None instead of an empty string. 2021-02-07 19:52:50 +00:00			`if self.xslt_path is None:`
ePub: add argument for external DocBook XSLT. 2021-02-07 19:51:25 +00:00			`self.xslt = self.script_folder + 'docbook/epub3/chunk.xsl'`
			`else:`
			`self.xslt = self.xslt_path + '/epub3/chunk.xsl'`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`print('XSLT style sheet to use:')`
			`print(self.xslt)`

ePub: allow a user-specified Saxon. 2021-02-07 22:18:06 +00:00			`if self.saxon_path is None:`
			`self.saxon_path = self.script_folder + 'scripts/saxon6.5.5.jar'`

ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`# These will be filled during the execution of the script.`
			`self.renamed = None`

			`def gracefully_fail(self, reason):`
			`print('docbook2epub fails: %s' % reason)`
			`shutil.rmtree(self.output_dir, ignore_errors=True)`
docbook2epub. 2021-01-28 03:57:57 +00:00			`sys.exit(1)`

ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`def start_xslt_transformation(self):`
			`command = None`
ePub: use None instead of an empty string. 2021-02-07 19:52:50 +00:00			`if self.xsltproc_path is not None:`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`command = self.start_xslt_transformation_xsltproc()`
ePub: use None instead of an empty string. 2021-02-07 19:52:50 +00:00			`elif self.java_path is not None:`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`command = self.start_xslt_transformation_saxon6()`

			`if command is None:`
			`self.gracefully_fail('no XSLT processor available')`

			`print('Command to execute:')`
			`print(command)`

			`quoted_command = command`
			`if os.name == 'nt':`
			`# On Windows, it is typical to have spaces in folder names, and that requires to wrap the whole command`
			`# in quotes. On Linux, this might create errors when starting the command.`
			`quoted_command = '"' + command + '"'`
			`# This could be simplified by using subprocess.run, but this requires Python 3.5.`

			`if os.system(quoted_command) != 0:`
			`self.gracefully_fail('error from the XSLT processor')`

			`print('Generated ePub contents.')`

			`def start_xslt_transformation_xsltproc(self):`
			`params = '-stringparam base.dir "' + self.output_dir + '"'`
			`return '"' + self.xsltproc_path + '" ' + params + ' "' + self.xslt + '" "' + self.input + '"'`

			`def start_xslt_transformation_saxon6(self):`
			`params = 'base.dir=%s' % self.output_dir`
ePub: allow a user-specified Saxon. 2021-02-07 22:18:06 +00:00			`executable = '"' + self.java_path + '" -jar "' + self.saxon_path + '"'`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`return executable + ' "' + self.input + '" "' + self.xslt + '" "' + params + '"'`

			`def get_images_from_package_opf(self):`
			`images = []`

			`# Example in the OPF file:`
			`# <item id="d436e1" href="D:/LyX/lib/images/buffer-view.svgz" media-type="image/SVGZ"/>`
			`# The XHTML files are also <item> tags:`
			`# <item id="id-d0e2" href="index.xhtml" media-type="application/xhtml+xml"/>`
			`try:`
Update Python scripts to Python 3+ Remove support for Python 2 Use formatted strings where appropriated 2024-06-10 10:29:56 +00:00			`with open(self.package_opf) as f:`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`for line in f.readlines():`
			`if '<item' in line and 'media-type="image' in line:`
			`images.append(line.split('href="')[1].split('"')[0])`
			`except FileNotFoundError:`
			`print('The package.opf file was not found, probably due to a DocBook error. The ePub file will be corrupt.')`

			`return images`

ePub: try harder to find image files. 2021-02-19 19:59:20 +00:00			`def get_image_changes(self):`
			`epub_folder = 'images/'`

			`changes = []`
			`for image in self.get_images_from_package_opf():`
			`if os.path.exists(image):`
			`file_system_path = image`
			`elif os.path.exists(self.input_path + image):`
			`file_system_path = self.input_path + image`
			`else:`
			`file_system_path = ''`

			`changes.append(ImageRename(image, file_system_path, epub_folder + os.path.basename(image)))`
			`return changes`

ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`def change_image_paths(self, file):`
			`# This could be optimised, as the same operation is performed a zillion times on many files:`
			`# https://www.oreilly.com/library/view/python-cookbook/0596001673/ch03s15.html`
Update Python scripts to Python 3+ Remove support for Python 2 Use formatted strings where appropriated 2024-06-10 10:29:56 +00:00			`with open(file, encoding='utf8') as f:`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`contents = list(f)`

			`with open(file, 'w', encoding='utf8') as f:`
			`for line in contents:`
ePub: try harder to find image files. 2021-02-19 19:59:20 +00:00			`for change in self.renamed:`
			`line = line.replace(change.opf_path, change.epub_path)`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`f.write(line)`

			`def copy_images(self):`
			`# Copy the assets to the OEBPS/images/. All paths are available in OEBPS/package.opf, but they must also be`
			`# changed in the XHTML files. Typically, the current paths are absolute.`

			`# First, get the mapping old file => file in the ePub archive.`
ePub: try harder to find image files. 2021-02-19 19:59:20 +00:00			`self.renamed = self.get_image_changes()`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00
			`# Then, transform all paths (both OPF and XHTML files).`
			`self.change_image_paths(self.output_dir + '/OEBPS/package.opf')`
			`for file in glob.glob(self.output_dir + '/OEBPS/*.xhtml'):`
			`self.change_image_paths(file)`

			`# Ensure that the destination path exists. OEBPS exists due to the DocBook-to-ePub transformation.`
			`if not os.path.exists(self.output_dir + '/OEBPS/images/'):`
			`os.mkdir(self.output_dir + '/OEBPS/images/')`

			`# Finally, actually copy the image files.`
ePub: try harder to find image files. 2021-02-19 19:59:20 +00:00			`for change in self.renamed:`
			`shutil.copyfile(change.local_path, self.output_dir + '/OEBPS/' + change.epub_path)`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00
			`def create_zip_archive(self):`
			`with zipfile.ZipFile(self.output, 'w', zipfile.ZIP_DEFLATED) as zip:`
			# Python 3.5 brings the `recursive` argument. For older versions, this trick is required...
			`# for file in glob.glob(output_dir + '/*/', recursive=True):`
			`for file in [os.path.join(dp, f) for dp, dn, filenames in os.walk(self.output_dir) for f in filenames]:`
			`zip.write(file, os.path.relpath(file, self.output_dir), compress_type=zipfile.ZIP_STORED)`

			`shutil.rmtree(self.output_dir)`
			`print('Generated ePub.')`

			`def transform(self):`
			`self.start_xslt_transformation()`
			`self.copy_images()`
			`self.create_zip_archive()`
ePub: refactor script as a series of small functions. 2021-02-07 05:53:43 +00:00

			`if __name__ == '__main__':`
ePub: restructure the script as a class. This allows to share a little bit more code and to avoid functions with many arguments. 2021-02-07 19:46:29 +00:00			`DocBookToEpub(sys.argv).transform()`