File: docbook2epub.py

package info (click to toggle)
lyx 2.5.0~RC2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 138,212 kB
  • sloc: cpp: 244,227; ansic: 106,398; xml: 72,791; python: 39,384; sh: 7,666; makefile: 6,586; pascal: 2,143; perl: 2,101; objc: 1,084; tcl: 163; sed: 16
file content (204 lines) | stat: -rw-r--r-- 8,051 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# file docbook2epub.py
# This file is part of LyX, the document processor.
# Licence details can be found in the file COPYING.
#
# \author Thibaut Cuvelier
#
# Full author contact details are available in file CREDITS

# Usage:
#   python docbook2epub.py java_binary saxon_path xsltproc_path xslt_path in.docbook in.orig.path out.epub


import glob
import os
import shutil
import sys
import tempfile
import zipfile


def _parse_nullable_argument(arg):
    return arg if arg != '' and arg != 'none' else None


class ImageRename:
    def __init__(self, opf_path, local_path, epub_path):
        self.opf_path = opf_path
        self.local_path = local_path
        self.epub_path = epub_path


class DocBookToEpub:
    def __init__(self, args=None):
        if args is None:
            args = sys.argv

        if len(args) != 8:
            print(f'Exactly eight arguments are expected, only {len(args)} found: {args}.')
            sys.exit(1)

        self.own_path = sys.argv[0]
        self.java_path = _parse_nullable_argument(sys.argv[1])
        self.saxon_path = _parse_nullable_argument(sys.argv[2])
        self.xsltproc_path = _parse_nullable_argument(sys.argv[3])
        self.xslt_path = _parse_nullable_argument(sys.argv[4])
        self.input = sys.argv[5]
        self.input_path = sys.argv[6]
        self.output = sys.argv[7]
        self.script_folder = os.path.dirname(self.own_path) + '/../'

        print('Generating ePub with the following parameters:')
        print('    own_path: %s' % self.own_path)
        print('    java_path: %s' % self.java_path)
        print('    saxon_path: %s' % self.saxon_path)
        print('    xsltproc_path: %s' % self.xsltproc_path)
        print('    xslt_path: %s' % self.xslt_path)
        print('    input: %s' % self.input)
        print('    input_path: %s' % self.input_path)
        print('    output: %s' % self.output)

        # Precompute paths that will be used later.
        self.output_dir = tempfile.mkdtemp().replace('\\', '/')
        self.package_opf = self.output_dir + '/OEBPS/package.opf'
        print('Temporary output directory: %s' % self.output_dir)

        os.mkdir(self.output_dir + '/OEBPS')
        os.mkdir(self.output_dir + '/OEBPS/images')
        os.mkdir(self.output_dir + '/META-INF')
        print('Created the folder structure')

        if self.xslt_path is None:
            self.xslt = self.script_folder + 'docbook/epub3/chunk.xsl'
        else:
            self.xslt = self.xslt_path + '/epub3/chunk.xsl'
        print('XSLT style sheet to use:')
        print(self.xslt)

        if self.saxon_path is None:
            self.saxon_path = self.script_folder + 'scripts/saxon6.5.5.jar'

        # These will be filled during the execution of the script.
        self.renamed = None

    def gracefully_fail(self, reason):
        print('docbook2epub fails: %s' % reason)
        shutil.rmtree(self.output_dir, ignore_errors=True)
        sys.exit(1)

    def start_xslt_transformation(self):
        command = None
        if self.xsltproc_path is not None:
            command = self.start_xslt_transformation_xsltproc()
        elif self.java_path is not None:
            command = self.start_xslt_transformation_saxon6()

        if command is None:
            self.gracefully_fail('no XSLT processor available')

        print('Command to execute:')
        print(command)

        quoted_command = command
        if os.name == 'nt':
            # On Windows, it is typical to have spaces in folder names, and that requires to wrap the whole command
            # in quotes. On Linux, this might create errors when starting the command.
            quoted_command = '"' + command + '"'
        # This could be simplified by using subprocess.run, but this requires Python 3.5.

        if os.system(quoted_command) != 0:
            self.gracefully_fail('error from the XSLT processor')

        print('Generated ePub contents.')

    def start_xslt_transformation_xsltproc(self):
        params = '-stringparam base.dir "' + self.output_dir + '"'
        return '"' + self.xsltproc_path + '" ' + params + ' "' + self.xslt + '" "' + self.input + '"'

    def start_xslt_transformation_saxon6(self):
        params = 'base.dir=%s' % self.output_dir
        executable = '"' + self.java_path + '" -jar "' + self.saxon_path + '"'
        return executable + ' "' + self.input + '" "' + self.xslt + '" "' + params + '"'

    def get_images_from_package_opf(self):
        images = []

        # Example in the OPF file:
        #     <item id="d436e1" href="D:/LyX/lib/images/buffer-view.svgz" media-type="image/SVGZ"/>
        # The XHTML files are also <item> tags:
        #     <item id="id-d0e2" href="index.xhtml" media-type="application/xhtml+xml"/>
        try:
            with open(self.package_opf) as f:
                for line in f.readlines():
                    if '<item' in line and 'media-type="image' in line:
                        images.append(line.split('href="')[1].split('"')[0])
        except FileNotFoundError:
            print('The package.opf file was not found, probably due to a DocBook error. The ePub file will be corrupt.')

        return images

    def get_image_changes(self):
        epub_folder = 'images/'

        changes = []
        for image in self.get_images_from_package_opf():
            if os.path.exists(image):
                file_system_path = image
            elif os.path.exists(self.input_path + image):
                file_system_path = self.input_path + image
            else:
                file_system_path = ''

            changes.append(ImageRename(image, file_system_path, epub_folder + os.path.basename(image)))
        return changes

    def change_image_paths(self, file):
        # This could be optimised, as the same operation is performed a zillion times on many files:
        # https://www.oreilly.com/library/view/python-cookbook/0596001673/ch03s15.html
        with open(file, encoding='utf8') as f:
            contents = list(f)

        with open(file, 'w', encoding='utf8') as f:
            for line in contents:
                for change in self.renamed:
                    line = line.replace(change.opf_path, change.epub_path)
                f.write(line)

    def copy_images(self):
        # Copy the assets to the OEBPS/images/. All paths are available in OEBPS/package.opf, but they must also be
        # changed in the XHTML files. Typically, the current paths are absolute.

        # First, get the mapping old file => file in the ePub archive.
        self.renamed = self.get_image_changes()

        # Then, transform all paths (both OPF and XHTML files).
        self.change_image_paths(self.output_dir + '/OEBPS/package.opf')
        for file in glob.glob(self.output_dir + '/OEBPS/*.xhtml'):
            self.change_image_paths(file)

        # Ensure that the destination path exists. OEBPS exists due to the DocBook-to-ePub transformation.
        if not os.path.exists(self.output_dir + '/OEBPS/images/'):
            os.mkdir(self.output_dir + '/OEBPS/images/')

        # Finally, actually copy the image files.
        for change in self.renamed:
            shutil.copyfile(change.local_path, self.output_dir + '/OEBPS/' + change.epub_path)

    def create_zip_archive(self):
        with zipfile.ZipFile(self.output, 'w', zipfile.ZIP_DEFLATED) as zip:
            # Python 3.5 brings the `recursive` argument. For older versions, this trick is required...
            # for file in glob.glob(output_dir + '/**/*', recursive=True):
            for file in [os.path.join(dp, f) for dp, dn, filenames in os.walk(self.output_dir) for f in filenames]:
                zip.write(file, os.path.relpath(file, self.output_dir), compress_type=zipfile.ZIP_STORED)

        shutil.rmtree(self.output_dir)
        print('Generated ePub.')

    def transform(self):
        self.start_xslt_transformation()
        self.copy_images()
        self.create_zip_archive()


if __name__ == '__main__':
    DocBookToEpub(sys.argv).transform()