1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
|
# file docbook2epub.py
# This file is part of LyX, the document processor.
# Licence details can be found in the file COPYING.
#
# \author Thibaut Cuvelier
#
# Full author contact details are available in file CREDITS
# Usage:
# python docbook2epub.py java_binary saxon_path xsltproc_path xslt_path in.docbook in.orig.path out.epub
import glob
import os
import shutil
import sys
import tempfile
import zipfile
def _parse_nullable_argument(arg):
return arg if arg != '' and arg != 'none' else None
class ImageRename:
def __init__(self, opf_path, local_path, epub_path):
self.opf_path = opf_path
self.local_path = local_path
self.epub_path = epub_path
class DocBookToEpub:
def __init__(self, args=None):
if args is None:
args = sys.argv
if len(args) != 8:
print(f'Exactly eight arguments are expected, only {len(args)} found: {args}.')
sys.exit(1)
self.own_path = sys.argv[0]
self.java_path = _parse_nullable_argument(sys.argv[1])
self.saxon_path = _parse_nullable_argument(sys.argv[2])
self.xsltproc_path = _parse_nullable_argument(sys.argv[3])
self.xslt_path = _parse_nullable_argument(sys.argv[4])
self.input = sys.argv[5]
self.input_path = sys.argv[6]
self.output = sys.argv[7]
self.script_folder = os.path.dirname(self.own_path) + '/../'
print('Generating ePub with the following parameters:')
print(' own_path: %s' % self.own_path)
print(' java_path: %s' % self.java_path)
print(' saxon_path: %s' % self.saxon_path)
print(' xsltproc_path: %s' % self.xsltproc_path)
print(' xslt_path: %s' % self.xslt_path)
print(' input: %s' % self.input)
print(' input_path: %s' % self.input_path)
print(' output: %s' % self.output)
# Precompute paths that will be used later.
self.output_dir = tempfile.mkdtemp().replace('\\', '/')
self.package_opf = self.output_dir + '/OEBPS/package.opf'
print('Temporary output directory: %s' % self.output_dir)
os.mkdir(self.output_dir + '/OEBPS')
os.mkdir(self.output_dir + '/OEBPS/images')
os.mkdir(self.output_dir + '/META-INF')
print('Created the folder structure')
if self.xslt_path is None:
self.xslt = self.script_folder + 'docbook/epub3/chunk.xsl'
else:
self.xslt = self.xslt_path + '/epub3/chunk.xsl'
print('XSLT style sheet to use:')
print(self.xslt)
if self.saxon_path is None:
self.saxon_path = self.script_folder + 'scripts/saxon6.5.5.jar'
# These will be filled during the execution of the script.
self.renamed = None
def gracefully_fail(self, reason):
print('docbook2epub fails: %s' % reason)
shutil.rmtree(self.output_dir, ignore_errors=True)
sys.exit(1)
def start_xslt_transformation(self):
command = None
if self.xsltproc_path is not None:
command = self.start_xslt_transformation_xsltproc()
elif self.java_path is not None:
command = self.start_xslt_transformation_saxon6()
if command is None:
self.gracefully_fail('no XSLT processor available')
print('Command to execute:')
print(command)
quoted_command = command
if os.name == 'nt':
# On Windows, it is typical to have spaces in folder names, and that requires to wrap the whole command
# in quotes. On Linux, this might create errors when starting the command.
quoted_command = '"' + command + '"'
# This could be simplified by using subprocess.run, but this requires Python 3.5.
if os.system(quoted_command) != 0:
self.gracefully_fail('error from the XSLT processor')
print('Generated ePub contents.')
def start_xslt_transformation_xsltproc(self):
params = '-stringparam base.dir "' + self.output_dir + '"'
return '"' + self.xsltproc_path + '" ' + params + ' "' + self.xslt + '" "' + self.input + '"'
def start_xslt_transformation_saxon6(self):
params = 'base.dir=%s' % self.output_dir
executable = '"' + self.java_path + '" -jar "' + self.saxon_path + '"'
return executable + ' "' + self.input + '" "' + self.xslt + '" "' + params + '"'
def get_images_from_package_opf(self):
images = []
# Example in the OPF file:
# <item id="d436e1" href="D:/LyX/lib/images/buffer-view.svgz" media-type="image/SVGZ"/>
# The XHTML files are also <item> tags:
# <item id="id-d0e2" href="index.xhtml" media-type="application/xhtml+xml"/>
try:
with open(self.package_opf) as f:
for line in f.readlines():
if '<item' in line and 'media-type="image' in line:
images.append(line.split('href="')[1].split('"')[0])
except FileNotFoundError:
print('The package.opf file was not found, probably due to a DocBook error. The ePub file will be corrupt.')
return images
def get_image_changes(self):
epub_folder = 'images/'
changes = []
for image in self.get_images_from_package_opf():
if os.path.exists(image):
file_system_path = image
elif os.path.exists(self.input_path + image):
file_system_path = self.input_path + image
else:
file_system_path = ''
changes.append(ImageRename(image, file_system_path, epub_folder + os.path.basename(image)))
return changes
def change_image_paths(self, file):
# This could be optimised, as the same operation is performed a zillion times on many files:
# https://www.oreilly.com/library/view/python-cookbook/0596001673/ch03s15.html
with open(file, encoding='utf8') as f:
contents = list(f)
with open(file, 'w', encoding='utf8') as f:
for line in contents:
for change in self.renamed:
line = line.replace(change.opf_path, change.epub_path)
f.write(line)
def copy_images(self):
# Copy the assets to the OEBPS/images/. All paths are available in OEBPS/package.opf, but they must also be
# changed in the XHTML files. Typically, the current paths are absolute.
# First, get the mapping old file => file in the ePub archive.
self.renamed = self.get_image_changes()
# Then, transform all paths (both OPF and XHTML files).
self.change_image_paths(self.output_dir + '/OEBPS/package.opf')
for file in glob.glob(self.output_dir + '/OEBPS/*.xhtml'):
self.change_image_paths(file)
# Ensure that the destination path exists. OEBPS exists due to the DocBook-to-ePub transformation.
if not os.path.exists(self.output_dir + '/OEBPS/images/'):
os.mkdir(self.output_dir + '/OEBPS/images/')
# Finally, actually copy the image files.
for change in self.renamed:
shutil.copyfile(change.local_path, self.output_dir + '/OEBPS/' + change.epub_path)
def create_zip_archive(self):
with zipfile.ZipFile(self.output, 'w', zipfile.ZIP_DEFLATED) as zip:
# Python 3.5 brings the `recursive` argument. For older versions, this trick is required...
# for file in glob.glob(output_dir + '/**/*', recursive=True):
for file in [os.path.join(dp, f) for dp, dn, filenames in os.walk(self.output_dir) for f in filenames]:
zip.write(file, os.path.relpath(file, self.output_dir), compress_type=zipfile.ZIP_STORED)
shutil.rmtree(self.output_dir)
print('Generated ePub.')
def transform(self):
self.start_xslt_transformation()
self.copy_images()
self.create_zip_archive()
if __name__ == '__main__':
DocBookToEpub(sys.argv).transform()
|