File: preprocess.py

package info (click to toggle)
cppreference-doc 20170409-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 259,868 kB
  • sloc: xml: 570,184; python: 1,923; php: 520; makefile: 167; sh: 25; cpp: 9; ansic: 9
file content (323 lines) | stat: -rwxr-xr-x 11,125 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
#!/usr/bin/env python3

#   Copyright (C) 2011, 2012  Povilas Kanapickas <povilas@radix.lt>
#
#   This file is part of cppreference-doc
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see http://www.gnu.org/licenses/.

import argparse
import fnmatch
from lxml import etree
import re
import os
import sys
import shutil
import urllib.parse
from xml_utils import xml_escape, xml_unescape

def rmtree_if_exists(dir):
    if os.path.isdir(dir):
        shutil.rmtree(dir)

def move_dir_contents_to_dir(srcdir, dstdir):
    for fn in os.listdir(srcdir):
        shutil.move(os.path.join(srcdir, fn),
                    os.path.join(dstdir, fn))

def rearrange_archive(root):
    # rearrange the archive. {root} here is output/reference

    # before
    # {root}/en.cppreference.com/w/ : html
    # {root}/en.cppreference.com/mwiki/ : data
    # {root}/en.cppreference.com/ : data
    # ... (other languages)
    # {root}/upload.cppreference.com/mwiki/ : data

    # after
    # {root}/common/ : all common data
    # {root}/en/ : html for en
    # ... (other languages)

    data_path = os.path.join(root, 'common')
    rmtree_if_exists(data_path)
    shutil.move(os.path.join(root, 'upload.cppreference.com/mwiki'), data_path)
    shutil.rmtree(os.path.join(root, 'upload.cppreference.com'))

    for lang in ["en"]:
        path = os.path.join(root, lang + ".cppreference.com/")
        src_html_path = path + "w/"
        src_data_path = path + "mwiki/"
        html_path = os.path.join(root, lang)

        if os.path.isdir(src_html_path):
            shutil.move(src_html_path, html_path)

        if os.path.isdir(src_data_path):
            # the skin files should be the same for all languages thus we
            # can merge everything
            move_dir_contents_to_dir(src_data_path, data_path)

        # also copy the custom fonts
        shutil.copy(os.path.join(path, 'DejaVuSansMonoCondensed60.ttf'), data_path)
        shutil.copy(os.path.join(path, 'DejaVuSansMonoCondensed75.ttf'), data_path)

        # remove what's left
        shutil.rmtree(path)

    # remove the XML source file
    for fn in fnmatch.filter(os.listdir(root), 'cppreference-export*.xml'):
        os.remove(os.path.join(root, fn))

def add_file_to_rename_map(rename_map, dir, fn, new_fn):
    path = os.path.join(dir, fn)
    if not os.path.isfile(path):
        print("ERROR: Not renaming '{0}' because path does not exist".format(path))
        return
    rename_map.append((dir, fn, new_fn))

def find_files_to_be_renamed(root):
    # Returns a rename map: array of tuples each of which contain three strings:
    # the directory the file resides in, the source and destination filenames.

    # The rename map specifies files to be renamed in order to support them on
    # windows filesystems which don't support certain characters in file names
    rename_map = []

    files_rename = []           # general files to be renamed
    files_loader = []           # files served by load.php. These should map to
                                # consistent and short file names because we
                                # modify some of them later in the pipeline

    for dir, dirnames, filenames in os.walk(root):
        filenames_loader = set(fnmatch.filter(filenames, 'load.php[?]*'))
        # match any filenames with '?"*' characters
        filenames_rename = set(fnmatch.filter(filenames, '*[?"*]*'))

        # don't process load.php files in general rename handler
        filenames_rename -= filenames_loader

        for fn in filenames_loader:
            files_loader.append((dir, fn))
        for fn in filenames_rename:
            files_rename.append((dir, fn))

    for dir,orig_fn in files_rename:
        fn = orig_fn
        fn = re.sub('\?.*', '', fn)
        fn = re.sub('"', '_q_', fn)
        fn = re.sub('\*', '_star_', fn)
        add_file_to_rename_map(rename_map, dir, orig_fn, fn)

    # map loader names to more recognizable names
    for dir,fn in files_loader:
        if re.search("modules=site&only=scripts", fn):
            new_fn = "site_scripts.js"
        elif re.search("modules=site&only=styles", fn):
            new_fn = "site_modules.css"
        elif re.search("modules=skins.*&only=scripts", fn):
            new_fn = "skin_scripts.js"
        elif re.search("modules=startup&only=scripts", fn):
            new_fn = "startup_scripts.js"
        elif re.search("modules=.*ext.*&only=styles", fn):
            new_fn = "ext.css"
        else:
            print("Loader file " + fn + " does not match any known files")
            sys.exit(1)

        add_file_to_rename_map(rename_map, dir, fn, new_fn)

    # rename filenames that conflict on case-insensitive filesystems
    # TODO: perform this automatically
    add_file_to_rename_map(rename_map, os.path.join(root, 'en/cpp/numeric/math'), 'NAN.html', 'NAN.2.html')
    add_file_to_rename_map(rename_map, os.path.join(root, 'en/c/numeric/math'), 'NAN.html', 'NAN.2.html')
    return rename_map

def rename_files(rename_map):
    for dir, old_fn, new_fn in rename_map:
        src_path = os.path.join(dir, old_fn)
        dst_path = os.path.join(dir, new_fn)
        print("Renaming '{0}' to \n         '{1}'".format(src_path, dst_path))
        shutil.move(src_path, dst_path)

def find_html_files(root):
    # find files that need to be preprocessed
    html_files = []
    for dir, dirnames, filenames in os.walk(root):
        for filename in fnmatch.filter(filenames, '*.html'):
            html_files.append(os.path.join(dir, filename))
    return html_files

def fix_relative_link(rename_map, target):
    if 'http://' in target or 'https://' in target:
        return target

    target = urllib.parse.unquote(target)
    for dir,fn,new_fn in rename_map:
        target = target.replace(fn, new_fn)
    target = target.replace('../../upload.cppreference.com/mwiki/','../common/')
    target = target.replace('../mwiki/','../common/')
    target = re.sub('(\.php|\.css)\?.*', '\\1', target)
    target = urllib.parse.quote(target)
    target = target.replace('%23', '#')
    return target

def has_class(el, classes_to_check):
    value = el.get('class')
    if value is None:
        return False
    classes = value.split(' ')
    for cl in classes_to_check:
        if cl in classes:
            return True
    return False

def preprocess_html_file(root, fn, rename_map):

    parser = etree.HTMLParser()
    html = etree.parse(fn, parser)

    # remove non-printable elements
    for el in html.xpath('//*'):
        if has_class(el, ['noprint', 'editsection']):
            el.getparent().remove(el)
        if el.get('id') == 'toc':
            el.getparent().remove(el)

    # remove see also links between C and C++ documentations
    for el in html.xpath('//tr[@class]'):
        if not has_class(el, ['t-dcl-list-item']):
            continue

        child_tds = el.xpath('.//td/div[@class]')
        if not any(has_class(td, ['t-dcl-list-see']) for td in child_tds):
            continue

        # remove preceding separator, if any
        prev = el.getprevious()
        if prev is not None:
            child_tds = prev.xpath('.//td[@class')
            if any(has_class(td, 't-dcl-list-sep') for td in child_tds):
                prev.getparent().remove(prev)

        el.getparent().remove(el)

    for el in html.xpath('//h3'):
        if len(el.xpath(".//span[@id = 'See_also']")) == 0:
            continue

        next = el.getnext()
        if next is None:
            el.getparent().remove(el)
            continue

        if next.tag != 'table':
            continue

        if not has_class(next, 't-dcl-list-begin'):
            continue

        if len(next.xpath('.//tr')) > 0:
            continue

        el.getparent().remove(el)
        next.getparent().remove(next)

    # remove external links to unused resources
    for el in html.xpath('/html/head/link'):
        if el.get('rel') in [ 'alternate', 'search', 'edit', 'EditURI' ]:
            el.getparent().remove(el)

    # remove Google Analytics scripts
    for el in html.xpath('/html/body/script'):
        if el.get('src') is not None and 'google-analytics.com/ga.js' in el.get('src'):
            el.getparent().remove(el)
        elif el.text is not None and ('google-analytics.com/ga.js' in el.text or 'pageTracker' in el.text):
            el.getparent().remove(el)

    # apply changes to links caused by file renames
    for el in html.xpath('//*[@src or @href]'):
        if el.get('src') is not None:
            el.set('src', fix_relative_link(rename_map, el.get('src')))
        elif el.get('href') is not None:
            el.set('href', fix_relative_link(rename_map, el.get('href')))

    for err in parser.error_log:
        print("HTML WARN: {0}".format(err))
    text = etree.tostring(html, encoding=str, method="html")

    f = open(fn, "w", encoding='utf-8')
    f.write(text)
    f.close()

def preprocess_css_file(fn):

    f = open(fn, "r", encoding='utf-8')
    text = f.read()
    f.close()

    # note that query string is not used in css files

    text = text.replace('../DejaVuSansMonoCondensed60.ttf', 'DejaVuSansMonoCondensed60.ttf')
    text = text.replace('../DejaVuSansMonoCondensed75.ttf', 'DejaVuSansMonoCondensed75.ttf')

    # QT Help viewer doesn't understand nth-child
    text = text.replace('nth-child(1)', 'first-child')

    f = open(fn, "w", encoding='utf-8')
    f.write(text)
    f.close()

def main():

    parser = argparse.ArgumentParser(prog='preprocess.py')
    parser.add_argument('--src', type=str, help='Source directory where raw website copy resides')
    parser.add_argument('--dst', type=str, help='Destination folder to put preprocessed archive to')
    args = parser.parse_args()

    root = args.dst
    src = args.src

    # copy the source tree
    rmtree_if_exists(root)
    shutil.copytree(src, root)

    rearrange_archive(root)

    rename_map = find_files_to_be_renamed(root)
    rename_files(rename_map)

    # clean the html files
    for fn in find_html_files(root):
        preprocess_html_file(root, fn, rename_map)

    # append css modifications

    f = open("preprocess-css.css", "r", encoding='utf-8')
    css_app = f.read()
    f.close()
    f = open(os.path.join(root, 'common/site_modules.css'), "a", encoding='utf-8')
    f.write(css_app)
    f.close()

    # clean the css files

    for fn in [ os.path.join(root, 'common/site_modules.css'),
                os.path.join(root, 'common/ext.css') ]:
        preprocess_css_file(fn)


if __name__ == "__main__":
    main()