File: genattrs.py

package info (click to toggle)
html5-parser 0.4.9-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 1,764 kB
  • sloc: ansic: 32,441; python: 2,055; makefile: 13
file content (100 lines) | stat: -rwxr-xr-x 3,471 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

import os
import re
import subprocess

from lxml import html

self_path = os.path.abspath(__file__)
HEADER = '''\
// Do not edit
// Generated by genattrs.py

'''


def generate_attr_headers(attrs):
    with open("src/attr_strings.h", "wb") as attr_strings, \
            open("src/attr_enum.h", "wb") as attr_enum, \
            open("src/attr_sizes.h", "wb") as attr_sizes:
        for f in (attr_strings, attr_enum, attr_sizes):
            f.write(HEADER.encode('utf-8'))
        for attr in attrs:
            attr_upper = attr.upper().replace('-', '_').replace(':', '_')
            attr_strings.write(('"%s",\n' % attr).encode('utf-8'))
            attr_enum.write(('HTML_ATTR_%s,\n' % attr_upper).encode('utf-8'))
            attr_sizes.write(('%d, ' % len(attr)).encode('utf-8'))
        attr_sizes.write(b'\n')


def generate_attr_perfect_hash(attrs, repetitions=400):
    p = subprocess.Popen(
        'gperf -LANSI-C -H attr_hash -m{} /dev/stdin'.format(repetitions).split(),
        stdout=subprocess.PIPE,
        stdin=subprocess.PIPE)
    stdout = p.communicate('\n'.join(attrs).encode('utf-8'))[0]
    if p.wait() != 0:
        raise SystemExit(p.returncode)
    raw = stdout.decode('utf-8').splitlines()
    for i, line in enumerate(raw):
        if line.startswith('in_word_set'):
            break
    else:
        raise SystemExit('Failed to find in_word_set()')
    lines = raw[:i - 1]
    del raw[:i - 1]
    raw = '\n'.join(raw)
    wordlist = re.search("wordlist\[\]\s+=\s+{(.*?)}", raw, re.DOTALL)
    if wordlist is None:
        raise SystemExit('Failed to find wordlist')
    wordlist = [w.strip().replace('"', '') for w in wordlist.group(1).split(',')]
    attrlist = ["\tHTML_ATTR_" + (w.upper().replace('-', '_').replace(':', '_') if w else 'LAST')
                for w in wordlist]
    processed = '\n'.join(lines) + '\n\n'
    processed += 'static const HTMLAttr HTML_ATTR_MAP[] = {\n%s\n};' % '\n,'.join(attrlist)
    processed = re.sub(
        r'.+^attr_hash',
        HEADER + 'static inline unsigned int\nattr_hash',
        processed,
        flags=re.DOTALL | re.MULTILINE)
    with open('src/attr_perf.h', 'wb') as f:
        f.write(processed.encode('utf-8'))
        f.write(b'\n')


def get_attr_names():
    # HTML Attributes from
    # https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
    raw = open('/t/Attributes', 'rb').read()
    root = html.fromstring(raw)
    table = root.xpath('//table[@class="standard-table"]/tbody')[0]
    for tr in table.findall('tr'):
        td = tr.find('td')
        code = td.find('code')
        attr = code.text
        if attr and '*' not in attr:
            yield attr.strip()
    # SVG Attributes from
    # https://developer.mozilla.org/en-US/docs/Web/SVG/Attribute
    raw = open('/t/Attribute', 'rb').read()
    root = html.fromstring(raw)
    h2 = root.xpath('//h2[@id="SVG_Attributes"]')[0]
    for ul in h2.xpath('following-sibling::div[1]/ul'):
        for attr in ul.xpath('./li/code/a/text()'):
            yield attr.strip()


def main():
    os.chdir(os.path.dirname(self_path))
    attrs = sorted(set(get_attr_names()) | {'data-reactid'})
    generate_attr_headers(attrs)
    generate_attr_perfect_hash(attrs)


if __name__ == '__main__':
    main()