File: entities.py

package info (click to toggle)
firefox 149.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 4,767,760 kB
  • sloc: cpp: 7,416,064; javascript: 6,752,859; ansic: 3,774,850; python: 1,250,473; xml: 641,578; asm: 439,191; java: 186,617; sh: 56,634; makefile: 18,856; objc: 13,092; perl: 12,763; pascal: 5,960; yacc: 4,583; cs: 3,846; lex: 1,720; ruby: 1,002; php: 436; lisp: 258; awk: 105; sql: 66; sed: 53; csh: 10; exp: 6
file content (101 lines) | stat: -rw-r--r-- 2,707 bytes parent folder | download | duplicates (27)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import json

import html5lib


def parse(path="html5ents.xml"):
    return html5lib.parse(open(path), treebuilder="lxml")


def entity_table(tree):
    return {entity_name("".join(tr[0].xpath(".//text()"))):
            entity_characters(tr[1].text)
            for tr in tree.xpath("//h:tbody/h:tr",
                                 namespaces={"h": "http://www.w3.org/1999/xhtml"})}


def entity_name(inp):
    return inp.strip()


def entity_characters(inp):
    return "".join(codepoint_to_character(item)
                   for item in inp.split()
                   if item)


def codepoint_to_character(inp):
    return ("\\U000" + inp[2:]).decode("unicode-escape")


def make_tests_json(entities):
    test_list = make_test_list(entities)
    tests_json = {"tests":
                  [make_test(*item) for item in test_list]
                  }
    return tests_json


def make_test(name, characters, good):
    return {
        "description": test_description(name, good),
        "input": "&%s" % name,
        "output": test_expected(name, characters, good)
    }


def test_description(name, good):
    with_semicolon = name.endswith(";")
    semicolon_text = {True: "with a semi-colon",
                      False: "without a semi-colon"}[with_semicolon]
    if good:
        text = "Named entity: %s %s" % (name, semicolon_text)
    else:
        text = "Bad named entity: %s %s" % (name, semicolon_text)
    return text


def test_expected(name, characters, good):
    rv = []
    if not good or not name.endswith(";"):
        rv.append("ParseError")
    rv.append(["Character", characters])
    return rv


def make_test_list(entities):
    tests = []
    for entity_name, characters in entities.items():
        if entity_name.endswith(";") and not subentity_exists(entity_name, entities):
            tests.append((entity_name[:-1], "&" + entity_name[:-1], False))
        tests.append((entity_name, characters, True))
    return sorted(tests)


def subentity_exists(entity_name, entities):
    for i in range(1, len(entity_name)):
        if entity_name[:-i] in entities:
            return True
    return False


def make_entities_code(entities):
    entities_text = "\n".join("    \"%s\": u\"%s\"," % (
        name, entities[name].encode(
            "unicode-escape").replace("\"", "\\\""))
        for name in sorted(entities.keys()))
    return """entities = {
%s
}""" % entities_text


def main():
    entities = entity_table(parse())
    tests_json = make_tests_json(entities)
    json.dump(tests_json, open("namedEntities.test", "w"), indent=4)
    code = make_entities_code(entities)
    open("entities_constants.py", "w").write(code)


if __name__ == "__main__":
    main()