File: unicode2rstsubs.py

package info (click to toggle)
python-docutils 0.22%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 11,448 kB
  • sloc: python: 53,302; lisp: 14,475; xml: 1,807; javascript: 1,032; makefile: 102; sh: 96
file content (217 lines) | stat: -rwxr-xr-x 7,275 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#! /usr/bin/env python3
# $Id: unicode2rstsubs.py 10045 2025-03-09 01:02:23Z aa-turner $
# Author: David Goodger <goodger@python.org>
# Copyright: This program has been placed in the public domain.

"""
unicode2subfiles.py -- produce character entity files (reSructuredText
substitutions) from the W3C master unicode.xml file.

This program extracts character entity and entity set information from a
unicode.xml file and produces multiple reStructuredText files (in the current
directory) containing substitutions.  Entity sets are from ISO 8879 & ISO
9573-13 (combined), MathML, and HTML4.  One or two files are produced for each
entity set; a second file with a "-wide.rst" suffix is produced if there are
wide-Unicode characters in the set.

The input file, unicode.xml, is maintained as part of the MathML 2
Recommentation XML source, and is available from
<https://www.w3.org/2003/entities/xml/>.
"""

from __future__ import annotations

import os
import re
import sys
from xml.parsers.expat import ParserCreate

TYPE_CHECKING = False
if TYPE_CHECKING:
    from typing import BinaryIO, NoReturn, TextIO
    from xml.parsers.expat import XMLParserType


usage_msg = """Usage: %s [unicode.xml]\n"""


def usage(prog: str, status: int = 0, msg: str | None = None) -> NoReturn:
    sys.stderr.write(usage_msg % prog)
    if msg:
        sys.stderr.write(msg + '\n')
    sys.exit(status)


def main(argv: list[str] | None = None) -> None:
    if argv is None:
        argv = sys.argv
    if len(argv) == 2:
        inpath = argv[1]
    elif len(argv) > 2:
        usage(argv[0], 2,
              'Too many arguments (%s): only 1 expected.' % (len(argv) - 1))
    else:
        inpath = 'unicode.xml'
    if not os.path.isfile(inpath):
        usage(argv[0], 1, 'No such file: "%s".' % inpath)
    infile = open(inpath, mode='rb')
    process(infile)


def process(infile: BinaryIO) -> None:
    grouper = CharacterEntitySetExtractor(infile)
    grouper.group()
    grouper.write_sets()


class CharacterEntitySetExtractor:

    """
    Extracts character entity information from unicode.xml file, groups it by
    entity set, and writes out reStructuredText substitution files.
    """

    unwanted_entity_sets = ['stix',     # unknown, buggy set
                            'predefined']

    header = """\
.. This data file has been placed in the public domain.
.. Derived from the Unicode character mappings available from
   <https://www.w3.org/2003/entities/xml/>.
   Processed by unicode2rstsubs.py, part of Docutils:
   <https://docutils.sourceforge.io>.
"""

    def __init__(self, infile: BinaryIO) -> None:
        self.infile = infile
        """Input unicode.xml file."""

        self.parser: XMLParserType = self.setup_parser()
        """XML parser."""

        self.elements: list[str] = []
        """Stack of element names.  Last is current element."""

        self.sets: dict[str, dict[str, str]] = {}
        """Mapping of charent set name to set dict."""

        self.charid: str | None = None
        """Current character's "id" attribute value."""

        self.descriptions: dict[str, str] = {}
        """Mapping of character ID to description."""

    def setup_parser(self) -> XMLParserType:
        parser = ParserCreate()
        parser.StartElementHandler = self.StartElementHandler
        parser.EndElementHandler = self.EndElementHandler
        parser.CharacterDataHandler = self.CharacterDataHandler
        return parser

    def group(self) -> None:
        self.parser.ParseFile(self.infile)

    def StartElementHandler(self, name: str, attributes) -> None:
        self.elements.append(name)
        handler = name + '_start'
        if hasattr(self, handler):
            getattr(self, handler)(name, attributes)

    def EndElementHandler(self, name: str) -> None:
        assert self.elements[-1] == name, \
               'unknown end-tag %r (%r)' % (name, self.element)
        self.elements.pop()
        handler = name + '_end'
        if hasattr(self, handler):
            getattr(self, handler)(name)

    def CharacterDataHandler(self, data) -> None:
        handler = self.elements[-1] + '_data'
        if hasattr(self, handler):
            getattr(self, handler)(data)

    def character_start(self, name: str, attributes) -> None:
        self.charid = attributes['id']

    def entity_start(self, name, attributes) -> None:
        set_ = self.entity_set_name(attributes['set'])
        if not set_:
            return
        if set_ not in self.sets:
            print('bad set: %r' % set_)
            return
        entity = attributes['id']
        assert (entity not in self.sets[set_]
                or self.sets[set_][entity] == self.charid
                ), ('sets[%r][%r] == %r (!= %r)'
                    % (set_, entity, self.sets[set_][entity], self.charid))
        self.sets[set_][entity] = self.charid

    def description_data(self, data) -> None:
        self.descriptions.setdefault(self.charid, '')
        self.descriptions[self.charid] += data

    entity_set_name_pat = re.compile(r'[0-9-]*(.+)$')
    """Pattern to strip ISO numbers off the beginning of set names."""

    def entity_set_name(self, name: str) -> str | None:
        """
        Return lowcased and standard-number-free entity set name.
        Return ``None`` for unwanted entity sets.
        """
        match = self.entity_set_name_pat.match(name)
        name = match.group(1).lower()
        if name in self.unwanted_entity_sets:
            return None
        self.sets.setdefault(name, {})
        return name

    def write_sets(self) -> None:
        sets = sorted(self.sets.keys())
        for set_name in sets:
            self.write_set(set_name)

    def write_set(self, set_name: str, wide: bool = False) -> None:
        if wide:
            outname = set_name + '-wide.rst'
        else:
            outname = set_name + '.rst'
        outfile = open(outname, 'w', encoding='ascii')
        print('writing file "%s"' % outname)
        outfile.write(self.header + '\n')
        set_ = self.sets[set_name]
        entities = sorted((e.lower(), e) for e in set_.keys())
        longest = 0
        for _, entity_name in entities:
            longest = max(longest, len(entity_name))
        has_wide = False
        for _, entity_name in entities:
            has_wide = self.write_entity(
                set_, set_name, entity_name, outfile, longest, wide,
            ) or has_wide
        if has_wide and not wide:
            self.write_set(set_name, wide=True)

    def write_entity(
        self,
        set_: dict[str, str],
        set_name: str,
        entity_name: str,
        outfile: TextIO,
        longest: int,
        wide: bool = False,
    ) -> bool:
        charid = set_[entity_name]
        if not wide:
            for code in charid[1:].split('-'):
                if int(code, 16) > 0xFFFF:
                    return True         # wide-Unicode character
        codes = ' '.join('U+%s' % code for code in charid[1:].split('-'))
        outfile.write('.. %-*s unicode:: %s .. %s\n'
                      % (longest + 2, '|' + entity_name + '|',
                         codes, self.descriptions[charid]))
        return False


if __name__ == '__main__':
    sys.exit(main())