File: doctype.py

package info (click to toggle)
python-kajiki 1.0.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 700 kB
  • sloc: python: 4,098; makefile: 115
file content (227 lines) | stat: -rw-r--r-- 7,376 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import re


class DocumentTypeDeclaration:
    """Represents a http://en.wikipedia.org/wiki/Document_Type_Declaration

    This is used to lookup DTDs details by its string, DTDs can
    be registered in :attr:`.by_uri` and can then be looked up
    using :meth:`.matching` method:

    >>> from kajiki.doctype import DocumentTypeDeclaration
    >>> dtd = DocumentTypeDeclaration(
    ...     "html4transitional",
    ...     "-//W3C//DTD HTML 4.01 Transitional//EN",
    ...     "http://www.w3.org/TR/html4/loose.dtd",
    ...     rendering_mode="html",
    ... )
    >>> dtd.uri
    'http://www.w3.org/TR/html4/loose.dtd'
    >>> DocumentTypeDeclaration.by_uri["http://www.w3.org/TR/html4/loose.dtd"] = dtd
    >>> match = DocumentTypeDeclaration.matching(
    ...     '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" '
    ...     '"http://www.w3.org/TR/html4/loose.dtd">'
    ... )
    >>> match.name
    'html4transitional'

    DocumentTypeDeclaration is used by :class:`kajiki.xml_template._Compiler`
    to detect the document doctype and tune generated template (for example
    by deciding if tags closed inline are allowed or not).
    """

    def __init__(
        self,
        name,
        fpi="",
        uri="",
        rendering_mode="xml",
        root_element="html",
        kind="PUBLIC",
    ):
        """*fpi* is the Formal Public Identifier."""
        self.name = name
        self.fpi = fpi
        self.uri = uri
        self.rendering_mode = rendering_mode
        self.root_element = root_element
        assert kind in (  # noqa: S101
            "PUBLIC",
            "SYSTEM",
            "",
        ), '*kind* can be either "PUBLIC", "SYSTEM", or empty.'
        self.kind = kind
        self._cached_str = None

        self.regex = re.compile(
            str(self).replace(" ", r"\s+").replace(".", r"\.").replace("[", r"\[").replace("]", r"\]"),
            flags=re.IGNORECASE,
        )

    def __str__(self):
        if not self._cached_str:
            alist = ["<!DOCTYPE"]
            alist.append(self.root_element)
            if self.kind:
                alist.append(self.kind)
            if self.fpi:
                alist.append('"' + self.fpi + '"')
            if self.uri:
                alist.append('"' + self.uri + '"')
            self._cached_str = " ".join(alist) + ">"
        return self._cached_str

    by_uri = {}  # We store the public DTDs here.  # noqa: RUF012

    @classmethod
    def matching(cls, dtd_string):
        """Looks up the known DTDs and returns the instance that matches the
        provided dtd_string.
        """
        for dtd in cls.by_uri.values():
            if dtd.regex.match(dtd_string):
                return dtd
        return None

    REGEX = re.compile(r"<!DOCTYPE[^>]+>")  # This matches any DTD.


# Build the public DTDs dictionary
for dtd in (
    DocumentTypeDeclaration("html5", kind="", rendering_mode="html5"),
    DocumentTypeDeclaration("xhtml5", kind="", uri=None),
    DocumentTypeDeclaration(
        "xhtml1transitional",
        "-//W3C//DTD XHTML 1.0 Transitional//EN",
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
    ),
    DocumentTypeDeclaration(
        "xhtml1strict",
        "-//W3C//DTD XHTML 1.0 Strict//EN",
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd",
    ),
    DocumentTypeDeclaration(
        "xhtml1rdfa",
        "-//W3C//DTD XHTML+RDFa 1.0//EN",
        "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd",
    ),
    DocumentTypeDeclaration(
        "xhtml11",
        "-//W3C//DTD XHTML 1.1//EN",
        "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd",
    ),
    DocumentTypeDeclaration(
        "xhtml1frameset",
        "-//W3C//DTD XHTML 1.0 Frameset//EN",
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd",
    ),
    DocumentTypeDeclaration(
        "xhtmlbasic11",
        "-//W3C//DTD XHTML Basic 1.1//EN",
        "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd",
    ),
    DocumentTypeDeclaration(
        "xhtmlmobile12",
        "-//WAPFORUM//DTD XHTML Mobile 1.2//EN",
        "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile12.dtd",
    ),
    DocumentTypeDeclaration(
        "html4transitional",
        "-//W3C//DTD HTML 4.01 Transitional//EN",
        "http://www.w3.org/TR/html4/loose.dtd",
        rendering_mode="html",
    ),
    DocumentTypeDeclaration(
        "html4strict",
        "-//W3C//DTD HTML 4.01//EN",
        "http://www.w3.org/TR/html4/strict.dtd",
        rendering_mode="html",
    ),
    DocumentTypeDeclaration(
        "html4frameset",
        "-//W3C//DTD HTML 4.01 Frameset//EN",
        "http://www.w3.org/TR/html4/frameset.dtd",
        rendering_mode="html",
    ),
    # html3='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">',
    # html2='<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">',
):
    DocumentTypeDeclaration.by_uri[dtd.uri] = dtd

XML_DECLARATION = re.compile(r"<\?xml .*?\?>")


def extract_dtd(markup):
    """Lookup the DTD in the provided markup code.

    Tries to find any DTD in the string *markup* and returns a tuple
    (dtd_string, position, markup_without_the_DTD). Note the first of
    these values might be an empty string:

    >>> markup = (
    ...     "<!DOCTYPE HTML PUBLIC "
    ...     '"-//W3C//DTD HTML 4.01 Transitional//EN" '
    ...     '"http://www.w3.org/TR/html4/loose.dtd">'
    ...     '''<html>
    ...     <head>
    ...     ...
    ...     </head>
    ...     <body>
    ...     ...
    ...     </body>
    ...     </html>'''
    ... )
    >>> import kajiki.doctype
    >>> dtd, dtd_pos, markup_without_dtd = kajiki.doctype.extract_dtd(markup)
    >>> print(dtd)  # doctest: +NORMALIZE_WHITESPACE
    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
        "http://www.w3.org/TR/html4/loose.dtd">
    >>> print(dtd_pos)
    0
    >>> print(markup_without_dtd)  # doctest: +NORMALIZE_WHITESPACE
    <html>
    <head>
    ...
    </head>
    <body>
    ...
    </body>
    </html>

    >>> markup = '<?xml version="1.0"?><!DOCTYPE html><html><body></body></html>'
    >>> dtd, dtd_pos, markup_without_dtd = kajiki.doctype.extract_dtd(markup)
    >>> print(dtd)
    <!DOCTYPE html>
    >>> print(dtd_pos)
    21
    >>> print(markup_without_dtd)
    <?xml version="1.0"?><html><body></body></html>

    >>> markup = '<?xml version="1.0"?><html><head></head><body></body></html>'
    >>> dtd, dtd_pos, markup_without_dtd = kajiki.doctype.extract_dtd(markup)
    >>> print(dtd)
    <BLANKLINE>
    >>> print(dtd_pos)
    21
    >>> print(markup_without_dtd == markup)
    True

    >>> markup = '<?xml version="1.0" encoding="UTF-8"?><html><body></body></html>'
    >>> dtd, dtd_pos, markup_without_dtd = kajiki.doctype.extract_dtd(markup)
    >>> print(dtd)
    <BLANKLINE>
    >>> print(dtd_pos)
    38
    >>> print(markup_without_dtd == markup)
    True
    """
    match = DocumentTypeDeclaration.REGEX.search(markup)
    if not match:
        decl_match = XML_DECLARATION.match(markup)
        if decl_match:
            # The position for a prospective DTD is *after* the <?xml ...?> declaration,
            # because it's not allowed for there to be anything before it.
            return "", decl_match.end(), markup
        return "", 0, markup
    found = match.group()
    return found, match.start(), markup.replace(found, "", 1)