File: lib.py

package info (click to toggle)
utidylib 0.10-3
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 216 kB
  • sloc: python: 428; makefile: 148; sh: 41
file content (376 lines) | stat: -rw-r--r-- 11,088 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
from __future__ import annotations

import ctypes
import io
import os
import os.path
import weakref
from abc import ABC, abstractmethod
from errno import ENOMEM
from typing import (
    TYPE_CHECKING,
    Any,
    BinaryIO,
    Callable,
    ClassVar,
    Mapping,
    TypeVar,
)

from tidy.error import InvalidOptionError, OptionArgError

if TYPE_CHECKING:
    OPTION_TYPE = str | int | bool | None
    OPTION_DICT_TYPE = dict[str, OPTION_TYPE]

LIBNAMES = (
    # Linux
    "libtidy.so",
    # MacOS
    "libtidy.dylib",
    # Windows
    "tidy",
    # Cygwin
    "cygtidy-0-99-0",
    # Linux, full soname
    "libtidy-0.99.so.0",
    # Linux, full soname
    "libtidy-0.99.so.0.0.0",
    # HTML tidy
    "libtidy.so.5",
    # Linux, HTML tidy v5.8
    "libtidy.so.58",
    # Debian changed soname
    "libtidy.so.5deb1",
    # Windows?
    "libtidy",
    # Windows?
    "tidylib",
)


class Loader:
    """
    ctypes.CDLL wrapper.

    I am a trivial wrapper that eliminates the need for tidy.tidyFoo,
    so you can just access tidy.Foo.
    """

    def __init__(self, libnames: tuple[str, ...] | None = None) -> None:
        self.lib: ctypes.CDLL
        self.libnames: tuple[str, ...] = libnames or LIBNAMES

        # Add package directory to search path
        os.environ["PATH"] = "".join(
            (os.path.dirname(__file__), os.pathsep, os.environ["PATH"]),
        )

        # Add full path to a library
        lib_path = os.environ.get("TIDY_LIBRARY_FULL_PATH")
        if lib_path:
            self.libnames = (lib_path, *self.libnames)

        # Try loading library
        for libname in self.libnames:
            try:
                self.lib = ctypes.CDLL(libname)
                break
            except OSError:
                continue
        else:
            # Fail in case we could not load it
            raise OSError("Couldn't find libtidy, please make sure it is installed.")

        # Adjust some types
        self.Create.restype = ctypes.POINTER(ctypes.c_void_p)
        self.LibraryVersion.restype = ctypes.c_char_p

    def __getattr__(self, name: str) -> Any:  # noqa: ANN401
        return getattr(self.lib, "tidy%s" % name)


_tidy = Loader()


_putByteFunction = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_char)


# define a callback to pass to Tidylib
@_putByteFunction
def putByte(handle: int, char: int) -> int:
    """Lookup sink by handle and call its putByte method."""
    sinkfactory[handle].putByte(char)
    return 0


class _OutputSink(ctypes.Structure):
    _fields_ = (("sinkData", ctypes.c_int), ("putByte", _putByteFunction))


class _Sink:
    def __init__(self, handle: int) -> None:
        self._data = io.BytesIO()
        self.struct = _OutputSink()
        self.struct.putByte = putByte
        self.handle = handle

    def putByte(self, byte: bytes) -> None:
        self._data.write(byte)

    def getvalue(self) -> bytes:
        return self._data.getvalue()


class ReportItem:
    """Error report item as returned by tidy."""

    severities: ClassVar[dict[str, str]] = {
        "W": "Warning",
        "E": "Error",
        "C": "Config",
        "D": "Document",
    }

    def __init__(self, err: str) -> None:
        self.err: str = err  #: Whole error message as returned by tidy
        self.full_severity: str  #: Full severity string
        self.severity: str  #: D, W, E or C indicating severity
        self.message: str  #: Error message itself
        self.line: int | None  #: Line where error was fired (can be None)
        self.col: int | None  #: Column where error was fired (can be None)
        # Parses:
        # line <line number> column <column number> - (Error|Warning): <message>
        # It might be also useful to  gnu-emacs reporting mode
        if err.startswith("line"):
            tokens = err.split(" ", 6)
            self.full_severity = tokens[5]
            self.severity = tokens[5][0]  # W, E or C
            self.line = int(tokens[1])
            self.col = int(tokens[3])
            self.message = tokens[6]
        else:
            tokens = err.split(" ", 1)
            self.full_severity = tokens[0]
            self.severity = tokens[0][0]
            self.message = tokens[1]
            self.line = None
            self.col = None

    def get_severity(self) -> str:
        try:
            return self.severities[self.severity]
        except KeyError:
            return self.full_severity.strip().rstrip(":")

    def __str__(self) -> str:
        if self.line:
            return "line {} col {} - {}: {}".format(
                self.line,
                self.col,
                self.get_severity(),
                self.message,
            )
        return f"{self.get_severity()}: {self.message}"

    def __repr__(self) -> str:
        return "{}('{}')".format(self.__class__.__name__, str(self).replace("'", "\\'"))


K = TypeVar("K")
V = TypeVar("V")


class FactoryDict(ABC, dict, Mapping[K, V]):
    """
    Custom dict wrapper.

    I am a dict with a create method and no __setitem__.  This allows
    me to control my own keys.
    """

    @abstractmethod
    def create(self) -> V:
        """Generate a new item."""
        raise NotImplementedError

    def _setitem(self, name: K, value: V) -> None:
        dict.__setitem__(self, name, value)

    def __setitem__(self, _: K, __: V) -> None:
        raise TypeError("Use create() to get a new object")


class SinkFactory(FactoryDict[int, _Sink]):
    """Mapping for lookup of sinks by handle."""

    def __init__(self) -> None:
        super().__init__()
        self.lastsink: int = 0

    def create(self) -> _Sink:
        sink = _Sink(self.lastsink)
        sink.struct.sinkData = self.lastsink
        FactoryDict._setitem(self, self.lastsink, sink)  # noqa: SLF001
        self.lastsink = self.lastsink + 1
        return sink


sinkfactory = SinkFactory()


class Document:
    """Document object as returned by :func:`parseString` or :func:`parse`."""

    def __init__(self, options: OPTION_DICT_TYPE) -> None:
        self.cdoc = _tidy.Create()
        self.options = options
        self.errsink = sinkfactory.create()
        _tidy.SetErrorSink(self.cdoc, ctypes.byref(self.errsink.struct))
        self._set_options()

    def _set_options(self) -> None:
        for key, value in self.options.items():
            # this will flush out most argument type errors...
            if value is None:
                value = ""  # noqa: PLW2901
            if isinstance(value, bool):
                value = int(value)  # noqa: PLW2901

            _tidy.OptParseValue(
                self.cdoc,
                key.replace("_", "-").encode("utf-8"),
                str(value).encode("utf-8"),
            )
            if self.errors:
                for error in ERROR_MAP:
                    if self.errors[-1].message.startswith(error):
                        raise ERROR_MAP[error](self.errors[-1].message)

    def __del__(self) -> None:
        del sinkfactory[self.errsink.handle]

    def write(self, stream: BinaryIO) -> None:
        """
        :param stream: Writable file like object.

        Writes document to the stream.
        """
        stream.write(self.getvalue())

    def get_errors(self) -> list[ReportItem]:
        """Return list of errors as a list of :class:`ReportItem`."""
        ret = []
        for line in self.errsink.getvalue().decode("utf-8").splitlines():
            line = line.strip()  # noqa: PLW2901
            if line:
                ret.append(ReportItem(line))
        return ret

    @property
    def errors(self) -> list[ReportItem]:
        return self.get_errors()

    def getvalue(self) -> bytes:
        """Raw string as returned by tidy."""
        stlen = ctypes.c_int(8192)
        string_buffer = ctypes.create_string_buffer(stlen.value)
        result = _tidy.SaveString(self.cdoc, string_buffer, ctypes.byref(stlen))
        if result == -ENOMEM:  # buffer too small
            string_buffer = ctypes.create_string_buffer(stlen.value)
            _tidy.SaveString(self.cdoc, string_buffer, ctypes.byref(stlen))
        return string_buffer.value

    def gettext(self) -> str:
        """Unicode text for output returned by tidy."""
        output_encoding = self.options["output_encoding"]
        assert isinstance(output_encoding, str)
        return self.getvalue().decode(output_encoding)

    def __str__(self) -> str:
        return self.gettext()


ERROR_MAP = {
    "missing or malformed argument for option: ": OptionArgError,
    "unknown option: ": InvalidOptionError,
}


class DocumentFactory(FactoryDict[weakref.ReferenceType, Document]):
    @staticmethod
    def load(
        doc: Document,
        arg: bytes,
        loader: Callable[[Document, bytes], int],
    ) -> None:
        status = loader(doc.cdoc, arg)
        if status >= 0:
            _tidy.CleanAndRepair(doc.cdoc)

    def loadFile(self, doc: Document, filename: str) -> None:
        self.load(doc, filename.encode("utf-8"), _tidy.ParseFile)

    def loadString(self, doc: Document, text: bytes) -> None:
        self.load(doc, text, _tidy.ParseString)

    def create(self, **kwargs: OPTION_TYPE) -> Document:
        enc = kwargs.get("char_encoding", "utf8")
        if "output_encoding" not in kwargs:
            kwargs["output_encoding"] = enc
        if "input_encoding" not in kwargs:
            kwargs["input_encoding"] = enc
        doc = Document(kwargs)
        ref = weakref.ref(doc, self.releaseDoc)
        FactoryDict._setitem(self, ref, doc.cdoc)  # noqa: SLF001
        return doc

    def parse(self, filename: str, **kwargs: OPTION_TYPE) -> Document:
        """
        Open and process filename as an HTML file.

        Returning a processed document object.

        :param kwargs: named options to pass to TidyLib for processing the
                       input file.
        :param filename: the name of a file to process
        :return: a :class:`Document` object

        """
        doc = self.create(**kwargs)
        self.loadFile(doc, filename)
        return doc

    def parseString(self, text: bytes | str, **kwargs: OPTION_TYPE) -> Document:
        """
        Use text as an HTML file.

        Returning a processed document object.

        :param kwargs: named options to pass to TidyLib for processing the
                       input file.
        :param text: the string to parse
        :return: a :class:`Document` object

        """
        doc = self.create(**kwargs)
        if isinstance(text, str):
            input_encoding = doc.options["input_encoding"]
            assert isinstance(input_encoding, str)
            text = text.encode(input_encoding)
        self.loadString(doc, text)
        return doc

    def releaseDoc(self, ref: weakref.ReferenceType) -> None:
        _tidy.Release(self[ref])


docfactory = DocumentFactory()
parse = docfactory.parse
parseString = docfactory.parseString


def getTidyVersion() -> str:
    version = _tidy.lib.tidyLibraryVersion()
    assert isinstance(version, bytes)
    return version.decode()