File: tools.py

package info (click to toggle)
url-normalize 2.2.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 268 kB
  • sloc: python: 935; makefile: 16; sh: 8
file content (113 lines) | stat: -rw-r--r-- 2,564 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""URL normalization tools."""

from __future__ import annotations

import re
import unicodedata
from typing import NamedTuple
from urllib.parse import quote as quote_orig
from urllib.parse import unquote as unquote_orig
from urllib.parse import urlsplit, urlunsplit


class URL(NamedTuple):
    """URL components tuple.

    A named tuple containing the parsed components of a URL:
    scheme, userinfo, host, port, path, query, and fragment.
    """

    scheme: str
    userinfo: str
    host: str
    port: str
    path: str
    query: str
    fragment: str


def deconstruct_url(url: str) -> URL:
    """Transform the url into URL structure.

    Params:
        url : string : the URL

    Returns:
        URL

    """
    scheme, auth, path, query, fragment = urlsplit(url.strip())
    match = re.search(r"([^@]*@)?([^:]*):?(.*)", auth)
    (userinfo, host, port) = match.groups()  # type: ignore  # noqa: PGH003
    return URL(
        fragment=fragment,
        host=host,
        path=path,
        port=port or "",
        query=query,
        scheme=scheme,
        userinfo=userinfo or "",
    )


def reconstruct_url(url: URL) -> str:
    """Reconstruct string url from URL.

    Params:
        url : URL object instance

    Returns:
        string : reconstructed url string

    """
    auth = (url.userinfo or "") + url.host
    if url.port:
        auth += ":" + url.port
    return urlunsplit((url.scheme, auth, url.path, url.query, url.fragment))


def force_unicode(string: str | bytes, charset: str = "utf-8") -> str:
    """Ensure string is properly encoded (Python 3 only).

    Params:
        string : str : an input string
        charset : str : optional : output encoding

    Returns:
        str

    """
    if isinstance(string, bytes):
        return string.decode(charset, "replace")
    return string


def unquote(string: str, charset: str = "utf-8") -> str:
    """Unquote and normalize unicode string.

    Params:
        string : string to be unquoted
        charset : string : optional : output encoding

    Returns:
        string : an unquoted and normalized string

    """
    string = unquote_orig(string)
    string = force_unicode(string, charset)
    encoded_str = unicodedata.normalize("NFC", string).encode(charset)
    return encoded_str.decode(charset)


def quote(string: str, safe: str = "/") -> str:
    """Quote string.

    Params:
        string : string to be quoted
        safe : string of safe characters

    Returns:
        string : quoted string

    """
    return quote_orig(string, safe)