File: url_normalize.py

package info (click to toggle)
url-normalize 2.2.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 268 kB
  • sloc: python: 935; makefile: 16; sh: 8
file content (81 lines) | stat: -rw-r--r-- 3,076 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""URL normalize main module.

Copyright (c) 2020 Nikolay Panov
This module is part of url-normalize package and is released under the MIT License:
https://opensource.org/licenses/MIT

"""

from __future__ import annotations

from .generic_url_cleanup import generic_url_cleanup
from .normalize_fragment import normalize_fragment
from .normalize_host import DEFAULT_CHARSET, normalize_host
from .normalize_path import normalize_path
from .normalize_port import normalize_port
from .normalize_query import normalize_query
from .normalize_scheme import DEFAULT_SCHEME, normalize_scheme
from .normalize_userinfo import normalize_userinfo
from .provide_url_domain import provide_url_domain
from .provide_url_scheme import provide_url_scheme
from .tools import deconstruct_url, reconstruct_url


def url_normalize(  # noqa: PLR0913
    url: str | None,
    *,  # Force keyword-only arguments
    charset: str = DEFAULT_CHARSET,
    default_scheme: str = DEFAULT_SCHEME,
    default_domain: str | None = None,
    filter_params: bool = False,
    param_allowlist: dict | list | None = None,
) -> str | None:
    """URI normalization routine.

    Sometimes you get an URL by a user that just isn't a real
    URL because it contains unsafe characters like ' ' and so on.
    This function can fix some of the problems in a similar way
    browsers handle data entered by the user:

    >>> url_normalize('http://de.wikipedia.org/wiki/Elf (Begriffsklärung)')
    'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'

    Params:
        url : str | None : URL to normalize
        charset : str : optional
            The target charset for the URL if the url was given as unicode string
        default_scheme : str : default scheme to use if none present
        default_domain : str | None : optional
            Default domain to use for absolute paths (starting with '/')
        filter_params : bool : optional
            Whether to filter non-allowlisted parameters (False by default)
        param_allowlist : dict | list | None : optional
            Override for the parameter allowlist

    Returns:
        str | None : a normalized url

    """
    if not url:
        return url
    url = provide_url_domain(url, default_domain)
    url = provide_url_scheme(url, default_scheme)
    url = generic_url_cleanup(url)
    url_elements = deconstruct_url(url)
    url_elements = url_elements._replace(
        scheme=normalize_scheme(url_elements.scheme),
        userinfo=normalize_userinfo(url_elements.userinfo),
        host=normalize_host(url_elements.host, charset),
        query=normalize_query(
            url_elements.query,
            host=url_elements.host,
            filter_params=filter_params,
            param_allowlist=param_allowlist,
        ),
        fragment=normalize_fragment(url_elements.fragment),
    )
    url_elements = url_elements._replace(
        port=normalize_port(url_elements.port, url_elements.scheme),
        path=normalize_path(url_elements.path, url_elements.scheme),
    )
    return reconstruct_url(url_elements)