File: normalize_host.py

package info (click to toggle)
url-normalize 2.2.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 268 kB
  • sloc: python: 935; makefile: 16; sh: 8
file content (42 lines) | stat: -rw-r--r-- 1,131 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
"""URL host normalization."""

from __future__ import annotations

import idna

from .tools import force_unicode

DEFAULT_CHARSET = "utf-8"


def normalize_host(host: str, charset: str = DEFAULT_CHARSET) -> str:
    """Normalize host part of the url.

    Lowercase and strip of final dot.
    Also, handle IDN domains using IDNA2008 with UTS46 transitional processing.

    Params:
        host : string : url host, e.g., 'site.com'
        charset : string : encoding charset

    Returns:
        string : normalized host data.

    """
    host = force_unicode(host, charset)
    host = host.lower()
    host = host.strip(".")

    # Split domain into parts to handle each label separately
    parts = host.split(".")
    try:
        # Process each label separately to handle mixed unicode/ascii domains
        parts = [
            idna.encode(p, uts46=True, transitional=True).decode(charset)
            for p in parts
            if p
        ]
        return ".".join(parts)
    except idna.IDNAError:
        # Fallback to direct encoding if IDNA2008 processing fails
        return host.encode("idna").decode(charset)