File: unescape.py

package info (click to toggle)
python-jsonpath 2.0.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,028 kB
  • sloc: python: 9,473; makefile: 6
file content (134 lines) | stat: -rw-r--r-- 3,973 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
r"""Replace `\uXXXX` escape sequences with Unicode code points."""

from typing import List
from typing import Tuple

from .exceptions import JSONPathSyntaxError
from .token import Token


def unescape_string(value: str, token: Token, quote: str) -> str:
    """Return `value` with escape sequences replaced with Unicode code points."""
    unescaped: List[str] = []
    index = 0

    while index < len(value):
        ch = value[index]
        if ch == "\\":
            index += 1
            _ch, index = _decode_escape_sequence(value, index, token, quote)
            unescaped.append(_ch)
        else:
            _string_from_codepoint(ord(ch), token)
            unescaped.append(ch)
        index += 1
    return "".join(unescaped)


def _decode_escape_sequence(  # noqa: PLR0911
    value: str, index: int, token: Token, quote: str
) -> Tuple[str, int]:
    try:
        ch = value[index]
    except IndexError as err:
        raise JSONPathSyntaxError("incomplete escape sequence", token=token) from err

    if ch == quote:
        return quote, index
    if ch == "\\":
        return "\\", index
    if ch == "/":
        return "/", index
    if ch == "b":
        return "\x08", index
    if ch == "f":
        return "\x0c", index
    if ch == "n":
        return "\n", index
    if ch == "r":
        return "\r", index
    if ch == "t":
        return "\t", index
    if ch == "u":
        codepoint, index = _decode_hex_char(value, index, token)
        return _string_from_codepoint(codepoint, token), index

    raise JSONPathSyntaxError(
        f"unknown escape sequence at index {token.index + index - 1}",
        token=token,
    )


def _decode_hex_char(value: str, index: int, token: Token) -> Tuple[int, int]:
    length = len(value)

    if index + 4 >= length:
        raise JSONPathSyntaxError(
            f"incomplete escape sequence at index {token.index + index - 1}",
            token=token,
        )

    index += 1  # move past 'u'
    codepoint = _parse_hex_digits(value[index : index + 4], token)

    if _is_low_surrogate(codepoint):
        raise JSONPathSyntaxError(
            f"unexpected low surrogate at index {token.index + index - 1}",
            token=token,
        )

    if _is_high_surrogate(codepoint):
        # expect a surrogate pair
        if not (
            index + 9 < length and value[index + 4] == "\\" and value[index + 5] == "u"
        ):
            raise JSONPathSyntaxError(
                f"incomplete escape sequence at index {token.index + index - 2}",
                token=token,
            )

        low_surrogate = _parse_hex_digits(value[index + 6 : index + 10], token)

        if not _is_low_surrogate(low_surrogate):
            raise JSONPathSyntaxError(
                f"unexpected codepoint at index {token.index + index + 4}",
                token=token,
            )

        codepoint = 0x10000 + (((codepoint & 0x03FF) << 10) | (low_surrogate & 0x03FF))

        return (codepoint, index + 9)

    return (codepoint, index + 3)


def _parse_hex_digits(digits: str, token: Token) -> int:
    codepoint = 0
    for digit in digits.encode():
        codepoint <<= 4
        if digit >= 48 and digit <= 57:
            codepoint |= digit - 48
        elif digit >= 65 and digit <= 70:
            codepoint |= digit - 65 + 10
        elif digit >= 97 and digit <= 102:
            codepoint |= digit - 97 + 10
        else:
            raise JSONPathSyntaxError(
                "invalid \\uXXXX escape sequence",
                token=token,
            )
    return codepoint


def _string_from_codepoint(codepoint: int, token: Token) -> str:
    if codepoint <= 0x1F:
        raise JSONPathSyntaxError("invalid character", token=token)
    return chr(codepoint)


def _is_high_surrogate(codepoint: int) -> bool:
    return codepoint >= 0xD800 and codepoint <= 0xDBFF


def _is_low_surrogate(codepoint: int) -> bool:
    return codepoint >= 0xDC00 and codepoint <= 0xDFFF