File: _json_api.py

package info (click to toggle)
python-srsly 2.5.2-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,256 kB
  • sloc: python: 17,567; ansic: 1,434; sh: 12; makefile: 6
file content (209 lines) | stat: -rw-r--r-- 7,061 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
from typing import Union, Iterable, Sequence, Any, Optional, Iterator
import sys
import json as _builtin_json
import gzip

import ujson
from .util import force_path, force_string, FilePath, JSONInput, JSONOutput


def json_dumps(
    data: JSONInput, indent: Optional[int] = 0, sort_keys: bool = False
) -> str:
    """Serialize an object to a JSON string.

    data: The JSON-serializable data.
    indent (int): Number of spaces used to indent JSON.
    sort_keys (bool): Sort dictionary keys. Falls back to json module for now.
    RETURNS (str): The serialized string.
    """
    if sort_keys:
        indent = None if indent == 0 else indent
        result = _builtin_json.dumps(
            data, indent=indent, separators=(",", ":"), sort_keys=sort_keys
        )
    else:
        result = ujson.dumps(data, indent=indent, escape_forward_slashes=False)
    return result


def json_loads(data: Union[str, bytes]) -> JSONOutput:
    """Deserialize unicode or bytes to a Python object.

    data (str / bytes): The data to deserialize.
    RETURNS: The deserialized Python object.
    """
    # Avoid transforming the string '-' into the int '0'
    if data == "-":
        raise ValueError("Expected object or value")
    return ujson.loads(data)


def read_json(path: FilePath) -> JSONOutput:
    """Load JSON from file or standard input.

    path (FilePath): The file path. "-" for reading from stdin.
    RETURNS (JSONOutput): The loaded JSON content.
    """
    if path == "-":  # reading from sys.stdin
        data = sys.stdin.read()
        return ujson.loads(data)
    file_path = force_path(path)
    with file_path.open("r", encoding="utf8") as f:
        return ujson.load(f)


def read_gzip_json(path: FilePath) -> JSONOutput:
    """Load JSON from a gzipped file.

    location (FilePath): The file path.
    RETURNS (JSONOutput): The loaded JSON content.
    """
    file_path = force_string(path)
    with gzip.open(file_path, "r") as f:
        return ujson.load(f)


def read_gzip_jsonl(path: FilePath, skip: bool = False) -> Iterator[JSONOutput]:
    """Read a gzipped .jsonl file and yield contents line by line.
    Blank lines will always be skipped.

    path (FilePath): The file path.
    skip (bool): Skip broken lines and don't raise ValueError.
    YIELDS (JSONOutput): The unpacked, deserialized Python objects.
    """
    with gzip.open(force_path(path), "r") as f:
        for line in _yield_json_lines(f, skip=skip):
            yield line


def write_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
    """Create a .json file and dump contents or write to standard
    output.

    location (FilePath): The file path. "-" for writing to stdout.
    data (JSONInput): The JSON-serializable data to output.
    indent (int): Number of spaces used to indent JSON.
    """
    json_data = json_dumps(data, indent=indent)
    if path == "-":  # writing to stdout
        print(json_data)
    else:
        file_path = force_path(path, require_exists=False)
        with file_path.open("w", encoding="utf8") as f:
            f.write(json_data)


def write_gzip_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
    """Create a .json.gz file and dump contents.

    path (FilePath): The file path.
    data (JSONInput): The JSON-serializable data to output.
    indent (int): Number of spaces used to indent JSON.
    """
    json_data = json_dumps(data, indent=indent)
    file_path = force_string(path)
    with gzip.open(file_path, "w") as f:
        f.write(json_data.encode("utf-8"))


def write_gzip_jsonl(
    path: FilePath,
    lines: Iterable[JSONInput],
    append: bool = False,
    append_new_line: bool = True,
) -> None:
    """Create a .jsonl.gz file and dump contents.

    location (FilePath): The file path.
    lines (Sequence[JSONInput]): The JSON-serializable contents of each line.
    append (bool): Whether or not to append to the location. Appending to .gz files is generally not recommended, as it
        doesn't allow the algorithm to take advantage of all data when compressing - files may hence be poorly
        compressed.
    append_new_line (bool): Whether or not to write a new line before appending
        to the file.
    """
    mode = "a" if append else "w"
    file_path = force_path(path, require_exists=False)
    with gzip.open(file_path, mode=mode) as f:
        if append and append_new_line:
            f.write("\n".encode("utf-8"))
        f.writelines([(json_dumps(line) + "\n").encode("utf-8") for line in lines])


def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]:
    """Read a .jsonl file or standard input and yield contents line by line.
    Blank lines will always be skipped.

    path (FilePath): The file path. "-" for reading from stdin.
    skip (bool): Skip broken lines and don't raise ValueError.
    YIELDS (JSONOutput): The loaded JSON contents of each line.
    """
    if path == "-":  # reading from sys.stdin
        for line in _yield_json_lines(sys.stdin, skip=skip):
            yield line
    else:
        file_path = force_path(path)
        with file_path.open("r", encoding="utf8") as f:
            for line in _yield_json_lines(f, skip=skip):
                yield line


def write_jsonl(
    path: FilePath,
    lines: Iterable[JSONInput],
    append: bool = False,
    append_new_line: bool = True,
) -> None:
    """Create a .jsonl file and dump contents or write to standard output.

    location (FilePath): The file path. "-" for writing to stdout.
    lines (Sequence[JSONInput]): The JSON-serializable contents of each line.
    append (bool): Whether or not to append to the location.
    append_new_line (bool): Whether or not to write a new line before appending
        to the file.
    """
    if path == "-":  # writing to stdout
        for line in lines:
            print(json_dumps(line))
    else:
        mode = "a" if append else "w"
        file_path = force_path(path, require_exists=False)
        with file_path.open(mode, encoding="utf-8") as f:
            if append and append_new_line:
                f.write("\n")
            for line in lines:
                f.write(json_dumps(line) + "\n")


def is_json_serializable(obj: Any) -> bool:
    """Check if a Python object is JSON-serializable.

    obj: The object to check.
    RETURNS (bool): Whether the object is JSON-serializable.
    """
    if hasattr(obj, "__call__"):
        # Check this separately here to prevent infinite recursions
        return False
    try:
        ujson.dumps(obj)
        return True
    except (TypeError, OverflowError):
        return False


def _yield_json_lines(
    stream: Iterable[str], skip: bool = False
) -> Iterable[JSONOutput]:
    line_no = 1
    for line in stream:
        line = line.strip()
        if line == "":
            continue
        try:
            yield ujson.loads(line)
        except ValueError:
            if skip:
                continue
            raise ValueError(f"Invalid JSON on line {line_no}: {line}")
        line_no += 1