1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
|
from typing import Union, Iterable, Sequence, Any, Optional
import sys
import json as _builtin_json
import gzip
from . import ujson
from .util import force_path, force_string, FilePath, JSONInput, JSONOutput
def json_dumps(
data: JSONInput, indent: Optional[int] = 0, sort_keys: bool = False
) -> str:
"""Serialize an object to a JSON string.
data: The JSON-serializable data.
indent (int): Number of spaces used to indent JSON.
sort_keys (bool): Sort dictionary keys. Falls back to json module for now.
RETURNS (str): The serialized string.
"""
if sort_keys:
indent = None if indent == 0 else indent
result = _builtin_json.dumps(
data, indent=indent, separators=(",", ":"), sort_keys=sort_keys
)
else:
result = ujson.dumps(data, indent=indent, escape_forward_slashes=False)
return result
def json_loads(data: Union[str, bytes]) -> JSONOutput:
"""Deserialize unicode or bytes to a Python object.
data (str / bytes): The data to deserialize.
RETURNS: The deserialized Python object.
"""
# Avoid transforming the string '-' into the int '0'
if data == "-":
raise ValueError("Expected object or value")
return ujson.loads(data)
def read_json(path: FilePath) -> JSONOutput:
"""Load JSON from file or standard input.
path (FilePath): The file path. "-" for reading from stdin.
RETURNS (JSONOutput): The loaded JSON content.
"""
if path == "-": # reading from sys.stdin
data = sys.stdin.read()
return ujson.loads(data)
file_path = force_path(path)
with file_path.open("r", encoding="utf8") as f:
return ujson.load(f)
def read_gzip_json(path: FilePath) -> JSONOutput:
"""Load JSON from a gzipped file.
location (FilePath): The file path.
RETURNS (JSONOutput): The loaded JSON content.
"""
file_path = force_string(path)
with gzip.open(file_path, "r") as f:
return ujson.load(f)
def write_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
"""Create a .json file and dump contents or write to standard
output.
location (FilePath): The file path. "-" for writing to stdout.
data (JSONInput): The JSON-serializable data to output.
indent (int): Number of spaces used to indent JSON.
"""
json_data = json_dumps(data, indent=indent)
if path == "-": # writing to stdout
print(json_data)
else:
file_path = force_path(path, require_exists=False)
with file_path.open("w", encoding="utf8") as f:
f.write(json_data)
def write_gzip_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
"""Create a .json.gz file and dump contents.
path (FilePath): The file path.
data (JSONInput): The JSON-serializable data to output.
indent (int): Number of spaces used to indent JSON.
"""
json_data = json_dumps(data, indent=indent)
file_path = force_string(path)
with gzip.open(file_path, "w") as f:
f.write(json_data.encode("utf-8"))
def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]:
"""Read a .jsonl file or standard input and yield contents line by line.
Blank lines will always be skipped.
path (FilePath): The file path. "-" for reading from stdin.
skip (bool): Skip broken lines and don't raise ValueError.
YIELDS (JSONOutput): The loaded JSON contents of each line.
"""
if path == "-": # reading from sys.stdin
for line in _yield_json_lines(sys.stdin, skip=skip):
yield line
else:
file_path = force_path(path)
with file_path.open("r", encoding="utf8") as f:
for line in _yield_json_lines(f, skip=skip):
yield line
def write_jsonl(
path: FilePath,
lines: Iterable[JSONInput],
append: bool = False,
append_new_line: bool = True,
) -> None:
"""Create a .jsonl file and dump contents or write to standard output.
location (FilePath): The file path. "-" for writing to stdout.
lines (Sequence[JSONInput]): The JSON-serializable contents of each line.
append (bool): Whether or not to append to the location.
append_new_line (bool): Whether or not to write a new line before appending
to the file.
"""
if path == "-": # writing to stdout
for line in lines:
print(json_dumps(line))
else:
mode = "a" if append else "w"
file_path = force_path(path, require_exists=False)
with file_path.open(mode, encoding="utf-8") as f:
if append and append_new_line:
f.write("\n")
for line in lines:
f.write(json_dumps(line) + "\n")
def is_json_serializable(obj: Any) -> bool:
"""Check if a Python object is JSON-serializable.
obj: The object to check.
RETURNS (bool): Whether the object is JSON-serializable.
"""
if hasattr(obj, "__call__"):
# Check this separately here to prevent infinite recursions
return False
try:
ujson.dumps(obj)
return True
except (TypeError, OverflowError):
return False
def _yield_json_lines(
stream: Iterable[str], skip: bool = False
) -> Iterable[JSONOutput]:
line_no = 1
for line in stream:
line = line.strip()
if line == "":
continue
try:
yield ujson.loads(line)
except ValueError:
if skip:
continue
raise ValueError(f"Invalid JSON on line {line_no}: {line}")
line_no += 1
|