1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
import concurrent.futures
import os
import ssl
import sys
import urllib.request
from pathlib import Path
from typing import Optional
from urllib.error import HTTPError
if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self
import yaml
TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT.parent
RESOURCE_ROOT = PROJECT_ROOT / "resources"
SAMPLE_ROOT = Path(PROJECT_ROOT) / "sample-files"
def _get_data_from_url(url: str) -> bytes:
ssl._create_default_https_context = ssl._create_unverified_context
attempts = 0
while attempts < 3:
try:
with urllib.request.urlopen( # noqa: S310
url
) as response:
return response.read()
except HTTPError as e:
if attempts < 3:
attempts += 1
else:
raise e
raise ValueError(f"Unknown error handling {url}")
# TODO: Make keyword-only and drop name being optional.
def get_data_from_url(url: Optional[str] = None, name: Optional[str] = None) -> bytes:
"""
Download a File from a URL and return its contents.
This function makes sure the PDF is not downloaded too often.
This function is a last resort for PDF files where we are uncertain if
we may add it for testing purposes to https://github.com/py-pdf/sample-files
Args:
url: location of the PDF file
name: unique name across all files
Returns:
Read File as bytes
"""
if name is None:
raise ValueError("A name must always be specified")
if os.getenv("GITHUB_JOB", None) is not None:
cache_dir = Path("tests", "pdf_cache").resolve()
else:
cache_dir = Path(__file__).parent / "pdf_cache"
if not cache_dir.exists():
cache_dir.mkdir()
cache_path = cache_dir / name
if url is not None:
if url.startswith("file://"):
path = Path(url[7:].replace("\\", "/"))
return path.read_bytes()
if not cache_path.exists():
cache_path.write_bytes(_get_data_from_url(url))
return cache_path.read_bytes()
def _strip_position(line: str) -> str:
"""
Remove the location information.
The message
WARNING pypdf._reader:_utils.py:364 Xref table not zero-indexed.
becomes
Xref table not zero-indexed.
Args:
line: the original line
Returns:
A line with stripped position
"""
line = ".py".join(line.split(".py:")[1:])
return " ".join(line.split(" ")[1:])
def normalize_warnings(caplog_text: str) -> list[str]:
return [_strip_position(line) for line in caplog_text.strip().split("\n")]
def is_sublist(child_list, parent_list):
"""
Check if child_list is a sublist of parent_list, with respect to
* elements order
* elements repetition
Elements are compared using `==`
"""
if len(child_list) == 0:
return True
if len(parent_list) == 0:
return False
if parent_list[0] == child_list[0]:
return is_sublist(child_list[1:], parent_list[1:])
return is_sublist(child_list, parent_list[1:])
def read_yaml_to_list_of_dicts(yaml_file: Path) -> list[dict[str, str]]:
with open(yaml_file) as yaml_input:
return yaml.safe_load(yaml_input)
def download_test_pdfs():
"""
Run this before the tests are executed to ensure you have everything locally.
This is especially important to avoid pytest timeouts.
"""
pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.yaml")
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
futures = [
executor.submit(get_data_from_url, pdf["url"], name=pdf["local_filename"])
for pdf in pdfs
]
concurrent.futures.wait(futures)
class PILContext:
"""Allow changing the PIL/Pillow configuration for some limited scope."""
def __init__(self) -> None:
self._saved_load_truncated_images = False
def __enter__(self) -> Self:
# Allow loading incomplete images.
from PIL import ImageFile # noqa: PLC0415
self._saved_load_truncated_images = ImageFile.LOAD_TRUNCATED_IMAGES
ImageFile.LOAD_TRUNCATED_IMAGES = True
return self
def __exit__(self, type_, value, traceback) -> Optional[bool]:
from PIL import ImageFile # noqa: PLC0415
ImageFile.LOAD_TRUNCATED_IMAGES = self._saved_load_truncated_images
if type_:
# Error.
return None
return True
|