File: __init__.py

package info (click to toggle)
pypdf 6.9.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 18,184 kB
  • sloc: python: 48,595; makefile: 35
file content (159 lines) | stat: -rw-r--r-- 4,600 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import concurrent.futures
import os
import ssl
import sys
import urllib.request
from pathlib import Path
from typing import Optional
from urllib.error import HTTPError

if sys.version_info >= (3, 11):
    from typing import Self
else:
    from typing_extensions import Self

import yaml

TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT.parent
RESOURCE_ROOT = PROJECT_ROOT / "resources"
SAMPLE_ROOT = Path(PROJECT_ROOT) / "sample-files"


def _get_data_from_url(url: str) -> bytes:
    ssl._create_default_https_context = ssl._create_unverified_context
    attempts = 0
    while attempts < 3:
        try:
            with urllib.request.urlopen(  # noqa: S310
                    url
            ) as response:
                return response.read()
        except HTTPError as e:
            if attempts < 3:
                attempts += 1
            else:
                raise e
    raise ValueError(f"Unknown error handling {url}")


# TODO: Make keyword-only and drop name being optional.
def get_data_from_url(url: Optional[str] = None, name: Optional[str] = None) -> bytes:
    """
    Download a File from a URL and return its contents.

    This function makes sure the PDF is not downloaded too often.
    This function is a last resort for PDF files where we are uncertain if
    we may add it for testing purposes to https://github.com/py-pdf/sample-files

    Args:
        url: location of the PDF file
        name: unique name across all files

    Returns:
        Read File as bytes

    """
    if name is None:
        raise ValueError("A name must always be specified")

    if os.getenv("GITHUB_JOB", None) is not None:
        cache_dir = Path("tests", "pdf_cache").resolve()
    else:
        cache_dir = Path(__file__).parent / "pdf_cache"
    if not cache_dir.exists():
        cache_dir.mkdir()
    cache_path = cache_dir / name

    if url is not None:
        if url.startswith("file://"):
            path = Path(url[7:].replace("\\", "/"))
            return path.read_bytes()
        if not cache_path.exists():
            cache_path.write_bytes(_get_data_from_url(url))
    return cache_path.read_bytes()


def _strip_position(line: str) -> str:
    """
    Remove the location information.

    The message
        WARNING  pypdf._reader:_utils.py:364 Xref table not zero-indexed.

    becomes
        Xref table not zero-indexed.

    Args:
        line: the original line

    Returns:
        A line with stripped position

    """
    line = ".py".join(line.split(".py:")[1:])
    return " ".join(line.split(" ")[1:])


def normalize_warnings(caplog_text: str) -> list[str]:
    return [_strip_position(line) for line in caplog_text.strip().split("\n")]


def is_sublist(child_list, parent_list):
    """
    Check if child_list is a sublist of parent_list, with respect to
    * elements order
    * elements repetition

    Elements are compared using `==`
    """
    if len(child_list) == 0:
        return True
    if len(parent_list) == 0:
        return False
    if parent_list[0] == child_list[0]:
        return is_sublist(child_list[1:], parent_list[1:])
    return is_sublist(child_list, parent_list[1:])


def read_yaml_to_list_of_dicts(yaml_file: Path) -> list[dict[str, str]]:
    with open(yaml_file) as yaml_input:
        return yaml.safe_load(yaml_input)


def download_test_pdfs():
    """
    Run this before the tests are executed to ensure you have everything locally.

    This is especially important to avoid pytest timeouts.
    """
    pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.yaml")

    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        futures = [
            executor.submit(get_data_from_url, pdf["url"], name=pdf["local_filename"])
            for pdf in pdfs
        ]
        concurrent.futures.wait(futures)


class PILContext:
    """Allow changing the PIL/Pillow configuration for some limited scope."""

    def __init__(self) -> None:
        self._saved_load_truncated_images = False

    def __enter__(self) -> Self:
        # Allow loading incomplete images.
        from PIL import ImageFile  # noqa: PLC0415
        self._saved_load_truncated_images = ImageFile.LOAD_TRUNCATED_IMAGES
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        return self

    def __exit__(self, type_, value, traceback) -> Optional[bool]:
        from PIL import ImageFile  # noqa: PLC0415
        ImageFile.LOAD_TRUNCATED_IMAGES = self._saved_load_truncated_images
        if type_:
            # Error.
            return None
        return True