1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
|
"""Test the pypdf._utils module."""
import functools
import io
import re
import subprocess
import sys
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any, Callable
import pytest
import pypdf._utils
from pypdf._utils import (
File,
Version,
_get_max_pdf_version_header,
_human_readable_bytes,
check_if_whitespace_only,
classproperty,
deprecate_with_replacement,
deprecation_no_replacement,
format_iso8824_date,
mark_location,
matrix_multiply,
parse_iso8824_date,
read_block_backwards,
read_previous_line,
read_until_regex,
read_until_whitespace,
rename_kwargs,
skip_over_comment,
skip_over_whitespace,
)
from pypdf.errors import DeprecationError, PdfReadError, PdfStreamError
from . import is_sublist
@pytest.mark.parametrize(
("stream", "expected"),
[
(io.BytesIO(b"foo"), False),
(io.BytesIO(b""), False),
(io.BytesIO(b" "), True),
(io.BytesIO(b" "), True),
(io.BytesIO(b" \n"), True),
(io.BytesIO(b" \n"), True),
(io.BytesIO(b"\f"), True),
],
)
def test_skip_over_whitespace(stream, expected):
assert skip_over_whitespace(stream) == expected
@pytest.mark.parametrize(
("value", "expected"),
[
(b"foo", False),
(b" a", False),
(b" a\n b", False),
(b"", True),
(b" ", True),
(b" ", True),
(b" \n", True),
(b" \n", True),
(b"\f", True),
],
)
def test_check_if_whitespace_only(value, expected):
assert check_if_whitespace_only(value) is expected
def test_read_until_whitespace():
assert read_until_whitespace(io.BytesIO(b"foo"), maxchars=1) == b"f"
@pytest.mark.parametrize(
("stream", "remainder"),
[
(io.BytesIO(b"% foobar\n"), b""),
(io.BytesIO(b""), b""),
(io.BytesIO(b" "), b" "),
(io.BytesIO(b"% foo%\nbar"), b"bar"),
],
)
def test_skip_over_comment(stream, remainder):
skip_over_comment(stream)
assert stream.read() == remainder
def test_read_until_regex_premature_ending_name():
stream = io.BytesIO(b"")
assert read_until_regex(stream, re.compile(b".")) == b""
def test_read_until_regex_match_in_first_chunk():
"""Match within the first 16-byte chunk."""
stream = io.BytesIO(b"hello world")
result = read_until_regex(stream, re.compile(b" "))
assert result == b"hello"
assert stream.tell() == 5
def test_read_until_regex_match_in_second_chunk():
"""Match lands in the second chunk (past first 16 bytes)."""
payload = b"0123456789abcdefghij"
assert len(payload) == 20
data = payload + b" rest"
stream = io.BytesIO(data)
result = read_until_regex(stream, re.compile(b" "))
assert result == payload
assert stream.tell() == 20
def test_read_until_regex_match_at_chunk_boundary():
"""Delimiter sits exactly at byte 16 (first byte of second chunk)."""
payload = b"0123456789abcdef"
assert len(payload) == 16
data = payload + b" after"
stream = io.BytesIO(data)
result = read_until_regex(stream, re.compile(b" "))
assert result == payload
assert stream.tell() == 16
def test_read_until_regex_multi_byte_spanning_boundary():
"""Multi-byte regex pattern spans across a chunk boundary."""
# "X" at byte 15 (last byte of first chunk), "Y" at byte 16 (first of second)
payload = b"0123456789abcde"
assert len(payload) == 15
data = payload + b"XYafter"
stream = io.BytesIO(data)
result = read_until_regex(stream, re.compile(b"XY"))
assert result == payload
assert stream.tell() == 15
def test_read_until_regex_no_match_exhausted():
"""No match - stream is fully consumed and all data returned."""
data = b"0123456789" * 10
stream = io.BytesIO(data)
result = read_until_regex(stream, re.compile(b"ZZZ"))
assert result == data
def test_read_until_regex_exponential_chunk_growth():
"""Verify correctness with long input that exercises chunk doubling."""
payload = (b"0123456789abcdef" * 3125)[:50_000]
assert len(payload) == 50_000
data = payload + b"|done"
stream = io.BytesIO(data)
result = read_until_regex(stream, re.compile(rb"\|"))
assert result == payload
assert stream.tell() == 50_000
def test_read_until_regex_match_spanning_later_boundary():
"""Multi-byte match spanning a boundary after chunk size has grown."""
# Chunk 1: 16 bytes, chunk 2: 32 bytes → total 48 after two reads.
# Place "END" at offset 47 so it spans bytes 47-49.
payload = (b"0123456789abcdef" * 3)[:47]
assert len(payload) == 47
data = payload + b"ENDrest"
stream = io.BytesIO(data)
result = read_until_regex(stream, re.compile(b"END"))
assert result == payload
assert stream.tell() == 47
def test_read_until_regex_tail_overlap_is_fixed():
"""Tail overlap is 16 bytes regardless of chunk size growth.
Chunk reads: 16, 32, 64 -> total 112. Place a 16-byte pattern starting
one byte before the 64-byte chunk boundary (at offset 47) so it spans
into the third chunk. This only works if the tail kept from chunk 2
covers at least 16 bytes.
"""
pattern = b"ABCDEFGHIJKLMNOP" # 16 bytes
assert len(pattern) == 16
# Chunk 1: 16 bytes, chunk 2: 32 bytes -> boundary at offset 48.
# Pattern starts at 47, spanning bytes 47-62.
payload = b"x" * 47
data = payload + pattern + b"rest"
stream = io.BytesIO(data)
result = read_until_regex(stream, re.compile(re.escape(pattern)))
assert result == payload
assert stream.tell() == 47
@pytest.mark.parametrize(
("a", "b", "expected"),
[
(((3,),), ((7,),), ((21,),)),
(((3, 7),), ((5,), (13,)), ((3 * 5.0 + 7 * 13,),)),
(((3,), (7,)), ((5, 13),), ((3 * 5, 3 * 13), (7 * 5, 7 * 13))),
],
)
def test_matrix_multiply(a, b, expected):
assert matrix_multiply(a, b) == expected
def test_mark_location():
stream = io.BytesIO(b"abde" * 6000)
mark_location(stream)
Path("pypdf_pdfLocation.txt").unlink() # cleanup
def test_deprecate_no_replacement():
with pytest.warns(
expected_warning=DeprecationWarning,
match="foo is deprecated and will be removed in pypdf 3.0.0."
):
pypdf._utils.deprecate_no_replacement("foo", removed_in="3.0.0")
@pytest.mark.parametrize(
("dat", "pos", "to_read", "expected", "expected_pos"),
[
(b"abc", 1, 0, b"", 1),
(b"abc", 1, 1, b"a", 0),
(b"abc", 2, 1, b"b", 1),
(b"abc", 2, 2, b"ab", 0),
(b"abc", 3, 1, b"c", 2),
(b"abc", 3, 2, b"bc", 1),
(b"abc", 3, 3, b"abc", 0),
(b"", 0, 1, None, 0),
(b"a", 0, 1, None, 0),
(b"abc", 0, 10, None, 0),
],
)
def test_read_block_backwards(dat, pos, to_read, expected, expected_pos):
s = io.BytesIO(dat)
s.seek(pos)
if expected is not None:
assert read_block_backwards(s, to_read) == expected
else:
with pytest.raises(PdfStreamError):
read_block_backwards(s, to_read)
assert s.tell() == expected_pos
def test_read_block_backwards_at_start():
s = io.BytesIO(b"abc")
with pytest.raises(PdfStreamError) as _:
read_previous_line(s)
@pytest.mark.parametrize(
("dat", "pos", "expected", "expected_pos"),
[
(b"abc", 1, b"a", 0),
(b"abc", 2, b"ab", 0),
(b"abc", 3, b"abc", 0),
(b"abc\n", 3, b"abc", 0),
(b"abc\n", 4, b"", 3),
(b"abc\n\r", 4, b"", 3),
(b"abc\nd", 5, b"d", 3),
# Skip over multiple CR/LF bytes
(b"abc\n\r\ndef", 9, b"def", 3),
],
ids=list(range(8)),
)
def test_read_previous_line(dat, pos, expected, expected_pos):
s = io.BytesIO(dat)
s.seek(pos)
assert read_previous_line(s) == expected
assert s.tell() == expected_pos
# for unknown reason if the parameters are passed through pytest, errors are reported
def test_read_previous_line2():
# Include a block full of newlines...
test_read_previous_line(
b"abc" + b"\n" * (2 * io.DEFAULT_BUFFER_SIZE) + b"d",
2 * io.DEFAULT_BUFFER_SIZE + 4,
b"d",
3,
)
# Include a block full of non-newline characters
test_read_previous_line(
b"abc\n" + b"d" * (2 * io.DEFAULT_BUFFER_SIZE),
2 * io.DEFAULT_BUFFER_SIZE + 4,
b"d" * (2 * io.DEFAULT_BUFFER_SIZE),
3,
)
# Both
test_read_previous_line(
b"abcxyz"
+ b"\n" * (2 * io.DEFAULT_BUFFER_SIZE)
+ b"d" * (2 * io.DEFAULT_BUFFER_SIZE),
4 * io.DEFAULT_BUFFER_SIZE + 6,
b"d" * (2 * io.DEFAULT_BUFFER_SIZE),
6,
)
def test_get_max_pdf_version_header():
with pytest.raises(ValueError) as exc:
_get_max_pdf_version_header(b"", b"PDF-1.2")
assert exc.value.args[0] == "Neither b'' nor b'PDF-1.2' are proper headers"
def test_read_block_backwards_exception():
stream = io.BytesIO(b"foobar")
stream.seek(6)
with pytest.raises(PdfReadError) as exc:
read_block_backwards(stream, 7)
assert exc.value.args[0] == "Could not read malformed PDF file"
def test_deprecate_with_replacement():
def foo() -> None:
deprecate_with_replacement("foo", "bar", removed_in="4.3.2")
with pytest.warns(
DeprecationWarning,
match="foo is deprecated and will be removed in pypdf 4.3.2. Use bar instead.",
):
foo()
def test_deprecation_no_replacement():
def foo() -> None:
deprecation_no_replacement("foo", removed_in="4.3.2")
with pytest.raises(
DeprecationError,
match=r"foo is deprecated and was removed in pypdf 4\.3\.2\.",
):
foo()
def test_rename_kwargs():
def deprecation_bookmark_nofail(**aliases: str) -> Callable:
"""
Decorator for deprecated term "bookmark".
To be used for methods and function arguments
outline_item = a bookmark
outline = a collection of outline items.
"""
def decoration(func: Callable) -> Any: # type: ignore
@functools.wraps(func)
def wrapper(*args: Any, **kwargs: Any) -> Any: # type: ignore
rename_kwargs(func.__name__, kwargs, aliases, fail=False)
return func(*args, **kwargs)
return wrapper
return decoration
@deprecation_bookmark_nofail(old_param="new_param")
def foo(old_param: int = 1, baz: int = 2, new_param: int = 1) -> None:
pass
expected_msg = (
"foo received both old_param and new_param as an argument. "
"old_param is deprecated. Use new_param instead."
)
with pytest.raises(TypeError, match=expected_msg):
foo(old_param=12, new_param=13)
with pytest.warns(
DeprecationWarning,
match="old_param is deprecated as an argument. Use new_param instead",
):
foo(old_param=12)
def test_rename_kwargs__stacklevel(tmp_path: Path) -> None:
script = tmp_path / "script.py"
script.write_text("""
import functools
import warnings
from pypdf._utils import rename_kwargs
def deprecation(**aliases: str):
def decoration(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
rename_kwargs(func.__name__, kwargs, aliases, fail=False)
return func(*args, **kwargs)
return wrapper
return decoration
@deprecation(old_param="new_param")
def foo(old_param: int = 1, baz: int = 2, new_param: int = 1) -> None:
pass
warnings.simplefilter("always")
foo(old_param=12)
""")
result = subprocess.run([sys.executable, script], capture_output=True, text=True) # noqa: S603
assert result.returncode == 0
assert result.stderr == (
f"{script}:23: DeprecationWarning: old_param is deprecated as an argument. "
f"Use new_param instead\n foo(old_param=12)\n"
)
@pytest.mark.parametrize(
("input_int", "expected_output"),
[
(123, "123 Byte"),
(1234, "1.2 kB"),
(123_456, "123.5 kB"),
(1_234_567, "1.2 MB"),
(1_234_567_890, "1.2 GB"),
(1_234_567_890_000, "1234.6 GB"),
],
)
def test_human_readable_bytes(input_int, expected_output):
"""_human_readable_bytes correctly transforms the integer to a string."""
assert _human_readable_bytes(input_int) == expected_output
def test_file_class():
"""File class can be instantiated and string representation is ok."""
f = File(name="image.png", data=b"")
assert str(f) == "File(name=image.png, data: 0 Byte)"
# hash(b"") varies between CPython and PyPy
assert repr(f) == f"File(name=image.png, data: 0 Byte, hash: {hash(b'')})"
@pytest.mark.parametrize(
("text", "expected"),
[
("D:20210318000756", "2021-03-18T00:07:56"),
("20210318000756", "2021-03-18T00:07:56"),
("D:2021", "2021-01-01T00:00:00"),
("D:202103", "2021-03-01T00:00:00"),
("D:20210304", "2021-03-04T00:00:00"),
("D:2021030402", "2021-03-04T02:00:00"),
("D:20210408054711", "2021-04-08T05:47:11"),
("D:20210408054711Z", "2021-04-08T05:47:11+00:00"),
("D:20210408054711Z00", "2021-04-08T05:47:11+00:00"),
("D:20210408054711Z0000", "2021-04-08T05:47:11+00:00"),
("D:20210408075331+02'00'", "2021-04-08T07:53:31+02:00"),
("D:20210408075331-03'00'", "2021-04-08T07:53:31-03:00"),
],
)
def test_parse_datetime(text, expected):
date = parse_iso8824_date(text)
date_str = (date.isoformat() + date.strftime("%z"))[: len(expected)]
assert date_str == expected
@pytest.mark.parametrize(
("text", "expected"),
[
("", None),
(None, None),
],
)
def test_parse_datetime_edge_cases(text, expected):
date = parse_iso8824_date(text)
assert date == expected
def test_parse_datetime_err():
with pytest.raises(ValueError) as ex:
parse_iso8824_date("D:20210408T054711Z")
assert ex.value.args[0] == "Can not convert date: D:20210408T054711Z"
assert parse_iso8824_date("D:20210408054711").tzinfo is None
def test_format_iso8824_date():
"""Test format_iso8824_date function with timezone handling."""
dt_naive = datetime(2021, 3, 18, 12, 7, 56)
result = format_iso8824_date(dt_naive)
assert result == "D:20210318120756"
dt_utc = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone.utc)
result = format_iso8824_date(dt_utc)
assert result == "D:20210318120756+00'00'"
dt_positive = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone(timedelta(hours=2, minutes=30)))
result = format_iso8824_date(dt_positive)
assert result == "D:20210318120756+02'30'"
dt_negative = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone(timedelta(hours=-5, minutes=-30)))
result = format_iso8824_date(dt_negative)
assert result == "D:20210318120756-05'30'"
def test_format_iso8824_date_roundtrip():
dt_naive = datetime(2021, 3, 18, 12, 7, 56)
formatted = format_iso8824_date(dt_naive)
parsed = parse_iso8824_date(formatted)
assert parsed == dt_naive
dt_utc = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone.utc)
formatted = format_iso8824_date(dt_utc)
parsed = parse_iso8824_date(formatted)
assert parsed == dt_utc
dt_positive = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone(timedelta(hours=2, minutes=30)))
formatted = format_iso8824_date(dt_positive)
parsed = parse_iso8824_date(formatted)
assert parsed == dt_positive
dt_negative = datetime(2021, 3, 18, 12, 7, 56, tzinfo=timezone(timedelta(hours=-5, minutes=-30)))
formatted = format_iso8824_date(dt_negative)
parsed = parse_iso8824_date(formatted)
assert parsed == dt_negative
def test_is_sublist():
# Basic checks:
assert is_sublist([0, 1], [0, 1, 2]) is True
assert is_sublist([0, 2], [0, 1, 2]) is True
assert is_sublist([1, 2], [0, 1, 2]) is True
assert is_sublist([0, 3], [0, 1, 2]) is False
# Ensure order is checked:
assert is_sublist([1, 0], [0, 1, 2]) is False
# Ensure duplicates are handled:
assert is_sublist([0, 1, 1], [0, 1, 1, 2]) is True
assert is_sublist([0, 1, 1], [0, 1, 2]) is False
# Edge cases with empty lists:
assert is_sublist([], [0, 1, 2]) is True
assert is_sublist([0, 1], []) is False
# Self-sublist edge case:
assert is_sublist([0, 1, 2], [0, 1, 2]) is True
@pytest.mark.parametrize(
("left", "right", "is_less_than"),
[
("1", "2", True),
("2", "1", False),
("1", "1", False),
("1.0", "1.1", True),
("1", "1.1", True),
# Suffix left
("1a", "2", True),
("2a", "1", False),
("1a", "1", False),
("1.0a", "1.1", True),
# I'm not sure about that, but seems special enough that it
# probably doesn't matter:
("1a", "1.1", False),
# Suffix right
("1", "2a", True),
("2", "1a", False),
("1", "1a", True),
("1.0", "1.1a", True),
("1", "1.1a", True),
("", "0.0.0", True),
# Just suffix matters ... hm, I think this is actually wrong:
("1.0a", "1.0", False),
("1.0", "1.0a", True),
],
)
def test_version_compare(left, right, is_less_than):
assert (Version(left) < Version(right)) is is_less_than
def test_version_compare_equal_str():
a = Version("1.0")
assert a != "1.0"
def test_version_compare_lt_str():
a = Version("1.0")
with pytest.raises(ValueError) as exc:
a < "1.0" # noqa: B015
assert exc.value.args[0] == "Version cannot be compared against <class 'str'>"
def test_bad_version():
assert Version("a").components == [(0, "a")]
def test_version_eq_hash():
version1 = Version("1.0")
version2 = Version("1.0")
version3 = Version("1.1")
assert version1 == version2
assert version1 != version3
assert hash(version1) == hash(version2)
assert hash(version1) != hash(version3)
def test_classproperty():
class Container:
@classproperty
def value1(cls) -> int: # noqa: N805
return 42
@classproperty
def value2(cls) -> int: # noqa: N805
return 1337
@classproperty
def value3(cls) -> int: # noqa: N805
return 1
@value3.getter
def value3(cls) -> int: # noqa: N805
return 2
assert Container.value1 == 42
assert Container.value2 == 1337
assert Container.value3 == 2
assert Container().value1 == 42
assert Container().value2 == 1337
assert Container().value3 == 2
|