1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
|
"""Test the pypdf_cmap module."""
from io import BytesIO
import pytest
from pypdf import PdfReader, PdfWriter
from pypdf._cmap import get_encoding, parse_bfchar, parse_bfrange
from pypdf._codecs import charset_encoding
from pypdf._font import Font
from pypdf.errors import LimitReachedError
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NullObject, StreamObject
from . import RESOURCE_ROOT, get_data_from_url
@pytest.mark.enable_socket
@pytest.mark.slow
@pytest.mark.parametrize(
("url", "name", "strict"),
[
# compute_space_width:
(
None,
"tika-923406.pdf",
False,
),
# _parse_to_unicode_process_rg:
(
None,
"tika-959173.pdf",
False,
),
(
None,
"tika-959173.pdf",
True,
),
# issue #1718:
(
None,
"iss1718.pdf",
False,
),
],
)
def test_text_extraction_slow(caplog, url: str, name: str, strict: bool):
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict)
for page in reader.pages:
page.extract_text()
assert caplog.text == ""
@pytest.mark.enable_socket
@pytest.mark.parametrize(
("url", "name", "strict"),
[
# bfchar_on_2_chars: issue #1293
(
None,
"ASurveyofImageClassificationBasedTechniques.pdf",
False,
),
# L40, get_font_width_from_default
(
None,
"tika-908104.pdf",
False,
),
# multiline_bfrange / regression test for issue #1285:
(
None,
"The%20lean%20times%20in%20the%20Peruvian%20economy.pdf",
False,
),
(
None,
"Giacalone.pdf",
False,
),
],
)
def test_text_extraction_fast(caplog, url: str, name: str, strict: bool):
"""Text extraction runs without exceptions or warnings"""
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict)
for page in reader.pages:
page.extract_text()
assert caplog.text == ""
@pytest.mark.enable_socket
def test_parse_encoding_advanced_encoding_not_implemented(caplog):
reader = PdfReader(BytesIO(get_data_from_url(name="tika-957144.pdf")))
for page in reader.pages:
page.extract_text()
# The correctly spelled encoding is /WinAnsiEncoding
assert "Advanced encoding /WinAnsEncoding not implemented yet" in caplog.text
@pytest.mark.enable_socket
def test_ascii_charset():
# Issue #1312
reader = PdfReader(BytesIO(get_data_from_url(name="ascii charset.pdf")))
assert "/a" not in reader.pages[0].extract_text()
@pytest.mark.enable_socket
@pytest.mark.parametrize(
("url", "name", "page_nb", "within_text"),
[
(
None,
"cmap1370.pdf",
0,
"",
),
(
None,
"02voc.pdf",
2,
"Document delineation and character sequence decoding",
),
],
ids=["iss1370", "iss1379"],
)
def test_text_extraction_of_specific_pages(
url: str, name: str, page_nb: int, within_text
):
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert within_text in reader.pages[page_nb].extract_text()
@pytest.mark.enable_socket
def test_iss1533():
reader = PdfReader(BytesIO(get_data_from_url(name="iss1533.pdf")))
reader.pages[0].extract_text() # no error
font = Font.from_font_resource(reader.pages[0]["/Resources"]["/Font"]["/F"])
assert font.character_map["\x01"] == "Ü"
@pytest.mark.enable_socket
@pytest.mark.parametrize(
("url", "name", "page_index", "within_text", "caplog_text"),
[
(
None,
"tstUCS2.pdf",
1,
["2 / 12", "S0490520090001", "于博"],
"",
),
(
None,
"tst-GBK_EUC.pdf",
0,
["NJA", "中华男科学杂志"],
"Multiple definitions in dictionary at byte 0x5cb42 for key /MediaBox\n",
),
],
)
def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text):
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
extracted = reader.pages[page_index].extract_text() # no error
for contained in within_text:
assert contained in extracted
assert caplog_text in caplog.text
@pytest.mark.enable_socket
def test_latex():
reader = PdfReader(BytesIO(get_data_from_url(name="math_latex.pdf")))
txt = reader.pages[0].extract_text() # no error
for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"):
assert pat in txt
# actually the ϕ and φ seems to be crossed in latex
@pytest.mark.enable_socket
def test_unixxx_glyphs():
reader = PdfReader(BytesIO(get_data_from_url(name="unixxx_glyphs.pdf")))
txt = reader.pages[0].extract_text() # no error
for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"):
assert pat in txt
@pytest.mark.enable_socket
def test_cmap_compute_space_width():
# issue 2137
# original file URL:
# url = "https://arxiv.org/pdf/2005.05909.pdf"
# URL from github issue is too long to pass code type check, use original arxiv URL instead
# url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf"
reader = PdfReader(BytesIO(get_data_from_url(name="TextAttack_paper.pdf")))
reader.pages[0].extract_text() # no error
@pytest.mark.enable_socket
def test_tabs_in_cmap():
"""Issue #2173"""
reader = PdfReader(BytesIO(get_data_from_url(name="iss2173.pdf")))
reader.pages[0].extract_text()
@pytest.mark.enable_socket
def test_ignoring_non_put_entries():
"""Issue #2290"""
reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf")))
reader.pages[0].extract_text()
@pytest.mark.enable_socket
def test_eten_b5():
"""Issue #2356"""
reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf")))
reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險")
def test_missing_entries_in_cmap():
"""
Issue #2702: this issue is observed on damaged pdfs
use of this file in test has been discarded as too slow/long
we will create the same error from crazyones
"""
pdf_path = RESOURCE_ROOT / "crazyones.pdf"
reader = PdfReader(pdf_path)
p = reader.pages[0]
p["/Resources"]["/Font"]["/F1"][NameObject("/ToUnicode")] = IndirectObject(
99999999, 0, reader
)
p.extract_text()
def test_null_missing_width():
"""For coverage of #2792"""
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
page = writer.pages[0]
ft = page["/Resources"]["/Font"]["/F1"]
ft[NameObject("/Widths")] = ArrayObject()
ft["/FontDescriptor"][NameObject("/MissingWidth")] = NullObject()
page.extract_text()
@pytest.mark.enable_socket
def test_unigb_utf16():
"""Cf #2812"""
url = (
"https://github.com/user-attachments/files/16767536/W020240105322424121296.pdf"
)
name = "iss2812.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "《中国能源展望 2060(2024 年版)》编写委员会" in reader.pages[1].extract_text()
@pytest.mark.enable_socket
def test_too_many_differences():
"""Cf #2836"""
url = (
"https://github.com/user-attachments/files/16911741/dumb_extract_text_crash.pdf"
)
name = "iss2836.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert reader.pages[0].extract_text() == ""
@pytest.mark.enable_socket
def test_iss2925():
url = (
"https://github.com/user-attachments/files/17621508/2305.09315.pdf"
)
name = "iss2925.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "slicing on the PDG to extract the relevant contextual" in reader.pages[3].extract_text()
@pytest.mark.enable_socket
def test_iss2966():
"""Regression test for issue #2966: indirect objects in fonts"""
url = (
"https://github.com/user-attachments/files/17904233/repro_out.pdf"
)
name = "iss2966.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "Lorem ipsum dolor sit amet" in reader.pages[0].extract_text()
@pytest.mark.enable_socket
def test_binascii_odd_length_string(caplog):
"""Tests for #2216"""
url = "https://github.com/user-attachments/files/18199642/iss2216.pdf"
name = "iss2216.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[0]
assert "\n(Many other theorems may\n" in page.extract_text()
assert "Skipping broken line b'143f 143f 10300': Odd-length string\n" in caplog.text
@pytest.mark.enable_socket
def test_standard_encoding(caplog):
"""Tests for #3156"""
url = "https://github.com/user-attachments/files/18983503/standard-encoding.pdf"
name = "issue3156.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[0]
assert page.extract_text() == "Lorem ipsum"
assert "Advanced encoding" not in caplog.text
@pytest.mark.enable_socket
def test_function_in_font_widths(caplog):
"""Tests for #3153"""
url = "https://github.com/user-attachments/files/18945709/Marseille_pypdf_level_0.2._compressed.pdf"
name = "issue3153.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[455]
assert "La vulnérabilité correspond aux conséquences potentielles" in page.extract_text()
assert "Expected numeric value for width, got {'/Bounds': [0.25, 0.25]," in caplog.text
def test_get_encoding__encoding_value_is_none():
ft = DictionaryObject()
ft[NameObject("/Encoding")] = NullObject()
assert get_encoding(ft) == (
dict(zip(range(256), charset_encoding["/StandardEncoding"])),
{}
)
def test_parse_bfchar(caplog):
map_dict = {}
int_entry = []
parse_bfchar(line=b"057e 1337", map_dict=map_dict, int_entry=int_entry)
parse_bfchar(line=b"056e 1f310", map_dict=map_dict, int_entry=int_entry)
assert map_dict == {-1: 2, "ծ": "", "վ": "ጷ"}
assert int_entry == [1406, 1390]
assert caplog.messages == ["Got invalid hex string: Odd-length string (b'1f310')"]
def test_parse_bfrange__iteration_limit():
writer = PdfWriter()
to_unicode = StreamObject()
to_unicode.set_data(
b"beginbfrange\n"
b"<00000000> <001FFFFF> <00000000>\n"
b"endbfrange\n"
)
font = writer._add_object(DictionaryObject({
NameObject("/Type"): NameObject("/Font"),
NameObject("/Subtype"): NameObject("/Type1"),
NameObject("/BaseFont"): NameObject("/Helvetica"),
NameObject("/ToUnicode"): to_unicode,
}))
page = writer.add_blank_page(width=100, height=100)
page[NameObject("/Resources")] = DictionaryObject({
NameObject("/Font"): DictionaryObject({
NameObject("/F1"): font.indirect_reference,
})
})
# Case without list, exceeding list directly.
with pytest.raises(
expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 2097152 > 100000\.$"
):
_ = page.extract_text()
# Use a pre-filled dummy list to simulate multiple calls where the upper bound does
# not overflow, but the overall size does. Case without list.
int_entry = [0] * 99_999
map_dict = {}
with pytest.raises(
expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 165535 > 100000\.$"
):
_ = parse_bfrange(line=b"0000 FFFF 0000", map_dict=map_dict, int_entry=int_entry, multiline_rg=None)
assert map_dict == {-1: 2}
# Exceeding from previous call.
int_entry.append(1)
map_dict = {}
with pytest.raises(
expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$"
):
_ = parse_bfrange(line=b"00000000 00000000 00000000", map_dict=map_dict, int_entry=int_entry, multiline_rg=None)
assert map_dict == {-1: 4}
# multiline_rg
int_entry = [0] * 99_995
map_dict = {-1: 1}
with pytest.raises(
expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$"
):
_ = parse_bfrange(
line=b"0020 0021 0022 0023 0024 0025 0026 2019",
map_dict=map_dict, int_entry=int_entry, multiline_rg=(32, 251)
)
assert map_dict == {-1: 1, " ": " ", "!": "!", '"': '"', "#": "#", "$": "$"}
# No multiline_rg, but list.
int_entry = [0] * 99_995
map_dict = {}
with pytest.raises(
expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$"
):
_ = parse_bfrange(
line=b"01 8A [ FFFD FFFD FFFD FFFF FFAB AAAA BBBB",
map_dict=map_dict, int_entry=int_entry, multiline_rg=None
)
assert map_dict == {-1: 1, "\x01": "�", "\x02": "�", "\x03": "�", "\x04": "\uffff", "\x05": "ᆱ"}
def test_parse_bfchar__iteration_limit():
int_entry = [0] * 99_995
map_dict = {}
with pytest.raises(
expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100002 > 100000\.$"
):
parse_bfchar(
line=b"0003 0020 0008 0025 0009 0026 000A 0027 000B 0028 000C 0029 000D 002A",
map_dict=map_dict, int_entry=int_entry,
)
assert map_dict == {}
|