File: test_cmap.py

package info (click to toggle)
pypdf 6.9.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 18,184 kB
  • sloc: python: 48,595; makefile: 35
file content (424 lines) | stat: -rw-r--r-- 13,853 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
"""Test the pypdf_cmap module."""
from io import BytesIO

import pytest

from pypdf import PdfReader, PdfWriter
from pypdf._cmap import get_encoding, parse_bfchar, parse_bfrange
from pypdf._codecs import charset_encoding
from pypdf._font import Font
from pypdf.errors import LimitReachedError
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NullObject, StreamObject

from . import RESOURCE_ROOT, get_data_from_url


@pytest.mark.enable_socket
@pytest.mark.slow
@pytest.mark.parametrize(
    ("url", "name", "strict"),
    [
        # compute_space_width:
        (
            None,
            "tika-923406.pdf",
            False,
        ),
        # _parse_to_unicode_process_rg:
        (
            None,
            "tika-959173.pdf",
            False,
        ),
        (
            None,
            "tika-959173.pdf",
            True,
        ),
        # issue #1718:
        (
            None,
            "iss1718.pdf",
            False,
        ),
    ],
)
def test_text_extraction_slow(caplog, url: str, name: str, strict: bool):
    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict)
    for page in reader.pages:
        page.extract_text()
    assert caplog.text == ""


@pytest.mark.enable_socket
@pytest.mark.parametrize(
    ("url", "name", "strict"),
    [
        # bfchar_on_2_chars: issue #1293
        (
            None,
            "ASurveyofImageClassificationBasedTechniques.pdf",
            False,
        ),
        # L40, get_font_width_from_default
        (
            None,
            "tika-908104.pdf",
            False,
        ),
        # multiline_bfrange / regression test for issue #1285:
        (
            None,
            "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf",
            False,
        ),
        (
            None,
            "Giacalone.pdf",
            False,
        ),
    ],
)
def test_text_extraction_fast(caplog, url: str, name: str, strict: bool):
    """Text extraction runs without exceptions or warnings"""
    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict)
    for page in reader.pages:
        page.extract_text()
    assert caplog.text == ""


@pytest.mark.enable_socket
def test_parse_encoding_advanced_encoding_not_implemented(caplog):
    reader = PdfReader(BytesIO(get_data_from_url(name="tika-957144.pdf")))
    for page in reader.pages:
        page.extract_text()
    # The correctly spelled encoding is /WinAnsiEncoding
    assert "Advanced encoding /WinAnsEncoding not implemented yet" in caplog.text


@pytest.mark.enable_socket
def test_ascii_charset():
    # Issue #1312
    reader = PdfReader(BytesIO(get_data_from_url(name="ascii charset.pdf")))
    assert "/a" not in reader.pages[0].extract_text()


@pytest.mark.enable_socket
@pytest.mark.parametrize(
    ("url", "name", "page_nb", "within_text"),
    [
        (
            None,
            "cmap1370.pdf",
            0,
            "",
        ),
        (
            None,
            "02voc.pdf",
            2,
            "Document delineation and character sequence decoding",
        ),
    ],
    ids=["iss1370", "iss1379"],
)
def test_text_extraction_of_specific_pages(
    url: str, name: str, page_nb: int, within_text
):
    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
    assert within_text in reader.pages[page_nb].extract_text()


@pytest.mark.enable_socket
def test_iss1533():
    reader = PdfReader(BytesIO(get_data_from_url(name="iss1533.pdf")))
    reader.pages[0].extract_text()  # no error
    font = Font.from_font_resource(reader.pages[0]["/Resources"]["/Font"]["/F"])
    assert font.character_map["\x01"] == "Ü"


@pytest.mark.enable_socket
@pytest.mark.parametrize(
    ("url", "name", "page_index", "within_text", "caplog_text"),
    [
        (
            None,
            "tstUCS2.pdf",
            1,
            ["2 / 12", "S0490520090001", "于博"],
            "",
        ),
        (
            None,
            "tst-GBK_EUC.pdf",
            0,
            ["NJA", "中华男科学杂志"],
            "Multiple definitions in dictionary at byte 0x5cb42 for key /MediaBox\n",
        ),
    ],
)
def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text):
    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
    extracted = reader.pages[page_index].extract_text()  # no error
    for contained in within_text:
        assert contained in extracted
    assert caplog_text in caplog.text


@pytest.mark.enable_socket
def test_latex():
    reader = PdfReader(BytesIO(get_data_from_url(name="math_latex.pdf")))
    txt = reader.pages[0].extract_text()  # no error
    for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"):
        assert pat in txt
    # actually the ϕ and φ seems to be crossed in latex


@pytest.mark.enable_socket
def test_unixxx_glyphs():
    reader = PdfReader(BytesIO(get_data_from_url(name="unixxx_glyphs.pdf")))
    txt = reader.pages[0].extract_text()  # no error
    for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"):
        assert pat in txt


@pytest.mark.enable_socket
def test_cmap_compute_space_width():
    # issue 2137
    # original file URL:
    # url = "https://arxiv.org/pdf/2005.05909.pdf"
    # URL from github issue is too long to pass code type check, use original arxiv URL instead
    # url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf"
    reader = PdfReader(BytesIO(get_data_from_url(name="TextAttack_paper.pdf")))
    reader.pages[0].extract_text()  # no error


@pytest.mark.enable_socket
def test_tabs_in_cmap():
    """Issue #2173"""
    reader = PdfReader(BytesIO(get_data_from_url(name="iss2173.pdf")))
    reader.pages[0].extract_text()


@pytest.mark.enable_socket
def test_ignoring_non_put_entries():
    """Issue #2290"""
    reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf")))
    reader.pages[0].extract_text()


@pytest.mark.enable_socket
def test_eten_b5():
    """Issue #2356"""
    reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf")))
    reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險")


def test_missing_entries_in_cmap():
    """
    Issue #2702: this issue is observed on damaged pdfs
    use of this file in test has been discarded as too slow/long
    we will create the same error from crazyones
    """
    pdf_path = RESOURCE_ROOT / "crazyones.pdf"
    reader = PdfReader(pdf_path)
    p = reader.pages[0]
    p["/Resources"]["/Font"]["/F1"][NameObject("/ToUnicode")] = IndirectObject(
        99999999, 0, reader
    )
    p.extract_text()


def test_null_missing_width():
    """For coverage of #2792"""
    writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
    page = writer.pages[0]
    ft = page["/Resources"]["/Font"]["/F1"]
    ft[NameObject("/Widths")] = ArrayObject()
    ft["/FontDescriptor"][NameObject("/MissingWidth")] = NullObject()
    page.extract_text()


@pytest.mark.enable_socket
def test_unigb_utf16():
    """Cf #2812"""
    url = (
        "https://github.com/user-attachments/files/16767536/W020240105322424121296.pdf"
    )
    name = "iss2812.pdf"
    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
    assert "《中国能源展望 2060(2024 年版)》编写委员会" in reader.pages[1].extract_text()


@pytest.mark.enable_socket
def test_too_many_differences():
    """Cf #2836"""
    url = (
        "https://github.com/user-attachments/files/16911741/dumb_extract_text_crash.pdf"
    )
    name = "iss2836.pdf"
    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
    assert reader.pages[0].extract_text() == ""


@pytest.mark.enable_socket
def test_iss2925():
    url = (
        "https://github.com/user-attachments/files/17621508/2305.09315.pdf"
    )
    name = "iss2925.pdf"
    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
    assert "slicing on the PDG to extract the relevant contextual" in reader.pages[3].extract_text()


@pytest.mark.enable_socket
def test_iss2966():
    """Regression test for issue #2966: indirect objects in fonts"""
    url = (
        "https://github.com/user-attachments/files/17904233/repro_out.pdf"
    )
    name = "iss2966.pdf"
    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
    assert "Lorem ipsum dolor sit amet" in reader.pages[0].extract_text()


@pytest.mark.enable_socket
def test_binascii_odd_length_string(caplog):
    """Tests for #2216"""
    url = "https://github.com/user-attachments/files/18199642/iss2216.pdf"
    name = "iss2216.pdf"
    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))

    page = reader.pages[0]
    assert "\n(Many other theorems may\n" in page.extract_text()
    assert "Skipping broken line b'143f   143f   10300': Odd-length string\n" in caplog.text


@pytest.mark.enable_socket
def test_standard_encoding(caplog):
    """Tests for #3156"""
    url = "https://github.com/user-attachments/files/18983503/standard-encoding.pdf"
    name = "issue3156.pdf"
    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))

    page = reader.pages[0]
    assert page.extract_text() == "Lorem ipsum"
    assert "Advanced encoding" not in caplog.text


@pytest.mark.enable_socket
def test_function_in_font_widths(caplog):
    """Tests for #3153"""
    url = "https://github.com/user-attachments/files/18945709/Marseille_pypdf_level_0.2._compressed.pdf"
    name = "issue3153.pdf"
    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))

    page = reader.pages[455]
    assert "La vulnérabilité correspond aux conséquences potentielles" in page.extract_text()
    assert "Expected numeric value for width, got {'/Bounds': [0.25, 0.25]," in caplog.text


def test_get_encoding__encoding_value_is_none():
    ft = DictionaryObject()
    ft[NameObject("/Encoding")] = NullObject()
    assert get_encoding(ft) == (
        dict(zip(range(256), charset_encoding["/StandardEncoding"])),
        {}
    )


def test_parse_bfchar(caplog):
    map_dict = {}
    int_entry = []
    parse_bfchar(line=b"057e   1337", map_dict=map_dict, int_entry=int_entry)
    parse_bfchar(line=b"056e   1f310", map_dict=map_dict, int_entry=int_entry)

    assert map_dict == {-1: 2, "ծ": "", "վ": "ጷ"}
    assert int_entry == [1406, 1390]
    assert caplog.messages == ["Got invalid hex string: Odd-length string (b'1f310')"]


def test_parse_bfrange__iteration_limit():
    writer = PdfWriter()

    to_unicode = StreamObject()
    to_unicode.set_data(
        b"beginbfrange\n"
        b"<00000000> <001FFFFF> <00000000>\n"
        b"endbfrange\n"
    )
    font = writer._add_object(DictionaryObject({
        NameObject("/Type"): NameObject("/Font"),
        NameObject("/Subtype"): NameObject("/Type1"),
        NameObject("/BaseFont"): NameObject("/Helvetica"),
        NameObject("/ToUnicode"): to_unicode,
    }))

    page = writer.add_blank_page(width=100, height=100)
    page[NameObject("/Resources")] = DictionaryObject({
        NameObject("/Font"): DictionaryObject({
            NameObject("/F1"): font.indirect_reference,
        })
    })

    # Case without list, exceeding list directly.
    with pytest.raises(
            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 2097152 > 100000\.$"
    ):
        _ = page.extract_text()

    # Use a pre-filled dummy list to simulate multiple calls where the upper bound does
    # not overflow, but the overall size does. Case without list.
    int_entry = [0] * 99_999
    map_dict = {}
    with pytest.raises(
            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 165535 > 100000\.$"
    ):
        _ = parse_bfrange(line=b"0000 FFFF 0000", map_dict=map_dict, int_entry=int_entry, multiline_rg=None)
    assert map_dict == {-1: 2}

    # Exceeding from previous call.
    int_entry.append(1)
    map_dict = {}
    with pytest.raises(
            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$"
    ):
        _ = parse_bfrange(line=b"00000000 00000000 00000000", map_dict=map_dict, int_entry=int_entry, multiline_rg=None)
    assert map_dict == {-1: 4}

    # multiline_rg
    int_entry = [0] * 99_995
    map_dict = {-1: 1}
    with pytest.raises(
            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$"
    ):
        _ = parse_bfrange(
            line=b"0020  0021  0022  0023  0024  0025  0026  2019",
            map_dict=map_dict, int_entry=int_entry, multiline_rg=(32, 251)
        )
    assert map_dict == {-1: 1, " ": " ", "!": "!", '"': '"', "#": "#", "$": "$"}

    # No multiline_rg, but list.
    int_entry = [0] * 99_995
    map_dict = {}
    with pytest.raises(
            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$"
    ):
        _ = parse_bfrange(
            line=b"01 8A [ FFFD FFFD FFFD FFFF FFAB AAAA BBBB",
            map_dict=map_dict, int_entry=int_entry, multiline_rg=None
        )
    assert map_dict == {-1: 1, "\x01": "�", "\x02": "�", "\x03": "�", "\x04": "\uffff", "\x05": "ᆱ"}


def test_parse_bfchar__iteration_limit():
    int_entry = [0] * 99_995
    map_dict = {}
    with pytest.raises(
            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100002 > 100000\.$"
    ):
        parse_bfchar(
            line=b"0003   0020   0008   0025   0009   0026   000A   0027   000B   0028   000C   0029   000D   002A",
            map_dict=map_dict, int_entry=int_entry,
        )
    assert map_dict == {}