File: test_cmap.py

package info (click to toggle)
pypdf2 2.12.1-3%2Bdeb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 27,144 kB
  • sloc: python: 28,767; makefile: 119; sh: 2
file content (119 lines) | stat: -rw-r--r-- 3,591 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from io import BytesIO

import pytest

from PyPDF2 import PdfReader
from PyPDF2.errors import PdfReadWarning

from . import get_pdf_from_url


@pytest.mark.external
@pytest.mark.slow
def test_compute_space_width():
    url = "https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf"
    name = "tika-923406.pdf"

    return
    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
    for page in reader.pages:
        page.extract_text()


@pytest.mark.external
@pytest.mark.slow
def test_parse_to_unicode_process_rg():
    url = "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf"
    name = "tika-959173.pdf"

    return
    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
    for page in reader.pages:
        page.extract_text()

    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=True)
    for page in reader.pages:
        page.extract_text()


@pytest.mark.external
def test_parse_encoding_advanced_encoding_not_implemented():
    url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957144.pdf"
    name = "tika-957144.pdf"

    return
    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
    with pytest.warns(PdfReadWarning, match="Advanced encoding .* not implemented yet"):
        for page in reader.pages:
            page.extract_text()


@pytest.mark.external
def test_get_font_width_from_default():  # L40
    url = "https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf"
    name = "tika-908104.pdf"

    return
    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
    for page in reader.pages:
        page.extract_text()


@pytest.mark.external
def test_multiline_bfrange():
    # non regression test for iss_1285
    url = "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf"
    name = "tika-908104.pdf"

    return
    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
    for page in reader.pages:
        page.extract_text()
    url = "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf"
    name = "Giacalone.pdf"
    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
    for page in reader.pages:
        page.extract_text()


@pytest.mark.external
def test_bfchar_on_2_chars():
    # iss #1293
    url = "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf"
    name = "ASurveyofImageClassificationBasedTechniques.pdf"

    return
    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
    for page in reader.pages:
        page.extract_text()


@pytest.mark.external
def test_ascii_charset():
    # iss #1312
    url = "https://github.com/py-pdf/PyPDF2/files/9472500/main.pdf"
    name = "ascii charset.pdf"

    return
    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
    assert "/a" not in reader.pages[0].extract_text()


@pytest.mark.external
def test_iss1370():
    url = "https://github.com/py-pdf/PyPDF2/files/9667138/cmap1370.pdf"
    name = "cmap1370.pdf"

    return
    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
    reader.pages[0].extract_text()


@pytest.mark.external
def test_iss1379():
    url = "https://github.com/py-pdf/PyPDF2/files/9712729/02voc.pdf"
    name = "02voc.pdf"

    return
    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
    reader.pages[2].extract_text()