File: test_mupdf_regressions.py

package info (click to toggle)
pymupdf 1.25.4%2Bds1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 98,632 kB
  • sloc: python: 43,379; ansic: 75; makefile: 6
file content (116 lines) | stat: -rw-r--r-- 3,771 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pymupdf
import os
import gentle_compare

scriptdir = os.path.abspath(os.path.dirname(__file__))


def test_707448():
    """Confirm page content cleaning does not destroy page appearance."""
    filename = os.path.join(scriptdir, "resources", "test-707448.pdf")
    doc = pymupdf.open(filename)
    page = doc[0]
    words0 = page.get_text("words")
    page.clean_contents(sanitize=True)
    words1 = page.get_text("words")
    assert gentle_compare.gentle_compare(words0, words1)


def test_707673():
    """Confirm page content cleaning does not destroy page appearance.

    Fails starting with MuPDF v1.23.9.

    Fixed in:
    commit 779b8234529cb82aa1e92826854c7bb98b19e44b (golden/master)
    """
    filename = os.path.join(scriptdir, "resources", "test-707673.pdf")
    doc = pymupdf.open(filename)
    page = doc[0]
    words0 = page.get_text("words")
    page.clean_contents(sanitize=True)
    words1 = page.get_text("words")
    ok = gentle_compare.gentle_compare(words0, words1)
    if pymupdf.mupdf_version_tuple >= (1, 24, 1):
        assert ok
    else:
        assert not ok


def test_707727():
    """Confirm page content cleaning does not destroy page appearance.

    MuPDF issue: https://bugs.ghostscript.com/show_bug.cgi?id=707727
    """
    filename = os.path.join(scriptdir, "resources", "test_3362.pdf")
    doc = pymupdf.open(filename)
    page = doc[0]
    pix0 = page.get_pixmap()
    page.clean_contents(sanitize=True)
    page = doc.reload_page(page)  # required to prevent re-use
    pix1 = page.get_pixmap()
    rms = gentle_compare.pixmaps_rms(pix0, pix1)
    print(f'{rms=}', flush=1)
    pix0.save(os.path.normpath(f'{__file__}/../../tests/test_707727_pix0.png'))
    pix1.save(os.path.normpath(f'{__file__}/../../tests/test_707727_pix1.png'))
    if pymupdf.mupdf_version_tuple >= (1, 25, 2):
        # New sanitising gives small fp rounding errors.
        assert rms < 0.05
    elif pymupdf.mupdf_version_tuple > (1, 24, 1):
        assert rms == 0
    else:
        assert rms != 0
    if pymupdf.mupdf_version_tuple <= (1, 24, 1):
        # We expect warnings.
        wt = pymupdf.TOOLS.mupdf_warnings()
        print(f"{wt=}")
        assert wt


def test_707721():
    """Confirm text extraction works for nested MCID with Type 3 fonts.
    PyMuPDF issue https://github.com/pymupdf/PyMuPDF/issues/3357
    MuPDF issue: https://bugs.ghostscript.com/show_bug.cgi?id=707721
    """
    if pymupdf.mupdf_version_tuple < (1, 24, 2):
        print(
            "test_707721(): not running because MuPDF-{pymupdf.mupdf_version} known to hang."
        )
        return
    filename = os.path.join(scriptdir, "resources", "test_3357.pdf")
    doc = pymupdf.open(filename)
    page = doc[0]
    ok = page.get_text()
    assert ok


def test_3376():
    """Check fix of MuPDF bug 707733.

    https://bugs.ghostscript.com/show_bug.cgi?id=707733
    PyMuPDF issue https://github.com/pymupdf/PyMuPDF/issues/3376

    Test file contains a redaction for the first 3 words: "Table of Contents".
    Test strategy:
    - extract all words (sorted)
    - apply redactions
    - extract words again
    - confirm: we now have 3 words less and remaining words are equal.
    """
    filename = os.path.join(scriptdir, "resources", "test_3376.pdf")
    doc = pymupdf.open(filename)
    page = doc[0]
    words0 = page.get_text("words", sort=True)
    words0_s = words0[:3]  # first 3 words
    words0_e = words0[3:]  # remaining words
    assert " ".join([w[4] for w in words0_s]) == "Table of Contents"

    page.apply_redactions()

    words1 = page.get_text("words", sort=True)

    ok = gentle_compare.gentle_compare(words0_e, words1)
    if pymupdf.mupdf_version_tuple >= (1, 24, 2):
        assert ok
    else:
        assert not ok