1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
|
import pymupdf
import os
import gentle_compare
scriptdir = os.path.abspath(os.path.dirname(__file__))
def test_707448():
"""Confirm page content cleaning does not destroy page appearance."""
filename = os.path.join(scriptdir, "resources", "test-707448.pdf")
doc = pymupdf.open(filename)
page = doc[0]
words0 = page.get_text("words")
page.clean_contents(sanitize=True)
words1 = page.get_text("words")
assert gentle_compare.gentle_compare(words0, words1)
def test_707673():
"""Confirm page content cleaning does not destroy page appearance.
Fails starting with MuPDF v1.23.9.
Fixed in:
commit 779b8234529cb82aa1e92826854c7bb98b19e44b (golden/master)
"""
filename = os.path.join(scriptdir, "resources", "test-707673.pdf")
doc = pymupdf.open(filename)
page = doc[0]
words0 = page.get_text("words")
page.clean_contents(sanitize=True)
words1 = page.get_text("words")
ok = gentle_compare.gentle_compare(words0, words1)
if pymupdf.mupdf_version_tuple >= (1, 24, 1):
assert ok
else:
assert not ok
def test_707727():
"""Confirm page content cleaning does not destroy page appearance.
MuPDF issue: https://bugs.ghostscript.com/show_bug.cgi?id=707727
"""
filename = os.path.join(scriptdir, "resources", "test_3362.pdf")
doc = pymupdf.open(filename)
page = doc[0]
pix0 = page.get_pixmap()
page.clean_contents(sanitize=True)
page = doc.reload_page(page) # required to prevent re-use
pix1 = page.get_pixmap()
rms = gentle_compare.pixmaps_rms(pix0, pix1)
print(f'{rms=}', flush=1)
pix0.save(os.path.normpath(f'{__file__}/../../tests/test_707727_pix0.png'))
pix1.save(os.path.normpath(f'{__file__}/../../tests/test_707727_pix1.png'))
if pymupdf.mupdf_version_tuple >= (1, 25, 2):
# New sanitising gives small fp rounding errors.
assert rms < 0.05
elif pymupdf.mupdf_version_tuple > (1, 24, 1):
assert rms == 0
else:
assert rms != 0
if pymupdf.mupdf_version_tuple <= (1, 24, 1):
# We expect warnings.
wt = pymupdf.TOOLS.mupdf_warnings()
print(f"{wt=}")
assert wt
def test_707721():
"""Confirm text extraction works for nested MCID with Type 3 fonts.
PyMuPDF issue https://github.com/pymupdf/PyMuPDF/issues/3357
MuPDF issue: https://bugs.ghostscript.com/show_bug.cgi?id=707721
"""
if pymupdf.mupdf_version_tuple < (1, 24, 2):
print(
"test_707721(): not running because MuPDF-{pymupdf.mupdf_version} known to hang."
)
return
filename = os.path.join(scriptdir, "resources", "test_3357.pdf")
doc = pymupdf.open(filename)
page = doc[0]
ok = page.get_text()
assert ok
def test_3376():
"""Check fix of MuPDF bug 707733.
https://bugs.ghostscript.com/show_bug.cgi?id=707733
PyMuPDF issue https://github.com/pymupdf/PyMuPDF/issues/3376
Test file contains a redaction for the first 3 words: "Table of Contents".
Test strategy:
- extract all words (sorted)
- apply redactions
- extract words again
- confirm: we now have 3 words less and remaining words are equal.
"""
filename = os.path.join(scriptdir, "resources", "test_3376.pdf")
doc = pymupdf.open(filename)
page = doc[0]
words0 = page.get_text("words", sort=True)
words0_s = words0[:3] # first 3 words
words0_e = words0[3:] # remaining words
assert " ".join([w[4] for w in words0_s]) == "Table of Contents"
page.apply_redactions()
words1 = page.get_text("words", sort=True)
ok = gentle_compare.gentle_compare(words0_e, words1)
if pymupdf.mupdf_version_tuple >= (1, 24, 2):
assert ok
else:
assert not ok
|