File: test_textsearch.py

package info (click to toggle)
pymupdf 1.25.4%2Bds1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 98,632 kB
  • sloc: python: 43,379; ansic: 75; makefile: 6
file content (52 lines) | stat: -rw-r--r-- 1,552 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""
"test_search1":
Search for some text on a PDF page, and compare content of returned hit
rectangle with the searched text.

"test_search2":
Text search with 'clip' parameter - clip rectangle contains two occurrences
of searched text. Confirm search locations are inside clip.
"""

import os

import pymupdf

scriptdir = os.path.abspath(os.path.dirname(__file__))
filename1 = os.path.join(scriptdir, "resources", "2.pdf")
filename2 = os.path.join(scriptdir, "resources", "github_sample.pdf")
filename3 = os.path.join(scriptdir, "resources", "text-find-ligatures.pdf")


def test_search1():
    doc = pymupdf.open(filename1)
    page = doc[0]
    needle = "mupdf"
    rlist = page.search_for(needle)
    assert rlist != []
    for rect in rlist:
        assert needle in page.get_textbox(rect).lower()


def test_search2():
    doc = pymupdf.open(filename2)
    page = doc[0]
    needle = "the"
    clip = pymupdf.Rect(40.5, 228.31436157226562, 346.5226135253906, 239.5338592529297)
    rl = page.search_for(needle, clip=clip)
    assert len(rl) == 2
    for r in rl:
        assert r in clip


def test_search3():
    """Ensure we find text whether or not it contains ligatures."""
    doc = pymupdf.open(filename3)
    page = doc[0]
    needle = "flag"
    hits = page.search_for(needle, flags=pymupdf.TEXTFLAGS_SEARCH)
    assert len(hits) == 2  # all occurrences found
    hits = page.search_for(
        needle, flags=pymupdf.TEXTFLAGS_SEARCH | pymupdf.TEXT_PRESERVE_LIGATURES
    )
    assert len(hits) == 1  # only found text without ligatures