File: test_word_delimiters.py

package info (click to toggle)
pymupdf 1.25.4%2Bds1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 98,632 kB
  • sloc: python: 43,379; ansic: 75; makefile: 6
file content (23 lines) | stat: -rw-r--r-- 756 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import pymupdf
import string


def test_delimiters():
    """Test changing word delimiting characters."""
    doc = pymupdf.open()
    page = doc.new_page()
    text = "word1,word2 - word3. word4?word5."
    page.insert_text((50, 50), text)

    # Standard words extraction:
    # only spaces and line breaks start a new word
    words0 = [w[4] for w in page.get_text("words")]
    assert words0 == ["word1,word2", "-", "word3.", "word4?word5."]

    # extract words again
    words1 = [w[4] for w in page.get_text("words", delimiters=string.punctuation)]
    assert words0 != words1
    assert " ".join(words1) == "word1 word2 word3 word4 word5"

    # confirm we will be getting old extraction
    assert [w[4] for w in page.get_text("words")] == words0