File: test_tesseract.py

package info (click to toggle)
pymupdf 1.25.4%2Bds1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 98,632 kB
  • sloc: python: 43,379; ansic: 75; makefile: 6
file content (152 lines) | stat: -rw-r--r-- 8,963 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import platform
import textwrap

import pymupdf

def test_tesseract():
    '''
    This checks that MuPDF has been built with tesseract support.

    By default we don't supply a valid `tessdata` directory, and just assert
    that attempting to use Tesseract raises the expected error (which checks
    that MuPDF is built with Tesseract support).

    But if TESSDATA_PREFIX is set in the environment, we assert that
    FzPage.get_textpage_ocr() succeeds.
    '''
    path = os.path.abspath( f'{__file__}/../resources/2.pdf')
    doc = pymupdf.open( path)
    page = doc[5]
    if hasattr(pymupdf, 'mupdf'):
        # rebased.
        if pymupdf.mupdf_version_tuple < (1, 25, 4):
            tail = 'OCR initialisation failed'
        else:
            tail = 'Tesseract language initialisation failed'
        e_expected = f'code=3: {tail}'
        if platform.system() == 'OpenBSD':
            # 2023-12-12: For some reason the SWIG catch code only catches
            # the exception as FzErrorBase.
            e_expected_type = pymupdf.mupdf.FzErrorBase
            print(f'OpenBSD workaround - expecting FzErrorBase, not FzErrorLibrary.')
        else:
            e_expected_type = pymupdf.mupdf.FzErrorLibrary
    else:
        # classic.
        e_expected = 'OCR initialisation failed'
        e_expected_type = None
    tessdata_prefix = os.environ.get('TESSDATA_PREFIX')
    if tessdata_prefix:
        tp = page.get_textpage_ocr(full=True)
        print(f'test_tesseract(): page.get_textpage_ocr() succeeded')
    else:
        try:
            tp = page.get_textpage_ocr(full=True, tessdata='/foo/bar')
        except Exception as e:
            e_text = str(e)
            print(f'Received exception as expected.')
            print(f'{type(e)=}')
            print(f'{e_text=}')
            assert e_text == e_expected, f'Unexpected exception: {e_text!r}'
            if e_expected_type:
                print(f'{e_expected_type=}')
                assert type(e) == e_expected_type, f'{type(e)=} != {e_expected_type=}.'
        else:
            assert 0, f'Expected exception {e_expected!r}'
        rebased = hasattr(pymupdf, 'mupdf')
        if rebased:
            wt = pymupdf.TOOLS.mupdf_warnings()
            if pymupdf.mupdf_version_tuple < (1, 25, 4):
                assert wt == (
                        'UNHANDLED EXCEPTION!\n'
                        'library error: Tesseract initialisation failed'
                        )
            else:
                assert not wt
        

def test_3842b():
    # Check Tesseract failure when given a bogus languages.
    #
    # Note that Tesseract seems to output its own diagnostics.
    #
    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3842.pdf')
    with pymupdf.open(path) as document:
        page = document[6]
        try:
            partial_tp = page.get_textpage_ocr(flags=0, full=False, language='qwerty')
        except Exception as e:
            print(f'test_3842b(): received exception: {e}')
            if 'No tessdata specified and Tesseract is not installed' in str(e):
                pass
            else:
                if pymupdf.mupdf_version_tuple < (1, 25, 4):
                    assert 'OCR initialisation failed' in str(e)
                    wt = pymupdf.TOOLS.mupdf_warnings()
                    assert wt == 'UNHANDLED EXCEPTION!\nlibrary error: Tesseract initialisation failed\nUNHANDLED EXCEPTION!\nlibrary error: Tesseract initialisation failed', \
                            f'Unexpected {wt=}'
                else:
                    assert 'Tesseract language initialisation failed' in str(e)


def test_3842():
    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3842.pdf')
    with pymupdf.open(path) as document:
        page = document[6]
        try:
            partial_tp = page.get_textpage_ocr(flags=0, full=False)
        except Exception as e:
            print(f'test_3842(): received exception: {e}', flush=1)
            if 'No tessdata specified and Tesseract is not installed' in str(e):
                pass
            elif 'Tesseract language initialisation failed' in str(e):
                pass
            else:
                assert 0, f'Unexpected exception text: {str(e)=}'
        else:
            text = page.get_text(textpage=partial_tp)
            print()
            print(text)
            print(f'text:\n{text!r}')
            
            # 2024-11-29: This is the current incorrect output. We use
            # underscores for lines containing entirely whitespace (which
            # textwrap.dedent() unfortunately replaces with empty lines).
            text_expected = textwrap.dedent('''
                    NIST SP 800-223  
                    _
                    High-Performance Computing Security 
                    February 2024 
                    _
                    __
                    iii 
                    Table of Contents 
                    1. Introduction ...................................................................................................................................1 
                    2. HPC System Reference Architecture and Main Components ............................................................2 
                    2.1.1. Components of the High-Performance Computing Zone ............................................................. 3 
                    2.1.2. Components of the Data Storage Zone ........................................................................................ 4 
                    2.1.3. Parallel File System ....................................................................................................................... 4 
                    2.1.4. Archival and Campaign Storage .................................................................................................... 5 
                    2.1.5. Burst Buffer .................................................................................................................................. 5 
                    2.1.6. Components of the Access Zone .................................................................................................. 6 
                    2.1.7. Components of the Management Zone ....................................................................................... 6 
                    2.1.8. General Architecture and Characteristics .................................................................................... 6 
                    2.1.9. Basic Services ................................................................................................................................ 7 
                    2.1.10. Configuration Management ....................................................................................................... 7 
                    2.1.11. HPC Scheduler and Workflow Management .............................................................................. 7 
                    2.1.12. HPC Software .............................................................................................................................. 8 
                    2.1.13. User Software ............................................................................................................................. 8 
                    2.1.14. Site-Provided Software and Vendor Software ........................................................................... 8 
                    2.1.15. Containerized Software in HPC .................................................................................................. 9 
                    3. HPC Threat Analysis...................................................................................................................... 10 
                    3.2.1. Access Zone Threats ................................................................................................................... 11 
                    3.2.2. Management Zone Threats ........................................................................................................ 11 
                    3.2.3. High-Performance Computing Zone Threats .............................................................................. 12 
                    3.2.4. Data Storage Zone Threats ......................................................................................................... 12 
                    4. HPC Security Posture, Challenges, and Recommendations ............................................................. 14 
                    5. Conclusions .................................................................................................................................. 19 
                    ''',
                    )[1:].replace('_', ' ')
            print(f'text_expected:\n{text_expected!r}')
            assert text == text_expected