1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import logging
import os
import subprocess
from os import fspath
from pathlib import Path
import pytest
from ocrmypdf import pdfinfo
from ocrmypdf._exec import tesseract
from ocrmypdf.exceptions import MissingDependencyError
from .conftest import check_ocrmypdf
# pylint: disable=redefined-outer-name
@pytest.mark.parametrize('basename', ['graph_ocred.pdf', 'cardinal.pdf'])
def test_skip_pages_does_not_replicate(resources, basename, outdir):
infile = resources / basename
outpdf = outdir / basename
check_ocrmypdf(
infile,
outpdf,
'--pdf-renderer',
'sandwich',
'--force-ocr',
'--tesseract-timeout',
'0',
)
info_in = pdfinfo.PdfInfo(infile)
info = pdfinfo.PdfInfo(outpdf)
for page in info:
assert len(page.images) == 1, "skipped page was replicated"
for n, info_out_n in enumerate(info):
assert info_out_n.width_inches == info_in[n].width_inches, "output resized"
assert info_out_n.height_inches == info_in[n].height_inches, "output resized"
def test_content_preservation(resources, outpdf):
infile = resources / 'masks.pdf'
check_ocrmypdf(
infile, outpdf, '--pdf-renderer', 'sandwich', '--tesseract-timeout', '0'
)
info = pdfinfo.PdfInfo(outpdf)
page = info[0]
assert len(page.images) > 1, "masks were rasterized"
@pytest.mark.skipif(tesseract.version() > '5', reason="doesn't fool Tess 5")
def test_no_languages(tmp_path, monkeypatch):
(tmp_path / 'tessdata').mkdir()
monkeypatch.setenv('TESSDATA_PREFIX', fspath(tmp_path))
with pytest.raises(MissingDependencyError):
tesseract.get_languages()
def test_image_too_large_hocr(monkeypatch, resources, outdir):
def dummy_run(args, *, env=None, **kwargs):
raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')
monkeypatch.setattr(tesseract, 'run', dummy_run)
tesseract.generate_hocr(
input_file=resources / 'crom.png',
output_hocr=outdir / 'out.hocr',
output_text=outdir / 'out.txt',
languages=['eng'],
engine_mode=None,
tessconfig=[],
timeout=180.0,
pagesegmode=None,
thresholding=0,
user_words=None,
user_patterns=None,
)
assert "name='ocr-capabilities'" in Path(outdir / 'out.hocr').read_text()
def test_image_too_large_pdf(monkeypatch, resources, outdir):
def dummy_run(args, *, env=None, **kwargs):
raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')
monkeypatch.setattr(tesseract, 'run', dummy_run)
tesseract.generate_pdf(
input_file=resources / 'crom.png',
output_pdf=outdir / 'pdf.pdf',
output_text=outdir / 'txt.txt',
languages=['eng'],
engine_mode=None,
tessconfig=[],
timeout=180.0,
pagesegmode=None,
thresholding=0,
user_words=None,
user_patterns=None,
)
assert Path(outdir / 'txt.txt').read_text() == '[skipped page]'
if os.name != 'nt': # different semantics
assert Path(outdir / 'pdf.pdf').stat().st_size == 0
def test_timeout(caplog):
tesseract.page_timedout(5)
assert "took too long" in caplog.text
@pytest.mark.parametrize(
'in_, logged',
[
(b'Tesseract Open Source', ''),
(b'lots of diacritics blah blah', 'diacritics'),
(b'Warning in pixReadMem', ''),
(b'OSD: Weak margin', 'unsure about page orientation'),
(b'Error in pixScanForForeground', ''),
(b'Error in boxClipToRectangle', ''),
(b'an unexpected error', 'an unexpected error'),
(b'a dire warning', 'a dire warning'),
(b'read_params_file something', 'read_params_file'),
(b'an innocent message', 'innocent'),
(b'\x7f\x7f\x80innocent unicode failure', 'innocent'),
],
)
def test_tesseract_log_output(caplog, in_, logged):
caplog.set_level(logging.INFO)
tesseract.tesseract_log_output(in_)
if logged == '':
assert caplog.text == ''
else:
assert logged in caplog.text
def test_tesseract_log_output_raises(caplog):
with pytest.raises(tesseract.TesseractConfigError):
tesseract.tesseract_log_output(b'parameter not found: moo')
assert 'not found' in caplog.text
|