File: tests_hocr.py

package info (click to toggle)
paperwork 2.2.5-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 166,660 kB
  • sloc: python: 44,775; makefile: 992; sh: 625; xml: 135
file content (93 lines) | stat: -rw-r--r-- 2,870 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
import unittest

import openpaperwork_core

TEST_XML = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
 "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
    <meta http-equiv="content-type" content="text/html; charset=utf-8" />
    <title>OCR output</title>
</head>
<body>
<p>
    <span class="ocr_line" title="bbox 10 20 30 40">
        <span class="ocrx_word" title="bbox 1 2 3 4">ABC</span>
        <span class="ocrx_word" title="bbox 5 6 7 8">def</span>
    </span>
</p>
</body>
</html>
"""


class TestHocr(unittest.TestCase):
    def setUp(self):
        self.core = openpaperwork_core.Core(auto_load_dependencies=True)
        self.core.load("openpaperwork_core.fs.fake")
        self.core.load("paperwork_backend.model.hocr")
        self.core.init()

        self.fs = self.core.get_by_name("openpaperwork_core.fs.fake")

    def test_get_boxes(self):
        self.fs.fs = {
            "some_doc": {
                "paper.4.words": TEST_XML,
            },
        }

        lines = self.core.call_success(
            "page_get_boxes_by_url", "file:///some_doc", 1
        )
        self.assertIsNone(lines)
        lines = list(
            self.core.call_success(
                "page_get_boxes_by_url", "file:///some_doc", 3
            )
        )
        self.assertEqual(len(lines), 1)
        self.assertEqual(lines[0].position, ((10, 20), (30, 40)))
        self.assertEqual(len(lines[0].word_boxes), 2)
        self.assertEqual(lines[0].word_boxes[0].position, ((1, 2), (3, 4)))
        self.assertEqual(lines[0].word_boxes[0].content, "ABC")
        self.assertEqual(lines[0].word_boxes[1].position, ((5, 6), (7, 8)))
        self.assertEqual(lines[0].word_boxes[1].content, "def")

    def test_get_text(self):
        self.fs.fs = {
            "some_doc": {
                "paper.4.words": TEST_XML,
            },
        }

        lines = self.core.call_success(
            "page_get_text_by_url", "file:///some_doc", 1
        )
        self.assertIsNone(lines)
        txt = self.core.call_success(
            "page_get_text_by_url", "file:///some_doc", 3
        ).replace("\n", " ")
        txt = re.sub(" +", " ", txt)
        txt = txt.strip()
        self.assertEqual(txt, "ABC def")

    def test_has_text(self):
        self.fs.fs = {
            "some_doc": {
                "paper.4.words": TEST_XML,
            },
        }

        self.assertIsNone(self.core.call_success(
            "page_has_text_by_url", "file:///some_doc", 1
        ))
        self.assertTrue(self.core.call_success(
            "page_get_text_by_url", "file:///some_doc", 3
        ))
        self.core.call_all("page_set_boxes_by_url", "file:///some_doc", 3, [])
        self.assertFalse(self.core.call_success(
            "page_has_text_by_url", "file:///some_doc", 3
        ))