1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
|
import re
import unittest
import openpaperwork_core
TEST_XML = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<title>OCR output</title>
</head>
<body>
<p>
<span class="ocr_line" title="bbox 10 20 30 40">
<span class="ocrx_word" title="bbox 1 2 3 4">ABC</span>
<span class="ocrx_word" title="bbox 5 6 7 8">def</span>
</span>
</p>
</body>
</html>
"""
class TestHocr(unittest.TestCase):
def setUp(self):
self.core = openpaperwork_core.Core(auto_load_dependencies=True)
self.core.load("openpaperwork_core.fs.fake")
self.core.load("paperwork_backend.model.hocr")
self.core.init()
self.fs = self.core.get_by_name("openpaperwork_core.fs.fake")
def test_get_boxes(self):
self.fs.fs = {
"some_doc": {
"paper.4.words": TEST_XML,
},
}
lines = self.core.call_success(
"page_get_boxes_by_url", "file:///some_doc", 1
)
self.assertIsNone(lines)
lines = list(
self.core.call_success(
"page_get_boxes_by_url", "file:///some_doc", 3
)
)
self.assertEqual(len(lines), 1)
self.assertEqual(lines[0].position, ((10, 20), (30, 40)))
self.assertEqual(len(lines[0].word_boxes), 2)
self.assertEqual(lines[0].word_boxes[0].position, ((1, 2), (3, 4)))
self.assertEqual(lines[0].word_boxes[0].content, "ABC")
self.assertEqual(lines[0].word_boxes[1].position, ((5, 6), (7, 8)))
self.assertEqual(lines[0].word_boxes[1].content, "def")
def test_get_text(self):
self.fs.fs = {
"some_doc": {
"paper.4.words": TEST_XML,
},
}
lines = self.core.call_success(
"page_get_text_by_url", "file:///some_doc", 1
)
self.assertIsNone(lines)
txt = self.core.call_success(
"page_get_text_by_url", "file:///some_doc", 3
).replace("\n", " ")
txt = re.sub(" +", " ", txt)
txt = txt.strip()
self.assertEqual(txt, "ABC def")
def test_has_text(self):
self.fs.fs = {
"some_doc": {
"paper.4.words": TEST_XML,
},
}
self.assertIsNone(self.core.call_success(
"page_has_text_by_url", "file:///some_doc", 1
))
self.assertTrue(self.core.call_success(
"page_get_text_by_url", "file:///some_doc", 3
))
self.core.call_all("page_set_boxes_by_url", "file:///some_doc", 3, [])
self.assertFalse(self.core.call_success(
"page_has_text_by_url", "file:///some_doc", 3
))
|