File: cuneiform.py

package info (click to toggle)
python-pyocr 0.3.0-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 680 kB
  • ctags: 252
  • sloc: python: 1,235; sh: 40; makefile: 7
file content (153 lines) | stat: -rw-r--r-- 4,057 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python
'''
cuneiform.py is a wrapper for Cuneiform

USAGE:
 > from PIL import Image
 > from cuneiform import image_to_string
 > print image_to_string(Image.open('test.png'))
 > print image_to_string(Image.open('test-european.jpg'), lang='fra')

COPYRIGHT:
PyOCR is released under the GPL v3.
Copyright (c) Samuel Hoffstaetter, 2009
Copyright (c) Jerome Flesch, 2011-2012
https://github.com/jflesch/python-tesseract#readme
'''

import codecs
from io import BytesIO
import os
import re
import subprocess
import tempfile

from . import builders
from . import util


# CHANGE THIS IF CUNEIFORM IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
CUNEIFORM_CMD = 'cuneiform'

CUNEIFORM_DATA_POSSIBLE_PATHS = [
    "/usr/local/share/cuneiform",
    "/usr/share/cuneiform",
]

LANGUAGES_LINE_PREFIX = "Supported languages: "
LANGUAGES_SPLIT_RE = re.compile("[^a-z]")
VERSION_LINE_RE = re.compile("Cuneiform for \w+ (\d+).(\d+).(\d+)")

__all__ = [
    'can_detect_orientation',
    'get_available_builders',
    'get_available_languages',
    'get_name',
    'get_version',
    'image_to_string',
    'is_available',
    'CuneiformError',
]


def can_detect_orientation():
    return False


def get_name():
    return "Cuneiform"


def get_available_builders():
    return [
        builders.TextBuilder,
        builders.WordBoxBuilder,
    ]


class CuneiformError(Exception):
    def __init__(self, status, message):
        Exception.__init__(self, message)
        self.status = status
        self.message = message
        self.args = (status, message)


def temp_file(suffix):
    ''' Returns a temporary file '''
    return tempfile.NamedTemporaryFile(prefix='cuneiform_', suffix=suffix)


def cleanup(filename):
    ''' Tries to remove the given filename. Ignores non-existent files '''
    try:
        os.remove(filename)
    except OSError:
        pass


def image_to_string(image, lang=None, builder=None):
    if builder is None:
        builder = builders.TextBuilder()

    with temp_file(builder.file_extensions[0]) as output_file:
        cmd = [CUNEIFORM_CMD]
        if lang is not None:
            cmd += ["-l", lang]
        cmd += builder.cuneiform_args
        cmd += ["-o", output_file.name]
        cmd += ["-"]  # stdin

        img_data = BytesIO()
        image = image.convert("RGB")
        image.save(img_data, format="png")

        proc = subprocess.Popen(cmd,
                                stdin=subprocess.PIPE,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT)
        proc.stdin.write(img_data.getvalue())
        proc.stdin.close()
        output = proc.stdout.read().decode('utf-8')
        retcode = proc.wait()
        if retcode:
            raise CuneiformError(retcode, output)
        with codecs.open(output_file.name, 'r', encoding='utf-8',
                         errors='replace') as file_desc:
            results = builder.read_file(file_desc)
        return results


def is_available():
    return util.is_on_path(CUNEIFORM_CMD)


def get_available_languages():
    proc = subprocess.Popen([CUNEIFORM_CMD, "-l"], stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT)
    output = proc.stdout.read().decode('utf-8')
    proc.wait()
    languages = []
    for line in output.split("\n"):
        if not line.startswith(LANGUAGES_LINE_PREFIX):
            continue
        line = line[len(LANGUAGES_LINE_PREFIX):]
        for language in LANGUAGES_SPLIT_RE.split(line):
            if language == "":
                continue
            languages.append(language)
    return languages


def get_version():
    proc = subprocess.Popen([CUNEIFORM_CMD], stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT)
    output = proc.stdout.read().decode('utf-8')
    proc.wait()
    for line in output.split("\n"):
        m = VERSION_LINE_RE.match(line)
        g = m.groups()
        if m is not None:
            ver = (int(g[0]), int(g[1]), int(g[2]))
            return ver
    return None