1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
|
#!/usr/bin/env python
'''
cuneiform.py is a wrapper for Cuneiform
USAGE:
> from PIL import Image
> from cuneiform import image_to_string
> print image_to_string(Image.open('test.png'))
> print image_to_string(Image.open('test-european.jpg'), lang='fra')
COPYRIGHT:
PyOCR is released under the GPL v3.
Copyright (c) Samuel Hoffstaetter, 2009
Copyright (c) Jerome Flesch, 2011-2012
https://github.com/jflesch/python-tesseract#readme
'''
import codecs
from io import BytesIO
import os
import re
import subprocess
import tempfile
from . import builders
from . import util
# CHANGE THIS IF CUNEIFORM IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
CUNEIFORM_CMD = 'cuneiform'
CUNEIFORM_DATA_POSSIBLE_PATHS = [
"/usr/local/share/cuneiform",
"/usr/share/cuneiform",
]
LANGUAGES_LINE_PREFIX = "Supported languages: "
LANGUAGES_SPLIT_RE = re.compile("[^a-z]")
VERSION_LINE_RE = re.compile("Cuneiform for \w+ (\d+).(\d+).(\d+)")
__all__ = [
'can_detect_orientation',
'get_available_builders',
'get_available_languages',
'get_name',
'get_version',
'image_to_string',
'is_available',
'CuneiformError',
]
def can_detect_orientation():
return False
def get_name():
return "Cuneiform"
def get_available_builders():
return [
builders.TextBuilder,
builders.WordBoxBuilder,
]
class CuneiformError(Exception):
def __init__(self, status, message):
Exception.__init__(self, message)
self.status = status
self.message = message
self.args = (status, message)
def temp_file(suffix):
''' Returns a temporary file '''
return tempfile.NamedTemporaryFile(prefix='cuneiform_', suffix=suffix)
def cleanup(filename):
''' Tries to remove the given filename. Ignores non-existent files '''
try:
os.remove(filename)
except OSError:
pass
def image_to_string(image, lang=None, builder=None):
if builder is None:
builder = builders.TextBuilder()
with temp_file(builder.file_extensions[0]) as output_file:
cmd = [CUNEIFORM_CMD]
if lang is not None:
cmd += ["-l", lang]
cmd += builder.cuneiform_args
cmd += ["-o", output_file.name]
cmd += ["-"] # stdin
img_data = BytesIO()
image = image.convert("RGB")
image.save(img_data, format="png")
proc = subprocess.Popen(cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
proc.stdin.write(img_data.getvalue())
proc.stdin.close()
output = proc.stdout.read().decode('utf-8')
retcode = proc.wait()
if retcode:
raise CuneiformError(retcode, output)
with codecs.open(output_file.name, 'r', encoding='utf-8',
errors='replace') as file_desc:
results = builder.read_file(file_desc)
return results
def is_available():
return util.is_on_path(CUNEIFORM_CMD)
def get_available_languages():
proc = subprocess.Popen([CUNEIFORM_CMD, "-l"], stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
output = proc.stdout.read().decode('utf-8')
proc.wait()
languages = []
for line in output.split("\n"):
if not line.startswith(LANGUAGES_LINE_PREFIX):
continue
line = line[len(LANGUAGES_LINE_PREFIX):]
for language in LANGUAGES_SPLIT_RE.split(line):
if language == "":
continue
languages.append(language)
return languages
def get_version():
proc = subprocess.Popen([CUNEIFORM_CMD], stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
output = proc.stdout.read().decode('utf-8')
proc.wait()
for line in output.split("\n"):
m = VERSION_LINE_RE.match(line)
g = m.groups()
if m is not None:
ver = (int(g[0]), int(g[1]), int(g[2]))
return ver
return None
|