1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
|
# -*- coding: utf-8 -*-
###########################################################################
# OCRFeeder - The complete OCR suite
# Copyright (C) 2009 Joaquim Rocha
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
###########################################################################
import string
import tempfile
import os
import xml.etree.ElementTree as ET
from xml.parsers.expat import ExpatError
from ocrfeeder.studio.dataHolder import TEXT_TYPE, IMAGE_TYPE
from ocrfeeder.util import lib
IMAGE_ARGUMENT = '$IMAGE'
FILE_ARGUMENT = '$FILE'
class Engine:
def __init__(self, name, engine_path, arguments, image = None, temporary_folder = '/tmp/', image_format = 'PPM', failure_string = ''):
self.name = name
self.engine_path = engine_path
self.arguments = arguments
if not self.name:
raise WrongSettingsForEngine("The engine's name cannot be empty!")
if not self.engine_path or not os.path.isfile(self.engine_path):
raise WrongSettingsForEngine("The engine's path must exist! Path: %s" % self.engine_path)
if image == None:
self.image_path = None
else:
self.image_path = self.setImage(image)
self.image_format = image_format
self.failure_string = failure_string
self.temporary_folder = temporary_folder
self.__color_information = None
def setImage(self, image):
image_file = tempfile.mkstemp(suffix = '.' + self.image_format.lower())[1]
image = image.convert('L')
try:
image.save(image_file, format = self.image_format)
except KeyError:
image.save(image_file)
self.image_path = image_file
def read(self):
parsed_arguments = self.arguments.replace(IMAGE_ARGUMENT, self.image_path)
file_name = None
if self.arguments.find(FILE_ARGUMENT) != -1:
file_name = tempfile.mkstemp(dir = self.temporary_folder)[1]
parsed_arguments = parsed_arguments.replace(FILE_ARGUMENT, file_name)
text = os.popen(self.engine_path + ' ' + parsed_arguments).read()
try:
try:
text = unicode(text, 'utf-8', 'replace')
except UnicodeDecodeError:
text = unicode(text, 'ascii', 'replace').encode('utf-8', 'replace')
finally:
os.unlink(self.image_path)
return text
def classify(self, reading_output, rules = []):
stripped_output = reading_output.strip()
if not stripped_output:
return IMAGE_TYPE
if self.failure_string and stripped_output.count(self.failure_string) > len(stripped_output) / 2:
return IMAGE_TYPE
if self.__punctuationTest(stripped_output):
return IMAGE_TYPE
return TEXT_TYPE
#else
# FINISH THIS!
def __punctuationTest(self, output):
no_punctuation_output = output
for char in string.punctuation:
no_punctuation_output = no_punctuation_output.replace(char, '')
no_punctuation_output = no_punctuation_output.replace(self.failure_string, '')
no_punctuation_output = no_punctuation_output.replace(' ', '')
if len(no_punctuation_output) < len(output) / 2:
return True
return False
def __is_not_greyscale(self, image):
colors = image.get_colors()
if colors:
for color in colors:
if ((color[1])[0] - (color[1])[1])>10 or ((color[1])[0] - (color[1])[2])>10:
return False
return True
def saveToXml(self, file_path):
engine_info = {'name': self.name,
'engine_path': self.engine_path,
'arguments': self.arguments,
'image_format': self.image_format,
'failure_string': self.failure_string}
root = ET.Element('engine')
for key, value in engine_info.items():
if not key or not value:
continue
subelement = ET.SubElement(root, key)
subelement.text = str(value)
return ET.ElementTree(root).write(file_path, 'UTF-8')
class OcrEnginesManager:
def __init__(self, configuration_manager):
self.ocr_engines = []
self.configuration_manager = configuration_manager
def getEnginesNames(self):
return [engine.name for engine, path in self.ocr_engines]
def getEnginePath(self, engine):
for eng, path in self.ocr_engines:
if eng == engine:
return path
return None
def replaceEngine(self, engine, new_engine):
for i in xrange(len(self.ocr_engines)):
eng, path = self.ocr_engines[i]
if eng == engine:
new_path = self.engineToXml(new_engine, path)
self.ocr_engines[i] = new_engine, path
return True
return False
def makeEnginesFromFolder(self, folder):
self.ocr_engines = []
for xml_file in self.getXmlFilesInFolder(folder):
try:
self.ocr_engines.append((self.getEngineFromXml(xml_file), xml_file))
except WrongSettingsForEngine, we:
lib.debug("Cannot load engine at %s: %s" %( xml_file, str(we)))
if not len(self.ocr_engines):
lib.debug("Warning: no engines found!")
def getEngineFromXml(self, xml_file_name):
document = ET.parse(xml_file_name)
root_node = document.getroot()
arguments = {}
for child in root_node.getchildren():
arg_name = child.tag
arg_value = child.text
arguments[arg_name] = arg_value
return Engine(**arguments)
def getXmlFilesInFolder(self, folder):
return [os.path.join(folder, file) for file in os.listdir(folder) if file.endswith('.xml')]
def newEngine(self, name, engine_path, arguments, image_format, failure_string):
engine = Engine(name = name, engine_path = engine_path, arguments = arguments, image_format = image_format, failure_string = failure_string)
return engine
def delete(self, index):
path = self.ocr_engines[index][1]
os.remove(path)
del self.ocr_engines[index]
def addNewEngine(self, engine):
path = self.engineToXml(engine)
self.ocr_engines.append((engine,path))
def engineToXml(self, engine, path = None):
if not path:
path = os.path.join(self.configuration_manager.user_engines_folder, engine.name + '.xml')
path = lib.getNonExistingFileName(path)
engine_content = engine.saveToXml(path)
return path
class WrongSettingsForEngine(Exception):
def __init__(self, message):
super(WrongSettingsForEngine, self).__init__(message)
|