File: ocrEngines.py

package info (click to toggle)
ocrfeeder 0.6.6%2Bdfsg1-1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 1,616 kB
  • ctags: 2,088
  • sloc: python: 16,603; makefile: 52
file content (190 lines) | stat: -rw-r--r-- 7,560 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# -*- coding: utf-8 -*-

###########################################################################
#    OCRFeeder - The complete OCR suite
#    Copyright (C) 2009 Joaquim Rocha
# 
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
###########################################################################

import string

import tempfile
import os
import xml.etree.ElementTree as ET
from xml.parsers.expat import ExpatError
from ocrfeeder.studio.dataHolder import TEXT_TYPE, IMAGE_TYPE
from ocrfeeder.util import lib
IMAGE_ARGUMENT = '$IMAGE'
FILE_ARGUMENT = '$FILE'

class Engine:
    
    def __init__(self, name, engine_path, arguments, image = None, temporary_folder = '/tmp/', image_format = 'PPM', failure_string = ''):
        
        self.name = name
        self.engine_path = engine_path
        self.arguments = arguments
        if not self.name:
            raise WrongSettingsForEngine("The engine's name cannot be empty!")
        if not self.engine_path or not os.path.isfile(self.engine_path):
            raise WrongSettingsForEngine("The engine's path must exist! Path: %s" % self.engine_path)
        if image == None:
            self.image_path = None
        else:
            self.image_path = self.setImage(image)
        self.image_format = image_format
        self.failure_string = failure_string
        self.temporary_folder = temporary_folder
        self.__color_information = None
    
    def setImage(self, image):
        image_file = tempfile.mkstemp(suffix = '.' + self.image_format.lower())[1]
        image = image.convert('L')
        try:
            image.save(image_file, format = self.image_format)
        except KeyError:
            image.save(image_file)
        self.image_path = image_file
    
    def read(self):
        parsed_arguments = self.arguments.replace(IMAGE_ARGUMENT, self.image_path)
        file_name = None
        if self.arguments.find(FILE_ARGUMENT) != -1:
            file_name = tempfile.mkstemp(dir = self.temporary_folder)[1]
            parsed_arguments = parsed_arguments.replace(FILE_ARGUMENT, file_name)
        text = os.popen(self.engine_path + ' ' + parsed_arguments).read()
        try:
            try:
                text = unicode(text, 'utf-8', 'replace')
            except UnicodeDecodeError:
                text = unicode(text, 'ascii', 'replace').encode('utf-8', 'replace')
        finally:
            os.unlink(self.image_path)
        return text
            
    def classify(self, reading_output, rules = []):
        stripped_output = reading_output.strip()
        if not stripped_output:
            return IMAGE_TYPE
        if self.failure_string and stripped_output.count(self.failure_string) > len(stripped_output) / 2:
            return IMAGE_TYPE
        if self.__punctuationTest(stripped_output):
            return IMAGE_TYPE
        return TEXT_TYPE
        #else
        # FINISH THIS!
    def __punctuationTest(self, output):
        no_punctuation_output = output
        for char in string.punctuation:
            no_punctuation_output = no_punctuation_output.replace(char, '')
        no_punctuation_output = no_punctuation_output.replace(self.failure_string, '')
        no_punctuation_output = no_punctuation_output.replace(' ', '')
        if len(no_punctuation_output) < len(output) / 2:
            return True
        return False
        
    def __is_not_greyscale(self, image):
        colors = image.get_colors()
        if colors:
            for color in colors:
                if ((color[1])[0] - (color[1])[1])>10 or ((color[1])[0] - (color[1])[2])>10:
                    return False
        return True
    
    def saveToXml(self, file_path):
        engine_info = {'name': self.name,
                       'engine_path': self.engine_path,
                       'arguments': self.arguments,
                       'image_format': self.image_format,
                       'failure_string': self.failure_string}
        root = ET.Element('engine')
        for key, value in engine_info.items():
            if not key or not value:
                continue
            subelement = ET.SubElement(root, key)
            subelement.text = str(value)
        return ET.ElementTree(root).write(file_path, 'UTF-8')

class OcrEnginesManager:
    
    def __init__(self, configuration_manager):
        self.ocr_engines = []
        self.configuration_manager = configuration_manager
    
    def getEnginesNames(self):
        return [engine.name for engine, path in self.ocr_engines]
    
    def getEnginePath(self, engine):
        for eng, path in self.ocr_engines:
            if eng == engine:
                return path
        return None
    
    def replaceEngine(self, engine, new_engine):
        for i in xrange(len(self.ocr_engines)):
            eng, path = self.ocr_engines[i]
            if eng == engine:
                new_path = self.engineToXml(new_engine, path)
                self.ocr_engines[i] = new_engine, path
                return True
        return False
    
    def makeEnginesFromFolder(self, folder):
        self.ocr_engines = []
        for xml_file in self.getXmlFilesInFolder(folder):
            try:
                self.ocr_engines.append((self.getEngineFromXml(xml_file), xml_file))
            except WrongSettingsForEngine, we:
                lib.debug("Cannot load engine at %s: %s" %( xml_file, str(we)))
        if not len(self.ocr_engines):
                lib.debug("Warning: no engines found!")
    
    def getEngineFromXml(self, xml_file_name):
        document = ET.parse(xml_file_name)
        root_node = document.getroot()
        arguments = {}
        for child in root_node.getchildren():
            arg_name = child.tag
            arg_value = child.text
            arguments[arg_name] = arg_value
        return Engine(**arguments)
    
    def getXmlFilesInFolder(self, folder):
        return [os.path.join(folder, file) for file in os.listdir(folder) if file.endswith('.xml')]
    
    def newEngine(self, name, engine_path, arguments, image_format, failure_string):
        engine = Engine(name = name, engine_path = engine_path, arguments = arguments, image_format = image_format, failure_string = failure_string)
        return engine
    
    def delete(self, index):
        path = self.ocr_engines[index][1]
        os.remove(path)
        del self.ocr_engines[index]
    
    def addNewEngine(self, engine):
        path = self.engineToXml(engine)
        self.ocr_engines.append((engine,path))
    
    def engineToXml(self, engine, path = None):
        if not path:
            path = os.path.join(self.configuration_manager.user_engines_folder, engine.name + '.xml')
            path = lib.getNonExistingFileName(path)
        engine_content = engine.saveToXml(path)
        return path

class WrongSettingsForEngine(Exception):
    
    def __init__(self, message):
        super(WrongSettingsForEngine, self).__init__(message)