File: classify.py

package info (click to toggle)
gamera 1:3.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, buster, sid
  • size: 15,912 kB
  • sloc: xml: 122,324; cpp: 50,730; python: 35,044; ansic: 258; makefile: 114; sh: 101
file content (883 lines) | stat: -rw-r--r-- 33,810 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
#
# Copyright (C) 2001-2005 Ichiro Fujinaga, Michael Droettboom, Karl MacMillan
#               2009-2012 Christoph Dalitz
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#

import sys
import core # grab all of the standard gamera modules
import util, gamera_xml, config
from fudge import Fudge
from gamera.gui import has_gui
from gamera.gameracore import CONFIDENCE_DEFAULT

"""This file defines the Python part of classifiers.  These wrapper classes
contain a reference to a core classifier class (unusally written in C++).
They add functionality for XML loading/saving, splitting/grouping, and
keeping track of a database of glyphs (in the Interactive case.)"""

class ClassifierError(Exception):
   pass

class _Classifier:
   """The base class for both the interactive and noninteractive classifier."""

   ########################################
   # INFORMATION
   def get_name(self):
      return self.__class__.__name__

   def is_dirty(self):
      return self._database.is_dirty and len(self._database)
   def _set_is_dirty(self, val):
      self._database.is_dirty = val
   is_dirty = property(is_dirty, _set_is_dirty)

   def get_database(self):
      return self._database
   def set_database(self, other):
      raise RuntimeError("You can't change the database.  This exception should never be raised.  Please report it to the authors.")
   database = property(get_database, set_database)

   def __len__(self):
      return len(self._database)

   ########################################
   # GROUPING
   def group_list_automatic(self, glyphs, grouping_function=None,
                            evaluate_function=None, max_parts_per_group=4,
                            max_graph_size=16, criterion="min"):
      """**group_list_automatic** (ImageList *glyphs*, Function
*grouping_function* = ``None``, Function *evaluate_function* = ``None``,
int *max_parts_per_group* = 4, int *max_graph_size* = 16, *criterion* = ``'min'``)

Classifies the given list of glyphs.  Adjacent glyphs are joined
together if doing so results in a higher global confidence.  Each part
of a joined glyph is classified as HEURISTIC with the prefix
``_group``.

*glyphs*
  The list of glyphs to group and classify.

*grouping_function*
  A function that determines how glyphs are initially
  combined.  This function must take exactly two arguments, which the
  grouping algorithm will pass an arbitrary pair of glyphs from
  *glyphs*.  If the two glyphs should be considered for grouping, the
  function should return ``True``, else ``False``.  

  In ``gamera.classify``, there are two predefined grouping functions:

  **BoundingBoxGroupingFunction** (*threshold*)
      A callable class that returns ``True`` when the bounding boxes
      are at most *threshold* apart.

  **ShapedGroupingFunction** (*threshold*)
      A callable class that returns ``True`` when the closest distance
      between the black pixels is at most *threshold*.

  When *grouping_function* is ``None``, *BoundingBoxGroupingFunction(4)*
  is used.

*evaluate_function*
   A function that evaluates a grouping of glyphs.  This function must
   take exactly one argument which is a list of glyphs.  The function
   should return a confidence value between 0 and 1 (1 being most
   confident) representing how confidently the grouping forms a valid
   character.

   If no *evaluate_function* is provided, a default one will be used
   that returns the CONFIDENCE_DEFAULT of the knn classification.

*max_parts_per_group*
   The maximum number of connected components that will be grouped
   together and tested as a group.  For performance reasons, this
   number should be kept relatively small.

*max_graph_size*
   Subgraphs (potentially connected areas of the image) larger than
   the given number of nodes will be ignored.  This is a hack to
   prevent the runtime of the algorithm from exploding.

*criterion*
   Determines how the grouping candidates ccs are evaluated against
   each other in the optimization step. Default = *min* choses the 
   grouping with the highest minimum confidence, and *avg* that one
   with the highest average confidence.
   
The function returns a 2-tuple (pair) of lists: (*add*, *remove*).
*add* is a list of glyphs that were created by classifying any glyphs
as a split (See Initialization_) or grouping.  *remove* is a list of
glyphs that are no longer valid due to reclassifying glyphs from a
split to something else.

The list *glyphs* is never modified.  Instead, detected parts of groups
are classified as ``_group._part.*``, where ``*`` stands for the class
name of the grouped glyph. This means that after calling this function,
you must remove the *remove* CCs and all CCs with a class name beginning with
```_group._part`` from *glyph*, and you must add all glyphs from *add*
to it. Or you can instead call group_and_update_list_automatic_, which does 
this automatically for you.
"""
      glyphs = [x for x in glyphs if x.classification_state != 3]
      if len(glyphs) == 0:
         return [], []

      splits, removed = self.classify_list_automatic(glyphs)
      glyphs = [x for x in glyphs if not x.get_main_id().startswith('split')]
      if grouping_function is None:
         grouping_function = BoundingBoxGroupingFunction(4)
      G = self._pregroup(glyphs, grouping_function)
      if evaluate_function is None:
         evaluate_function = self._evaluate_subgroup
      found_unions = self._find_group_unions(
         G, evaluate_function, max_parts_per_group, max_graph_size, criterion)
      return found_unions + splits, removed

   def group_and_update_list_automatic(self, glyphs, *args, **kwargs):
      """**group_and_update_list_automatic** (ImageList *glyphs*, Function
*grouping_function* = ``None``, Function *evaluate_function* = ``None``,
int *max_parts_per_group* = 5, int *max_graph_size* = 16, string *criterion* = ``'min'``)

A convenience wrapper around group_list_automatic_ that returns
a list of glyphs that is already updated for splitting and grouping."""
      added, removed = self.group_list_automatic(glyphs, *args, **kwargs)
      glyphs = [g for g in glyphs if not g.get_main_id().startswith("_group._part")]
      return self._update_after_classification(glyphs, added, removed)

   def _pregroup(self, glyphs, function):
      from gamera import graph
      G = graph.Undirected()
      G.add_nodes(glyphs)
      progress = util.ProgressFactory("Pre-grouping glyphs...", len(glyphs))
      try:
         for i in range(len(glyphs)):
            gi = glyphs[i]
            for j in range(i + 1, len(glyphs)):
               gj = glyphs[j]
               if function(gi, gj):
                  G.add_edge(gi, gj)
            progress.step()
      finally:
         progress.kill()
      return G

   def _evaluate_subgroup(self, subgroup):
      import image_utilities
      if len(subgroup) > 1:
         union = image_utilities.union_images(subgroup)
         classification, confidence = self.guess_glyph_automatic(union)
         classification_name = classification[0][1]
         if (classification_name.startswith("_split") or
             classification_name.startswith("skip")):
            return 0.0
         else:
            return classification[0][0]
      if len(subgroup):
         classification = subgroup[0].id_name[0]
         if classification[1].startswith('_group._part'):
            return 0.0
         return classification[0]
      raise ValueError("Something is wrong here...  Either you don't have classifier data or there is an internal error in the grouping algorithm.")

   def _find_group_unions(self, G, evaluate_function, max_parts_per_group=5,
                          max_graph_size=16, criterion="min"):
      import image_utilities
      progress = util.ProgressFactory("Grouping glyphs...", G.nsubgraphs)
      try:
         found_unions = []
         for root in G.get_subgraph_roots():
            if G.size_of_subgraph(root) > max_graph_size:
               continue
            best_grouping = G.optimize_partitions(
               root, evaluate_function, max_parts_per_group, max_graph_size, criterion)
            if not best_grouping is None:
               for subgroup in best_grouping:
                  if len(subgroup) > 1:
                     union = image_utilities.union_images(subgroup)
                     found_unions.append(union)
                     classification, confidence = self.guess_glyph_automatic(union)
                     union.classify_heuristic(classification)
                     part_name = "_group._part." + classification[0][1]
                     for glyph in subgroup:
                        glyph.classify_heuristic(part_name)
            progress.step()
      finally:
         progress.kill()
      return found_unions

   ########################################
   # AUTOMATIC CLASSIFICATION
   def classify_glyph_automatic(self, glyph, max_recursion=10, recursion_level=0):
      """**classify_glyph_automatic** (Image *glyph*, int *max_recursion* = 10)

Classifies a glyph and sets its ``classification_state`` and
``id_name``.  (If you don't want the glyph changed, use
guess_glyph_automatic_.)

*glyph*
  The glyph to classify.

*max_recursion* (optional)
  Limit the number of split recursions.

Returns a 2-tuple (pair) of lists: (*add*, *remove*).  *add* is a list
of glyphs that were created by classifying *glyph* as a split (See
Initialization_).  *remove* is a list of glyphs that are no longer
valid due to reclassifying *glyph* from a split to something else.
Most often, both of these lists will be empty.  You will normally want
to use these lists to update the collection of glyphs on the current
page."""
      if recursion_level > max_recursion:
         return [], []
      # Since we only have one glyph to classify, we can't do any grouping
      if (len(self.database) and
          glyph.classification_state in (core.UNCLASSIFIED, core.AUTOMATIC)):
         self.generate_features(glyph)
         removed = glyph.children_images
         (id, conf) = self._classify_automatic_impl(glyph)
         glyph.classify_automatic(id)
         glyph.confidence = conf
         splits = self._do_splits(self, glyph)
         all_splits = splits[:]
         for g2 in splits:
            recurse_splits, recurse_removed = self.classify_glyph_automatic(
               g2, max_recursion, recursion_level + 1)
            all_splits.extend(recurse_splits)
            removed.extend(recurse_removed)
         return all_splits, removed
      return [], []

   def _classify_list_automatic(self, glyphs, max_recursion=10, recursion_level=0, progress=None):

      # There is a slightly convoluted handling of the progress bar here, since
      # this function is called recursively on split glyphs
      if recursion_level == 0:
         progress = util.ProgressFactory("Classifying glyphs...", len(glyphs))
      try:
         if (recursion_level > max_recursion):
            return [], []
         added = []
         removed = {}
         for glyph in glyphs:
            if glyph.classification_state in (core.UNCLASSIFIED, core.AUTOMATIC):
               for child in glyph.children_images:
                  removed[child] = None
         for glyph in glyphs:
            if not removed.has_key(glyph):
               self.generate_features(glyph)
               if (glyph.classification_state in
                   (core.UNCLASSIFIED, core.AUTOMATIC)):
                  (id, conf) = self._classify_automatic_impl(glyph)
                  glyph.classify_automatic(id)
                  glyph.confidence = conf
                  adds = self._do_splits(self, glyph)
                  progress.add_length(len(adds))
                  added.extend(adds)
            progress.step()
         if len(added):
            added_recurse, removed_recurse = self._classify_list_automatic(
               added, max_recursion, recursion_level+1, progress)
            added.extend(added_recurse)
            for glyph in removed_recurse:
               removed[glyph] = None
      finally:
         if recursion_level == 0:
            progress.kill()
      return added, removed.keys()

   def classify_list_automatic(self, glyphs, max_recursion=10, progress=None):
      """**classify_list_automatic** (ImageList *glyphs*, int *max_recursion* = 10)

Classifies a list of glyphs and sets the classification_state and
id_name of each glyph.  (If you don't want it set, use guess_glyph_automatic_.)

*glyphs*
  A list of glyphs to classify.

*max_recursion*
  The maximum level of recursion to follow when splitting glyphs.
  Since some glyphs will split into parts that then classify as
  ``_split`` in turn, a maximum depth should be set to avoid infinite
  recursion.  This number can normally be set quite low, depending on
  the application.

Return type: (*add*, *remove*)

  The list *glyphs* is never modified by the function.  Instead, it
  returns a 2-tuple (pair) of lists: (*add*, *remove*).  *add* is a
  list of glyphs that were created by classifying glyphs as a
  split (See Initialization_).  *remove* is a list of glyphs that are
  no longer valid due to reclassifying glyphs from a split to
  something else.  Most often, both of these lists will be empty.  You
  will normally want to use these lists to update the collection of
  glyphs on the current page.  If you just want a new list returned
  with these updates already made, use classify_and_update_list_automatic_.
"""
      return self._classify_list_automatic(glyphs, max_recursion, 0, progress)

   def _update_after_classification(self, glyphs, added, removed):
      result = glyphs + added
      for g in removed:
         if g in result:
            result.remove(g)
      return result

   def classify_and_update_list_automatic(self, glyphs, *args, **kwargs):
      """**classify_and_update_list_automatic** (ImageList *glyphs*, Int
*max_recursion* = ``10``)

A convenience wrapper around classify_list_automatic_ that returns
a list of glyphs that is already updated based on splitting."""
      added, removed = self.classify_list_automatic(glyphs, *args, **kwargs)
      return self._update_after_classification(glyphs, added, removed)

   # Since splitting glyphs is optional (when the classifier instance is
   # created) we have two versions of this function, so that there needn't
   # be an 'if' statement everywhere.
   def _do_splits_impl(self, glyph):
      id = glyph.get_main_id()
      if (id.startswith('_split.')):
         parts = id.split('.')
         if (len(parts) != 2 or not hasattr(glyph, parts[1])):
            raise ClassifierError("'%s' is not a known or valid split function." % parts[1])
         try:
            if sys.version_info[:2] < (2, 4):
               splits = getattr(glyph, parts[1]).__call__(glyph)
            else:
               splits = getattr(glyph, parts[1]).__call__()
         except core.SegmentationError:
            if len(glyph.id_name) >= 2:
               glyph.id_name = glyph.id_name[1:]
            else:
               glyph.id_name = [(0.0, '_ERROR')]
            return []
         else:
            glyph.children_images = splits
            return splits
      return []

   def _do_splits_null(self, glyph):
      return []

   ########################################
   # XML
   # Note that unclassified glyphs in the XML file are ignored.
   def to_xml(self, stream, with_features=True):
      """**to_xml** (stream *stream*)

Saves the training data in XML format to the given stream (which could
be any object supporting the file protocol, such as a file object or StringIO
object)."""
      self.is_dirty = False
      glyphs = [g for g in self.get_glyphs() 
                if not g.get_main_id().startswith("_group._part")]
      return gamera_xml.WriteXML(
         glyphs=glyphs, with_features=with_features).write_stream(stream)

   def to_xml_filename(self, filename, with_features=True):
      """**to_xml_filename** (FileSave *filename*)

Saves the training data in XML format to the given filename."""
      self.is_dirty = False
      glyphs = [g for g in self.get_glyphs() 
                if not g.get_main_id().startswith("_group._part")]
      return gamera_xml.WriteXMLFile(
         glyphs=glyphs).write_filename(filename, with_features)

   def from_xml(self, stream):
      """**from_xml** (stream *stream*)

Loads the training data from the given stream (which could be any object 
supporting the file protocol, such as a file object or StringIO object.)"""
      self._from_xml(gamera_xml.LoadXML().parse_stream(stream))

   def from_xml_filename(self, filename):
      """**from_xml_filename** (FileOpen *filename*)

Loads the training data from the given filename."""
      stream = gamera_xml.LoadXML().parse_filename(filename)
      self._from_xml(stream)

   def _from_xml(self, xml):
      database = [x for x in xml.glyphs
                  if x.classification_state != core.UNCLASSIFIED]
      self.set_glyphs(database)

   def merge_from_xml(self, stream):
      """**merge_from_xml** (stream *stream*)

Loads the training data from the given stream (which could be a file
handle or StringIO object) and adds it to the existing training data."""
      self._merge_xml(gamera_xml.LoadXML().parse_stream(stream))

   def merge_from_xml_filename(self, filename):
      """**merge_from_xml_filename** (stream *stream*)

Loads the training data from the given filename and adds it to the
existing training data."""
      self._merge_xml(gamera_xml.LoadXML().parse_filename(filename))

   def _merge_xml(self, xml):
      database = [x for x in xml.glyphs
                  if x.classification_state != core.UNCLASSIFIED]
      self.merge_glyphs(database)

   ##############################################
   # Features
   def generate_features_on_glyphs(self, glyphs):
      """Generates features for all the given glyphs."""
      progress = util.ProgressFactory("Generating features...",
                                      len(glyphs), numsteps=32)
      try:
         for i, glyph in enumerate(glyphs):
            self.generate_features(glyph)
            progress.step()
      finally:
         progress.kill()
   
class NonInteractiveClassifier(_Classifier):
   def __init__(self, database=[], perform_splits=True):
      """**NonInteractiveClassifier** (ImageList *database* = ``[]``, bool *perform_splits* = ``True``)

Creates a new classifier instance.

*database*
        Can come in two forms:

           - When a list (or Python iterable) each element is a glyph to
             use as training data for the classifier

           - *For non-interactive classifiers only*, when *database* is a filename,
             the classifier will be "unserialized" from the given file.

        Any images in the list that were manually classified (have
	classification_state == MANUAL) will be used as training data
	for the classifier.  Any UNCLASSIFIED or AUTOMATICALLY
	classified images will be ignored.

	When initializing a noninteractive classifier, the database
	*must* be non-empty.

*perform_splits*
          If ``perform_splits`` is ``True``, glyphs trained with names
	  beginning with ``_split.`` are run through a given splitting
	  algorithm.  For instance, glyphs that need to be broken into
	  upper and lower halves for further classification of those
	  parts would be trained as ``_split.splity``.  When the
	  automatic classifier encounters glyphs that most closely
	  match those trained as ``_split``, it will perform the
	  splitting algorithm and then continue to recursively
	  classify its parts.

	  The `splitting algorithms`__ are documented in the plugin documentation.

.. __: plugins.html#segmentation

          New splitting algorithms can be created by `writing plugin`__ methods
          in the category ``Segmentation``.  

.. __: writing_plugins.html

      """
      if type(database) == list:
         self._database = util.CallbackList(database)
         self.set_glyphs(database)
      elif database[-4:] == ".xml":
         self._database = util.CallbackList(database)
         self.from_xml_filename(database)
      else:
         self._database = util.CallbackList([])
         self.unserialize(database)

      if perform_splits:
         self._do_splits = self.__class__._do_splits_impl
      else:
         self._do_splits = self.__class__._do_splits_null
      self._perform_splits = perform_splits

      self.confidence_types = [CONFIDENCE_DEFAULT]

   def __del__(self):
      # Seems redundant, but this triggers a callback to the GUI, if running.
      del self._database

   def is_interactive():
      """Boolean **is_interactive** ()

Returns ``True`` if classifier is interactive, else ``False``."""
      return False
   is_interactive = staticmethod(is_interactive)

   ########################################
   # BASIC DATABASE MANIPULATION FUNCTIONS
   def get_glyphs(self):
      """ImageList **get_glyphs** ()

Returns a list of the glyphs in the classifier."""
      return list(self.database)
      
   def set_glyphs(self, glyphs):
      """**set_glyphs** (ImageList *glyphs*)

Sets the training data for the classifier to the given list of glyphs.

On some non-interactive classifiers, this operation can be quite
expensive.
"""
      self.generate_features_on_glyphs(glyphs)
      self.database.clear()
      self.database.extend(glyphs)
      self.instantiate_from_images(self.database, self.normalize)

   def merge_glyphs(self, glyphs):
      """**merge_glyphs** (ImageList *glyphs*)

Adds the given glyphs to the current set of training data.

On some non-interactive classifiers, this operation can be quite
expensive."""
      self.generate_features_on_glyphs(glyphs)
      self.database.extend(glyphs)
      self.instantiate_from_images(self.database, self.normalize)

   def clear_glyphs(self):
      """**clear_glyphs** ()

Removes all training data from the classifier.
"""
      self.database.clear()

   def load_settings(self, filename):
      """**load_settings** (FileOpen *filename*)

Loads classifier-specific settings from the given filename.  The
format of this file is entirely dependent on the concrete classifier
type."""
      _Classifier.load_settings(self, filename)
      self.instantiate_from_images(self.database, self.normalize)

   ########################################
   # AUTOMATIC CLASSIFICATION
   # (most of this is implemented in the base class, _Classifier)
   def guess_glyph_automatic(self, glyph):
      """(id_name, confidencemap) **guess_glyph_automatic** (Image *glyph*)

Classifies the given *glyph* without setting its classification.  The
return value is a tuple of the form ``(id_name,confidencemap)``, where
*idname* is a list of the form `idname`_, and *confidencemap* is a
map of the form `confidence`_ listing the confidences of the main id.

.. _idname: #id-name

.. _confidence: #confidence
"""
      self.generate_features(glyph)
      return self.classify(glyph)

   def _classify_automatic_impl(self, glyph):
      return self.classify(glyph)

class InteractiveClassifier(_Classifier):
   def __init__(self, database=[], perform_splits=True):
      """**InteractiveClassifier** (ImageList *database* = ``[]``, bool *perform_splits* = ``True``)

Creates a new classifier instance.

*database*
        Must be a list (or Python interable) containing glyphs to use
        as training data for the classifier.

        Any images in the list that were manually classified (have
	classification_state == MANUAL) will be used as training data
	for the classifier.  Any UNCLASSIFIED or AUTOMATICALLY
	classified images will be ignored.

	When initializing a noninteractive classifier, the database
	*must* be non-empty.

*perform_splits*
	  If ``perform_splits`` is ``True``, glyphs trained with names
	  beginning with ``_split.`` are run through a given splitting
	  algorithm.  For instance, glyphs that need to be broken into
	  upper and lower halves for further classification of those
	  parts would be trained as ``_split.splity``.  When the
	  automatic classifier encounters glyphs that most closely
	  match those trained as ``_split``, it will perform the
	  splitting algorithm and then continue to recursively
	  classify its parts.

	  The `splitting algorithms`__ are documented in the plugin documentation.

.. __: plugins.html#segmentation

          New splitting algorithms can be created by `writing plugin`__ methods
          in the category ``Segmentation``.  

.. __: writing_plugins.html

      """
      self._database = util.CallbackSet()
      self.set_glyphs(database)
      if perform_splits:
         self._do_splits = self.__class__._do_splits_impl
      else:
         self._do_splits = self.__class__._do_splits_null
      self._perform_splits = perform_splits
      self._display = None
      self.confidence_types = [CONFIDENCE_DEFAULT]

   def __del__(self):
      # Seems redundant, but this triggers a callback to the GUI, if running.
      del self._database

   def is_interactive():
      return True
   is_interactive = staticmethod(is_interactive)

   ########################################
   # BASIC DATABASE MANIPULATION FUNCTIONS
   def get_glyphs(self):
      return self.database
      
   def set_glyphs(self, glyphs):
      glyphs = util.make_sequence(glyphs)
      self.clear_glyphs()
      self.generate_features_on_glyphs(glyphs)
      self.database.extend(glyphs)

   def merge_glyphs(self, glyphs):
      glyphs = util.make_sequence(glyphs)
      self.generate_features_on_glyphs(glyphs)
      self.database.extend(glyphs)

   def clear_glyphs(self):
      self.database.clear()

   ########################################
   # AUTOMATIC CLASSIFICATION
   def _classify_automatic_impl(self, glyph):
      if len(self.database) == 0:
         raise ClassifierError(
            "Cannot classify using an empty production database.")
      for child in glyph.children_images:
         if child in self.database:
            self.database.remove(child)
      return self.classify_with_images(self.database, glyph)

   def guess_glyph_automatic(self, glyph):
      if len(self.database):
         self.generate_features(glyph)
         return self.classify_with_images(self.database, glyph)
      else:
         return ([(0.0, 'unknown')], {})

   ########################################
   # MANUAL CLASSIFICATION
   def classify_glyph_manual(self, glyph, id):
      """**classify_glyph_manual** (Image *glyph*, String *id*)

Sets the classification of the given *glyph* to the given *id* and
then adds the glyph to the training data.  Call this function when the
end user definitively knows the identity of the glyph.

*glyph*
	The glyph to classify.

*id*
	The class name.

.. note::
   Here *id* is a simple string, not of the `id_name`_ format, since
   the confidence of a manual classification is always 1.0."""
      # Deal with grouping
      if id.startswith('_group'):
         raise ClassifierError(
            "You must select more than one connected component " +
            "to create a group")

      removed = {}
      for child in glyph.children_images:
         removed[child] = None
         if child in self.database:
            self.database.remove(child)
      glyph.classify_manual([(1.0, id)])
      self.generate_features(glyph)
      self.database.append(glyph)
      return self._do_splits(self, glyph), removed.keys()

   def classify_list_manual(self, glyphs, id):
      """**classify_list_manual** (ImageList *glyphs*, String *id*)

Sets the classification of the given *glyphs* to the given *id* and
then adds the glyphs to the training data.  Call this function when the
end user definitively knows the identity of the glyphs.

If *id* begins with the special prefix ``_group``, all of the glyphs
in *glyphs* are combined and the result is added to the training
data.  This is useful for characters that always appear with multiple
connnected components, such as the lower-case *i*.

*glyphs*
	The glyphs to classify.

*id*
	The class name.

.. note::
   Here *id* is a simple string, not of the `id_name`_ format, since
   the confidence of a manual classification is always 1.0."""
      if id.startswith('_group'):
         if len(glyphs) > 1:
            import image_utilities
            parts = id.split('.')
            sub = '.'.join(parts[1:])
            union = image_utilities.union_images(glyphs)
            for glyph in glyphs:
               if glyph.nrows > 2 and glyph.ncols > 2:
                  glyph.classify_heuristic('_group._part.' + sub)
                  self.generate_features(glyph)
            added, removed = self.classify_glyph_manual(union, sub)
            added.append(union)
            return added, removed
         else:
            # grouping a single glyph corrupts the classifier_glyph.xml file
            raise ClassifierError("Grouping of only a single glyph is not allowed.")

      added = []
      removed = set()
      for glyph in glyphs:
         for child in glyph.children_images:
            removed.add(child)

      new_glyphs = []
      for glyph in glyphs:
         # Don't re-insert removed children glyphs
         if not glyph in removed:
            if not glyph in self.database:
               self.generate_features(glyph)
               new_glyphs.append(glyph)
            glyph.classify_manual([(1.0, id)])
            added.extend(self._do_splits(self, glyph))
      self.database.extend(new_glyphs)
      return added, list(removed)

   def classify_and_update_list_manual(self, glyphs, *args, **kwargs):
      """**classify_and_update_list_manual** (ImageList *glyphs*, Function
*grouping_function* = ``None``, Function *evaluate_function* = ``None``,
int *max_size* = 5)

A convenience wrapper around group_list_automatic_ that returns
a list of glyphs that is already updated for splitting and grouping."""
      added, removed = self.classify_list_manual(glyphs, *args, **kwargs)
      return self._update_after_classification(glyphs, added, removed)

   def add_to_database(self, glyphs):
      """**add_to_database** (ImageList *glyphs*)

Adds the given glyph (or list of glyphs) to the classifier training data.  Will not add duplicates
to the training data.  Unlike classify_glyph_manual_, no grouping support is performed.
"""
      glyphs = util.make_sequence(glyphs)
      new_glyphs = []
      for glyph in glyphs:
         if (glyph.classification_state == core.MANUAL and
             not glyph in self.database):
            self.generate_features(glyph)
            new_glyphs.append(glyph)
      self.database.extend(new_glyphs)

   def remove_from_database(self, glyphs):
      """**remove_from_database** (ImageList *glyphs*)

Removes the given glyphs from the classifier training data.  Ignores silently
if a given glyph is not in the training data.
"""
      glyphs = util.make_sequence(glyphs)
      for glyph in glyphs:
         if glyph in self.database:
            self.database.remove(glyph)

   def display(self, current_database=[], context_image=None, symbol_table=[]):
      """**display** (ImageList *current_database* = ``[]``, Image
*context_image* = ``None``, List *symbol_table* = ``[]``)

Displays the `interactive classifier window`__, which is where manual
training usually takes place.

*current_database*
  A list of glyphs yet to be trained.

*context_image*
  An image of the page where the glyphs in *current_database* came
  from.

*symbol_table*
  A set of id names to insert by default into the symbol table.

.. __: gui.html#interactive-classifier-window"""
      if has_gui.gui and self._display is None:
         self._display = has_gui.gui.ShowClassifier(
            self, current_database, context_image, symbol_table)
      else:
         self._display.Show(1)
      return self._display

   def set_display(self, display):
      self._display = display

########################################
# GROUPING UTILITIES

class BasicGroupingFunction:
   def __init__(self, threshold):
      self._threshold = threshold

   def __call__(self, a, b):
      return Fudge(a, self._threshold).intersects(b)

class ShapedGroupingFunction:
   def __init__(self, threshold):
      from gamera.plugins import structural
      self._function = structural.shaped_grouping_function
      self._threshold = threshold

   def __call__(self, a, b):
      return self._function(a, b, self._threshold)

class BoundingBoxGroupingFunction:
   def __init__(self, threshold):
      from gamera.plugins import structural
      self._function = structural.bounding_box_grouping_function
      self._threshold = threshold

   def __call__(self, a, b):
      return self._function(a, b, self._threshold)

def average_bb_distance(ccs):
   """Calculates the average distance between the bounding boxes
in the given list of ccs."""
   average = 0
   for i, cc in enumerate(ccs):
      minimum = None
      for j in range(i + 1, len(ccs)):
         distance = cc.distance_bb(ccs[j])
         if distance < minimum or minimum is None:
            minimum = distance
      if not minimum is None:
         average += minimum
   average /= float(len(ccs))
   return average