File: TextIndexNG.py

package info (click to toggle)
zope-textindexng2 1%3A2.2.0-5
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 2,888 kB
  • ctags: 1,598
  • sloc: ansic: 6,836; python: 6,596; xml: 185; makefile: 137; sh: 41
file content (837 lines) | stat: -rw-r--r-- 29,945 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
###########################################################################
#
# LICENSE.txt for the terms of this license.
# TextIndexNG                The next generation TextIndex for Zope
#
# This software is governed by a license. See
# LICENSE.txt for the terms of this license.
#
###########################################################################

"""
TextIndexNG 
Written by Andreas Jung

E-Mail: andreas@andreas-jung.com

$Id: TextIndexNG.py,v 1.167 2005/05/19 10:22:35 ajung Exp $
"""

import sys

from Globals import DTMLFile, InitializeClass
from AccessControl.SecurityInfo import ClassSecurityInfo
from zLOG import ERROR, WARNING, LOG
from OFS.SimpleItem import SimpleItem
from Products.PluginIndexes import PluggableIndex       
from Products.PluginIndexes.common.util import parseIndexRequest
from OFS.content_types import guess_content_type
from BTrees.IIBTree import IISet, difference 
from classVerify import verifyClass

from Products.TextIndexNG2.ResultSet import ResultSet
from Products.TextIndexNG2.Registry import ParserRegistry, ConverterRegistry, NormalizerRegistry, StorageRegistry, ThesaurusRegistry
from Products.TextIndexNG2.Registry import LexiconRegistry, SplitterRegistry, StopwordsRegistry, RegistryException
from ParseTree import Evaluator

from interfaces.IStopwords import StopwordsInterface

import parsers, normalizers
import storages, lexicons, splitters, stop_words

import indexsupport
import PositionMap

from AccessControl.Permissions import  search_zcatalog
try:
    from AccessControl.Permissions import manage_zcatalog_indexes
except:
    manage_zcatalog_indexes = 'Manage ZCatalogIndex Entries' 


class TXNGError(Exception): pass


# Precalculate the term weight for terms derived by
# right truncation. The weight is calculated by the difference
# of the length of original term and the derived term.
# The weight is inverse proportional to difference
#
# weight = 1.0 / (a * difference + 1)
# a = (1 - p) / (p * d)
# p is the weight for terms with a difference of d  
#
# We use p=0.5 and  d=5


TRUNC_WEIGHT = {}
p = 0.5; d = 5
a = (1 - p) / (p * d)
for i in range(250): TRUNC_WEIGHT[i] = 1.0 / (a*i + 1)


class TextIndexNG(SimpleItem):
    """ TextIndexNG """

    meta_type = 'TextIndexNG2'
    __implements__ = PluggableIndex.PluggableIndexInterface

    security = ClassSecurityInfo()
    security.declareObjectProtected(manage_zcatalog_indexes)

    manage_options= (
        {'label': 'Settings',     
         'action': 'manage_workspace',
         'help': ('TextIndexNG','TextIndexNG_Settings.stx')},
        {'label': 'Stop words',     
         'action': 'manage_stopwords',
         'help': ('TextIndexNG','TextIndexNG_Stopwords.stx')},
        {'label': 'Normalizer',     
         'action': 'manage_normalizer',
         'help': ('TextIndex','TextIndexNG_Normalizer.stx')},
        {'label': 'Converters',     
         'action': 'manage_converters',
         'help': ('TextIndexNG','TextIndexNG_Converters.stx')},
        {'label': 'Vocabulary',     
         'action': 'manage_vocabulary',
         'help': ('TextIndexNG','TextIndexNG_Vocabulary.stx')},
        {'label': 'Test',     
         'action': 'manage_test',
         'help': ('TextIndexNG','TextIndexNG_Test.stx')},
        {'label': 'Statistics',     
         'action': 'manage_statistics',
         'help': ('TextIndexNG','TextIndexNG_Statistics.stx')},
    )

    _all_options = ('splitter_max_len', 'use_splitter', "splitter_separators",
         'splitter_single_chars', 'splitter_casefolding', 
         'lexicon', 'near_distance', 'truncate_left', 'autoexpand',
         'autoexpand_limit', 'numhits', 'use_storage', 'use_thesaurus', 'thesaurus_mode',
         'use_stopwords', 'use_normalizer', 'use_converters',
         'use_parser', 'indexed_fields', 'default_encoding'
        )

    query_options = ("query", "operator", "parser", "encoding", 'near_distance', 'autoexpand',
                     'numhits')

    def __init__(self, id, extra=None, caller=None):

        def _get(o, k, default):
            """ return a value for a given key of a dict/record 'o' """
            if isinstance(o, dict):
                return o.get(k, default)
            else:
                return getattr(o, k, default)
        
        self.id = id

        # check parameters
        if extra:
            for k in extra.keys():
                if not k in self._all_options:
                    raise TXNGError,'unknown parameter "%s"' % k

        if caller is not None:
            self.catalog_path = '/'.join(caller.getPhysicalPath())
        else:
            self.catalog_path = None

        # indexed attributes
        self._indexed_fields = _get(extra, 'indexed_fields', '').split(',')
        self._indexed_fields = [ attr.strip() for attr in  self._indexed_fields if attr ]
        if not self._indexed_fields:
            self._indexed_fields = [ self.id ]

        # splitter to be used
        self.use_splitter = _get(extra, 'use_splitter', 'TXNGSplitter')

        # max len of splitted words
        self.splitter_max_len= _get(extra, 'splitter_max_len', 64)

        # allow single characters
        self.splitter_single_chars = _get(extra,'splitter_single_chars',0)

        # valid word separators
        self.splitter_separators = _get(extra, 'splitter_separators','.+-_@')

        # allow single characters
        self.splitter_casefolding = _get(extra,'splitter_casefolding',1) 

        # left truncation
        self.truncate_left = _get(extra, 'truncate_left', 0)

        # Term autoexpansion
        self.autoexpand = _get(extra, 'autoexpand', 0)
        self.autoexpand_limit = _get(extra, 'autoexpand_limit', 4)

        # maximum number of hits
        self.numhits = _get(extra, 'numhits', 999999999)

        # default maximum distance for words with near search
        self.near_distance = _get(extra,'near_distance', 5)

        # Stopwords: either filename or StopWord object
        self.use_stopwords = _get(extra, 'use_stopwords', None) or None
        if self.use_stopwords:
            verifyClass(StopwordsInterface, self.use_stopwords.__class__)
     
        # Normalizer
        self.use_normalizer = _get(extra,'use_normalizer', None) or None

        # use converters from the ConvertersRegistry
        self.use_converters = _get(extra,'use_converters',0) 

        # Storage to be used
        self.use_storage = _get(extra,'use_storage', 'StandardStorage') 

        # encoding
        self.default_encoding = _get(extra,'default_encoding', 'iso-8859-15') 

        # check Parser
        self.use_parser = _get(extra, 'use_parser','PyQueryParser')
        
        # Thesaurus
        self.use_thesaurus = _get(extra, 'use_thesaurus', None)
        self.thesaurus_mode = _get(extra, 'thesaurus_mode', None)

        self.use_lexicon = 'StandardLexicon'
        self.clear()


    def clear(self):
        self._storage = StorageRegistry.get(self.use_storage)() 
        self._lexicon = LexiconRegistry.get(self.use_lexicon)(truncate_left=self.truncate_left)

    def getId(self):   return self.id
    def __len__(self): return len(self._storage)
    def __nonzero__(self): return not not self._unindex
    def getLexicon(self): return self._lexicon
    def getStorage(self): return self._storage

    def index_object(self, documentId, obj, threshold=None):
        """ wrapper to handle indexing of multiple attributes """
        # needed for backward compatibility
        try: fields = self._indexed_fields
        except: fields  = [ self.id ]

        res = 0
        all_wids = []
        for attr in fields:
            try:
                wids = self._index_object(documentId, obj, threshold, attr)
                if wids is not None:
                    all_wids.extend(wids)
            except:
                pass

        # get rid of words removed by reindexing
        try:
            o_wids = IISet(self._storage.getWordIdsForDocId(documentId))
        except KeyError:
            o_wids = IISet()

        all_wids_set = IISet(all_wids)
        remove_wids = difference(o_wids, all_wids_set)
        insert_wids = difference(all_wids_set, o_wids)
        insert_dict = {}   # hash wids to dict for performance reasons
        for wid in insert_wids.keys(): insert_dict[wid] = 1

        if len(remove_wids) > 0:
            self._storage.removeWordIdsForDocId(documentId, remove_wids) 
        if all_wids:
            self._storage.insert([w for w in all_wids if insert_dict.has_key(w)], documentId)
        return len(all_wids)
        
    def _index_object(self, documentId, obj, threshold=None, attr=''):

        encoding = self.default_encoding
        source = mimetype = None

        # This is to support foreign file formats that
        # are stored as "File" objects when searching
        # through PrincipiaSearchSource

        if hasattr(obj, 'txng_get'):
            # Check if the object has a method txng_get()
            result = obj.txng_get([attr])
            if result is None: return None
            source, mimetype, encoding = result

        elif obj.meta_type in ('File', 'Portal File') and  \
           attr in ('PrincipiaSearchSource', 'SearchableText'):

            source= getattr(obj, attr, None)
            if source and not self.use_converters:
                if callable(source): source = source()
            else:              
                source = str(obj)
            mimetype = obj.content_type

        elif obj.meta_type == 'ExtFile' and \
           attr in ('PrincipiaSearchSource', 'SearchableText'):
            source = obj.index_html()
            mimetype = obj.getContentType()

        elif obj.meta_type in ('ZMSFile',):
            lang = attr[attr.rfind('_')+1:]
            req = {'lang' : lang}
            file = obj.getObjProperty('file', req)
            source = ''
            mimetype = None
            if file:
                source = file.getData()
                mimetype = file.getContentType()
   
        elif obj.meta_type in ('TTWObject',) and attr not in ('SearchableText', ): 
            field = obj.get(attr)
            source = str(field)
            if field.meta_type in ( 'ZMSFile', 'File' ):
                mimetype = field.getContentType()
            else:
                mimetype = None

        else:
            # default behaviour: try to obtain the source from
            # the attribute or method call return value

            try:
                source = getattr(obj, attr)
                if callable(source): source = source()
                if not isinstance(source, unicode):
                    source = str(source)
            except (AttributeError, TypeError):
                return None
        
        # If enabled, we try to find a valid document converter
        # and convert the data to get a hopefully text only representation
        # of the data.

        if self.use_converters:
            if mimetype is None or mimetype == 'application/octet-stream':
                mimetype, encoding = guess_content_type(obj.getId(), source)
                if not encoding:
                    encoding = self.default_encoding

            try: 
                converter = ConverterRegistry.get(mimetype)
            except RegistryException: 
                LOG('textindexng', ERROR, '%s could not be converted because no converter could be found for %s' % (obj.absolute_url(1), mimetype))
                return None

            if converter:
                try:
                    source, encoding = converter.convert2(source, encoding, mimetype)
                except:
                    try:
                        source = converter.convert(source)
                    except:
                        LOG('textindexng', ERROR, '%s could not be converted' % obj.absolute_url(1), error=sys.exc_info())
                        return None

            if obj.meta_type == 'Portal File': 
                source += ' ' + obj.SearchableText()

        # Now we try to get a valid encoding. For unicode strings
        # we have to perform no action. For string objects we check
        # if the document has an attibute (not a method) '<index>_encoding'.
        # As fallback we also check for the presence of an attribute
        # 'document_encoding'. Checking for the two attributes allows
        # us to define different encodings for different attributes
        # on an object. This is useful when an object stores multiple texts
        # as attributes within the same instance (e.g. for multilingual
        # versions of a text but with different encodings). 
        # If no encoding is specified as object attribute, we will use
        # Python's default encoding.
        # After getting the encoding, we convert the data to unicode.

        if isinstance(source, str):
            if encoding is None:
                try: encoding = self.default_encoding
                except: encoding = self.default_encoding = 'iso-8859-15'

                for k in ['document_encoding', attr + '_encoding']:
                    enc = getattr(obj, k, None)
                    if enc is not None: encoding = enc  

            if encoding=='ascii': encoding ='iso-8859-15'         
            try:
                source = unicode(source, encoding, 'strict')
            except UnicodeDecodeError:
                LOG('textindexng', WARNING, 'UnicodeDecodeError raised from %s - ignoring unknown unicode characters'  % obj.absolute_url(1))
                source = unicode(source, encoding, 'ignore')
 
        elif isinstance(source, unicode):  pass
        else: raise TXNGError,"unknown object type" 

        source = source.strip()
        if not source: return None

        # Normalization: apply translation table to data
        if self.use_normalizer:
            source = NormalizerRegistry.get(self.use_normalizer).process(source)    
 
        # Split the text into a list of words
        SP = SplitterRegistry.get(self.use_splitter)

        _source = source
        words = SP(casefolding  = self.splitter_casefolding,
                   separator    = self.splitter_separators,
                   maxlen       = self.splitter_max_len,
                   singlechar   = self.splitter_single_chars
                   ).split(_source)

        #  remove stopwords from data
        if self.use_stopwords:
            words = self.use_stopwords.process( words ) 

        # We pass the list of words to the corresponding lexicon
        # and obtain a list of wordIds. The "old" TextIndex iterated
        # over every single words (overhead).
        return self._lexicon.getWordIdList(words)

    def unindex_object(self, documentId): 
        """ carefully unindex document with Id 'documentId'
            index and do not fail if it does not exist 
        """
        self._storage.removeDocument(documentId)

    def _apply_index(self, request, cid=''): 
        """ Apply the index to query parameters given in the argument,
        request

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.
 
        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.  
        """

        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys==None: return None

        # extract some parameters from the request 

        query_operator = record.get('operator','dummy')
        if query_operator is None:
            raise TXNGError, ("Invalid operator '%s' "
                                            "for a TextIndex" % query_operator)

        query_parser = record.get('parser', self.use_parser)
        if not ParserRegistry.is_registered(query_parser): 
            raise TXNGError, "Unknown parser '%s'" %  query_parser

 
        query = record.keys[0]
        encoding = record.get('encoding', self.default_encoding)
        if isinstance(query, str): query = unicode(query, encoding)
        P = ParserRegistry.get( query_parser )
        parsed_query = P(query.strip(), operator=query_operator)
        if not parsed_query:
            raise TXNGError,"Bad query: '%s'" % q

        evaluator = Evaluator(self)
        evaluator.autoexpand = record.get('autoexpand', self.autoexpand)
        evaluator.near_distance = record.get('near_distance', self.near_distance)

        numhits = record.get('numhits', self.numhits)
        resultset = evaluator(parsed_query)

        if self.getStorage().providesWordFrequencies():
            resultset.cosine_ranking(self, numhits)
            return  resultset.result(), (self.id,) 
        else:
            return  resultset.docIds(), (self.id,) 
            


    ################################################################
    # callbacks for ParseTree.py
    ################################################################

    def _lookup(self, words, do_autoexpand=1):
        """ search a word or a list of words in the lexicon and 
            return a ResultSet of found documents.
        """

        docids = IISet()
        used_words = {} 

        #  remove stopwords from data
        if self.use_stopwords:
            words = self.use_stopwords.process( words ) 

        if self.use_thesaurus and self.thesaurus_mode == 'expand_always':
            TH = ThesaurusRegistry.get(self.use_thesaurus)
            for word in words[:]:
                r = TH.getTermsFor(word)
                words.extend(r)

        for word in words:

            # perform casefolding if necessary
            if self.splitter_casefolding:
                word = word.lower()

            if self.use_normalizer:
                word = NormalizerRegistry.get(self.use_normalizer).process(word)    
 
            used_words[word] = 1.0

            wid = self._lexicon.getWordId(word)

            # Retrieve list of docIds for this wordid
            if wid is not None:
                docids.update( self._storage.get(wid) )

            # perform autoexpansion of terms by performing
            # a search using right-truncation
            if do_autoexpand and self.autoexpand and len(word) >= self.autoexpand_limit:
                rs = self.lookupWordsByTruncation(word, right=1)
                docids.update(rs.docIds())
                wlen = len(word)
                for w in rs.words().keys():
                    used_words[w] = TRUNC_WEIGHT[len(w)-wlen]

        return ResultSet(docids, used_words)

    
    def lookupWord(self, word):
        """ search a word in the lexicon and return a ResultSet
            of found documents 
        """

        return self._lookup( [word] )


    def lookupWordsByPattern(self,word):
        """ perform full pattern matching """

        if self.splitter_casefolding: word = word.lower()
        words = self._lexicon.getWordsForPattern(word)

        return self._lookup(words, do_autoexpand=0)

    def lookupWordsByTruncation(self, word, left=0, right=0):
        """ perform right truncation lookup"""

        if self.use_normalizer:
            word = NormalizerRegistry.get(self.use_normalizer).process(word)    

        if self.splitter_casefolding: word = word.lower()
        if right:
            words = self._lexicon.getWordsForRightTruncation(word)
        if left:
            if  self.truncate_left:
                words = self._lexicon.getWordsForLeftTruncation(word)
            else: 
                raise TXNGError, "Left truncation not allowed"

        return self._lookup(words, do_autoexpand=0)


    def lookupRange(self, w1, w2):
        """ search all words between w1 and w2 """

        if self.splitter_casefolding: 
            w1 = w1.lower()
            w2 = w2.lower()

        words = self._lexicon.getWordsInRange(w1, w2)
        return self._lookup(words, do_autoexpand=0)


    def lookupWordsBySimilarity(self, word):       
        """ perform a similarity lookup """

        lst = self._lexicon.getSimiliarWords(word)

        docids = IISet()
        used_words = {} 

        getwid = self._lexicon.getWordId

        for word, threshold in lst:
            used_words[word] = threshold
            wid = getwid(word)

            docids.update( self._storage.get(wid) )

        return ResultSet(docids, used_words)


    def lookupWordsBySubstring(self, word):       
        """ perform a substring search """

        if self.splitter_casefolding: word = word.lower()
        words = self._lexicon.getWordsForSubstring(word)
        return self._lookup(words, do_autoexpand=0)
        

    ###################################################################
    # document lookup for near and phrase search 
    ###################################################################

    def positionsFromDocumentLookup(self,docId, words):
        """ search all positions for a list of words for
            a given document given by its documentId.
            positions() returns a mapping word to
            list of positions of the word inside the document.
        """

        # some query preprocessing  
        if self.splitter_casefolding:
            words = [word.lower() for word in words] 

        posMap = PositionMap.PositionMap() 

        # obtain wids from document
        wids = self._storage.getWordIdsForDocId(docId)
        word_lst = [self._lexicon.getWord(wid) for wid in wids] 
        for word in words:
            posLst = indexsupport.listIndexes(word_lst, word)        
            posMap.append(word, IISet(posLst) )

        return posMap

    ###################################################################
    # some helper functions 
    ###################################################################

    def numObjects(self):
        """ return number of index objects """
        return len(self._storage.getDocIds())

    def rebuild(self):
        """ rebuild the inverted index """
        self._storage.buildInvertedIndex()
        return "done"

    def info(self):
        """ return a list of TextIndexNG properties """

        lst = [ (k,str(getattr(self,k))) for k in dir(self) ] 
        lst.sort()
        return lst

    def getEntryForObject(self, docId, default=None):
        """Get all information contained for a specific object.
           This takes the objects record ID as it's main argument.
        """

        try:
            # use an IISet() here to iterate over a unique list
            # of wids 
            wids = IISet(self._storage.getWordIdsForDocId(docId))
            return [(self._lexicon.getWord(wid), self._storage.getWordFrequency(docId, wid)) for wid in wids]
        except:
            return []

    def getRegisteredObjectForObject(self, docId, default=None):
        """Get all information contained for a specific object.
           This takes the objects record ID as it's main argument.
        """

        return "%d distinct words" % \
            len(self._storage.getWordIdsForDocId( docId ))

    def uniqueValues(self, name=None, withLengths=0):
        """ we don't implement that ! """
        raise NotImplementedError

    ###################################################################
    # minor introspection API
    ###################################################################

    def allSettingOptions(self):
        return self._all_options


    def getSetting(self, key):
        if not key in self._all_options:
            raise TXNGError, "No such setting '%s'" % key

        return getattr(self, key, '')

    def getIndexSourceNames(self):
        """ return sequence of indexed attributes """
        
        try:
            return self._indexed_fields
        except:
            return [ self.id ]


    ###################################################################
    # Stopword handling
    ###################################################################

    def getStopWords(self):     
        """ return a list of all stopwords (for ZMI) """

        if self.use_stopwords:
            return self.use_stopwords.getStopWords()
        else:
            return []

    ###################################################################
    # Normalizer handling
    ###################################################################

    def getNormalizerTable(self):     
        """ return the normalizer translation table """
        
        if self.use_normalizer:       
            return NormalizerRegistry.get(self.use_normalizer).getTable()
        else:
            return None

    ###################################################################
    # Converters
    ###################################################################

    def allConverters(self):
        """ return a list of all registered converters """
        lst = []
        used = []
        converters = ConverterRegistry.allRegisteredObjects()
        converters.sort( lambda x,y: cmp(x.getType(),y.getType()) )
        for c in converters:
            if not c in used:
                used.append(c)
                lst.append( (c.getType(), c.getDescription(), c.getDependency() ) )

        return lst

    ###################################################################
    # Testing 
    ###################################################################

    def testTextIndexNG(self, query, parser, operator=None):
        """ test the TextIndexNG """
        
        res = self._getCatalog().searchResults({self.id: {'query': query,
                                                          'parser': parser,
                                                          'operator': operator} })

        return [r.getURL(relative=1) for r in res]


    ###################################################################
    # Vocabulary browser 
    ###################################################################

    def _getCatalog(self):
        """ return the Catalog instance """

        try: 
            self._v_catalog = self.restrictedTraverse(self.catalog_path)
        except KeyError:
            self._v_catalog = self.aq_parent.aq_parent
        return self._v_catalog

    def getDocumentsForWord(self, word):
        """ return a sequence of document paths that contain 'word' """

        catalog = self._getCatalog()

        wid = self._lexicon.getWordId(word)
        docIds = self._storage.getDocumentIdsForWordId(wid)
        paths =  [ catalog.getpath(docId) for docId in docIds ]
        paths.sort()
        return paths

    ###################################################################
    # Cleanup vocabulary
    ###################################################################

    def manage_cleanVocabulary(self):
        """ cleanup the vocabulary """

        wids = list(self._lexicon.getWordIds())
        for wid in wids:
            docids = self._storage.getDocumentIdsForWordId(wid)
            if not docids:
                self._lexicon.removeWordId(wid)

        return 'Vocabulary cleaned'

    ###################################################################
    # TextIndexNG preferences 
    ###################################################################

    def manage_setPreferences(self,extra, debug_mode,
                               REQUEST=None,RESPONSE=None,URL2=None):
        """ preferences of TextIndex """

        for x in ('near_distance', ): 

            if hasattr(extra,x):

                oldval = getattr(self,x)
                newval = getattr(extra,x)
                setattr(self, x, newval)

        if RESPONSE:
            RESPONSE.redirect(URL2 + 
                '/manage_main?manage_tabs_message=Preferences%20saved')

    def manage_checkIndex1(self):
        """ check index (only for internal tests) """
      
        # check lexikon
        fwidx = self._lexicon._forward_idx
        revidx = self._lexicon._inverse_idx
        all_wids = fwidx.values()
        assert len(fwidx) == len(revidx) 
        for word, wid in fwidx.items():
            assert revidx[wid] == word

        # check storage
        fwidx = self._storage._forward_idx
        revidx = self._storage._reverse_idx

        all_docids = revidx.keys()
        for wid,docids in fwidx.items():
            assert wid in all_wids
            for docid in (isinstance(docids, int) and [docids] or docids.keys()):
                assert docid in all_docids

        for docid in revidx.keys():
            for wid in self._storage.getWordIdsForDocId(docid):
                assert wid in all_wids
        return 'Index seems to be consistent'


    manage_workspace  = DTMLFile("dtml/manageTextIndexNG",globals())
    manage_stopwords  = DTMLFile("dtml/manageStopWords",globals())
    manage_normalizer = DTMLFile("dtml/manageNormalizer",globals())
    manage_converters = DTMLFile("dtml/showConverterRegistry",globals())
    manage_vocabulary = DTMLFile("dtml/vocabularyBrowser",globals())
    manage_statistics = DTMLFile("dtml/manageStatistics",globals())
    showDocuments     = DTMLFile("dtml/vocabularyShowDocuments",globals())
    manage_test       = DTMLFile("dtml/testTextIndexNG",globals())
    testResults       = DTMLFile("dtml/testResults",globals())

InitializeClass(TextIndexNG)


manage_addTextIndexNGForm = DTMLFile('dtml/addTextIndexNG', globals())

def manage_addTextIndexNG(self, id, extra, REQUEST=None, RESPONSE=None, URL3=None):
    """Add a new TextIndexNG """

    from Registry import StopwordsRegistry

    # the ZMI passes the name of a registered Stopwords object (usually the
    # language abreviation like 'en', 'de'. 

    if extra.use_stopwords:
        sw = StopwordsRegistry.get(extra.use_stopwords)
        extra.use_stopwords = sw

    return self.manage_addIndex(id, 'TextIndexNG2', extra, REQUEST, RESPONSE, URL3)