1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
|
#!/usr/local/bin/python
""" Words - tag words in a string (Version 0.2)
Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com
Copyright (c) 2000-2014, eGenix.com Software GmbH; mailto:info@egenix.com
See the documentation for further information on copyrights,
or contact the author. All Rights Reserved.
"""
from mx.TextTools import *
lcwords = []
cwords = []
lower_case_word = (lcwords,AppendToTag+Table,
(# first char in word
(None,IsIn,a2z+umlaute),
# all other chars (if there are any)
(None,AllIn,german_alpha,MatchOk))
)
capital_word = (cwords,AppendToTag+Table,
(# first char in word
(None,IsIn,A2Z+Umlaute),
# all other chars (if there are any)
(None,AllIn,german_alpha,MatchOk))
)
tag_words = (lower_case_word+(+1,+2),
capital_word+(+1,),
(None,AllIn,white+newline,+1),
(None,AllNotIn,german_alpha+white+newline,+1), # uninteresting
(None,EOF,Here,-4)) # EOF
if __name__ == '__main__':
import sys
# read in a file
f = open(sys.argv[1])
text = f.read()
t = TextTools._timer()
t.start()
# don't need a taglist, so pass None
result, taglist, nextindex = tag(text,tag_words,0,len(text))
t = t.stop()
print result, nextindex
print 'lower case words:'
for n,l,r,d in lcwords:
print ' ',text[l:r]
print
print 'capital letter words:'
for n,l,r,d in cwords:
print ' ',text[l:r]
print
print 'found',len(lcwords)+len(cwords),'words in',t[0],'sec (scanned',len(text),'bytes)'
|