File: wntools.py

package info (click to toggle)
python-pattern 2.6%2Bgit20150109-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 78,672 kB
  • sloc: python: 53,865; xml: 11,965; ansic: 2,318; makefile: 94
file content (343 lines) | stat: -rw-r--r-- 12,214 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
# Module wordnet.py
#
# Original author: Oliver Steele <steele@osteele.com>
# Project Page: http://sourceforge.net/projects/pywordnet
#
# Copyright (c) 1998-2004 by Oliver Steele.  Use is permitted under
# the Artistic License
# <http://www.opensource.org/licenses/artistic-license.html>

"""Utility functions to use with the wordnet module.

Usage
-----
    >>> dog = N['dog'][0]

    # (First 10) adjectives that are transitively SIMILAR to the main sense of 'red'
    >>> closure(ADJ['red'][0], SIMILAR)[:10]
    ['red' in {adjective: red, reddish, ruddy, blood-red, carmine, cerise, cherry, cherry-red, crimson, ruby, ruby-red, scarlet}, {adjective: chromatic}, {adjective: amber, brownish-yellow, yellow-brown}, {adjective: amethyst}, {adjective: aureate, gilded, gilt, gold, golden}, {adjective: azure, cerulean, sky-blue, bright blue}, {adjective: blue, bluish, blueish, light-blue, dark-blue, blue-black}, {adjective: bluish green, blue-green, cyan, teal}, {adjective: blushful, rosy}, {adjective: bottle-green}]

    >>> # Adjectives that are transitively SIMILAR to any of the senses of 'red'
    >>> #flatten1(map(lambda sense:closure(sense, SIMILAR), ADJ['red']))    # too verbose

    >>> # Hyponyms of the main sense of 'dog'(n.) that are homophonous with verbs
    >>> filter(lambda sense:V.get(sense.form), flatten1(map(lambda e:e.getSenses(), hyponyms(N['dog'][0]))))
    ['dog' in {noun: dog, domestic dog, Canis familiaris}, 'pooch' in {noun: pooch, doggie, doggy, barker, bow-wow}, 'toy' in {noun: toy dog, toy}, 'hound' in {noun: hound, hound dog}, 'basset' in {noun: basset, basset hound}, 'cocker' in {noun: cocker spaniel, English cocker spaniel, cocker}, 'bulldog' in {noun: bulldog, English bulldog}]

    >>> # Find the senses of 'raise'(v.) and 'lower'(v.) that are antonyms
    >>> filter(lambda p:p[0] in p[1].pointerTargets(ANTONYM), product(V['raise'].getSenses(), V['lower'].getSenses()))
    [('raise' in {verb: raise, lift, elevate, get up, bring up}, 'lower' in {verb: lower, take down, let down, get down, bring down})]
"""

__author__  = "Oliver Steele <steele@osteele.com>"
__version__ = "2.0"

from wordnet import *

#
# Domain utilities
#

def _requireSource(entity):
    if not hasattr(entity, 'pointers'):
        if isinstance(entity, Word):
            raise TypeError(`entity` + " is not a Sense or Synset.  Try " + `entity` + "[0] instead.")
        else:
            raise TypeError(`entity` + " is not a Sense or Synset")

def tree(source, pointerType):
    """
    >>> dog = N['dog'][0]
    >>> from pprint import pprint
    >>> pprint(tree(dog, HYPERNYM))
    ['dog' in {noun: dog, domestic dog, Canis familiaris},
     [{noun: canine, canid},
      [{noun: carnivore},
       [{noun: placental, placental mammal, eutherian, eutherian mammal},
        [{noun: mammal},
         [{noun: vertebrate, craniate},
          [{noun: chordate},
           [{noun: animal, animate being, beast, brute, creature, fauna},
            [{noun: organism, being},
             [{noun: living thing, animate thing},
              [{noun: object, physical object}, [{noun: entity}]]]]]]]]]]]]
    >>> #pprint(tree(dog, HYPONYM)) # too verbose to include here
    """
    if isinstance(source,  Word):
        return map(lambda s, t=pointerType:tree(s,t), source.getSenses())
    _requireSource(source)
    return [source] + map(lambda s, t=pointerType:tree(s,t), source.pointerTargets(pointerType))

def closure(source, pointerType, accumulator=None):
    """Return the transitive closure of source under the pointerType
    relationship.  If source is a Word, return the union of the
    closures of its senses.
    
    >>> dog = N['dog'][0]
    >>> closure(dog, HYPERNYM)
    ['dog' in {noun: dog, domestic dog, Canis familiaris}, {noun: canine, canid}, {noun: carnivore}, {noun: placental, placental mammal, eutherian, eutherian mammal}, {noun: mammal}, {noun: vertebrate, craniate}, {noun: chordate}, {noun: animal, animate being, beast, brute, creature, fauna}, {noun: organism, being}, {noun: living thing, animate thing}, {noun: object, physical object}, {noun: entity}]
    """
    if isinstance(source, Word):
        return reduce(union, map(lambda s, t=pointerType:tree(s,t), source.getSenses()))
    _requireSource(source)
    if accumulator is None:
        accumulator = []
    if source not in accumulator:
        accumulator.append(source)
        for target in source.pointerTargets(pointerType):
            closure(target, pointerType, accumulator)
    return accumulator

def hyponyms(source):
    """Return source and its hyponyms.  If source is a Word, return
    the union of the hyponyms of its senses."""
    return closure(source, HYPONYM)

def hypernyms(source):
    """Return source and its hypernyms.  If source is a Word, return
    the union of the hypernyms of its senses."""

    return closure(source, HYPERNYM)

def meet(a, b, pointerType=HYPERNYM):
    """Return the meet of a and b under the pointerType relationship.
    
    >>> meet(N['dog'][0], N['cat'][0])
    {noun: carnivore}
    >>> meet(N['dog'][0], N['person'][0])
    {noun: organism, being}
    >>> meet(N['thought'][0], N['belief'][0])
    {noun: content, cognitive content, mental object}
    """
    return (intersection(closure(a, pointerType), closure(b, pointerType)) + [None])[0]


#
# String Utility Functions
#
def startsWith(str, prefix):
    """Return true iff _str_ starts with _prefix_.
    
    >>> startsWith('unclear', 'un')
    1
    """
    return str[:len(prefix)] == prefix

def endsWith(str, suffix):
    """Return true iff _str_ ends with _suffix_.
    
    >>> endsWith('clearly', 'ly')
    1
    """
    return str[-len(suffix):] == suffix

def equalsIgnoreCase(a, b):
    """Return true iff a and b have the same lowercase representation.
    
    >>> equalsIgnoreCase('dog', 'Dog')
    1
    >>> equalsIgnoreCase('dOg', 'DOG')
    1
    """
    # test a == b first as an optimization where they're equal
    return a == b or string.lower(a) == string.lower(b)


#
# Sequence Utility Functions
#
def issequence(item):
    """Return true iff _item_ is a Sequence (a List, String, or Tuple).
    
    >>> issequence((1,2))
    1
    >>> issequence([1,2])
    1
    >>> issequence('12')
    1
    >>> issequence(1)
    0
    """
    return type(item) in (ListType, StringType, TupleType)

def intersection(u, v):
    """Return the intersection of _u_ and _v_.
    
    >>> intersection((1,2,3), (2,3,4))
    [2, 3]
    """
    w = []
    for e in u:
        if e in v:
            w.append(e)
    return w

def union(u, v):
    """Return the union of _u_ and _v_.
    
    >>> union((1,2,3), (2,3,4))
    [1, 2, 3, 4]
    """
    w = list(u)
    if w is u:
        import copy
        w = copy.copy(w)
    for e in v:
        if e not in w:
            w.append(e)
    return w

def product(u, v):
    """Return the Cartesian product of u and v.
    
    >>> product("123", "abc")
    [('1', 'a'), ('1', 'b'), ('1', 'c'), ('2', 'a'), ('2', 'b'), ('2', 'c'), ('3', 'a'), ('3', 'b'), ('3', 'c')]
    """
    return flatten1(map(lambda a, v=v:map(lambda b, a=a:(a,b), v), u))

def removeDuplicates(sequence):
    """Return a copy of _sequence_ with equal items removed.
    
    >>> removeDuplicates("this is a test")
    ['t', 'h', 'i', 's', ' ', 'a', 'e']
    >>> removeDuplicates(map(lambda tuple:apply(meet, tuple), product(N['story'].getSenses(), N['joke'].getSenses())))
    [{noun: message, content, subject matter, substance}, None, {noun: abstraction}, {noun: communication}]
    """
    accumulator = []
    for item in sequence:
        if item not in accumulator:
            accumulator.append(item)
    return accumulator


#
# Tree Utility Functions
#

def flatten1(sequence):
    accumulator = []
    for item in sequence:
        if type(item) == TupleType:
            item = list(item)
        if type(item) == ListType:
            accumulator.extend(item)
        else:
            accumulator.append(item)
    return accumulator


#
# WordNet utilities
#

GET_INDEX_SUBSTITUTIONS = ((' ', '-'), ('-', ' '), ('-', ''), (' ', ''), ('.', ''))

def getIndex(form, pos='noun'):
    """Search for _form_ in the index file corresponding to
    _pos_. getIndex applies to _form_ an algorithm that replaces
    underscores with hyphens, hyphens with underscores, removes
    hyphens and underscores, and removes periods in an attempt to find
    a form of the string that is an exact match for an entry in the
    index file corresponding to _pos_.  getWord() is called on each
    transformed string until a match is found or all the different
    strings have been tried. It returns a Word or None."""
    def trySubstitutions(trySubstitutions, form, substitutions, lookup=1, dictionary=dictionaryFor(pos)):
        if lookup and dictionary.has_key(form):
            return dictionary[form]
        elif substitutions:
            (old, new) = substitutions[0]
            substitute = string.replace(form, old, new) and substitute != form
            if substitute and dictionary.has_key(substitute):
                return dictionary[substitute]
            return              trySubstitutions(trySubstitutions, form, substitutions[1:], lookup=0) or \
                (substitute and trySubstitutions(trySubstitutions, substitute, substitutions[1:]))
    return trySubstitutions(returnMatch, form, GET_INDEX_SUBSTITUTIONS)


MORPHOLOGICAL_SUBSTITUTIONS = {
    NOUN:
    [('s', ''),
     ('ses', 's'),
     ('ves', 'f'),
     ('xes', 'x'),
     ('zes', 'z'),
     ('ches', 'ch'),
     ('shes', 'sh'),
     ('men', 'man'),
     ('ies', 'y')],
    VERB:
    [('s', ''),
     ('ies', 'y'),
     ('es', 'e'),
     ('es', ''),
     ('ed', 'e'),
     ('ed', ''),
     ('ing', 'e'),
     ('ing', '')],
    ADJECTIVE:
    [('er', ''),
     ('est', ''),
     ('er', 'e'),
     ('est', 'e')],
    ADVERB: []}

def morphy(form, pos='noun', collect=0):
    """Recursively uninflect _form_, and return the first form found
    in the dictionary.  If _collect_ is true, a sequence of all forms
    is returned, instead of just the first one.
    
    >>> morphy('dogs')
    'dog'
    >>> morphy('churches')
    'church'
    >>> morphy('aardwolves')
    'aardwolf'
    >>> morphy('abaci')
    'abacus'
    >>> morphy('hardrock', 'adv')
    """
    from wordnet import _normalizePOS, _dictionaryFor
    pos = _normalizePOS(pos)
    fname = os.path.join(WNSEARCHDIR, {NOUN: 'noun', VERB: 'verb', ADJECTIVE: 'adj', ADVERB: 'adv'}[pos] + '.exc')
    excfile = open(fname)
    substitutions = MORPHOLOGICAL_SUBSTITUTIONS[pos]
    def trySubstitutions(trySubstitutions,	# workaround for lack of nested closures in Python < 2.1
                         form,		  	# reduced form
                         substitutions,		# remaining substitutions
                         lookup=1,
                         dictionary=_dictionaryFor(pos),
                         excfile=excfile,
                         collect=collect,
                         collection=[]):
        import string
        exceptions = binarySearchFile(excfile, form)
        if exceptions:
            form = exceptions[string.find(exceptions, ' ')+1:-1]
        if lookup and dictionary.has_key(form):
            if collect:
                collection.append(form)
            else:
                return form
        elif substitutions:
            old, new = substitutions[0]
            substitutions = substitutions[1:]
            substitute = None
            if endsWith(form, old):
                substitute = form[:-len(old)] + new
                #if dictionary.has_key(substitute):
                #   return substitute
            form =              trySubstitutions(trySubstitutions, form, substitutions) or \
                (substitute and trySubstitutions(trySubstitutions, substitute, substitutions))
            return (collect and collection) or form
        elif collect:
            return collection
    return trySubstitutions(trySubstitutions, form, substitutions)

#
# Testing
#
def _test(reset=0):
    import doctest, wntools
    if reset:
        doctest.master = None # This keeps doctest from complaining after a reload.
    return doctest.testmod(wntools)