File: 04-taxonomy.py

package info (click to toggle)
python-pattern 2.6%2Bgit20150109-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 78,672 kB
  • sloc: python: 53,865; xml: 11,965; ansic: 2,318; makefile: 94
file content (74 lines) | stat: -rw-r--r-- 2,576 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.search import search, taxonomy, Classifier
from pattern.en     import parsetree

# The search module includes a Taxonomy class 
# that can be used to define semantic word types.
# For example, consider that you want to extract flower names from a text.
# This would make search patterns somewhat unwieldy:
# search("rose|lily|daisy|daffodil|begonia", txt).

# A better approach is to use the taxonomy:
for flower in ("rose", "lily", "daisy", "daffodil", "begonia"):
    taxonomy.append(flower, type="flower")
    
print taxonomy.children("flower")
print taxonomy.parents("rose")
print taxonomy.classify("rose") # Yields the most recently added parent.
print
    
# Taxonomy terms can be included in a pattern by using uppercase:
t = parsetree("A field of white daffodils.", lemmata=True)
m = search("FLOWER", t)
print t
print m
print

# Another example:
taxonomy.append("chicken", type="food")
taxonomy.append("chicken", type="bird")
taxonomy.append("penguin", type="bird")
taxonomy.append("bird", type="animal")
print taxonomy.parents("chicken")
print taxonomy.children("animal", recursive=True)
print search("FOOD", "I'm eating chicken.")
print

# The advantage is that the taxonomy can hold an entire hierarchy.
# For example, "flower" could be classified as "organism".
# Other organisms could be defined as well (insects, trees, mammals, ...) 
# The ORGANISM constraint then matches everything that is an organism.

# A taxonomy entry can also be a proper name containing spaces
# (e.g. "windows vista", case insensitive).
# It will be detected as long as it is contained in a single chunk:
taxonomy.append("windows vista", type="operating system")
taxonomy.append("ubuntu", type="operating system")

t = parsetree("Which do you like more, Windows Vista, or Ubuntu?")
m = search("OPERATING_SYSTEM", t)
print t
print m
print m[0].constituents()
print

# Taxonomy entries cannot have wildcards (*),
# but you can use a classifier to simulate this.
# Classifiers are quite slow but useful in many ways.
# For example, a classifier could be written to dynamically 
# retrieve word categories from WordNet.

def find_parents(word):
    if word.startswith(("mac os", "windows", "ubuntu")):
        return ["operating system"]
c = Classifier(parents=find_parents)
taxonomy.classifiers.append(c)

t = parsetree("I like Mac OS X 10.5 better than Windows XP or Ubuntu.")
m = search("OPERATING_SYSTEM", t)
print t
print m
print m[0].constituents()
print m[1].constituents()
print