File: 07-slp.py

package info (click to toggle)
python-pattern 2.6%2Bgit20150109-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 78,672 kB
  • sloc: python: 53,865; xml: 11,965; ansic: 2,318; makefile: 94
file content (146 lines) | stat: -rw-r--r-- 5,861 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
import random

from codecs         import open
from collections    import defaultdict
from pattern.text   import Model
from pattern.vector import shuffled, SLP
from pattern.en     import lexicon, parsetree
from random         import seed

# This example demonstrates how a Perceptron classifier 
# can be used to construct an English language model 
# (i.e., a classifier that predicts part-of-speech tags),
# by learning from a training set of tagged sentences.

# First we need training data: a corpus of manually annotated (= tagged) sentences.
# Typically, Penn Treebank is used, which contains texts from the Wall Street Journal (WSJ).
# In this example we will use the freely available Open American National Corpus (OANC).

print "load training data..."

def corpus(path, encoding="utf-8"):
    """ Yields sentences of (word, tag)-tuples from the given corpus,
        which is a .txt file with a sentence on each line, 
        with slash-encoded tokens (e.g., the/DT cat/NN).
    """
    for s in open(path, encoding=encoding):
        s = map(lambda w:  w.split("/"), s.strip().split(" "))
        s = map(lambda w: (w[0].replace("&slash;", "/"), w[1]), s)
        yield s

# The corpus is included in the Pattern download zip, in pattern/test/corpora:
path = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora", "tagged-en-oanc.txt")
data = list(corpus(path))

# A parser is typically based on a lexicon of known words (aka a tag dictionary),
# that contains frequent words and their most frequent part-of-speech tag.
# This approach is fast. However, some words can have more than one tag,
# depending on their context in the sentence (e.g., "a can" vs. "can I").

# When we train a language model (i.e., a classifier),
# we want to make sure that it captures all ambiguity,
# ignoring ambiguous entries in the lexicon,
# handling them with the classifier instead.

# For example, the lexicon in pattern.en will always tag "about" as IN (preposition),
# even though it can also be used as RB (adverb) in about 25% of the cases.

# We will add "about" to the set of words in the lexicon to ignore
# when using a language model. 

print "load training lexicon..."

f = defaultdict(lambda: defaultdict(int)) # {word1: {tag1: count, tag2: count, ...}}
for s in data:
    for w, tag in s:
        f[w][tag] += 1

known, unknown = set(), set()
for w, tags in f.items():
    n = sum(tags.values()) # total count
    m = sorted(tags, key=tags.__getitem__, reverse=True)[0] # most frequent tag
    if float(tags[m]) / n >= 0.97 and n > 1:
        # Words that are always handled by the lexicon.
        known.add(w)
    if float(tags[m]) / n <  0.92 and w in lexicon:
        # Words in the lexicon that should be ignored and handled by the model.
        unknown.add(w)

# A language model is a classifier (e.g., NB, KNN, SVM, SLP)
# trained on words and their context (= words to the left & right in sentence),
# that predicts the part-of-speech tag of unknown words.

# Take a look at the Model class in pattern/text/__init__.py.
# You'll see an internal Model._v() method
# that creates a training vector from a given word and its context,
# using information such as word suffix, first letter (i.e., for proper nouns), 
# the part-of-speech tags of preceding words, surrounding tags, etc.

# Perceptron (SLP, single-layer averaged perceptron) works well for language models.
# Perceptron is an error-driven classifier.
# When given a training example (e.g., tagged word + surrounding words), 
# it will check if it could correctly predict this example.
# If not, it will adjust its weights.
# So the accuracy of the perceptron can be improved significantly
# by training in multiple iterations, averaging out all weights.

# This will take several minutes.
# If you want it to run faster for experimentation,
# use less iterations or less data in the code below:

print "training model..."

seed(0) # Lock random list shuffling so we can compare.

m = Model(known=known, unknown=unknown, classifier=SLP())
for iteration in range(5):
    for s in shuffled(data[:20000]):
        prev = None
        next = None
        for i, (w, tag) in enumerate(s):
            if i < len(s) - 1:
                next = s[i+1]
            m.train(w, tag, prev, next)
            prev = (w, tag)
            next = None

f = os.path.join(os.path.dirname(__file__), "en-model.slp")
m.save(f, final=True)

# Each parser in Pattern (pattern.en, pattern.es, pattern.it, ...)
# assumes that a lexicon of known words and their most frequent tag is available,
# along with some rules for morphology (suffixes, e.g., -ly = adverb)
# and context (surrounding words) for unknown words.

# If a language model is also available, it overrides these (simpler) rules.
# For English, this can raise accuracy from about 94% up to about 97%,
# and makes the parses about 3x faster.

print "loading model..."

f = os.path.join(os.path.dirname(__file__), "en-model.slp")
lexicon.model = Model.load(lexicon, f)

# To test the accuracy of the language model,
# we can compare a tagged corpus to the predicted tags.
# This corpus must be different from the one used for training.
# Typically, sections 22, 23 and 24 of the WSJ are used.

# Note that the WSJ contains standardized English.
# The accuracy will be lower when tested on, for example, informal tweets.
# A different classifier could be trained for informal language use.

print "testing..."

i, n = 0, 0
for s1 in data[-5000:]:
    s2 = " ".join(w for w, tag in s1)
    s2 = parsetree(s2, tokenize=False)
    s2 = ((w.string, w.tag or "") for w in s2[0])
    for (w1, tag1), (w2, tag2) in zip(s1, s2):
        if tag1 == tag2.split("-")[0]: # NNP-PERS => NNP
            i += 1
        n += 1

print float(i) / n # accuracy