File: 08-wiktionary.py

package info (click to toggle)
python-pattern 2.6%2Bgit20150109-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 78,672 kB
  • sloc: python: 53,865; xml: 11,965; ansic: 2,318; makefile: 94
file content (88 lines) | stat: -rw-r--r-- 2,850 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Wiktionary, DOM
from pattern.db import csv, pd

# This example retrieves male and female given names from Wiktionary (http://en.wiktionary.org).
# It then trains a classifier that can predict the gender of unknown names (about 78% correct).
# The classifier is small (80KB) and fast.

w = Wiktionary(language="en")
f = csv() # csv() is a short alias for Datasheet().

# Collect male and female given names from Wiktionary.
# Store the data as (name, gender)-rows in a CSV-file.
# The pd() function returns the parent directory of the current script,
# so pd("given-names.csv") = pattern/examples/01-web/given-names.csv.

for gender in ("male", "female"):
    for ch in ("abcdefghijklmnopqrstuvwxyz"):
        p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True)
        for name in p.links:
            if not name.startswith("Appendix:"):
                f.append((name, gender[0]))
        f.save(pd("given-names.csv"))
        print ch, gender

# Create a classifier that predicts gender based on name.

from pattern.vector import SVM, chngrams, count, kfoldcv

class GenderByName(SVM):

    def train(self, name, gender=None):
        SVM.train(self, self.vector(name), gender)

    def classify(self, name):
        return SVM.classify(self, self.vector(name))

    def vector(self, name): 
        """ Returns a dictionary with character bigrams and suffix.
            For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
        """
        v = chngrams(name, n=2)
        v = count(v)
        v[name[-2:]+"$"] = 1
        v[len(name)] = 1
        return v

data = csv(pd("given-names.csv"))

# Test average (accuracy, precision, recall, F-score, standard deviation).

print kfoldcv(GenderByName, data, folds=3) # (0.81, 0.79, 0.77, 0.78, 0.00)

# Train and save the classifier in the current folder.
# With final=True, discards the original training data (= smaller file).

g = GenderByName(train=data)
g.save(pd("gender-by-name.svm"), final=True)

# Next time, we can simply load the trained classifier.
# Keep in mind that the script that loads the classifier
# must include the code for the GenderByName class description,
# otherwise Python won't know how to load the data.

g = GenderByName.load(pd("gender-by-name.svm"))

for name in (
  "Felix",
  "Felicia",
  "Rover",
  "Kitty",
  "Legolas",
  "Arwen",
  "Jabba",
  "Leia",
  "Flash",
  "Barbarella"):
    print name, g.classify(name)

# In the example above, Arwen and Jabba are misclassified.
# We can of course improve the classifier by hand:

#g.train("Arwen", gender="f")
#g.train("Jabba", gender="m")
#g.save(pd("gender-by-name.svm"), final=True)
#print g.classify("Arwen")
#print g.classify("Jabba")