1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
|
#!/usr/bin/env python2.7
# ----------------------------------
"""
Usage: dectreelearn.py [options] <trainfile> [testfile]
Generate a decision tree for classifying the examples in example
file and do stuff with it.
Options:
-x<n>
Perform n-fold stratified cross-validition. If n is ommited, performs
10-fold cross-validation. Folds are stratified for class as far as
possible, but proportions of very rare classes may still be way off
(if there are only 5 examples with class X, but you request 10-fold
cross-validation, 5 folds will have _no_ example for that class).
-X<n>
Same, but performs non-stratified cross-validiation. THIS IS A
PARTICULARLY BAD IDEA IF THE INPUT SET IS SORTED BY CLASS! I know of
no reason why you should ever prefer -X to -x, but was reminded about
the value of stratification the hard way.
-S<n>
Provide a seed for the pseudo-random number generator used in
splitting the set for cross-validation. If none is given, current
system time is used. Use this option to eliminate one random factor
when comparing different parameters.
-g<limit>
Sets the relative information gain limit for refining nodes. Smaller
values make for bigger trees. Default is 0.5, but needs more
experiments and mybe that a knot in my thinking unknots.
-a
Use absolute information gain instead of relative information gain to
select features. If you use -g0.0 -a -s2, this should behave (more or
less) like a standard ID3 algorithm.
-s<n>
Set the maximal number of subclasses for each numerical feature to
n. This only applies at a given node in the tree, the same feature
can later be picked again and refined.
-v
Verbose output. The program will print a lot of stuff while doing
its thing...
-c
Classify data from a second input file. If classes are given, will
also print performance, otherwise will just give predicted class for
each instance.
-e
Evaluate the generate tree on the training data and print a report.
-n
Do NOT print the generated tree.
-h
Print this information and exit.
Copyright 2003 Stephan Schulz, schulz@informatik.tu-muenchen.de
his code is part of the support structure for the equational
heorem prover E. Visit
http://www4.informatik.tu-muenchen.de/~schulz/WORK/eprover.html
for more information.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program ; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston,
MA 02111-1307 USA
The original copyright holder can be contacted as
Stephan Schulz (I4)
Technische Universitaet Muenchen
Institut fuer Informatik
Boltzmannstrasse 3
Garching bei Muenchen
Germany
or via email (address above).
"""
import sys
import string
import random
import pylib_io
import pylib_basics
import pylib_ml_examples
import pylib_probabilities
import pylib_dectrees
relgain_limit = 0.5
entropy_compare_fun = cmp
max_split = 10
crossval = 0
eval_tree = False
classify = False
printtree = True
seed = None
stratified = True
feature_selection = "R"
dectree_constructor = pylib_dectrees.global_decision_tree
options = pylib_io.get_options()
args = pylib_io.get_args()
for o in options:
if o[0:2] == "-g":
relgain_limit = float(o[2:])
if o == "-a":
entropy_compare_fun = pylib_basics.rl_lex_compare
feature_selection = "A"
if o[0:2] == "-s":
max_split = int(float(o[2:]))
if o[0:2] == "-x":
try:
crossval = int(float(o[2:]))
except ValueError:
crossval = 10
if o[0:2] == "-X":
try:
crossval = int(float(o[2:]))
except ValueError:
crossval = 10
stratified = False
if o=="-v":
pylib_basics.globals.verbose = True
if o=="-c":
classify = True
pylib_io.check_argc(2,args)
if o=="-e":
eval_tree = True
if o=="-n":
printtree = False
if o=="-h":
print __doc__
sys.exit()
if o[0:2] == "-S":
seed = long(o[2:])
pylib_io.check_argc(1,None,args)
set = pylib_ml_examples.ml_exampleset()
set.parse(args[0])
if crossval:
random.seed(seed)
jobs = set.crossval_sets(crossval, stratified)
tr_results = []
te_results = []
fold = 0
for i in jobs:
fold += 1
tree = dectree_constructor(i[0],
entropy_compare_fun,
relgain_limit,
max_split)
(tsize, tdepth, tleaves) = tree.characteristics()
(apriori, remainder, absinfgain, relinfgain) = tree.entropies()
(succ, count) = tree.classify_set(i[0],pylib_basics.verbose())
succ_percent = float(succ)/count*100
print "Fold %-2d RIG: %5.3f (%2d,%4d,%4d) Train: %4d out of %4d, %7.3f%% " %\
(fold, relinfgain,tdepth, tsize, tleaves, succ, count, succ_percent),
tr_results.append((succ,count,succ_percent,relinfgain, tdepth,
tsize, tleaves))
(succ, count) = tree.classify_set(i[1],pylib_basics.verbose())
succ_percent = float(succ)/count*100
print "Test: %4d out of %4d, %7.3f%%" % (succ, count, succ_percent)
te_results.append((succ, count,succ_percent))
tr_percent = (map(lambda x:x[2],tr_results))
te_percent = (map(lambda x:x[2],te_results))
rig = (map(lambda x:x[3],tr_results))
depths = (map(lambda x:x[4],tr_results))
size = (map(lambda x:x[5],tr_results))
leaves = (map(lambda x:x[6],tr_results))
print "%s Splits %2d RIGL %5.3f RIG %5.3f+/-%5.3f (%5.2f+/-%4.2f, %7.2f+/-%4.2f, %7.2f+/-%4.2f) Train: %7.3f+/-%7.3f%% Test: %7.3f+/-%7.3f%%" %\
(feature_selection,
max_split,
relgain_limit,
pylib_basics.mean(rig),
pylib_basics.standard_deviation(rig),
pylib_basics.mean(depths),
pylib_basics.standard_deviation(depths),
pylib_basics.mean(size),
pylib_basics.standard_deviation(size),
pylib_basics.mean(leaves),
pylib_basics.standard_deviation(leaves),
pylib_basics.mean(tr_percent),
pylib_basics.standard_deviation(tr_percent),
pylib_basics.mean(te_percent),
pylib_basics.standard_deviation(te_percent))
else:
tree = dectree_constructor(set,
entropy_compare_fun,
relgain_limit,
max_split)
if printtree:
tree.printout()
if eval_tree:
(succ, count) = tree.classify_set(set, pylib_basics.verbose())
print "Successes on training set: %d out of %d, %f%%"% (succ, count, float(succ)/count*100)
if classify:
testset = pylib_ml_examples.ml_exampleset()
testset.parse(args[1])
(succ, count) = tree.classify_set(testset, True)
if count!=0:
print "Successes on test set : %d out of %d, %f%%"% (succ, count, float(succ)/count*100)
|