1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
|
/*************************************************************************/
/* */
/* Centre for Speech Technology Research */
/* University of Edinburgh, UK */
/* Copyright (c) 1996,1997 */
/* All Rights Reserved. */
/* */
/* Permission to use, copy, modify and distribute this software and its */
/* documentation for research, educational and individual use only, is */
/* hereby granted without fee, subject to the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* This software may not be used for commercial purposes without */
/* specific prior written permission from the authors. */
/* */
/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* Author : Alan W Black */
/* Date : May 1996 */
/*-----------------------------------------------------------------------*/
/* A Classification and Regression Tree (CART) Program */
/* A basic implementation of many of the techniques in */
/* Briemen et al. 1984 */
/* */
/* Added decision list support, Feb 1997 */
/* */
/*=======================================================================*/
#include <stdlib.h>
#include <iostream.h>
#include <fstream.h>
#include <string.h>
#include "EST_Wagon.h"
#include "EST_cmd_line.h"
enum wn_strategy_type {wn_decision_list, wn_decision_tree};
static wn_strategy_type wagon_type = wn_decision_tree;
static int wagon_main(int argc, char **argv);
int main(int argc, char **argv)
{
wagon_main(argc,argv);
exit(0);
return 0;
}
static int wagon_main(int argc, char **argv)
{
// Top level function sets up data and creates a tree
EST_Option al;
EST_StrList files;
ostream *wgn_coutput = 0;
float stepwise_limit = 0;
parse_command_line(argc, argv,
EST_String("Usage:\n")+
"wagon <options>\n"+
"Wagon CART building program (defaults in {})\n"+
"-desc <ifile> Field description file\n"+
"-data <ifile> Datafile, one vector per line\n"+
"-stop <int> {50} Minimum number of examples for leaf nodes\n"+
"-test <ifile> Datafile to test tree on\n"+
"-frs <float> {10} Float range split, number of partitions to\n"+
" split a float feature range into\n"+
"-distmatrix <ifile>\n"+
" A distance matrix for clustering\n"+
"-dlist Build a decision list (rather than tree)\n"+
"-dtree Build a decision tree (rather than list) default\n"+
"-output <ofile> File to save output tree in\n"+
"-v Print version number and exit\n"+
"-quiet No questions printed during building\n"+
"-verbose Lost of information printing during build\n"+
"The following are *provisional* and may not work\n"+
"-stepwise Incrementally find best features\n"+
"-swlimit <float> {0.0}\n"+
" Percentage necessary improvement for stepwise\n"+
"-balance <float> For derived stop size, if dataset at node, divided\n"+
" by balance is greater than stop it is used as stop\n"+
" if balance is 0 (default) always use stop as is.\n"+
"-held_out <int> Percent to hold out for pruning\n"+
"-heap <int> {210000}\n"+
" Set size of Lisp heap, should not normally need\n"+
" to be changed from its default, only with *very*\n"+
" large description files (> 1M)\n"+
"-noprune No (same class) pruning required\n",
files, al);
if (al.present("-v"))
{
printf("%s: %s\n",argv[0],wagon_version);
exit(0);
}
if (al.present("-subset"))
wgn_csubset = TRUE;
if (al.present("-held_out"))
wgn_held_out = al.ival("-held_out");
if (al.present("-balance"))
wgn_balance = al.fval("-balance");
if ((!al.present("-desc")) || ((!al.present("-data"))))
{
cerr << argv[0] << ": missing description and/or datafile" << endl;
cerr << "use -h for description of arguments" << endl;
}
if (al.present("-quiet"))
wgn_quiet = TRUE;
if (al.present("-verbose"))
wgn_verbose = TRUE;
if (al.present("-test"))
{
wgn_desc_file = al.val("-desc");
wgn_test_file = al.val("-test");
}
if (al.present("-stop"))
wgn_min_cluster_size = atoi(al.val("-stop"));
if (al.present("-noprune"))
wgn_prune = FALSE;
if (al.present("-swlimit"))
stepwise_limit = al.fval("-swlimit");
if (al.present("-frs")) // number of partitions to try in floats
wgn_float_range_split = atof(al.val("-frs"));
if (al.present("-output"))
{
wgn_coutput = new ofstream(al.val("-output"));
if (!(*wgn_coutput))
{
cerr << "Wagon: can't open file \"" << al.val("-output") <<
"\" for output " << endl;
exit(-1);
}
}
else
wgn_coutput = &cout;
if (al.present("-distmatrix"))
{
if (wgn_DistMatrix.load(al.val("-distmatrix")) != 0)
{
cerr << "Wagon: failed to load Distance Matrix from \"" <<
al.val("-distmatrix") << "\"\n" << endl;
exit(-1);
}
}
if (al.present("-dlist"))
wagon_type = wn_decision_list;
WNode *tree;
float score;
siod_init(al.ival("-heap"));
// Load in the data
wgn_load_datadescription(al.val("-desc"));
wgn_load_dataset(wgn_dataset,al.val("-data"));
if (al.present("-distmatrix") &&
(wgn_DistMatrix.num_rows() < wgn_dataset.length()))
{
cerr << "wagon: distance matrix is smaller than number of training elements\n";
exit(-1);
}
if (al.present("-test"))
wgn_load_dataset(wgn_test_dataset,al.val("-test"));
// Build and test the model
if (al.present("-stepwise"))
tree = wagon_stepwise(stepwise_limit);
else if (wagon_type == wn_decision_tree)
tree = wgn_build_tree(score); // default operation
else if (wagon_type == wn_decision_list)
// dlist is printed with build_dlist rather than returned
tree = wgn_build_dlist(score,wgn_coutput);
else
{
cerr << "Wagon: unknown operation, not tree or list" << endl;
exit(-1);
}
if (tree != 0)
{
*wgn_coutput << *tree;
summary_results(*tree,wgn_coutput);
}
if (wgn_coutput != &cout)
delete wgn_coutput;
return 0;
}
|