File: usersplits.Rout.save

package info (click to toggle)
rpart 4.1.24-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,640 kB
  • sloc: ansic: 2,994; makefile: 2
file content (172 lines) | stat: -rw-r--r-- 6,306 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

R Under development (unstable) (2019-04-05 r76323) -- "Unsuffered Consequences"
Copyright (C) 2019 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> # Any necessary setup
> library(rpart)
> options(na.action="na.omit")
> options(digits=4) # to match earlier output
> set.seed(1234)
> 
> mystate <- data.frame(state.x77, region=factor(state.region))
> names(mystate) <- c("population","income" , "illiteracy","life" ,
+        "murder", "hs.grad", "frost",     "area",      "region")
> #
> # Test out the "user mode" functions, with an anova variant
> #
> 
> # The 'evaluation' function.  Called once per node.
> #  Produce a label (1 or more elements long) for labeling each node,
> #  and a deviance.  The latter is
> #	- of length 1
> #       - equal to 0 if the node is "pure" in some sense (unsplittable)
> #       - does not need to be a deviance: any measure that gets larger
> #            as the node is less acceptable is fine.
> #       - the measure underlies cost-complexity pruning, however
> temp1 <- function(y, wt, parms) {
+     wmean <- sum(y*wt)/sum(wt)
+     rss <- sum(wt*(y-wmean)^2)
+     list(label= wmean, deviance=rss)
+     }
> 
> # The split function, where most of the work occurs.
> #   Called once per split variable per node.
> # If continuous=T
> #   The actual x variable is ordered
> #   y is supplied in the sort order of x, with no missings,
> #   return two vectors of length (n-1):
> #      goodness = goodness of the split, larger numbers are better.
> #                 0 = couldn't find any worthwhile split
> #        the ith value of goodness evaluates splitting obs 1:i vs (i+1):n
> #      direction= -1 = send "y< cutpoint" to the left side of the tree
> #                  1 = send "y< cutpoint" to the right
> #         this is not a big deal, but making larger "mean y's" move towards
> #         the right of the tree, as we do here, seems to make it easier to
> #         read
> # If continuos=F, x is a set of integers defining the groups for an
> #   unordered predictor.  In this case:
> #       direction = a vector of length m= "# groups".  It asserts that the
> #           best split can be found by lining the groups up in this order
> #           and going from left to right, so that only m-1 splits need to
> #           be evaluated rather than 2^(m-1)
> #       goodness = m-1 values, as before.
> #
> # The reason for returning a vector of goodness is that the C routine
> #   enforces the "minbucket" constraint. It selects the best return value
> #   that is not too close to an edge.
> temp2 <- function(y, wt, x, parms, continuous) {
+     # Center y
+     n <- length(y)
+     y <- y- sum(y*wt)/sum(wt)
+ 
+     if (continuous) {
+ 	# continuous x variable
+ 	temp <- cumsum(y*wt)[-n]
+ 
+ 	left.wt  <- cumsum(wt)[-n]
+ 	right.wt <- sum(wt) - left.wt
+ 	lmean <- temp/left.wt
+ 	rmean <- -temp/right.wt
+ 	goodness <- (left.wt*lmean^2 + right.wt*rmean^2)/sum(wt*y^2)
+ 	list(goodness= goodness, direction=sign(lmean))
+ 	}
+     else {
+ 	# Categorical X variable
+ 	ux <- sort(unique(x))
+ 	wtsum <- tapply(wt, x, sum)
+ 	ysum  <- tapply(y*wt, x, sum)
+ 	means <- ysum/wtsum
+ 
+ 	# For anova splits, we can order the categories by their means
+ 	#  then use the same code as for a non-categorical
+ 	ord <- order(means)
+ 	n <- length(ord)
+ 	temp <- cumsum(ysum[ord])[-n]
+ 	left.wt  <- cumsum(wtsum[ord])[-n]
+ 	right.wt <- sum(wt) - left.wt
+ 	lmean <- temp/left.wt
+ 	rmean <- -temp/right.wt
+ 	list(goodness= (left.wt*lmean^2 + right.wt*rmean^2)/sum(wt*y^2),
+ 	     direction = ux[ord])
+ 	}
+     }
> 
> # The init function:
> #   fix up y to deal with offsets
> #   return a dummy parms list
> #   numresp is the number of values produced by the eval routine's "label"
> #   numy is the number of columns for y
> #   summary is a function used to print one line in summary.rpart
> # In general, this function would also check for bad data, see rpart.poisson
> #   for instace.
> temp3 <- function(y, offset, parms, wt) {
+     if (!is.null(offset)) y <- y-offset
+     list(y=y, parms=0, numresp=1, numy=1,
+ 	      summary= function(yval, dev, wt, ylevel, digits ) {
+ 		  paste("  mean=", format(signif(yval, digits)),
+ 			", MSE=" , format(signif(dev/wt, digits)),
+ 			sep='')
+ 	     })
+     }
> 
> 
> alist <- list(eval=temp1, split=temp2, init=temp3)
> 
> fit1 <- rpart(income ~population +illiteracy  + murder + hs.grad + region,
+ 	     mystate, control=rpart.control(minsplit=10, xval=0),
+ 	     method=alist)
> 
> fit2 <- rpart(income ~population +illiteracy + murder + hs.grad + region,
+ 	     mystate, control=rpart.control(minsplit=10, xval=0),
+ 	      method='anova')
> 
> # Other than their call statement, and a longer "functions" component in
> #  fit1, fit1 and fit2 should be identical.
> all.equal(fit1$frame, fit2$frame)
[1] TRUE
> all.equal(fit1$splits, fit2$splits)
[1] TRUE
> all.equal(fit1$csplit, fit2$csplit)
[1] TRUE
> all.equal(fit1$where, fit2$where)
[1] TRUE
> all.equal(fit1$cptable, fit2$cptable)
[1] TRUE
> 
> # Now try xpred on it
> xvtemp <- rep(1:5, length=50)
> xp1 <- xpred.rpart(fit1, xval=xvtemp)
> xp2 <- xpred.rpart(fit2, xval=xvtemp)
> aeq <- function(x,y) all.equal(as.vector(x), as.vector(y))
> aeq(xp1, xp2)
[1] TRUE
> 
> fit3 <- rpart(income ~population +illiteracy + murder + hs.grad + region,
+ 	     mystate, control=rpart.control(minsplit=10, xval=xvtemp),
+ 	      method='anova')
> zz <- apply((mystate$income - xp1)^2,2, sum)
> aeq(zz/fit1$frame$dev[1], fit3$cptable[,4])  #reproduce xerror
[1] TRUE
> 
> zz2 <- sweep((mystate$income-xp1)^2,2, zz/nrow(xp1))
> zz2 <- sqrt(apply(zz2^2, 2, sum))/ fit1$frame$dev[1]
> aeq(zz2, fit3$cptable[,5])          #reproduce se(xerror)
[1] TRUE
> 
> 
> proc.time()
   user  system elapsed 
  0.155   0.025   0.175