1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
|
# stat_kernel.tcl --
#
# Part of the statistics package for basic statistical analysis
# Based on http://en.wikipedia.org/wiki/Kernel_(statistics) and
# http://en.wikipedia.org/wiki/Kernel_density_estimation
#
# version 0.1: initial implementation, january 2014
# kernel-density --
# Estimate the probability density using the kernel density
# estimation method
#
# Arguments:
# data List of univariate data
# args List of options in the form of keyword-value pairs:
# -weights weights: per data point the weight
# -bandwidth value: bandwidth to be used for the estimation
# -number value: number of bins to be returned
# -interval {begin end}: begin and end of the interval for
# which the density is returned
# -kernel function: kernel to be used (gaussian, cosine,
# epanechnikov, uniform, triangular, biweight,
# logistic)
# For all options more or less sensible defaults are
# provided.
#
# Result:
# A list of the bin centres, a list of the corresponding density
# estimates and a list containing several computational parameters:
# begin and end of the interval, mean, standard deviation and bandwidth
#
# Note:
# The conditions for the kernel function are fairly weak:
# - It should integrate to 1
# - It should be symmetric around 0
#
# As for the implementation in Tcl: it should be reachable in the
# ::math::statistics namespace. As a consequence, you can define
# your own kernel function too. Hence there is no check.
#
proc ::math::statistics::kernel-density {data args} {
#
# Determine the basic statistics
#
set basicStats [BasicStats all $data]
set mean [lindex $basicStats 0]
set ndata [lindex $basicStats 3]
set stdev [lindex $basicStats 4]
if { $ndata < 1 } {
return -code error -errorcode ARG -errorinfo "Too few actual data"
}
#
# Get the options (providing defaults as needed)
#
set opt(-weights) {}
set opt(-number) 100
set opt(-kernel) gaussian
#
# The default bandwidth is set via a simple expression, which
# is supposed to be optimal for the Gaussian kernel.
# Perhaps a more sophisticated method should be provided as well
#
set opt(-bandwidth) [expr {1.06 * $stdev / pow($ndata,0.2)}]
#
# The default interval is derived from the mean and the
# standard deviation
#
set opt(-interval) [list [expr {$mean - 3.0 * $stdev}] [expr {$mean + 3.0 * $stdev}]]
#
# Retrieve the given options from $args
#
if { [llength $args] % 2 != 0 } {
return -code error -errorcode ARG -errorinfo "The options must all have a value"
}
array set opt $args
#
# Elementary checks
#
if { $opt(-bandwidth) <= 0.0 } {
return -code error -errorcode ARG -errorinfo "The bandwidth must be positive: $opt(-bandwidth)"
}
if { $opt(-number) <= 0.0 } {
return -code error -errorcode ARG -errorinfo "The number of bins must be positive: $opt(-number)"
}
if { [lindex $opt(-interval) 0] == [lindex $opt(-interval) 1] } {
return -code error -errorcode ARG -errorinfo "The interval has length zero: $opt(-interval)"
}
if { [llength [info proc $opt(-kernel)]] == 0 } {
return -code error -errorcode ARG -errorinfo "Unknown kernel function: $opt(-kernel)"
}
#
# Construct the weights
#
if { [llength $opt(-weights)] > 0 } {
if { [llength $data] != [llength $opt(-weights)] } {
return -code error -errorcode ARG -errorinfo "The list of weights must match the data"
}
set sum 0.0
foreach d $data w $opt(-weights) {
if { $d != {} } {
set sum [expr {$sum + $w}]
}
}
set scale [expr {1.0/$sum/$ndata}]
set weight {}
foreach w $opt(-weights) {
if { $d != {} } {
lappend weight [expr {$w / $scale}]
} else {
lappend weight {}
}
}
} else {
set weight [lrepeat [llength $data] [expr {1.0/$ndata}]] ;# Note: missing values have weight zero
}
#
# Construct the centres of the bins
#
set xbegin [lindex $opt(-interval) 0]
set xend [lindex $opt(-interval) 1]
set dx [expr {($xend - $xbegin) / double($opt(-number))}]
set xb [expr {$xbegin + 0.5 * $dx}]
set xvalue {}
for {set i 0} {$i < $opt(-number)} {incr i} {
lappend xvalue [expr {$xb + $i * $dx}]
}
#
# Construct the density function
#
set density {}
set scale [expr {1.0/$opt(-bandwidth)}]
foreach x $xvalue {
set sum 0.0
foreach d $data w $weight {
if { $d != {} } {
set kvalue [$opt(-kernel) [expr {$scale * ($x-$d)}]]
set sum [expr {$sum + $w * $kvalue}]
}
}
lappend density [expr {$sum * $scale}]
}
#
# Return the result
#
return [list $xvalue $density [list $xbegin $xend $mean $stdev $opt(-bandwidth)]]
}
# gaussian, uniform, triangular, epanechnikov, biweight, cosine, logistic --
# The Gaussian kernel
#
# Arguments:
# x (Scaled) argument
#
# Result:
# Value of the kernel
#
# Note:
# The standard deviation is 1.
#
proc ::math::statistics::gaussian {x} {
return [expr {exp(-0.5*$x*$x) / sqrt(2.0*acos(-1.0))}]
}
proc ::math::statistics::uniform {x} {
if { abs($x) <= 1.0 } {
return 0.5
} else {
return 0.0
}
}
proc ::math::statistics::triangular {x} {
if { abs($x) < 1.0 } {
return [expr {1.0 - abs($x)}]
} else {
return 0.0
}
}
proc ::math::statistics::epanechnikov {x} {
if { abs($x) < 1.0 } {
return [expr {0.75 * (1.0 - abs($x)*abs($x))}]
} else {
return 0.0
}
}
proc ::math::statistics::biweight {x} {
if { abs($x) < 1.0 } {
return [expr {0.9375 * pow((1.0 - abs($x)*abs($x)),2)}]
} else {
return 0.0
}
}
proc ::math::statistics::cosine {x} {
if { abs($x) < 1.0 } {
return [expr {0.25 * acos(-1.0) * cos(0.5 * acos(-1.0) * $x)}]
} else {
return 0.0
}
}
proc ::math::statistics::logistic {x} {
return [expr {1.0 / (exp($x) + 2.0 + exp(-$x))}]
}
|