1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
|
# stat-wasserstein.tcl --
# Determine the Wasserstein distance between two probability distributions
#
# Note:
# This is an implementation for one-dimensional distributions (or better:
# non-negative patterns)
#
# Note 2:
# The lower bound of 1.0e-10 is probably not at all necessary
#
# LastNonZero --
# Auxiliary procedure to find the last non-zero entry
#
# Arguments:
# prob Probability distribution
#
# Result:
# Index in the list of the last non-zero entry
#
# Note:
# To avoid numerical problems any value smaller than 1.0e-10 is considered to
# be zero
#
proc ::math::statistics::LastNonZero {prob} {
set maxidx [expr {[llength $prob] - 1}]
for {set idx $maxidx} {$idx >= 0} {incr idx -1} {
if { [lindex $prob $idx] > 1.0e-10 } {
return $idx
}
}
return -1 ;# No non-zero entry
}
# Normalise --
# Auxiliary procedure to normalise the probability distribution
#
# Arguments:
# prob Probability distribution
#
# Result:
# Normalised distribution (i.e. the entries sum to 1)
#
# Note:
# To avoid numerical problems any value smaller than 1.0e-10 is set to zero
#
proc ::math::statistics::Normalise {prob} {
set newprob {}
set sum 0.0
foreach p $prob {
set sum [expr {$sum + $p}]
}
if { $sum == 0.0 } {
return -code error "Probability distribution should not consist of only zeroes"
}
foreach p $prob {
lappend newprob [expr {$p > 1.0e-10? ($p/$sum) : 0.0}]
}
return $newprob
}
# wasserstein-distance --
# Determine the Wasserstein distance using a "greedy" algorithm.
#
# Arguments:
# prob1 First probability distribution, interpreted as a histogram
# with uniform bin width
# prob2 Second probability distribution
#
# Result:
# Distance between the two distributions
#
proc ::math::statistics::wasserstein-distance {prob1 prob2} {
#
# First step: make sure the histograms have the same length and the
# same cumulative weight.
#
if { [llength $prob1] != [llength $prob2] } {
return -code error "Lengths of the probability histograms must be the same"
}
set prob1 [Normalise $prob1]
set prob2 [Normalise $prob2]
set distance 0.0
#
# Determine the last non-zero bin - this bin will be shifted to the second
# distribution
#
while {1} {
set idx1 [LastNonZero $prob1]
set idx2 [LastNonZero $prob2]
if { $idx1 < 0 } {
break ;# We are done
}
set bin1 [lindex $prob1 $idx1]
set bin2 [lindex $prob2 $idx2]
if { $bin1 <= $bin2 } {
lset prob1 $idx1 0.0
lset prob2 $idx2 [expr {$bin2 - $bin1}]
set distance [expr {$distance + abs($idx2-$idx1) * $bin1}]
} else {
lset prob1 $idx1 [expr {$bin1 - $bin2}]
lset prob2 $idx2 0.0
set distance [expr {$distance + abs($idx2-$idx1) * $bin2}]
}
}
return $distance
}
# kl-divergence --
# Calculate the Kullback-Leibler (KL) divergence for two discrete distributions
#
# Arguments:
# prob1 First probability distribution - the divergence is calculated
# with this one as the basis
# prob2 Second probability distribution - the divergence of this
# distribution wrt the first is calculated
#
# Notes:
# - The KL divergence is an asymmetric measure
# - It is actually only defined if prob2 is only zero when prob1 is too
# - The number of elements in the two distributions must be the same and
# bins must be the same
#
proc ::math::statistics::kl-divergence {prob1 prob2} {
if { [llength $prob1] != [llength $prob2] } {
return -code error "Lengths of the two probability histograms must be the same"
}
#
# Normalise the probability histograms
#
set prob1 [Normalise $prob1]
set prob2 [Normalise $prob2]
#
# Check for well-definedness while going along
#
set sum 0.0
foreach p1 $prob1 p2 $prob2 {
if { $p2 == 0.0 && $p1 != 0.0 } {
return -code error "Second probability histogram contains unmatched zeroes"
}
if { $p1 != 0.0 } {
set sum [expr {$sum - $p1 * log($p2/$p1)}]
}
}
return $sum
}
if {0} {
# tests --
#
# Almost trivial
set prob1 {0.0 0.0 0.0 1.0}
set prob2 {0.0 0.0 1.0 0.0}
puts "Expected distance: 1"
puts "Calculated: [wasserstein-distance $prob1 $prob2]"
puts "Symmetric: [wasserstein-distance $prob2 $prob1]"
# Less trivial
set prob1 {0.0 0.75 0.25 0.0}
set prob2 {0.0 0.0 1.0 0.0}
puts "Expected distance: 0.75"
puts "Calculated: [wasserstein-distance $prob1 $prob2]"
puts "Symmetric: [wasserstein-distance $prob2 $prob1]"
# Shift trivial
set prob1 {0.0 0.1 0.2 0.4 0.2 0.1 0.0 0.0}
set prob2 {0.0 0.0 0.0 0.1 0.2 0.4 0.2 0.1}
puts "Expected distance: 2"
puts "Calculated: [wasserstein-distance $prob1 $prob2]"
puts "Symmetric: [wasserstein-distance $prob2 $prob1]"
# KL-divergence
set prob1 {0.0 0.1 0.2 0.4 0.2 0.1 0.0 0.0}
set prob2 {0.0 0.1 0.2 0.4 0.2 0.1 0.0 0.0}
puts "KL-divergence for equal distributions: 0"
puts "KL-divergence: [kl-divergence $prob1 $prob2]"
set prob1 {0.1e-8 0.1 0.2 0.4 0.2 0.1 0.0 0.0 0.0}
set prob2 {0.1e-8 0.1e-8 0.1 0.2 0.4 0.2 0.1 0.1e-8 0.1e-8}
puts "KL-divergence for shifted distributions: ??"
puts "KL-divergence: [kl-divergence $prob1 $prob2]"
# Hm, the normalisation proc causes a slight problem with elements of 1.0e-10
set prob1 {0.1e-8 0.1 0.2 0.4 0.2 0.1 0.0 0.0}
set prob2 {0.1e-8 0.11 0.19 0.4 0.24 0.06 0.1e-8 0.1e-8}
puts "KL-divergence slightly dififerent distributions: ??"
puts "KL-divergence: [kl-divergence $prob1 $prob2]"
}
|