File: stat_wasserstein.tcl

package info (click to toggle)
tcllib 1.20%2Bdfsg-1
  • links: PTS
  • area: main
  • in suites: bullseye
  • size: 68,064 kB
  • sloc: tcl: 216,842; ansic: 14,250; sh: 2,846; xml: 1,766; yacc: 1,145; pascal: 881; makefile: 107; perl: 84; f90: 84; python: 33; ruby: 13; php: 11
file content (214 lines) | stat: -rw-r--r-- 5,855 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# stat-wasserstein.tcl --
#     Determine the Wasserstein distance between two probability distributions
#
#     Note:
#     This is an implementation for one-dimensional distributions (or better:
#     non-negative patterns)
#
#     Note 2:
#     The lower bound of 1.0e-10 is probably not at all necessary
#

# LastNonZero --
#     Auxiliary procedure to find the last non-zero entry
#
# Arguments:
#     prob           Probability distribution
#
# Result:
#     Index in the list of the last non-zero entry
#
# Note:
#     To avoid numerical problems any value smaller than 1.0e-10 is considered to
#     be zero
#
proc ::math::statistics::LastNonZero {prob} {
    set maxidx [expr {[llength $prob] - 1}]

    for {set idx $maxidx} {$idx >= 0} {incr idx -1} {
        if { [lindex $prob $idx] > 1.0e-10 } {
            return $idx
        }
    }

    return -1 ;# No non-zero entry
}

# Normalise --
#     Auxiliary procedure to normalise the probability distribution
#
# Arguments:
#     prob           Probability distribution
#
# Result:
#     Normalised distribution (i.e. the entries sum to 1)
#
# Note:
#     To avoid numerical problems any value smaller than 1.0e-10 is set to zero
#
proc ::math::statistics::Normalise {prob} {

    set newprob {}
    set sum     0.0

    foreach p $prob {
        set sum [expr {$sum + $p}]
    }

    if { $sum == 0.0 } {
        return -code error "Probability distribution should not consist of only zeroes"
    }

    foreach p $prob {
        lappend newprob [expr {$p > 1.0e-10? ($p/$sum) : 0.0}]
    }

    return $newprob
}

# wasserstein-distance --
#     Determine the Wasserstein distance using a "greedy" algorithm.
#
# Arguments:
#     prob1          First probability distribution, interpreted as a histogram
#                    with uniform bin width
#     prob2          Second probability distribution
#
# Result:
#     Distance between the two distributions
#
proc ::math::statistics::wasserstein-distance {prob1 prob2} {
    #
    # First step: make sure the histograms have the same length and the
    # same cumulative weight.
    #
    if { [llength $prob1] != [llength $prob2] } {
        return -code error "Lengths of the probability histograms must be the same"
    }

    set prob1 [Normalise $prob1]
    set prob2 [Normalise $prob2]

    set distance 0.0

    #
    # Determine the last non-zero bin - this bin will be shifted to the second
    # distribution
    #
    while {1} {
        set idx1 [LastNonZero $prob1]
        set idx2 [LastNonZero $prob2]

        if { $idx1 < 0 } {
            break ;# We are done
        }

        set bin1 [lindex $prob1 $idx1]
        set bin2 [lindex $prob2 $idx2]

        if { $bin1 <= $bin2 } {
            lset prob1 $idx1 0.0
            lset prob2 $idx2 [expr {$bin2 - $bin1}]
            set distance [expr {$distance + abs($idx2-$idx1) * $bin1}]
        } else {
            lset prob1 $idx1 [expr {$bin1 - $bin2}]
            lset prob2 $idx2 0.0
            set distance [expr {$distance + abs($idx2-$idx1) * $bin2}]
        }
    }

    return $distance
}

# kl-divergence --
#     Calculate the Kullback-Leibler (KL) divergence for two discrete distributions
#
# Arguments:
#     prob1          First probability distribution - the divergence is calculated
#                    with this one as the basis
#     prob2          Second probability distribution - the divergence of this
#                    distribution wrt the first is calculated
#
# Notes:
#     - The KL divergence is an asymmetric measure
#     - It is actually only defined if prob2 is only zero when prob1 is too
#     - The number of elements in the two distributions must be the same and
#       bins must be the same
#
proc ::math::statistics::kl-divergence {prob1 prob2} {
    if { [llength $prob1] != [llength $prob2] } {
        return -code error "Lengths of the two probability histograms must be the same"
    }

    #
    # Normalise the probability histograms
    #
    set prob1 [Normalise $prob1]
    set prob2 [Normalise $prob2]

    #
    # Check for well-definedness while going along
    #
    set sum 0.0
    foreach p1 $prob1 p2 $prob2 {
        if { $p2 == 0.0 && $p1 != 0.0 } {
            return -code error "Second probability histogram contains unmatched zeroes"
        }

        if { $p1 != 0.0 } {
            set sum [expr {$sum - $p1 * log($p2/$p1)}]
        }
    }

    return $sum
}

if {0} {
# tests --
#

# Almost trivial
set prob1 {0.0 0.0 0.0 1.0}
set prob2 {0.0 0.0 1.0 0.0}

puts "Expected distance: 1"
puts "Calculated: [wasserstein-distance $prob1 $prob2]"
puts "Symmetric:  [wasserstein-distance $prob2 $prob1]"

# Less trivial
set prob1 {0.0 0.75 0.25 0.0}
set prob2 {0.0 0.0  1.0  0.0}

puts "Expected distance: 0.75"
puts "Calculated: [wasserstein-distance $prob1 $prob2]"
puts "Symmetric:  [wasserstein-distance $prob2 $prob1]"

# Shift trivial
set prob1 {0.0 0.1 0.2 0.4 0.2 0.1 0.0 0.0}
set prob2 {0.0 0.0 0.0 0.1 0.2 0.4 0.2 0.1}

puts "Expected distance: 2"
puts "Calculated: [wasserstein-distance $prob1 $prob2]"
puts "Symmetric:  [wasserstein-distance $prob2 $prob1]"


# KL-divergence
set prob1 {0.0 0.1 0.2 0.4 0.2 0.1 0.0 0.0}
set prob2 {0.0 0.1 0.2 0.4 0.2 0.1 0.0 0.0}

puts "KL-divergence for equal distributions: 0"
puts "KL-divergence: [kl-divergence $prob1 $prob2]"

set prob1 {0.1e-8 0.1    0.2 0.4 0.2 0.1 0.0 0.0    0.0}
set prob2 {0.1e-8 0.1e-8 0.1 0.2 0.4 0.2 0.1 0.1e-8 0.1e-8}

puts "KL-divergence for shifted distributions: ??"
puts "KL-divergence: [kl-divergence $prob1 $prob2]"

# Hm, the normalisation proc causes a slight problem with elements of 1.0e-10
set prob1 {0.1e-8 0.1  0.2  0.4 0.2  0.1  0.0     0.0}
set prob2 {0.1e-8 0.11 0.19 0.4 0.24 0.06 0.1e-8  0.1e-8}

puts "KL-divergence slightly dififerent distributions: ??"
puts "KL-divergence: [kl-divergence $prob1 $prob2]"
}