File: plot_kolmogorov_statistics.py

package info (click to toggle)
openturns 1.26-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 67,708 kB
  • sloc: cpp: 261,605; python: 67,030; ansic: 4,378; javascript: 406; sh: 185; xml: 164; makefile: 101
file content (138 lines) | stat: -rw-r--r-- 4,315 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Kolmogorov-Smirnov : understand the statistics
==============================================
"""

# %%

# %%
# In this example, we illustrate how the Kolmogorov-Smirnov (KS) statistic is computed.
#
# * We generate a sample from a Normal distribution.
# * We create a uniform distribution and estimate its parameters from the sample.
# * Compute the Kolmogorov-Smirnov statistic and plot it on top of the empirical cumulative distribution function.

# %%
import openturns as ot
import openturns.viewer as otv


# %%
# The `computeKSStatisticsIndex` function computes the Kolmogorov-Smirnov
# distance between the sample and the distribution.
# Furthermore, it returns the index which achieves the maximum distance
# in the sorted sample.
# The following function is for teaching purposes only: use
# `FittingTest` for real applications.

# %%


def computeKSStatisticsIndex(sample, distribution):
    sample = ot.Sample(sample.sort())
    print("Sorted")
    print(sample)
    n = sample.getSize()
    D = 0.0
    index = -1
    D_previous = 0.0
    for i in range(n):
        F = distribution.computeCDF(sample[i])
        S1 = abs(F - float(i) / n)
        S2 = abs(float(i + 1) / n - F)
        print(
            "i=%d, x[i]=%.4f, F(x[i])=%.4f, S1=%.4f, S2=%.4f"
            % (i, sample[i, 0], F, S1, S2)
        )
        D = max(S1, S2, D)
        if D > D_previous:
            print("D max!")
            index = i
            D_previous = D
    observation = sample[index]
    return D, index, observation


# %%
# The `drawKSDistance()` function plots the empirical distribution function
# of the sample and the Kolmogorov-Smirnov distance at point x.
# The empirical CDF is a staircase function and is discontinuous at each observation.
# Denote by :math:`\hat{F}` the empirical CDF. For a given observation :math:`x`
# which achieves the maximum distance to the candidate distribution CDF,
# let us denote :math:`\hat{F}^- = \lim_{x \rightarrow x^-} \hat{F}(x)` and
# :math:`\hat{F}^+ = \lim_{x\rightarrow x^+} \hat{F}(x)`.
# The maximum distance can be achieved either by :math:`\hat{F}^-` or :math:`\hat{F}^+`.
# The `computeEmpiricalCDF(x)` method computes :math:`\hat{F}^+=\mathbb{P}(X \leq x)`.
# We compute :math:`\hat{F}^-` with the equation :math:`\hat{F}^- = \hat{F}^+ - 1/n`
# where :math:`n` is the sample size.
#


# %%
def drawKSDistance(sample, distribution, observation, D, distFactory):
    graph = ot.Graph("KS Distance = %.4f" % (D), "X", "CDF", True, "upper left")
    # Thick vertical line at point x
    ECDF_x_plus = sample.computeEmpiricalCDF(observation)
    ECDF_x_minus = ECDF_x_plus - 1.0 / sample.getSize()
    CDF_index = distribution.computeCDF(observation)
    curve = ot.Curve(
        [observation[0], observation[0], observation[0]],
        [ECDF_x_plus, ECDF_x_minus, CDF_index],
    )
    curve.setLegend("KS Statistics")
    curve.setLineWidth(4.0 * curve.getLineWidth())
    graph.add(curve)
    # Empirical CDF
    empiricalCDF = ot.UserDefined(sample).drawCDF()
    empiricalCDF.setLegends(["Empirical DF"])
    graph.add(empiricalCDF)
    #
    distname = distFactory.getClassName()
    distribution = distFactory.build(sample)
    cdf = distribution.drawCDF()
    cdf.setLegends([distname])
    graph.add(cdf)
    return graph


# %%
# We generate a sample from a standard Normal distribution.

# %%
N = ot.Normal()
n = 10
sample = N.getSample(n)

# %%
# Compute the index which achieves the maximum Kolmogorov-Smirnov distance.

# %%
# We then create a uniform distribution whose parameters are estimated
# from the sample.
# This way, the K.S. distance is large enough to be graphically significant.

# %%
distFactory = ot.UniformFactory()
distribution = distFactory.build(sample)
distribution

# %%
# Compute the index which achieves the maximum Kolmogorov-Smirnov distance.

# %%
D, index, observation = computeKSStatisticsIndex(sample, distribution)
print("D=", D, ", Index=", index, ", Obs.=", observation)

# %%
graph = drawKSDistance(sample, distribution, observation, D, distFactory)
view = otv.View(graph)


# %%
# Display the graphs
view.ShowAll()

# %%
# We see that the K.S. statistics is achieved at the observation where the distance
# between the empirical distribution function of the sample and the
# candidate distribution is largest.