File: plot_distribution_linear_regression.py

package info (click to toggle)
openturns 1.26-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 67,708 kB
  • sloc: cpp: 261,605; python: 67,030; ansic: 4,378; javascript: 406; sh: 185; xml: 164; makefile: 101
file content (257 lines) | stat: -rw-r--r-- 9,017 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
"""
Distribution of estimators in linear regression
===============================================
"""

# %%
# Introduction
# ~~~~~~~~~~~~
#
# In this example, we are interested in the distribution of the estimator of the variance
# of the observation error in linear regression.
# We are also interested in the estimator of the standard deviation of the
# observation error.
# We show how to use the :class:`~openturns.PythonRandomVector` class in order to
# perform a study of the sample distribution of these estimators.
#
# In the general linear regression model, the observation error :math:`\epsilon` has the
# Normal distribution :math:`\cN(0, \sigma^2)` where :math:`\sigma > 0`
# is the standard deviation.
# We are interested in the estimators of the variance :math:`\sigma^2`
# and the standard deviation :math:`\sigma`:
#
# - the variance of the residuals, :math:`\sigma^2`, is estimated from :meth:`~openturns.LinearModelResult.getResidualsVariance`;
# - the standard deviation :math:`\sigma` is estimated from :meth:`~openturns.LinearModelAnalysis.getResidualsStandardError`.
#
# The asymptotic distribution of these estimators is known (see [vaart2000]_)
# but we want to perform an empirical study, based on simulation.
# In order to see the distribution of the estimator, we simulate an observation of the estimator,
# and repeat that experiment :math:`r \in \Nset` times, where :math:`r`
# is a large integer.
# Then we approximate the distribution using :class:`~openturns.KernelSmoothing`.


import openturns as ot
import openturns.viewer as otv
from matplotlib import pyplot as plt

# %%
# The simulation engine
# ~~~~~~~~~~~~~~~~~~~~~
# The simulation is based on the :class:`~openturns.PythonRandomVector` class,
# which simulates independent observations of a random vector.
# The `getRealization()` method implements the simulation of the observation
# of the estimator.
#


class SampleEstimatorLinearRegression(ot.PythonRandomVector):
    def __init__(
        self, sample_size, true_standard_deviation, coefficients, estimator="variance"
    ):
        """
        Create a RandomVector for an estimator from a linear regression model.

        Parameters
        ----------
        sample_size : int
            The sample size n.
        true_standard_deviation : float
            The standard deviation of the Gaussian observation error.
        coefficients: sequence of p floats
            The coefficients of the linear model.
        estimator: str
            The estimator.
            Available estimators are "variance" or "standard-deviation".
        """
        super(SampleEstimatorLinearRegression, self).__init__(1)
        self.sample_size = sample_size
        self.numberOfParameters = coefficients.getDimension()
        input_dimension = self.numberOfParameters - 1  # Because of the intercept
        centerCoefficient = [0.0] * input_dimension
        constantCoefficient = [coefficients[0]]
        linearCoefficient = ot.Matrix([coefficients[1:]])
        self.linearModel = ot.LinearFunction(
            centerCoefficient, constantCoefficient, linearCoefficient
        )
        self.distribution = ot.Normal(input_dimension)
        self.distribution.setDescription(["x%d" % (i) for i in range(input_dimension)])
        self.errorDistribution = ot.Normal(0.0, true_standard_deviation)
        self.estimator = estimator
        # In classical linear regression, the input is deterministic.
        # Here, we set it once and for all.
        self.input_sample = self.distribution.getSample(self.sample_size)
        self.output_sample = self.linearModel(self.input_sample)

    def getRealization(self):
        errorSample = self.errorDistribution.getSample(self.sample_size)
        noisy_output_sample = self.output_sample + errorSample
        algo = ot.LinearModelAlgorithm(self.input_sample, noisy_output_sample)
        lmResult = algo.getResult()
        if self.estimator == "variance":
            output = lmResult.getResidualsVariance()
        elif self.estimator == "standard-deviation":
            lmAnalysis = ot.LinearModelAnalysis(lmResult)
            output = lmAnalysis.getResidualsStandardError()
        else:
            raise ValueError("Unknown estimator %s" % (self.estimator))
        return [output]


def plot_sample_by_kernel_smoothing(
    sample_size, true_standard_deviation, coefficients, estimator, repetitions_size
):
    """
    Plot the estimated distribution of the biased sample variance from a sample

    The method uses Kernel Smoothing with default kernel.

    Parameters
    ----------
    repetitions_size : int
        The number of repetitions of the experiments.
        This is the (children) size of the sample of the biased
        sample variance

    Returns
    -------
    graph : ot.Graph
        The plot of the PDF of the estimated distribution.

    """
    myRV = ot.RandomVector(
        SampleEstimatorLinearRegression(
            sample_size, true_standard_deviation, coefficients, estimator
        )
    )
    sample_estimator = myRV.getSample(repetitions_size)
    if estimator == "variance":
        sample_estimator.setDescription([r"$\hat{\sigma}^2$"])
    elif estimator == "standard-deviation":
        sample_estimator.setDescription([r"$\hat{\sigma}$"])
    else:
        raise ValueError("Unknown estimator %s" % (estimator))
    mean_sample_estimator = sample_estimator.computeMean()

    graph = ot.KernelSmoothing().build(sample_estimator).drawPDF()
    graph.setLegends(["Sample"])
    bb = graph.getBoundingBox()
    ylb = bb.getLowerBound()[1]
    yub = bb.getUpperBound()[1]
    if estimator == "variance":
        curve = ot.Curve([true_standard_deviation**2] * 2, [ylb, yub])
    elif estimator == "standard-deviation":
        curve = ot.Curve([true_standard_deviation] * 2, [ylb, yub])
    curve.setLegend("Exact")
    curve.setLineWidth(2.0)
    graph.add(curve)
    graph.setTitle(
        "Size = %d, True S.D. = %.4f, Mean = %.4f, Rep. = %d"
        % (
            sample_size,
            true_standard_deviation,
            mean_sample_estimator[0],
            repetitions_size,
        )
    )
    return graph


# %%
# Distribution of the variance estimator
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# We first consider the estimation of the variance :math:`\sigma^2`.
# In the next cell, we consider a sample size equal to :math:`n = 6` with
# :math:`p = 3` parameters.
# We use :math:`r = 100` repetitions.


repetitions_size = 100
true_standard_deviation = 0.1
sample_size = 6
coefficients = ot.Point([3.0, 2.0, -1.0])
estimator = "variance"
view = otv.View(
    plot_sample_by_kernel_smoothing(
        sample_size, true_standard_deviation, coefficients, estimator, repetitions_size
    ),
    figure_kw={"figsize": (6.0, 3.5)},
)
plt.subplots_adjust(bottom=0.25)

# %%
# If we use a sample size equal to :math:`n = 6` with
# :math:`p = 3` parameters, the distribution is not symmetric.
# The mean of the observations of the sample variances remains close to
# the true value :math:`0.1^2 = 0.01`.

# %%
# Then we increase the sample size :math:`n`.

repetitions_size = 100
true_standard_deviation = 0.1
sample_size = 100
coefficients = ot.Point([3.0, 2.0, -1.0])
view = otv.View(
    plot_sample_by_kernel_smoothing(
        sample_size, true_standard_deviation, coefficients, estimator, repetitions_size
    ),
    figure_kw={"figsize": (6.0, 3.5)},
)
plt.subplots_adjust(bottom=0.25)


# %%
# If we use a sample size equal to :math:`n = 100` with
# :math:`p = 3` parameters, the distribution is almost symmetric and
# almost normal.

# %%
# Distribution of the standard deviation estimator
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# We now consider the estimation of the standard deviation :math:`\sigma`.


repetitions_size = 100
true_standard_deviation = 0.1
sample_size = 6
coefficients = ot.Point([3.0, 2.0, -1.0])
estimator = "standard-deviation"
view = otv.View(
    plot_sample_by_kernel_smoothing(
        sample_size, true_standard_deviation, coefficients, estimator, repetitions_size
    ),
    figure_kw={"figsize": (6.0, 3.5)},
)
plt.subplots_adjust(bottom=0.25)


# %%
# If we use a sample size equal to :math:`n = 6` with
# :math:`p = 3` parameters, we see that the distribution is almost symmetric.
# We notice a slight bias, since the mean of the observations of the
# standard deviation is not as close to the true value 0.1
# as we could expect.


repetitions_size = 100
true_standard_deviation = 0.1
sample_size = 100
coefficients = ot.Point([3.0, 2.0, -1.0])
estimator = "standard-deviation"
view = otv.View(
    plot_sample_by_kernel_smoothing(
        sample_size, true_standard_deviation, coefficients, estimator, repetitions_size
    ),
    figure_kw={"figsize": (6.0, 3.5)},
)
plt.subplots_adjust(bottom=0.25)


# %%
# If we use a sample size equal to :math:`n = 100` with
# :math:`p = 3` parameters, we see that the distribution is almost normal.
# We notice that the bias disappeared.

# %%
otv.View.ShowAll()