1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
|
"""
Distribution of estimators in linear regression
===============================================
"""
# %%
# Introduction
# ~~~~~~~~~~~~
#
# In this example, we are interested in the distribution of the estimator of the variance
# of the observation error in linear regression.
# We are also interested in the estimator of the standard deviation of the
# observation error.
# We show how to use the :class:`~openturns.PythonRandomVector` class in order to
# perform a study of the sample distribution of these estimators.
#
# In the general linear regression model, the observation error :math:`\epsilon` has the
# Normal distribution :math:`\cN(0, \sigma^2)` where :math:`\sigma > 0`
# is the standard deviation.
# We are interested in the estimators of the variance :math:`\sigma^2`
# and the standard deviation :math:`\sigma`:
#
# - the variance of the residuals, :math:`\sigma^2`, is estimated from :meth:`~openturns.LinearModelResult.getResidualsVariance`;
# - the standard deviation :math:`\sigma` is estimated from :meth:`~openturns.LinearModelAnalysis.getResidualsStandardError`.
#
# The asymptotic distribution of these estimators is known (see [vaart2000]_)
# but we want to perform an empirical study, based on simulation.
# In order to see the distribution of the estimator, we simulate an observation of the estimator,
# and repeat that experiment :math:`r \in \Nset` times, where :math:`r`
# is a large integer.
# Then we approximate the distribution using :class:`~openturns.KernelSmoothing`.
import openturns as ot
import openturns.viewer as otv
from matplotlib import pyplot as plt
# %%
# The simulation engine
# ~~~~~~~~~~~~~~~~~~~~~
# The simulation is based on the :class:`~openturns.PythonRandomVector` class,
# which simulates independent observations of a random vector.
# The `getRealization()` method implements the simulation of the observation
# of the estimator.
#
class SampleEstimatorLinearRegression(ot.PythonRandomVector):
def __init__(
self, sample_size, true_standard_deviation, coefficients, estimator="variance"
):
"""
Create a RandomVector for an estimator from a linear regression model.
Parameters
----------
sample_size : int
The sample size n.
true_standard_deviation : float
The standard deviation of the Gaussian observation error.
coefficients: sequence of p floats
The coefficients of the linear model.
estimator: str
The estimator.
Available estimators are "variance" or "standard-deviation".
"""
super(SampleEstimatorLinearRegression, self).__init__(1)
self.sample_size = sample_size
self.numberOfParameters = coefficients.getDimension()
input_dimension = self.numberOfParameters - 1 # Because of the intercept
centerCoefficient = [0.0] * input_dimension
constantCoefficient = [coefficients[0]]
linearCoefficient = ot.Matrix([coefficients[1:]])
self.linearModel = ot.LinearFunction(
centerCoefficient, constantCoefficient, linearCoefficient
)
self.distribution = ot.Normal(input_dimension)
self.distribution.setDescription(["x%d" % (i) for i in range(input_dimension)])
self.errorDistribution = ot.Normal(0.0, true_standard_deviation)
self.estimator = estimator
# In classical linear regression, the input is deterministic.
# Here, we set it once and for all.
self.input_sample = self.distribution.getSample(self.sample_size)
self.output_sample = self.linearModel(self.input_sample)
def getRealization(self):
errorSample = self.errorDistribution.getSample(self.sample_size)
noisy_output_sample = self.output_sample + errorSample
algo = ot.LinearModelAlgorithm(self.input_sample, noisy_output_sample)
lmResult = algo.getResult()
if self.estimator == "variance":
output = lmResult.getResidualsVariance()
elif self.estimator == "standard-deviation":
lmAnalysis = ot.LinearModelAnalysis(lmResult)
output = lmAnalysis.getResidualsStandardError()
else:
raise ValueError("Unknown estimator %s" % (self.estimator))
return [output]
def plot_sample_by_kernel_smoothing(
sample_size, true_standard_deviation, coefficients, estimator, repetitions_size
):
"""
Plot the estimated distribution of the biased sample variance from a sample
The method uses Kernel Smoothing with default kernel.
Parameters
----------
repetitions_size : int
The number of repetitions of the experiments.
This is the (children) size of the sample of the biased
sample variance
Returns
-------
graph : ot.Graph
The plot of the PDF of the estimated distribution.
"""
myRV = ot.RandomVector(
SampleEstimatorLinearRegression(
sample_size, true_standard_deviation, coefficients, estimator
)
)
sample_estimator = myRV.getSample(repetitions_size)
if estimator == "variance":
sample_estimator.setDescription([r"$\hat{\sigma}^2$"])
elif estimator == "standard-deviation":
sample_estimator.setDescription([r"$\hat{\sigma}$"])
else:
raise ValueError("Unknown estimator %s" % (estimator))
mean_sample_estimator = sample_estimator.computeMean()
graph = ot.KernelSmoothing().build(sample_estimator).drawPDF()
graph.setLegends(["Sample"])
bb = graph.getBoundingBox()
ylb = bb.getLowerBound()[1]
yub = bb.getUpperBound()[1]
if estimator == "variance":
curve = ot.Curve([true_standard_deviation**2] * 2, [ylb, yub])
elif estimator == "standard-deviation":
curve = ot.Curve([true_standard_deviation] * 2, [ylb, yub])
curve.setLegend("Exact")
curve.setLineWidth(2.0)
graph.add(curve)
graph.setTitle(
"Size = %d, True S.D. = %.4f, Mean = %.4f, Rep. = %d"
% (
sample_size,
true_standard_deviation,
mean_sample_estimator[0],
repetitions_size,
)
)
return graph
# %%
# Distribution of the variance estimator
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# We first consider the estimation of the variance :math:`\sigma^2`.
# In the next cell, we consider a sample size equal to :math:`n = 6` with
# :math:`p = 3` parameters.
# We use :math:`r = 100` repetitions.
repetitions_size = 100
true_standard_deviation = 0.1
sample_size = 6
coefficients = ot.Point([3.0, 2.0, -1.0])
estimator = "variance"
view = otv.View(
plot_sample_by_kernel_smoothing(
sample_size, true_standard_deviation, coefficients, estimator, repetitions_size
),
figure_kw={"figsize": (6.0, 3.5)},
)
plt.subplots_adjust(bottom=0.25)
# %%
# If we use a sample size equal to :math:`n = 6` with
# :math:`p = 3` parameters, the distribution is not symmetric.
# The mean of the observations of the sample variances remains close to
# the true value :math:`0.1^2 = 0.01`.
# %%
# Then we increase the sample size :math:`n`.
repetitions_size = 100
true_standard_deviation = 0.1
sample_size = 100
coefficients = ot.Point([3.0, 2.0, -1.0])
view = otv.View(
plot_sample_by_kernel_smoothing(
sample_size, true_standard_deviation, coefficients, estimator, repetitions_size
),
figure_kw={"figsize": (6.0, 3.5)},
)
plt.subplots_adjust(bottom=0.25)
# %%
# If we use a sample size equal to :math:`n = 100` with
# :math:`p = 3` parameters, the distribution is almost symmetric and
# almost normal.
# %%
# Distribution of the standard deviation estimator
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# We now consider the estimation of the standard deviation :math:`\sigma`.
repetitions_size = 100
true_standard_deviation = 0.1
sample_size = 6
coefficients = ot.Point([3.0, 2.0, -1.0])
estimator = "standard-deviation"
view = otv.View(
plot_sample_by_kernel_smoothing(
sample_size, true_standard_deviation, coefficients, estimator, repetitions_size
),
figure_kw={"figsize": (6.0, 3.5)},
)
plt.subplots_adjust(bottom=0.25)
# %%
# If we use a sample size equal to :math:`n = 6` with
# :math:`p = 3` parameters, we see that the distribution is almost symmetric.
# We notice a slight bias, since the mean of the observations of the
# standard deviation is not as close to the true value 0.1
# as we could expect.
repetitions_size = 100
true_standard_deviation = 0.1
sample_size = 100
coefficients = ot.Point([3.0, 2.0, -1.0])
estimator = "standard-deviation"
view = otv.View(
plot_sample_by_kernel_smoothing(
sample_size, true_standard_deviation, coefficients, estimator, repetitions_size
),
figure_kw={"figsize": (6.0, 3.5)},
)
plt.subplots_adjust(bottom=0.25)
# %%
# If we use a sample size equal to :math:`n = 100` with
# :math:`p = 3` parameters, we see that the distribution is almost normal.
# We notice that the bias disappeared.
# %%
otv.View.ShowAll()
|