1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
|
// SPDX-FileCopyrightText: Copyright (c) Ken Martin, Will Schroeder, Bill Lorensen
// SPDX-FileCopyrightText: Copyright 2010 Sandia Corporation
// SPDX-License-Identifier: LicenseRef-BSD-3-Clause-Sandia-USGov
/**
* @class vtkDescriptiveStatistics
* @brief A class for univariate descriptive statistics
*
*
* Given a selection of columns of interest in an input data table, this
* class provides the following functionalities, depending on the chosen
* execution options:
* * Learn: calculate extremal values, sample mean, and M2, M3, and M4 aggregates
* (cf. P. Pebay, Formulas for robust, one-pass parallel computation of covariances
* and Arbitrary-Order Statistical Moments, Sandia Report SAND2008-6212, Sep 2008,
* http://infoserve.sandia.gov/sand_doc/2008/086212.pdf for details)
* * Derive: calculate unbiased variance estimator, standard deviation estimator,
* two skewness estimators, and two kurtosis excess estimators.
* * Assess: given an input data set, a reference value and a non-negative deviation,
* mark each datum with corresponding relative deviation (1-dimensional Mahlanobis
* distance). If the deviation is zero, then mark each datum which are equal to the
* reference value with 0, and all others with 1. By default, the reference value
* and the deviation are, respectively, the mean and the standard deviation of the
* input model.
* * Test: calculate Jarque-Bera statistic and, if VTK to R interface is available,
* retrieve corresponding p-value for normality testing.
*
* Among the derived statistics, the variance, the standard deviation, the skewness
* and the kurtosis can be estimated in two ways: using the sample version of those
* statistics, or the population version. Specify whether a sample estimate or population
* estimate is done by setting `SampleEstimate`. By default, `SampleEstimate == true`, hence
* the sample version of the statistics is estimated,
* which produces unbiased estimators (except for the sample standard deviation).
* The sample estimate should be used for input that represent a subset of the whole
* population of study. On the other hand, when `SampleEstimate == false`, the population
* version of the statistics is estimated. If the input doesn't contain all the samples
* from the population of study, then a bias is induced (the variance is slightly bigger than it
* should be). One can read about Bessel's correction to understand better where this comes from.
* That being said, on very large data, the difference between the 2 estimation formulas
* becomes very low, so in those instances,
* either state of `SampleEstimate` should yield very similar results
* (see explicit formulas below).
*
* \verbatim
*
* The formulas used are as follows, writing \f( \bar{X} \f) the mean of \f( X \f) and \f( N \f)
* the number of samples:
* - Sample estimate:
* \f[
* Var{X} = s^2 = \frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^2 }{N - 1}
* \f]
* \f[
* Skew{X} = \frac{n}{(n - 1)(n - 2)}
* \frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^3 }{s^3}
* \f]
* \f[
* Kurt{X} = \frac{n(n + 1)}{(n - 1)(n - 2)(n - 3)}
* \frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^3 }{s^4}
* - 3 \frac{(n - 1)^2}{(n - 2)(n - 3)}
* \f]
* - Population estimate:
* \f[
* Var{X} = \sigma^2 = \frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^2 }{N}
* \f]
* \f[
* Skew{X} = \frac{1}{N}\frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^3 }{\sigma^3}
* \f]
* \f[
* Kurt{X} = \frac{1}{N}\frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^3 }{\sigma^4} - 3
* \f]
*
* \f(\sigma\f) is the population standard deviation, and \f(s\f) is the sample standard deviation.
* Note that the kurtosis is corrected so the kurtosis of a gaussian distribution should yield 0.
*
* In the instance where \f(\sigma = 0\f) or \f(s = 0\f), the skewness and kurtosis are undefined.
* Thus they output a `NaN`. Similarly, if there are no samples, then all derived statistics
* yield a `NaN`.
*
* \endverbatim
*
* @par Thanks:
* Thanks to Philippe Pebay and David Thompson from Sandia National Laboratories
* for implementing this class.
* Updated by Philippe Pebay, Kitware SAS 2012
*/
#ifndef vtkDescriptiveStatistics_h
#define vtkDescriptiveStatistics_h
#include "vtkFiltersStatisticsModule.h" // For export macro
#include "vtkStatisticsAlgorithm.h"
VTK_ABI_NAMESPACE_BEGIN
class vtkMultiBlockDataSet;
class vtkStringArray;
class vtkTable;
class vtkVariant;
class vtkDoubleArray;
class VTKFILTERSSTATISTICS_EXPORT vtkDescriptiveStatistics : public vtkStatisticsAlgorithm
{
public:
vtkTypeMacro(vtkDescriptiveStatistics, vtkStatisticsAlgorithm);
void PrintSelf(ostream& os, vtkIndent indent) override;
static vtkDescriptiveStatistics* New();
///@{
/**
* Getter / Setter on `SampleEstimate`. When turned on, descriptive statistics
* computed by this filter assume that the input data only holds a sample of the whole
* population of study. In effect, the sample variance, the sample standard deviation,
* the sample skewness and the sample kurtosis are estimated. When turned off, the population
* variance, the population standard deviation, the population skewness and the population
* kurtosis are estimated instead.
*
* In short, if the input data is a full description of the population being studied,
* `SampleEstimate` should be turned off. If the input data is a sample of the population being
* studied, then `SampleEstimate` should be turned on. By default, `SampleEstimate` is turned
* on, as it is the most likely case.
*
* Please see class description for a full description of the formulas.
*
* @note For large data, the difference between the population estimate and the sample
* estimate becomes thin, so this parameter becomes of less worry.
*/
vtkSetMacro(SampleEstimate, bool);
vtkGetMacro(SampleEstimate, bool);
vtkBooleanMacro(SampleEstimate, bool);
///@}
///@{
/**
* Set/get whether the deviations returned should be signed, or should
* only have their magnitude reported.
* The default is that signed deviations will be computed.
*/
vtkSetMacro(SignedDeviations, vtkTypeBool);
vtkGetMacro(SignedDeviations, vtkTypeBool);
vtkBooleanMacro(SignedDeviations, vtkTypeBool);
///@}
///@{
/**
* If there is a ghost array in the input, then ghosts matching `GhostsToSkip` mask
* will be skipped. It is set to 0xff by default (every ghosts types are skipped).
*
* @sa
* vtkDataSetAttributes
* vtkFieldData
* vtkPointData
* vtkCellData
*/
vtkSetMacro(GhostsToSkip, unsigned char);
vtkGetMacro(GhostsToSkip, unsigned char);
///@}
/**
* Given a collection of models, calculate aggregate model
*/
void Aggregate(vtkDataObjectCollection*, vtkMultiBlockDataSet*) override;
protected:
vtkDescriptiveStatistics();
~vtkDescriptiveStatistics() override;
/**
* Execute the calculations required by the Learn option, given some input Data
* NB: input parameters are unused.
*/
void Learn(vtkTable*, vtkTable*, vtkMultiBlockDataSet*) override;
/**
* Execute the calculations required by the Derive option.
*/
void Derive(vtkMultiBlockDataSet*) override;
/**
* Execute the calculations required by the Test option.
*/
void Test(vtkTable*, vtkMultiBlockDataSet*, vtkTable*) override;
/**
* Execute the calculations required by the Assess option.
*/
void Assess(vtkTable* inData, vtkMultiBlockDataSet* inMeta, vtkTable* outData) override
{
this->Superclass::Assess(inData, inMeta, outData, 1);
}
/**
* Calculate p-value. This will be overridden using the object factory with an
* R implementation if R is present.
*/
virtual vtkDoubleArray* CalculatePValues(vtkDoubleArray*);
/**
* Provide the appropriate assessment functor.
*/
void SelectAssessFunctor(vtkTable* outData, vtkDataObject* inMeta, vtkStringArray* rowNames,
AssessFunctor*& dfunc) override;
bool SampleEstimate;
vtkTypeBool SignedDeviations;
unsigned char GhostsToSkip;
private:
vtkDescriptiveStatistics(const vtkDescriptiveStatistics&) = delete;
void operator=(const vtkDescriptiveStatistics&) = delete;
};
VTK_ABI_NAMESPACE_END
#endif
|