File: univariate_statistics.qbk

package info (click to toggle)
scipy 1.16.0-1exp7
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 234,820 kB
  • sloc: cpp: 503,145; python: 344,611; ansic: 195,638; javascript: 89,566; fortran: 56,210; cs: 3,081; f90: 1,150; sh: 848; makefile: 785; pascal: 284; csh: 135; lisp: 134; xml: 56; perl: 51
file content (403 lines) | stat: -rw-r--r-- 19,301 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
[/
  Copyright 2018 Nick Thompson
  Copyright 2020 Matt Borland

  Distributed under the Boost Software License, Version 1.0.
  (See accompanying file LICENSE_1_0.txt or copy at
  http://www.boost.org/LICENSE_1_0.txt).
]

[section:univariate_statistics Univariate Statistics]

[heading Synopsis]

``
#include <boost/math/statistics/univariate_statistics.hpp>

namespace boost::math::statistics {

    template<class ExecutionPolicy, class Container>
    auto mean(ExecutionPolicy&& exec, Container const & c);

    template<class Container>
    auto mean(Container const & c);

    template<class ExecutionPolicy, class ForwardIterator>
    auto mean(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last);

    template<class ForwardIterator>
    auto mean(ForwardIterator first, ForwardIterator last);

    template<class ExecutionPolicy, class Container>
    auto variance(ExecutionPolicy&& exec, Container const & c);

    template<class Container>
    auto variance(Container const & c);

    template<class ExecutionPolicy, class ForwardIterator>
    auto variance(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last);

    template<class ForwardIterator>
    auto variance(ForwardIterator first, ForwardIterator last);

    template<class ExecutionPolicy, class Container>
    auto sample_variance(ExecutionPolicy&& exec, Container const & c);

    template<class Container>
    auto sample_variance(Container const & c);

    template<class ExecutionPolicy, class ForwardIterator>
    auto sample_variance(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last);

    template<class ForwardIterator>
    auto sample_variance(ForwardIterator first, ForwardIterator last);

    template<class ExecutionPolicy, class Container>
    auto mean_and_sample_variance(ExecutionPolicy&& exec, Container const & c);

    template<class Container>
    auto mean_and_sample_variance(Container const & c);

    template<class ExecutionPolicy, class ForwardIterator>
    auto mean_and_sample_variance(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last);

    template<class ExecutionPolicy, class Container>
    auto skewness(ExecutionPolicy&& exec, class Container);

    template<class Container>
    auto skewness(Container const & c);

    template<class ExecutionPolicy, class ForwardIterator>
    auto skewness(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last);

    template<class ForwardIterator>
    auto skewness(ForwardIterator first, ForwardIterator last);

    template<class ExecutionPolicy, class Container>
    auto kurtosis(ExecutionPolicy&& exec, Container const & c);
    
    template<class Container>
    auto kurtosis(Container const & c);

    template<class ExecutionPolicy, class ForwardIterator>
    auto kurtosis(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last);

    template<class ForwardIterator>
    auto kurtosis(ForwardIterator first, ForwardIterator last);

    template<class ExecutionPolicy, class Container>
    auto excess_kurtosis(ExecutionPolicy&& exec, Container const & c);

    template<class Container>
    auto excess_kurtosis(Container const & c);

    template<class ExecutionPolicy, class ForwardIterator>
    auto excess_kurtosis(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last);

    template<class ForwardIterator>
    auto excess_kurtosis(ForwardIterator first, ForwardIterator last);

    template<class ExecutionPolicy, class Container>
    auto first_four_moments(ExecutionPolicy&& exec, Container const & c);

    template<class Container>
    auto first_four_moments(Container const & c);

    template<class ExecutionPolicy, class ForwardIterator>
    auto first_four_moments(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last);

    template<class ForwardIterator>
    auto first_four_moments(ForwardIterator first, ForwardIterator last);

    template<class ExecutionPolicy, class Container>
    auto median(ExecutionPolicy&& exec, Container const & c);

    template<class Container>
    auto median(Container & c);

    template<class ExecutionPolicy, class ForwardIterator>
    auto median(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last);

    template<class ForwardIterator>
    auto median(ForwardIterator first, ForwardIterator last);

    template<class ExecutionPolicy, class RandomAccessIterator>
    auto median_absolute_deviation(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last, typename std::iterator_traits<RandomAccessIterator::value_type center=std::numeric_limits<Real>::quiet_NaN());

    template<class RandomAccessIterator>
    auto median_absolute_deviation(RandomAccessIterator first, RandomAccessIterator last, typename std::iterator_traits<RandomAccessIterator>::value_type center=std::numeric_limits<Real>::quiet_NaN());

    template<class ExecutionPolicy, class RandomAccessContainer>
    auto median_absolute_deviation(ExecutionPolicy&& exec, RandomAccessContainer v, typename RandomAccessContainer::value_type center=std::numeric_limits<Real>::quiet_NaN());

    template<class RandomAccessContainer>
    auto median_absolute_deviation(RandomAccessContainer v, typename RandomAccessContainer::value_type center=std::numeric_limits<Real>::quiet_NaN());

    template<class ExecutionPolicy, class RandomAccessIterator>
    auto interquartile_range(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last);

    template<class RandomAccessIterator>
    auto interquartile_range(RandomAccessIterator first, RandomAccessIterator last);

    template<class ExecutionPolicy, class RandomAccessContainer>
    auto interquartile_range(ExecutionPolicy&& exec, RandomAccessContainer v);

    template<class RandomAccessContainer>
    auto interquartile_range(RandomAccessContainer v);

    template<class ExecutionPolicy, class RandomAccessContainer>
    auto gini_coefficient(ExecutionPolicy&& exec, RandomAccessContainer & c);

    template<class RandomAccessContainer>
    auto gini_coefficient(RandomAccessContainer & c);

    template<class ExecutionPolicy, class RandomAccessIterator>
    auto gini_coefficient(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last);

    template<class RandomAccessIterator>
    auto gini_coefficient(RandomAccessIterator first, RandomAccessIterator last);

    template<class ExecutionPolicy, class RandomAccessContainer>
    auto sample_gini_coefficient(ExecutionPolicy&& exec, RandomAccessContainer & c);

    template<class RandomAccessContainer>
    auto sample_gini_coefficient(RandomAccessContainer & c);

    template<class ExecutionPolicy, class RandomAccessIterator>
    auto sample_gini_coefficient(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last);

    template<class RandomAccessIterator>
    auto sample_gini_coefficient(RandomAccessIterator first, RandomAccessIterator last);

    template<class ExecutionPolicy, class ForwardIterator, class OutputIterator>
    OutputIterator mode(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, OutputIterator output);

    template<class ForwardIterator, class OutputIterator>
    OutputIterator mode(ForwardIterator first, ForwardIterator last, OutputIterator output);

    template<class ExecutionPolicy, class Container, class OutputIterator>
    OutputIterator mode(ExecutionPolicy&& exec, Container const & c, OutputIterator output);

    template<class Container, class OutputIterator>
    OutputIterator mode(Container const & c, OutputIterator output);

    template<class ExecutionPolicy, class ForwardIterator, class Real = typename std::iterator_traits<ForwardIterator>::value_type>
    std::list<Real> mode(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last)

    template<class ExecutionPolicy, class Container, class Real = typename std::iterator_traits<Container>::value_type>
    std::list<Real> mode(ExecutionPolicy&& exec, Container & c)
    
    template<class ForwardIterator, class Real = typename std::iterator_traits<ForwardIterator>::value_type>
    std::list<Real> mode(ForwardIterator first, ForwardIterator last)

    template<class Container, class Real = typename std::iterator_traits<Container>::value_type>
    std::list<Real> mode(Container & c)
}
``

[heading Description]

The file `boost/math/statistics/univariate_statistics.hpp` is a set of facilities for computing scalar values from vectors. All methods in the above list are 
compatible with C++11. In order to pass an ExecutionPolicy to the methods and access parallel calculations C++17 is required.

Many of these functionals have trivial naive implementations, but experienced programmers will recognize that even trivial algorithms are easy to screw up, and that numerical instabilities often lurk in corner cases.
We have attempted to do our "due diligence" to root out these problems-scouring the literature for numerically stable algorithms for even the simplest of functionals.

/Nota bene/: Some similar functionality is provided in [@https://www.boost.org/doc/libs/release/doc/html/accumulators.html Boost Accumulators Framework].
These accumulators should be used in real-time applications; `univariate_statistics.hpp` should be used when CPU vectorization is needed.
As a reminder, remember that to actually /get/ vectorization, compile with `-march=native -O3` flags.

We now describe each functional in detail.
Our examples use `std::vector<double>` to hold the data, but this not required.
In general, you can store your data in an Eigen array, and Armadillo vector, `std::array`, and for many of the routines, a `std::forward_list`.
These routines are usable in float, double, long double, and Boost.Multiprecision precision, as well as their complex extensions whenever the computation is well-defined.
For certain operations (total variation, for example) integer inputs are supported.

/Nota bene/: The default execution policy for every function is std::execution::seq.

[heading Mean]

    std::vector<double> v{1,2,3,4,5};
    double mu = boost::math::statistics::mean(v.cbegin(), v.cend());
    // Alternative syntax if you want to use entire container:
    mu = boost::math::statistics::mean(v);
    // Alternative syntax if you want to use parallel execution:
    mu = boost::math::statistics::mean(std::execution::par, v);

The implementation follows [@https://doi.org/10.1137/1.9780898718027 Higham 1.6a].
The data is not modified and must be forward iterable.
Works with real and integer data.
If the input is an integer type, the output is a double precision float.

[heading Variance]

    std::vector<double> v{1,2,3,4,5};
    Real sigma_sq = boost::math::statistics::variance(v.cbegin(), v.cend());
    // Alternative syntax if you want to use parallel execution:
    sigma_sq = boost::math::statistics::variance(std::execution::par, v.cbegin(), v.cend());

If you don't need to calculate on a subset of the input, then the range call is more terse:

    std::vector<double> v{1,2,3,4,5};
    Real sigma_sq = boost::math::statistics::variance(v);

The implementation follows [@https://doi.org/10.1137/1.9780898718027 Higham 1.6b].
The input data must be forward iterable and the range `[first, last)` must contain at least two elements.
It is /not/ in general sensible to pass complex numbers to this routine.
If integers are passed as input, then the output is a double precision float.

`boost::math::statistics::variance` returns the population variance.
If you want a sample variance, use

    std::vector<double> v{1,2,3,4,5};
    Real sn_sq = boost::math::statistics::sample_variance(v);

[heading Skewness]

Computes the skewness of a dataset:

    std::vector<double> v{1,2,3,4,5};
    double skewness = boost::math::statistics::skewness(v);
    // skewness = 0.
    // Alternative syntax if you want to use parallel execution:
    skewness = boost::math::statistics::skewness(std::execution::par, v);

The input vector is not modified, works with integral and real data.
If the input data is integral, the output is a double precision float.

For a dataset consisting of a single constant value, we take the skewness to be zero by definition.

The implementation follows [@https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf Pebay].

[heading Kurtosis]

Computes the kurtosis of a dataset:

    std::vector<double> v{1,2,3,4,5};
    double kurtosis = boost::math::statistics::kurtosis(v);
    // kurtosis = 17/10
    // Alternative syntax if you want to use parallel execution:
    kurtosis = boost::math::statistics::kurtosis(std::execution::par, v);

The implementation follows [@https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf Pebay].
The input data must be forward iterable and must consist of real or integral values.
If the input data is integral, the output is a double precision float.
Note that this is /not/ the excess kurtosis.
If you require the excess kurtosis, use `boost::math::statistics::excess_kurtosis`.
This function simply subtracts 3 from the kurtosis, but it makes eminently clear our definition of kurtosis.

[heading First four moments]

Simultaneously computes the first four [@https://en.wikipedia.org/wiki/Central_moment central moments] in a single pass through the data:

    std::vector<double> v{1,2,3,4,5};
    auto [M1, M2, M3, M4] = boost::math::statistics::first_four_moments(v);
    // Alternative syntax if you want to use parallel execution:
    auto [M1, M2, M3, M4] = boost::math::statistics::first_four_moments(std::execution::par, v);

[heading Median]

Computes the median of a dataset:

    std::vector<double> v{1,2,3,4,5};
    double m = boost::math::statistics::median(v.begin(), v.end());
    // Alternative syntax if you want to use parallel execution:
    m = boost::math::statistics::median(std::execution::par, v.begin(), v.end());

/Nota bene: The input vector is modified./
The calculation of the median is a thin wrapper around the C++11 [@https://en.cppreference.com/w/cpp/algorithm/nth_element `nth_element`].
Therefore, all requirements of `std::nth_element` are inherited by the median calculation.
In particular, the container must allow random access.

[heading Median Absolute Deviation]

Computes the [@https://en.wikipedia.org/wiki/Median_absolute_deviation median absolute deviation] of a dataset:

    std::vector<double> v{1,2,3,4,5};
    double mad = boost::math::statistics::median_absolute_deviation(v);
    // Alternative syntax if you want to use parallel execution:
    mad = boost::math::statistics::median_absolute_deviation(std::execution::par, v);

By default, the deviation from the median is used.
If you have some prior that the median is zero, or wish to compute the median absolute deviation from the mean,
use the following:

    // prior is that center is zero:
    double center = 0;
    double mad = boost::math::statistics::median_absolute_deviation(v, center);

    // compute median absolute deviation from the mean:
    double mu = boost::math::statistics::mean(v);
    double mad = boost::math::statistics::median_absolute_deviation(v, mu);

/Nota bene:/ The input vector is modified.
Again the vector is passed into a call to [@https://en.cppreference.com/w/cpp/algorithm/nth_element `nth_element`].

[heading Interquartile Range]

Computes the [@https://en.wikipedia.org/wiki/Interquartile_range interquartile range] of a dataset:

    std::vector<double> v{1,2,3,4,5};
    double iqr = boost::math::statistics::interquartile_range(v);
    // Q1 = 1.5, Q3 = 4.5 => iqr = 3
    // Alternative syntax if you want to use parallel execution:
    iqr = boost::math::statistics::interquartile_range(std::execution::par, v);

For a vector of length /2n+1/ or /2n/, the first quartile /Q/[sub 1] is the median of the /n/ smallest values,
and the third quartile /Q/[sub 3] is the median of the /n/ largest values.
The interquartile range is then /Q/[sub 3] - /Q/[sub 1].
The function `interquartile_range`, like the `median`, calls into `std::nth_element`, and hence partially sorts the data.

[heading Gini Coefficient]

Compute the Gini coefficient of a dataset:

    std::vector<double> v{1,0,0,0};
    double gini = boost::math::statistics::gini_coefficient(v);
    // gini = 3/4
    double s_gini = boost::math::statistics::sample_gini_coefficient(v);
    // s_gini = 1.
    std::vector<double> w{1,1,1,1};
    gini = boost::math::statistics::gini_coefficient(w.begin(), w.end());
    // gini = 0, as all elements are now equal.
    // Alternative syntax if you want to use parallel execution:
    gini = boost::math::statistics::gini_coefficient(std::execution::par, w.begin(), w.end());

/Nota bene/: The input data is altered: in particular, it is sorted. Makes a call to `std::sort`, and as such requires random access iterators.

The sample Gini coefficient lies in the range \[0,1\], whereas the population Gini coefficient is in the range \[0, 1 - 1/ /n/\].

/Nota bene:/ There is essentially no reason to pass negative values to the Gini coefficient function.
However, a use case (measuring wealth inequality when some people have negative wealth) exists, so we do not throw an exception when negative values are encountered.
You should have /very/ good cause to pass negative values to the Gini coefficient calculator.
Another use case is found in signal processing, but the sorting is by magnitude and hence has a different implementation.
See `absolute_gini_coefficient` for details.

[heading Mode]

Compute the mode(s) of a data set:

    std::vector<int> v {1, 3, 2, 2, 5, 4};
    std::vector<int> modes; 
    boost::math::statistics::mode(v, std::back_inserter(modes));
    // Mode is 2, modes.size() == 1
    std::deque<int> d_modes;
    std::array<int, 7> w {2, 2, 3, 1, 5, 4, 4};
    boost::math::statistics::mode(w, std::back_inserter(d_modes));
    // Modes are 2 and 4, d_modes.size() == 2
    // Alternative syntax if you want to use parallel execution:
    boost::math::statistics::mode(std::execution::par, w, std::back_inserter(d_modes));
    // Additional syntax to have mode output a `std::list` of modes. The same execution policy syntax applies
    auto list_modes = boost::math::statistics::mode(w);

/Nota bene/: The input data must be sorted in order to pass a forward iterator. If data is not sorted random access iterators are required for a call to `std::sort`.

[heading References]

* Higham, Nicholas J. ['Accuracy and stability of numerical algorithms.] Vol. 80. Siam, 2002.
* Philippe P. Pebay. ['Formulas for Robust, One-Pass Parallel Computation of Covariances and Arbitrary-Order Statistical Moments.] Technical Report SAND2008-6212, Sandia National Laboratories, September 2008.
* Tony F. Chan, Gene H. Golub, Randall J. LeVeque (1979), ['Updating Formulae and a Pairwise Algorithm for Computing Sample Variances.], Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
* Philippe Pebay, Timothy Terriberry, Hemanth Kolla, Janine Bennett (2016), ['Numerically Stable, Scalable Formulas for Parallel and Online Computation of Higher-Order Multivariate Central Moments with Arbitrary Weights], Computational Statistics, Springer

[endsect]
[/section:univariate_statistics Univariate Statistics]