File: Quantile.hs

package info (click to toggle)
haskell-statistics 0.16.2.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 640 kB
  • sloc: haskell: 6,819; ansic: 35; python: 33; makefile: 9
file content (400 lines) | stat: -rw-r--r-- 14,338 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE DeriveFoldable     #-}
{-# LANGUAGE DeriveFunctor      #-}
{-# LANGUAGE DeriveGeneric      #-}
{-# LANGUAGE FlexibleContexts   #-}
{-# LANGUAGE ViewPatterns       #-}
-- |
-- Module    : Statistics.Quantile
-- Copyright : (c) 2009 Bryan O'Sullivan
-- License   : BSD3
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
-- Functions for approximating quantiles, i.e. points taken at regular
-- intervals from the cumulative distribution function of a random
-- variable.
--
-- The number of quantiles is described below by the variable /q/, so
-- with /q/=4, a 4-quantile (also known as a /quartile/) has 4
-- intervals, and contains 5 points.  The parameter /k/ describes the
-- desired point, where 0 ≤ /k/ ≤ /q/.

module Statistics.Quantile
    (
    -- * Quantile estimation functions
    -- $cont_quantiles
      ContParam(..)
    , Default(..)
    , quantile
    , quantiles
    , quantilesVec
    -- ** Parameters for the continuous sample method
    , cadpw
    , hazen
    , spss
    , s
    , medianUnbiased
    , normalUnbiased
    -- * Other algorithms
    , weightedAvg
    -- * Median & other specializations
    , median
    , mad
    , midspread
    -- * Deprecated
    , continuousBy
    -- * References
    -- $references
    ) where

import           Data.Binary            (Binary)
import           Data.Aeson             (ToJSON,FromJSON)
import           Data.Data              (Data,Typeable)
import           Data.Default.Class
import qualified Data.Foldable        as F
import           Data.Vector.Generic ((!))
import qualified Data.Vector          as V
import qualified Data.Vector.Generic  as G
import qualified Data.Vector.Unboxed  as U
import qualified Data.Vector.Storable as S
import GHC.Generics (Generic)

import Statistics.Function (partialSort)


----------------------------------------------------------------
-- Quantile estimation
----------------------------------------------------------------

-- | O(/n/·log /n/). Estimate the /k/th /q/-quantile of a sample,
-- using the weighted average method. Up to rounding errors it's same
-- as @quantile s@.
--
-- The following properties should hold otherwise an error will be thrown.
--
--   * the length of the input is greater than @0@
--
--   * the input does not contain @NaN@
--
--   * k ≥ 0 and k ≤ q
weightedAvg :: G.Vector v Double =>
               Int        -- ^ /k/, the desired quantile.
            -> Int        -- ^ /q/, the number of quantiles.
            -> v Double   -- ^ /x/, the sample data.
            -> Double
weightedAvg k q x
  | G.any isNaN x   = modErr "weightedAvg" "Sample contains NaNs"
  | n == 0          = modErr "weightedAvg" "Sample is empty"
  | n == 1          = G.head x
  | q < 2           = modErr "weightedAvg" "At least 2 quantiles is needed"
  | k == q          = G.maximum x
  | k >= 0 || k < q = xj + g * (xj1 - xj)
  | otherwise       = modErr "weightedAvg" "Wrong quantile number"
  where
    j   = floor idx
    idx = fromIntegral (n - 1) * fromIntegral k / fromIntegral q
    g   = idx - fromIntegral j
    xj  = sx ! j
    xj1 = sx ! (j+1)
    sx  = partialSort (j+2) x
    n   = G.length x
{-# SPECIALIZE weightedAvg :: Int -> Int -> U.Vector Double -> Double #-}
{-# SPECIALIZE weightedAvg :: Int -> Int -> V.Vector Double -> Double #-}
{-# SPECIALIZE weightedAvg :: Int -> Int -> S.Vector Double -> Double #-}


----------------------------------------------------------------
-- Quantiles continuous algorithm
----------------------------------------------------------------

-- $cont_quantiles
--
-- Below is family of functions which use same algorithm for estimation
-- of sample quantiles. It approximates empirical CDF as continuous
-- piecewise function which interpolates linearly between points
-- \((X_k,p_k)\) where \(X_k\) is k-th order statistics (k-th smallest
-- element) and \(p_k\) is probability corresponding to
-- it. 'ContParam' determines how \(p_k\) is chosen. For more detailed
-- explanation see [Hyndman1996].
--
-- This is the method used by most statistical software, such as R,
-- Mathematica, SPSS, and S.


-- | Parameters /α/ and /β/ to the 'continuousBy' function. Exact
--   meaning of parameters is described in [Hyndman1996] in section
--   \"Piecewise linear functions\"
data ContParam = ContParam {-# UNPACK #-} !Double {-# UNPACK #-} !Double
  deriving (Show,Eq,Ord,Data,Typeable,Generic)

-- | We use 's' as default value which is same as R's default.
instance Default ContParam where
  def = s

instance Binary   ContParam
instance ToJSON   ContParam
instance FromJSON ContParam

-- | O(/n/·log /n/). Estimate the /k/th /q/-quantile of a sample /x/,
--   using the continuous sample method with the given parameters.
--
--   The following properties should hold, otherwise an error will be thrown.
--
--     * input sample must be nonempty
--
--     * the input does not contain @NaN@
--
--     * 0 ≤ k ≤ q
quantile :: G.Vector v Double
         => ContParam  -- ^ Parameters /α/ and /β/.
         -> Int        -- ^ /k/, the desired quantile.
         -> Int        -- ^ /q/, the number of quantiles.
         -> v Double   -- ^ /x/, the sample data.
         -> Double
quantile param q nQ xs
  | nQ < 2         = modErr "continuousBy" "At least 2 quantiles is needed"
  | badQ nQ q      = modErr "continuousBy" "Wrong quantile number"
  | G.any isNaN xs = modErr "continuousBy" "Sample contains NaNs"
  | otherwise      = estimateQuantile sortedXs pk
  where
    pk       = toPk param n q nQ
    sortedXs = psort xs $ floor pk + 1
    n        = G.length xs
{-# INLINABLE quantile #-}
{-# SPECIALIZE
    quantile :: ContParam -> Int -> Int -> U.Vector Double -> Double #-}
{-# SPECIALIZE
    quantile :: ContParam -> Int -> Int -> V.Vector Double -> Double #-}
{-# SPECIALIZE
    quantile :: ContParam -> Int -> Int -> S.Vector Double -> Double #-}

-- | O(/k·n/·log /n/). Estimate set of the /k/th /q/-quantile of a
--   sample /x/, using the continuous sample method with the given
--   parameters. This is faster than calling quantile repeatedly since
--   sample should be sorted only once
--
--   The following properties should hold, otherwise an error will be thrown.
--
--     * input sample must be nonempty
--
--     * the input does not contain @NaN@
--
--     * for every k in set of quantiles 0 ≤ k ≤ q
quantiles :: (G.Vector v Double, F.Foldable f, Functor f)
  => ContParam
  -> f Int
  -> Int
  -> v Double
  -> f Double
quantiles param qs nQ xs
  | nQ < 2             = modErr "quantiles" "At least 2 quantiles is needed"
  | F.any (badQ nQ) qs = modErr "quantiles" "Wrong quantile number"
  | G.any isNaN xs     = modErr "quantiles" "Sample contains NaNs"
  -- Doesn't matter what we put into empty container
  | null qs            = 0 <$ qs
  | otherwise          = fmap (estimateQuantile sortedXs) ks'
  where
    ks'      = fmap (\q -> toPk param n q nQ) qs
    sortedXs = psort xs $ floor (F.maximum ks') + 1
    n        = G.length xs
{-# INLINABLE quantiles #-}
{-# SPECIALIZE quantiles
      :: (Functor f, F.Foldable f) => ContParam -> f Int -> Int -> V.Vector Double -> f Double #-}
{-# SPECIALIZE quantiles
      :: (Functor f, F.Foldable f) => ContParam -> f Int -> Int -> U.Vector Double -> f Double #-}
{-# SPECIALIZE quantiles
      :: (Functor f, F.Foldable f) => ContParam -> f Int -> Int -> S.Vector Double -> f Double #-}

-- | O(/k·n/·log /n/). Same as quantiles but uses 'G.Vector' container
--   instead of 'Foldable' one.
quantilesVec :: (G.Vector v Double, G.Vector v Int)
  => ContParam
  -> v Int
  -> Int
  -> v Double
  -> v Double
quantilesVec param qs nQ xs
  | nQ < 2             = modErr "quantilesVec" "At least 2 quantiles is needed"
  | G.any (badQ nQ) qs = modErr "quantilesVec" "Wrong quantile number"
  | G.any isNaN xs     = modErr "quantilesVec" "Sample contains NaNs"
  | G.null qs          = G.empty
  | otherwise          = G.map (estimateQuantile sortedXs) ks'
  where
    ks'      = G.map (\q -> toPk param n q nQ) qs
    sortedXs = psort xs $ floor (G.maximum ks') + 1
    n        = G.length xs
{-# INLINABLE quantilesVec #-}
{-# SPECIALIZE quantilesVec
      :: ContParam -> V.Vector Int -> Int -> V.Vector Double -> V.Vector Double #-}
{-# SPECIALIZE quantilesVec
      :: ContParam -> U.Vector Int -> Int -> U.Vector Double -> U.Vector Double #-}
{-# SPECIALIZE quantilesVec
      :: ContParam -> S.Vector Int -> Int -> S.Vector Double -> S.Vector Double #-}


-- Returns True if quantile number is out of range
badQ :: Int -> Int -> Bool
badQ nQ q = q < 0 || q > nQ

-- Obtain k from equation for p_k [Hyndman1996] p.363.  Note that
-- equation defines p_k for integer k but we calculate it as real
-- value and will use fractional part for linear interpolation. This
-- is correct since equation is linear.
toPk
  :: ContParam
  -> Int        -- ^ /n/ number of elements
  -> Int        -- ^ /k/, the desired quantile.
  -> Int        -- ^ /q/, the number of quantiles.
  -> Double
toPk (ContParam a b) (fromIntegral -> n) q nQ
  = a + p * (n + 1 - a - b)
  where
    p = fromIntegral q / fromIntegral nQ

-- Estimate quantile for given k (including fractional part)
estimateQuantile :: G.Vector v Double => v Double -> Double -> Double
{-# INLINE estimateQuantile #-}
estimateQuantile sortedXs k'
  = (1-g) * item (k-1) + g * item k
  where
    (k,g) = properFraction k'
    item  = (sortedXs !) . clamp
    --
    clamp = max 0 . min (n - 1)
    n     = G.length sortedXs

psort :: G.Vector v Double => v Double -> Int -> v Double
psort xs k = partialSort (max 0 $ min (G.length xs - 1) k) xs
{-# INLINE psort #-}


-- | California Department of Public Works definition, /α/=0, /β/=1.
-- Gives a linear interpolation of the empirical CDF.  This
-- corresponds to method 4 in R and Mathematica.
cadpw :: ContParam
cadpw = ContParam 0 1

-- | Hazen's definition, /α/=0.5, /β/=0.5.  This is claimed to be
-- popular among hydrologists.  This corresponds to method 5 in R and
-- Mathematica.
hazen :: ContParam
hazen = ContParam 0.5 0.5

-- | Definition used by the SPSS statistics application, with /α/=0,
-- /β/=0 (also known as Weibull's definition).  This corresponds to
-- method 6 in R and Mathematica.
spss :: ContParam
spss = ContParam 0 0

-- | Definition used by the S statistics application, with /α/=1,
-- /β/=1.  The interpolation points divide the sample range into @n-1@
-- intervals.  This corresponds to method 7 in R and Mathematica and
-- is default in R.
s :: ContParam
s = ContParam 1 1

-- | Median unbiased definition, /α/=1\/3, /β/=1\/3. The resulting
-- quantile estimates are approximately median unbiased regardless of
-- the distribution of /x/.  This corresponds to method 8 in R and
-- Mathematica.
medianUnbiased :: ContParam
medianUnbiased = ContParam third third
    where third = 1/3

-- | Normal unbiased definition, /α/=3\/8, /β/=3\/8.  An approximately
-- unbiased estimate if the empirical distribution approximates the
-- normal distribution.  This corresponds to method 9 in R and
-- Mathematica.
normalUnbiased :: ContParam
normalUnbiased = ContParam ta ta
    where ta = 3/8

modErr :: String -> String -> a
modErr f err = error $ "Statistics.Quantile." ++ f ++ ": " ++ err


----------------------------------------------------------------
-- Specializations
----------------------------------------------------------------

-- | O(/n/·log /n/) Estimate median of sample
median :: G.Vector v Double
       => ContParam  -- ^ Parameters /α/ and /β/.
       -> v Double   -- ^ /x/, the sample data.
       -> Double
{-# INLINE median #-}
median p = quantile p 1 2

-- | O(/n/·log /n/). Estimate the range between /q/-quantiles 1 and
-- /q/-1 of a sample /x/, using the continuous sample method with the
-- given parameters.
--
-- For instance, the interquartile range (IQR) can be estimated as
-- follows:
--
-- > midspread medianUnbiased 4 (U.fromList [1,1,2,2,3])
-- > ==> 1.333333
midspread :: G.Vector v Double =>
             ContParam  -- ^ Parameters /α/ and /β/.
          -> Int        -- ^ /q/, the number of quantiles.
          -> v Double   -- ^ /x/, the sample data.
          -> Double
midspread param k x
  | G.any isNaN x = modErr "midspread" "Sample contains NaNs"
  | k <= 0        = modErr "midspread" "Nonpositive number of quantiles"
  | otherwise     = let Pair x1 x2 = quantiles param (Pair 1 (k-1)) k x
                    in  x2 - x1
{-# INLINABLE  midspread #-}
{-# SPECIALIZE midspread :: ContParam -> Int -> U.Vector Double -> Double #-}
{-# SPECIALIZE midspread :: ContParam -> Int -> V.Vector Double -> Double #-}
{-# SPECIALIZE midspread :: ContParam -> Int -> S.Vector Double -> Double #-}

data Pair a = Pair !a !a
  deriving (Functor, F.Foldable)


-- | O(/n/·log /n/). Estimate the median absolute deviation (MAD) of a
--   sample /x/ using 'continuousBy'. It's robust estimate of
--   variability in sample and defined as:
--
--   \[
--   MAD = \operatorname{median}(| X_i - \operatorname{median}(X) |)
--   \]
mad :: G.Vector v Double
    => ContParam  -- ^ Parameters /α/ and /β/.
    -> v Double   -- ^ /x/, the sample data.
    -> Double
mad p xs
  = median p $ G.map (abs . subtract med) xs
  where
    med = median p xs
{-# INLINABLE  mad #-}
{-# SPECIALIZE mad :: ContParam -> U.Vector Double -> Double #-}
{-# SPECIALIZE mad :: ContParam -> V.Vector Double -> Double #-}
{-# SPECIALIZE mad :: ContParam -> S.Vector Double -> Double #-}


----------------------------------------------------------------
-- Deprecated
----------------------------------------------------------------

continuousBy :: G.Vector v Double =>
                ContParam  -- ^ Parameters /α/ and /β/.
             -> Int        -- ^ /k/, the desired quantile.
             -> Int        -- ^ /q/, the number of quantiles.
             -> v Double   -- ^ /x/, the sample data.
             -> Double
continuousBy = quantile
{-# DEPRECATED continuousBy "Use quantile instead" #-}

-- $references
--
-- * Weisstein, E.W. Quantile. /MathWorld/.
--   <http://mathworld.wolfram.com/Quantile.html>
--
-- * [Hyndman1996] Hyndman, R.J.; Fan, Y. (1996) Sample quantiles in statistical
--   packages. /American Statistician/
--   50(4):361&#8211;365. <http://www.jstor.org/stable/2684934>