File: Quantile.hs

package info (click to toggle)
haskell-statistics 0.5.1.0-1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 180 kB
  • sloc: haskell: 1,265; makefile: 2
file content (185 lines) | stat: -rw-r--r-- 6,306 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
{-# LANGUAGE TypeOperators #-}
-- |
-- Module    : Statistics.Quantile
-- Copyright : (c) 2009 Bryan O'Sullivan
-- License   : BSD3
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
-- Functions for approximating quantiles, i.e. points taken at regular
-- intervals from the cumulative distribution function of a random
-- variable.
--
-- The number of quantiles is described below by the variable /q/, so
-- with /q/=4, a 4-quantile (also known as a /quartile/) has 4
-- intervals, and contains 5 points.  The parameter /k/ describes the
-- desired point, where 0 ≤ /k/ ≤ /q/.

module Statistics.Quantile
    (
    -- * Quantile estimation functions
      weightedAvg
    , ContParam(..)
    , continuousBy
    , midspread

    -- * Parameters for the continuous sample method
    , cadpw
    , hazen
    , s
    , spss
    , medianUnbiased
    , normalUnbiased

    -- * References
    -- $references
    ) where

import Control.Exception (assert)
import Data.Vector.Unboxed ((!))
import Statistics.Constants (m_epsilon)
import Statistics.Function (partialSort)
import Statistics.Types (Sample)
import qualified Data.Vector.Unboxed as U

-- | O(/n/ log /n/). Estimate the /k/th /q/-quantile of a sample,
-- using the weighted average method.
weightedAvg :: Int              -- ^ /k/, the desired quantile.
            -> Int              -- ^ /q/, the number of quantiles.
            -> Sample           -- ^ /x/, the sample data.
            -> Double
weightedAvg k q x =
    assert (q >= 2) .
    assert (k >= 0) .
    assert (k < q) .
    assert (U.all (not . isNaN) x) $
    xj + g * (xj1 - xj)
  where
    j   = floor idx
    idx = fromIntegral (U.length x - 1) * fromIntegral k / fromIntegral q
    g   = idx - fromIntegral j
    xj  = sx ! j
    xj1 = sx ! (j+1)
    sx  = partialSort (j+2) x
{-# INLINE weightedAvg #-}

-- | Parameters /a/ and /b/ to the 'continuousBy' function.
data ContParam = ContParam {-# UNPACK #-} !Double {-# UNPACK #-} !Double

-- | O(/n/ log /n/). Estimate the /k/th /q/-quantile of a sample /x/,
-- using the continuous sample method with the given parameters.  This
-- is the method used by most statistical software, such as R,
-- Mathematica, SPSS, and S.
continuousBy :: ContParam       -- ^ Parameters /a/ and /b/.
             -> Int             -- ^ /k/, the desired quantile.
             -> Int             -- ^ /q/, the number of quantiles.
             -> Sample          -- ^ /x/, the sample data.
             -> Double
continuousBy (ContParam a b) k q x =
    assert (q >= 2) .
    assert (k >= 0) .
    assert (k <= q) .
    assert (U.all (not . isNaN) x) $
    (1-h) * item (j-1) + h * item j
  where
    j               = floor (t + eps)
    t               = a + p * (fromIntegral n + 1 - a - b)
    p               = fromIntegral k / fromIntegral q
    h | abs r < eps = 0
      | otherwise   = r
      where r       = t - fromIntegral j
    eps             = m_epsilon * 4
    n               = U.length x
    item            = (sx !) . bracket
    sx              = partialSort (bracket j + 1) x
    bracket m       = min (max m 0) (n - 1)
{-# INLINE continuousBy #-}

-- | O(/n/ log /n/). Estimate the range between /q/-quantiles 1 and
-- /q/-1 of a sample /x/, using the continuous sample method with the
-- given parameters.
--
-- For instance, the interquartile range (IQR) can be estimated as
-- follows:
--
-- > midspread medianUnbiased 4 (U.to [1,1,2,2,3])
-- > ==> 1.333333
midspread :: ContParam       -- ^ Parameters /a/ and /b/.
          -> Int             -- ^ /q/, the number of quantiles.
          -> Sample          -- ^ /x/, the sample data.
          -> Double
midspread (ContParam a b) k x =
    assert (U.all (not . isNaN) x) .
    assert (k > 0) $
    quantile (1-frac) - quantile frac
  where
    quantile i        = (1-h i) * item (j i-1) + h i * item (j i)
    j i               = floor (t i + eps) :: Int
    t i               = a + i * (fromIntegral n + 1 - a - b)
    h i | abs r < eps = 0
        | otherwise   = r
        where r       = t i - fromIntegral (j i)
    eps               = m_epsilon * 4
    n                 = U.length x
    item              = (sx !) . bracket
    sx                = partialSort (bracket (j (1-frac)) + 1) x
    bracket m         = min (max m 0) (n - 1)
    frac              = 1 / fromIntegral k
{-# INLINE midspread #-}

-- | California Department of Public Works definition, /a/=0, /b/=1.
-- Gives a linear interpolation of the empirical CDF.  This
-- corresponds to method 4 in R and Mathematica.
cadpw :: ContParam
cadpw = ContParam 0 1
{-# INLINE cadpw #-}

-- | Hazen's definition, /a/=0.5, /b/=0.5.  This is claimed to be
-- popular among hydrologists.  This corresponds to method 5 in R and
-- Mathematica.
hazen :: ContParam
hazen = ContParam 0.5 0.5
{-# INLINE hazen #-}

-- | Definition used by the SPSS statistics application, with /a/=0,
-- /b/=0 (also known as Weibull's definition).  This corresponds to
-- method 6 in R and Mathematica.
spss :: ContParam
spss = ContParam 0 0
{-# INLINE spss #-}

-- | Definition used by the S statistics application, with /a/=1,
-- /b/=1.  The interpolation points divide the sample range into @n-1@
-- intervals.  This corresponds to method 7 in R and Mathematica.
s :: ContParam
s = ContParam 1 1
{-# INLINE s #-}

-- | Median unbiased definition, /a/=1\/3, /b/=1\/3. The resulting
-- quantile estimates are approximately median unbiased regardless of
-- the distribution of /x/.  This corresponds to method 8 in R and
-- Mathematica.
medianUnbiased :: ContParam
medianUnbiased = ContParam third third
    where third = 1/3
{-# INLINE medianUnbiased #-}

-- | Normal unbiased definition, /a/=3\/8, /b/=3\/8.  An approximately
-- unbiased estimate if the empirical distribution approximates the
-- normal distribution.  This corresponds to method 9 in R and
-- Mathematica.
normalUnbiased :: ContParam
normalUnbiased = ContParam ta ta
    where ta = 3/8
{-# INLINE normalUnbiased #-}

-- $references
--
-- * Weisstein, E.W. Quantile. /MathWorld/.
--   <http://mathworld.wolfram.com/Quantile.html>
--
-- * Hyndman, R.J.; Fan, Y. (1996) Sample quantiles in statistical
--   packages. /American Statistician/
--   50(4):361&#8211;365. <http://www.jstor.org/stable/2684934>