1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
|
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
-- |
-- Module : Statistics.Distribution
-- Copyright : (c) 2009 Bryan O'Sullivan
-- License : BSD3
--
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : portable
--
-- Type classes for probability distributions
module Statistics.Distribution
(
-- * Type classes
Distribution(..)
, DiscreteDistr(..)
, ContDistr(..)
-- ** Distribution statistics
, MaybeMean(..)
, Mean(..)
, MaybeVariance(..)
, Variance(..)
, MaybeEntropy(..)
, Entropy(..)
, FromSample(..)
-- ** Random number generation
, ContGen(..)
, DiscreteGen(..)
, genContinuous
-- * Helper functions
, findRoot
, sumProbabilities
) where
import Prelude hiding (sum)
import Statistics.Function (square)
import Statistics.Sample.Internal (sum)
import System.Random.Stateful (StatefulGen, uniformDouble01M)
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Generic as G
-- | Type class common to all distributions. Only c.d.f. could be
-- defined for both discrete and continuous distributions.
class Distribution d where
-- | Cumulative distribution function. The probability that a
-- random variable /X/ is less or equal than /x/,
-- i.e. P(/X/≤/x/). Cumulative should be defined for
-- infinities as well:
--
-- > cumulative d +∞ = 1
-- > cumulative d -∞ = 0
cumulative :: d -> Double -> Double
cumulative d x = 1 - complCumulative d x
-- | One's complement of cumulative distribution:
--
-- > complCumulative d x = 1 - cumulative d x
--
-- It's useful when one is interested in P(/X/>/x/) and
-- expression on the right side begin to lose precision. This
-- function have default implementation but implementors are
-- encouraged to provide more precise implementation.
complCumulative :: d -> Double -> Double
complCumulative d x = 1 - cumulative d x
{-# MINIMAL (cumulative | complCumulative) #-}
-- | Discrete probability distribution.
class Distribution d => DiscreteDistr d where
-- | Probability of n-th outcome.
probability :: d -> Int -> Double
probability d = exp . logProbability d
-- | Logarithm of probability of n-th outcome
logProbability :: d -> Int -> Double
logProbability d = log . probability d
{-# MINIMAL (probability | logProbability) #-}
-- | Continuous probability distribution.
--
-- Minimal complete definition is 'quantile' and either 'density' or
-- 'logDensity'.
class Distribution d => ContDistr d where
-- | Probability density function. Probability that random
-- variable /X/ lies in the infinitesimal interval
-- [/x/,/x+/δ/x/) equal to /density(x)/⋅δ/x/
density :: d -> Double -> Double
density d = exp . logDensity d
-- | Natural logarithm of density.
logDensity :: d -> Double -> Double
logDensity d = log . density d
-- | Inverse of the cumulative distribution function. The value
-- /x/ for which P(/X/≤/x/) = /p/. If probability is outside
-- of [0,1] range function should call 'error'
quantile :: d -> Double -> Double
quantile d x = complQuantile d (1 - x)
-- | 1-complement of @quantile@:
--
-- > complQuantile x ≡ quantile (1 - x)
complQuantile :: d -> Double -> Double
complQuantile d x = quantile d (1 - x)
{-# MINIMAL (density | logDensity), (quantile | complQuantile) #-}
-- | Type class for distributions with mean. 'maybeMean' should return
-- 'Nothing' if it's undefined for current value of data
class Distribution d => MaybeMean d where
maybeMean :: d -> Maybe Double
-- | Type class for distributions with mean. If a distribution has
-- finite mean for all valid values of parameters it should be
-- instance of this type class.
class MaybeMean d => Mean d where
mean :: d -> Double
-- | Type class for distributions with variance. If variance is
-- undefined for some parameter values both 'maybeVariance' and
-- 'maybeStdDev' should return Nothing.
--
-- Minimal complete definition is 'maybeVariance' or 'maybeStdDev'
class MaybeMean d => MaybeVariance d where
maybeVariance :: d -> Maybe Double
maybeVariance = fmap square . maybeStdDev
maybeStdDev :: d -> Maybe Double
maybeStdDev = fmap sqrt . maybeVariance
{-# MINIMAL (maybeVariance | maybeStdDev) #-}
-- | Type class for distributions with variance. If distribution have
-- finite variance for all valid parameter values it should be
-- instance of this type class.
--
-- Minimal complete definition is 'variance' or 'stdDev'
class (Mean d, MaybeVariance d) => Variance d where
variance :: d -> Double
variance d = square (stdDev d)
stdDev :: d -> Double
stdDev = sqrt . variance
{-# MINIMAL (variance | stdDev) #-}
-- | Type class for distributions with entropy, meaning Shannon entropy
-- in the case of a discrete distribution, or differential entropy in the
-- case of a continuous one. 'maybeEntropy' should return 'Nothing' if
-- entropy is undefined for the chosen parameter values.
class (Distribution d) => MaybeEntropy d where
-- | Returns the entropy of a distribution, in nats, if such is defined.
maybeEntropy :: d -> Maybe Double
-- | Type class for distributions with entropy, meaning Shannon
-- entropy in the case of a discrete distribution, or differential
-- entropy in the case of a continuous one. If the distribution has
-- well-defined entropy for all valid parameter values then it
-- should be an instance of this type class.
class (MaybeEntropy d) => Entropy d where
-- | Returns the entropy of a distribution, in nats.
entropy :: d -> Double
-- | Generate discrete random variates which have given
-- distribution.
class Distribution d => ContGen d where
genContVar :: (StatefulGen g m) => d -> g -> m Double
-- | Generate discrete random variates which have given
-- distribution. 'ContGen' is superclass because it's always possible
-- to generate real-valued variates from integer values
class (DiscreteDistr d, ContGen d) => DiscreteGen d where
genDiscreteVar :: (StatefulGen g m) => d -> g -> m Int
-- | Estimate distribution from sample. First parameter in sample is
-- distribution type and second is element type.
class FromSample d a where
-- | Estimate distribution from sample. Returns 'Nothing' if there is
-- not enough data, or if no usable fit results from the method
-- used, e.g., the estimated distribution parameters would be
-- invalid or inaccurate.
fromSample :: G.Vector v a => v a -> Maybe d
-- | Generate variates from continuous distribution using inverse
-- transform rule.
genContinuous :: (ContDistr d, StatefulGen g m) => d -> g -> m Double
genContinuous d gen = do
x <- uniformDouble01M gen
return $! quantile d x
data P = P {-# UNPACK #-} !Double {-# UNPACK #-} !Double
-- | Approximate the value of /X/ for which P(/x/>/X/)=/p/.
--
-- This method uses a combination of Newton-Raphson iteration and
-- bisection with the given guess as a starting point. The upper and
-- lower bounds specify the interval in which the probability
-- distribution reaches the value /p/.
findRoot :: ContDistr d =>
d -- ^ Distribution
-> Double -- ^ Probability /p/
-> Double -- ^ Initial guess
-> Double -- ^ Lower bound on interval
-> Double -- ^ Upper bound on interval
-> Double
findRoot d prob = loop 0 1
where
loop !(i::Int) !dx !x !lo !hi
| abs dx <= accuracy || i >= maxIters = x
| otherwise = loop (i+1) dx'' x'' lo' hi'
where
err = cumulative d x - prob
P lo' hi' | err < 0 = P x hi
| otherwise = P lo x
pdf = density d x
P dx' x' | pdf /= 0 = P (err / pdf) (x - dx)
| otherwise = P dx x
P dx'' x''
| x' < lo' || x' > hi' || pdf == 0 = let y = (lo' + hi') / 2
in P (y-x) y
| otherwise = P dx' x'
accuracy = 1e-15
maxIters = 150
-- | Sum probabilities in inclusive interval.
sumProbabilities :: DiscreteDistr d => d -> Int -> Int -> Double
sumProbabilities d low hi =
-- Return value is forced to be less than 1 to guard against roundoff errors.
-- ATTENTION! this check should be removed for testing or it could mask bugs.
min 1 . sum . U.map (probability d) $ U.enumFromTo low hi
|