1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
|
-- |
-- Module : Statistics.KernelDensity
-- Copyright : (c) 2009 Bryan O'Sullivan
-- License : BSD3
--
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : portable
--
-- Kernel density estimation code, providing non-parametric ways to
-- estimate the probability density function of a sample.
module Statistics.KernelDensity
(
-- * Simple entry points
epanechnikovPDF
, gaussianPDF
-- * Building blocks
-- These functions may be useful if you need to construct a kernel
-- density function estimator other than the ones provided in this
-- module.
-- ** Choosing points from a sample
, Points(..)
, choosePoints
-- ** Bandwidth estimation
, Bandwidth
, bandwidth
, epanechnikovBW
, gaussianBW
-- ** Kernels
, Kernel
, epanechnikovKernel
, gaussianKernel
-- ** Low-level estimation
, estimatePDF
, simplePDF
) where
import Statistics.Constants (m_1_sqrt_2, m_2_sqrt_pi)
import Statistics.Function (minMax)
import Statistics.Sample (stdDev)
import Statistics.Types (Sample)
import qualified Data.Vector.Unboxed as U
-- | Points from the range of a 'Sample'.
newtype Points = Points {
fromPoints :: U.Vector Double
} deriving (Eq, Show)
-- | Bandwidth estimator for an Epanechnikov kernel.
epanechnikovBW :: Double -> Bandwidth
epanechnikovBW n = (80 / (n * m_2_sqrt_pi)) ** 0.2
-- | Bandwidth estimator for a Gaussian kernel.
gaussianBW :: Double -> Bandwidth
gaussianBW n = (4 / (n * 3)) ** 0.2
-- | The width of the convolution kernel used.
type Bandwidth = Double
-- | Compute the optimal bandwidth from the observed data for the given
-- kernel.
bandwidth :: (Double -> Bandwidth)
-> Sample
-> Bandwidth
bandwidth kern values = stdDev values * kern (fromIntegral $ U.length values)
-- | Choose a uniform range of points at which to estimate a sample's
-- probability density function.
--
-- If you are using a Gaussian kernel, multiply the sample's bandwidth
-- by 3 before passing it to this function.
--
-- If this function is passed an empty vector, it returns values of
-- positive and negative infinity.
choosePoints :: Int -- ^ Number of points to select, /n/
-> Double -- ^ Sample bandwidth, /h/
-> Sample -- ^ Input data
-> Points
choosePoints n h sample = Points . U.map f $ U.enumFromTo 0 n'
where lo = a - h
hi = z + h
(a, z) = minMax sample
d = (hi - lo) / fromIntegral n'
f i = lo + fromIntegral i * d
n' = n - 1
-- | The convolution kernel. Its parameters are as follows:
--
-- * Scaling factor, 1\//nh/
--
-- * Bandwidth, /h/
--
-- * A point at which to sample the input, /p/
--
-- * One sample value, /v/
type Kernel = Double
-> Double
-> Double
-> Double
-> Double
-- | Epanechnikov kernel for probability density function estimation.
epanechnikovKernel :: Kernel
epanechnikovKernel f h p v
| abs u <= 1 = f * (1 - u * u)
| otherwise = 0
where u = (v - p) / (h * 0.75)
-- | Gaussian kernel for probability density function estimation.
gaussianKernel :: Kernel
gaussianKernel f h p v = exp (-0.5 * u * u) * g
where u = (v - p) / h
g = f * m_2_sqrt_pi * m_1_sqrt_2
-- | Kernel density estimator, providing a non-parametric way of
-- estimating the PDF of a random variable.
estimatePDF :: Kernel -- ^ Kernel function
-> Bandwidth -- ^ Bandwidth, /h/
-> Sample -- ^ Sample data
-> Points -- ^ Points at which to estimate
-> U.Vector Double
estimatePDF kernel h sample
| n < 2 = errorShort "estimatePDF"
| otherwise = U.map k . fromPoints
where
k p = U.sum . U.map (kernel f h p) $ sample
f = 1 / (h * fromIntegral n)
n = U.length sample
{-# INLINE estimatePDF #-}
-- | A helper for creating a simple kernel density estimation function
-- with automatically chosen bandwidth and estimation points.
simplePDF :: (Double -> Double) -- ^ Bandwidth function
-> Kernel -- ^ Kernel function
-> Double -- ^ Bandwidth scaling factor (3 for a Gaussian kernel, 1 for all others)
-> Int -- ^ Number of points at which to estimate
-> Sample -- ^ Sample data
-> (Points, U.Vector Double)
simplePDF fbw fpdf k numPoints sample =
(points, estimatePDF fpdf bw sample points)
where points = choosePoints numPoints (bw*k) sample
bw = bandwidth fbw sample
{-# INLINE simplePDF #-}
-- | Simple Epanechnikov kernel density estimator. Returns the
-- uniformly spaced points from the sample range at which the density
-- function was estimated, and the estimates at those points.
epanechnikovPDF :: Int -- ^ Number of points at which to estimate
-> Sample
-> (Points, U.Vector Double)
epanechnikovPDF = simplePDF epanechnikovBW epanechnikovKernel 1
-- | Simple Gaussian kernel density estimator. Returns the uniformly
-- spaced points from the sample range at which the density function
-- was estimated, and the estimates at those points.
gaussianPDF :: Int -- ^ Number of points at which to estimate
-> Sample
-> (Points, U.Vector Double)
gaussianPDF = simplePDF gaussianBW gaussianKernel 3
errorShort :: String -> a
errorShort func = error ("Statistics.KernelDensity." ++ func ++
": at least two points required")
|