1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
|
{-# LANGUAGE DeriveDataTypeable, DeriveGeneric, FlexibleContexts #-}
-- |
-- Module : Statistics.Sample.KernelDensity.Simple
-- Copyright : (c) 2009 Bryan O'Sullivan
-- License : BSD3
--
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : portable
--
-- Kernel density estimation code, providing non-parametric ways to
-- estimate the probability density function of a sample.
--
-- The techniques used by functions in this module are relatively
-- fast, but they generally give inferior results to the KDE function
-- in the main 'Statistics.KernelDensity' module (due to the
-- oversmoothing documented for 'bandwidth' below).
module Statistics.Sample.KernelDensity.Simple
{-# DEPRECATED "Use Statistics.Sample.KernelDensity instead." #-}
(
-- * Simple entry points
epanechnikovPDF
, gaussianPDF
-- * Building blocks
-- These functions may be useful if you need to construct a kernel
-- density function estimator other than the ones provided in this
-- module.
-- ** Choosing points from a sample
, Points(..)
, choosePoints
-- ** Bandwidth estimation
, Bandwidth
, bandwidth
, epanechnikovBW
, gaussianBW
-- ** Kernels
, Kernel
, epanechnikovKernel
, gaussianKernel
-- ** Low-level estimation
, estimatePDF
, simplePDF
-- * References
-- $references
) where
import Data.Aeson (FromJSON, ToJSON)
import Data.Binary (Binary(..))
import Data.Data (Data, Typeable)
import Data.Vector.Binary ()
import GHC.Generics (Generic)
import Numeric.MathFunctions.Constants (m_1_sqrt_2, m_2_sqrt_pi)
import Prelude hiding (sum)
import Statistics.Function (minMax)
import Statistics.Sample (stdDev)
import Statistics.Sample.Internal (sum)
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U
-- | Points from the range of a 'Sample'.
newtype Points = Points {
fromPoints :: U.Vector Double
} deriving (Eq, Read, Show, Typeable, Data, Generic)
instance FromJSON Points
instance ToJSON Points
instance Binary Points where
get = fmap Points get
put = put . fromPoints
-- | Bandwidth estimator for an Epanechnikov kernel.
epanechnikovBW :: Double -> Bandwidth
epanechnikovBW n = (80 / (n * m_2_sqrt_pi)) ** 0.2
-- | Bandwidth estimator for a Gaussian kernel.
gaussianBW :: Double -> Bandwidth
gaussianBW n = (4 / (n * 3)) ** 0.2
-- | The width of the convolution kernel used.
type Bandwidth = Double
-- | Compute the optimal bandwidth from the observed data for the
-- given kernel.
--
-- This function uses an estimate based on the standard deviation of a
-- sample (due to Deheuvels), which performs reasonably well for
-- unimodal distributions but leads to oversmoothing for more complex
-- ones.
bandwidth :: G.Vector v Double =>
(Double -> Bandwidth)
-> v Double
-> Bandwidth
bandwidth kern values = stdDev values * kern (fromIntegral $ G.length values)
-- | Choose a uniform range of points at which to estimate a sample's
-- probability density function.
--
-- If you are using a Gaussian kernel, multiply the sample's bandwidth
-- by 3 before passing it to this function.
--
-- If this function is passed an empty vector, it returns values of
-- positive and negative infinity.
choosePoints :: G.Vector v Double =>
Int -- ^ Number of points to select, /n/
-> Double -- ^ Sample bandwidth, /h/
-> v Double -- ^ Input data
-> Points
choosePoints n h sample = Points . U.map f $ U.enumFromTo 0 n'
where lo = a - h
hi = z + h
(a, z) = minMax sample
d = (hi - lo) / fromIntegral n'
f i = lo + fromIntegral i * d
n' = n - 1
-- | The convolution kernel. Its parameters are as follows:
--
-- * Scaling factor, 1\//nh/
--
-- * Bandwidth, /h/
--
-- * A point at which to sample the input, /p/
--
-- * One sample value, /v/
type Kernel = Double
-> Double
-> Double
-> Double
-> Double
-- | Epanechnikov kernel for probability density function estimation.
epanechnikovKernel :: Kernel
epanechnikovKernel f h p v
| abs u <= 1 = f * (1 - u * u)
| otherwise = 0
where u = (v - p) / (h * 0.75)
-- | Gaussian kernel for probability density function estimation.
gaussianKernel :: Kernel
gaussianKernel f h p v = exp (-0.5 * u * u) * g
where u = (v - p) / h
g = f * 0.5 * m_2_sqrt_pi * m_1_sqrt_2
-- | Kernel density estimator, providing a non-parametric way of
-- estimating the PDF of a random variable.
estimatePDF :: G.Vector v Double =>
Kernel -- ^ Kernel function
-> Bandwidth -- ^ Bandwidth, /h/
-> v Double -- ^ Sample data
-> Points -- ^ Points at which to estimate
-> U.Vector Double
estimatePDF kernel h sample
| n < 2 = errorShort "estimatePDF"
| otherwise = U.map k . fromPoints
where
k p = sum . G.map (kernel f h p) $ sample
f = 1 / (h * fromIntegral n)
n = G.length sample
{-# INLINE estimatePDF #-}
-- | A helper for creating a simple kernel density estimation function
-- with automatically chosen bandwidth and estimation points.
simplePDF :: G.Vector v Double =>
(Double -> Double) -- ^ Bandwidth function
-> Kernel -- ^ Kernel function
-> Double -- ^ Bandwidth scaling factor (3 for a Gaussian kernel, 1 for all others)
-> Int -- ^ Number of points at which to estimate
-> v Double -- ^ sample data
-> (Points, U.Vector Double)
simplePDF fbw fpdf k numPoints sample =
(points, estimatePDF fpdf bw sample points)
where points = choosePoints numPoints (bw*k) sample
bw = bandwidth fbw sample
{-# INLINE simplePDF #-}
-- | Simple Epanechnikov kernel density estimator. Returns the
-- uniformly spaced points from the sample range at which the density
-- function was estimated, and the estimates at those points.
epanechnikovPDF :: G.Vector v Double =>
Int -- ^ Number of points at which to estimate
-> v Double -- ^ Data sample
-> (Points, U.Vector Double)
epanechnikovPDF = simplePDF epanechnikovBW epanechnikovKernel 1
-- | Simple Gaussian kernel density estimator. Returns the uniformly
-- spaced points from the sample range at which the density function
-- was estimated, and the estimates at those points.
gaussianPDF :: G.Vector v Double =>
Int -- ^ Number of points at which to estimate
-> v Double -- ^ Data sample
-> (Points, U.Vector Double)
gaussianPDF = simplePDF gaussianBW gaussianKernel 3
errorShort :: String -> a
errorShort func = error ("Statistics.KernelDensity." ++ func ++
": at least two points required")
-- $references
--
-- * Deheuvels, P. (1977) Estimation non paramétrique de la densité
-- par histogrammes
-- généralisés. Mhttp://archive.numdam.org/article/RSA_1977__25_3_5_0.pdf>
|