1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
|
{-# LANGUAGE BangPatterns, FlexibleContexts, UnboxedTuples #-}
-- |
-- Module : Statistics.Sample.KernelDensity
-- Copyright : (c) 2011 Bryan O'Sullivan
-- License : BSD3
--
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : portable
--
-- Kernel density estimation. This module provides a fast, robust,
-- non-parametric way to estimate the probability density function of
-- a sample.
--
-- This estimator does not use the commonly employed \"Gaussian rule
-- of thumb\". As a result, it outperforms many plug-in methods on
-- multimodal samples with widely separated modes.
module Statistics.Sample.KernelDensity
(
-- * Estimation functions
kde
, kde_
-- * References
-- $references
) where
import Data.Default.Class
import Numeric.MathFunctions.Constants (m_sqrt_2_pi)
import Numeric.RootFinding (fromRoot, ridders, RiddersParam(..), Tolerance(..))
import Prelude hiding (const, min, max, sum)
import Statistics.Function (minMax, nextHighestPowerOfTwo)
import Statistics.Sample.Histogram (histogram_)
import Statistics.Sample.Internal (sum)
import Statistics.Transform (CD, dct, idct)
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector as V
-- | Gaussian kernel density estimator for one-dimensional data, using
-- the method of Botev et al.
--
-- The result is a pair of vectors, containing:
--
-- * The coordinates of each mesh point. The mesh interval is chosen
-- to be 20% larger than the range of the sample. (To specify the
-- mesh interval, use 'kde_'.)
--
-- * Density estimates at each mesh point.
kde :: (G.Vector v CD, G.Vector v Double, G.Vector v Int)
=> Int
-- ^ The number of mesh points to use in the uniform discretization
-- of the interval @(min,max)@. If this value is not a power of
-- two, then it is rounded up to the next power of two.
-> v Double -> (v Double, v Double)
kde n0 xs = kde_ n0 (lo - range / 10) (hi + range / 10) xs
where
(lo,hi) = minMax xs
range | G.length xs <= 1 = 1 -- Unreasonable guess
| lo == hi = 1 -- All elements are equal
| otherwise = hi - lo
{-# INLINABLE kde #-}
{-# SPECIAlIZE kde :: Int -> U.Vector Double -> (U.Vector Double, U.Vector Double) #-}
{-# SPECIAlIZE kde :: Int -> V.Vector Double -> (V.Vector Double, V.Vector Double) #-}
-- | Gaussian kernel density estimator for one-dimensional data, using
-- the method of Botev et al.
--
-- The result is a pair of vectors, containing:
--
-- * The coordinates of each mesh point.
--
-- * Density estimates at each mesh point.
kde_ :: (G.Vector v CD, G.Vector v Double, G.Vector v Int)
=> Int
-- ^ The number of mesh points to use in the uniform discretization
-- of the interval @(min,max)@. If this value is not a power of
-- two, then it is rounded up to the next power of two.
-> Double
-- ^ Lower bound (@min@) of the mesh range.
-> Double
-- ^ Upper bound (@max@) of the mesh range.
-> v Double
-> (v Double, v Double)
kde_ n0 min max xs
| G.null xs = error "Statistics.KernelDensity.kde: empty sample"
| n0 <= 1 = error "Statistics.KernelDensity.kde: invalid number of points"
| otherwise = (mesh, density)
where
mesh = G.generate ni $ \z -> min + (d * fromIntegral z)
where d = r / (n-1)
density = G.map (/(2 * r)) . idct $ G.zipWith f a (G.enumFromTo 0 (n-1))
where f b z = b * exp (sqr z * sqr pi * t_star * (-0.5))
!n = fromIntegral ni
!ni = nextHighestPowerOfTwo n0
!r = max - min
a = dct . G.map (/ sum h) $ h
where h = G.map (/ len) $ histogram_ ni min max xs
!len = fromIntegral (G.length xs)
!t_star = fromRoot (0.28 * len ** (-0.4)) . ridders def{ riddersTol = AbsTol 1e-14 } (0,0.1)
$ \x -> x - (len * (2 * sqrt pi) * go 6 (f 7 x)) ** (-0.4)
where
f q t = 2 * pi ** (q*2) * sum (G.zipWith g iv a2v)
where g i a2 = i ** q * a2 * exp ((-i) * sqr pi * t)
a2v = G.map (sqr . (*0.5)) $ G.tail a
iv = G.map sqr $ G.enumFromTo 1 (n-1)
go s !h | s == 1 = h
| otherwise = go (s-1) (f s time)
where time = (2 * const * k0 / len / h) ** (2 / (3 + 2 * s))
const = (1 + 0.5 ** (s+0.5)) / 3
k0 = U.product (G.enumFromThenTo 1 3 (2*s-1)) / m_sqrt_2_pi
sqr x = x * x
{-# INLINABLE kde_ #-}
{-# SPECIAlIZE kde_ :: Int -> Double -> Double -> U.Vector Double -> (U.Vector Double, U.Vector Double) #-}
{-# SPECIAlIZE kde_ :: Int -> Double -> Double -> V.Vector Double -> (V.Vector Double, V.Vector Double) #-}
-- $references
--
-- Botev. Z.I., Grotowski J.F., Kroese D.P. (2010). Kernel density
-- estimation via diffusion. /Annals of Statistics/
-- 38(5):2916–2957. <http://arxiv.org/pdf/1011.2602>
|