File: Simple.hs

package info (click to toggle)
haskell-statistics 0.16.2.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 640 kB
  • sloc: haskell: 6,819; ansic: 35; python: 33; makefile: 9
file content (205 lines) | stat: -rw-r--r-- 7,190 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
{-# LANGUAGE DeriveDataTypeable, DeriveGeneric, FlexibleContexts #-}
-- |
-- Module    : Statistics.Sample.KernelDensity.Simple
-- Copyright : (c) 2009 Bryan O'Sullivan
-- License   : BSD3
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
-- Kernel density estimation code, providing non-parametric ways to
-- estimate the probability density function of a sample.
--
-- The techniques used by functions in this module are relatively
-- fast, but they generally give inferior results to the KDE function
-- in the main 'Statistics.KernelDensity' module (due to the
-- oversmoothing documented for 'bandwidth' below).

module Statistics.Sample.KernelDensity.Simple
    {-# DEPRECATED "Use Statistics.Sample.KernelDensity instead." #-}
    (
    -- * Simple entry points
      epanechnikovPDF
    , gaussianPDF
    -- * Building blocks
    -- These functions may be useful if you need to construct a kernel
    -- density function estimator other than the ones provided in this
    -- module.

    -- ** Choosing points from a sample
    , Points(..)
    , choosePoints
    -- ** Bandwidth estimation
    , Bandwidth
    , bandwidth
    , epanechnikovBW
    , gaussianBW
    -- ** Kernels
    , Kernel
    , epanechnikovKernel
    , gaussianKernel
    -- ** Low-level estimation
    , estimatePDF
    , simplePDF
    -- * References
    -- $references
    ) where

import Data.Aeson (FromJSON, ToJSON)
import Data.Binary (Binary(..))
import Data.Data (Data, Typeable)
import Data.Vector.Binary ()
import GHC.Generics (Generic)
import Numeric.MathFunctions.Constants (m_1_sqrt_2, m_2_sqrt_pi)
import Prelude hiding (sum)
import Statistics.Function (minMax)
import Statistics.Sample (stdDev)
import Statistics.Sample.Internal (sum)
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U

-- | Points from the range of a 'Sample'.
newtype Points = Points {
      fromPoints :: U.Vector Double
    } deriving (Eq, Read, Show, Typeable, Data, Generic)

instance FromJSON Points
instance ToJSON Points

instance Binary Points where
    get = fmap Points get
    put = put . fromPoints

-- | Bandwidth estimator for an Epanechnikov kernel.
epanechnikovBW :: Double -> Bandwidth
epanechnikovBW n = (80 / (n * m_2_sqrt_pi)) ** 0.2

-- | Bandwidth estimator for a Gaussian kernel.
gaussianBW :: Double -> Bandwidth
gaussianBW n = (4 / (n * 3)) ** 0.2

-- | The width of the convolution kernel used.
type Bandwidth = Double

-- | Compute the optimal bandwidth from the observed data for the
-- given kernel.
--
-- This function uses an estimate based on the standard deviation of a
-- sample (due to Deheuvels), which performs reasonably well for
-- unimodal distributions but leads to oversmoothing for more complex
-- ones.
bandwidth :: G.Vector v Double =>
             (Double -> Bandwidth)
          -> v Double
          -> Bandwidth
bandwidth kern values = stdDev values * kern (fromIntegral $ G.length values)

-- | Choose a uniform range of points at which to estimate a sample's
-- probability density function.
--
-- If you are using a Gaussian kernel, multiply the sample's bandwidth
-- by 3 before passing it to this function.
--
-- If this function is passed an empty vector, it returns values of
-- positive and negative infinity.
choosePoints :: G.Vector v Double =>
                Int             -- ^ Number of points to select, /n/
             -> Double          -- ^ Sample bandwidth, /h/
             -> v Double        -- ^ Input data
             -> Points
choosePoints n h sample = Points . U.map f $ U.enumFromTo 0 n'
  where lo     = a - h
        hi     = z + h
        (a, z) = minMax sample
        d      = (hi - lo) / fromIntegral n'
        f i    = lo + fromIntegral i * d
        n'     = n - 1

-- | The convolution kernel.  Its parameters are as follows:
--
-- * Scaling factor, 1\//nh/
--
-- * Bandwidth, /h/
--
-- * A point at which to sample the input, /p/
--
-- * One sample value, /v/
type Kernel =  Double
            -> Double
            -> Double
            -> Double
            -> Double

-- | Epanechnikov kernel for probability density function estimation.
epanechnikovKernel :: Kernel
epanechnikovKernel f h p v
    | abs u <= 1 = f * (1 - u * u)
    | otherwise  = 0
    where u = (v - p) / (h * 0.75)

-- | Gaussian kernel for probability density function estimation.
gaussianKernel :: Kernel
gaussianKernel f h p v = exp (-0.5 * u * u) * g
    where u = (v - p) / h
          g = f * 0.5 * m_2_sqrt_pi * m_1_sqrt_2

-- | Kernel density estimator, providing a non-parametric way of
-- estimating the PDF of a random variable.
estimatePDF :: G.Vector v Double =>
               Kernel           -- ^ Kernel function
            -> Bandwidth        -- ^ Bandwidth, /h/
            -> v Double         -- ^ Sample data
            -> Points           -- ^ Points at which to estimate
            -> U.Vector Double
estimatePDF kernel h sample
    | n < 2     = errorShort "estimatePDF"
    | otherwise = U.map k . fromPoints
  where
    k p = sum . G.map (kernel f h p) $ sample
    f   = 1 / (h * fromIntegral n)
    n   = G.length sample
{-# INLINE estimatePDF #-}

-- | A helper for creating a simple kernel density estimation function
-- with automatically chosen bandwidth and estimation points.
simplePDF :: G.Vector v Double =>
             (Double -> Double) -- ^ Bandwidth function
          -> Kernel             -- ^ Kernel function
          -> Double             -- ^ Bandwidth scaling factor (3 for a Gaussian kernel, 1 for all others)
          -> Int                -- ^ Number of points at which to estimate
          -> v Double           -- ^ sample data
          -> (Points, U.Vector Double)
simplePDF fbw fpdf k numPoints sample =
    (points, estimatePDF fpdf bw sample points)
  where points = choosePoints numPoints (bw*k) sample
        bw     = bandwidth fbw sample
{-# INLINE simplePDF #-}

-- | Simple Epanechnikov kernel density estimator.  Returns the
-- uniformly spaced points from the sample range at which the density
-- function was estimated, and the estimates at those points.
epanechnikovPDF :: G.Vector v Double =>
                   Int          -- ^ Number of points at which to estimate
                -> v Double     -- ^ Data sample
                -> (Points, U.Vector Double)
epanechnikovPDF = simplePDF epanechnikovBW epanechnikovKernel 1

-- | Simple Gaussian kernel density estimator.  Returns the uniformly
-- spaced points from the sample range at which the density function
-- was estimated, and the estimates at those points.
gaussianPDF :: G.Vector v Double =>
               Int              -- ^ Number of points at which to estimate
            -> v Double         -- ^ Data sample
            -> (Points, U.Vector Double)
gaussianPDF = simplePDF gaussianBW gaussianKernel 3

errorShort :: String -> a
errorShort func = error ("Statistics.KernelDensity." ++ func ++
                        ": at least two points required")

-- $references
--
-- * Deheuvels, P. (1977) Estimation non paramétrique de la densité
--   par histogrammes
--   généralisés. Mhttp://archive.numdam.org/article/RSA_1977__25_3_5_0.pdf>