File: Resampling.hs

package info (click to toggle)
haskell-statistics 0.16.2.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 640 kB
  • sloc: haskell: 6,819; ansic: 35; python: 33; makefile: 9
file content (276 lines) | stat: -rw-r--r-- 9,599 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
{-# LANGUAGE BangPatterns       #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE DeriveFoldable     #-}
{-# LANGUAGE DeriveFunctor      #-}
{-# LANGUAGE DeriveGeneric      #-}
{-# LANGUAGE DeriveTraversable  #-}
{-# LANGUAGE FlexibleContexts   #-}
{-# LANGUAGE TypeFamilies       #-}

-- |
-- Module    : Statistics.Resampling
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
-- License   : BSD3
--
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : portable
--
-- Resampling statistics.

module Statistics.Resampling
    ( -- * Data types
      Resample(..)
    , Bootstrap(..)
    , Estimator(..)
    , estimate
      -- * Resampling
    , resampleST
    , resample
    , resampleVector
      -- * Jackknife
    , jackknife
    , jackknifeMean
    , jackknifeVariance
    , jackknifeVarianceUnb
    , jackknifeStdDev
      -- * Helper functions
    , splitGen
    ) where

import Data.Aeson (FromJSON, ToJSON)
import Control.Concurrent.Async (forConcurrently_)
import Control.Monad (forM_, forM, replicateM, liftM2)
import Control.Monad.Primitive (PrimMonad(..))
import Data.Binary (Binary(..))
import Data.Data (Data, Typeable)
import Data.Vector.Algorithms.Intro (sort)
import Data.Vector.Binary ()
import Data.Vector.Generic (unsafeFreeze,unsafeThaw)
import Data.Word (Word32)
import qualified Data.Foldable as T
import qualified Data.Traversable as T
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Unboxed.Mutable as MU

import GHC.Conc (numCapabilities)
import GHC.Generics (Generic)
import Numeric.Sum (Summation(..), kbn)
import Statistics.Function (indices)
import Statistics.Sample (mean, stdDev, variance, varianceUnbiased)
import Statistics.Types (Sample)
import System.Random.MWC (Gen, GenIO, initialize, uniformR, uniformVector)


----------------------------------------------------------------
-- Data types
----------------------------------------------------------------

-- | A resample drawn randomly, with replacement, from a set of data
-- points.  Distinct from a normal array to make it harder for your
-- humble author's brain to go wrong.
newtype Resample = Resample {
      fromResample :: U.Vector Double
    } deriving (Eq, Read, Show, Typeable, Data, Generic)

instance FromJSON Resample
instance ToJSON Resample

instance Binary Resample where
    put = put . fromResample
    get = fmap Resample get

data Bootstrap v a = Bootstrap
  { fullSample :: !a
  , resamples  :: v a
  }
  deriving (Eq, Read, Show , Generic, Functor, T.Foldable, T.Traversable
           , Typeable, Data
           )

instance (Binary a,   Binary   (v a)) => Binary   (Bootstrap v a) where
  get = liftM2 Bootstrap get get
  put (Bootstrap fs rs) = put fs >> put rs
instance (FromJSON a, FromJSON (v a)) => FromJSON (Bootstrap v a)
instance (ToJSON a,   ToJSON   (v a)) => ToJSON   (Bootstrap v a)



-- | An estimator of a property of a sample, such as its 'mean'.
--
-- The use of an algebraic data type here allows functions such as
-- 'jackknife' and 'bootstrapBCA' to use more efficient algorithms
-- when possible.
data Estimator = Mean
               | Variance
               | VarianceUnbiased
               | StdDev
               | Function (Sample -> Double)

-- | Run an 'Estimator' over a sample.
estimate :: Estimator -> Sample -> Double
estimate Mean             = mean
estimate Variance         = variance
estimate VarianceUnbiased = varianceUnbiased
estimate StdDev           = stdDev
estimate (Function est) = est


----------------------------------------------------------------
-- Resampling
----------------------------------------------------------------

-- | Single threaded and deterministic version of resample.
resampleST :: PrimMonad m
           => Gen (PrimState m)
           -> [Estimator]         -- ^ Estimation functions.
           -> Int                 -- ^ Number of resamples to compute.
           -> U.Vector Double     -- ^ Original sample.
           -> m [Bootstrap U.Vector Double]
resampleST gen ests numResamples sample = do
  -- Generate resamples
  res <- forM ests $ \e -> U.replicateM numResamples $ do
    v <- resampleVector gen sample
    return $! estimate e v
  -- Sort resamples
  resM <- mapM unsafeThaw res
  mapM_ sort resM
  resSorted <- mapM unsafeFreeze resM
  return $ zipWith Bootstrap [estimate e sample | e <- ests]
                             resSorted


-- | /O(e*r*s)/ Resample a data set repeatedly, with replacement,
-- computing each estimate over the resampled data.
--
-- This function is expensive; it has to do work proportional to
-- /e*r*s/, where /e/ is the number of estimation functions, /r/ is
-- the number of resamples to compute, and /s/ is the number of
-- original samples.
--
-- To improve performance, this function will make use of all
-- available CPUs.  At least with GHC 7.0, parallel performance seems
-- best if the parallel garbage collector is disabled (RTS option
-- @-qg@).
resample :: GenIO
         -> [Estimator]         -- ^ Estimation functions.
         -> Int                 -- ^ Number of resamples to compute.
         -> U.Vector Double     -- ^ Original sample.
         -> IO [(Estimator, Bootstrap U.Vector Double)]
resample gen ests numResamples samples = do
  let ixs = scanl (+) 0 $
            zipWith (+) (replicate numCapabilities q)
                        (replicate r 1 ++ repeat 0)
          where (q,r) = numResamples `quotRem` numCapabilities
  results <- mapM (const (MU.new numResamples)) ests
  gens <- splitGen numCapabilities gen
  forConcurrently_ (zip3 ixs (tail ixs) gens) $ \ (start,!end,gen') -> do
    -- on GHCJS it doesn't make sense to do any forking.
    -- JavaScript runtime has only single capability.
      let loop k ers | k >= end = return ()
                     | otherwise = do
            re <- resampleVector gen' samples
            forM_ ers $ \(est,arr) ->
                MU.write arr k . est $ re
            loop (k+1) ers
      loop start (zip ests' results)
  mapM_ sort results
  -- Build resamples
  res <- mapM unsafeFreeze results
  return $ zip ests
         $ zipWith Bootstrap [estimate e samples | e <- ests]
                             res
 where
  ests' = map estimate ests

-- | Create vector using resamples
resampleVector :: (PrimMonad m, G.Vector v a)
               => Gen (PrimState m) -> v a -> m (v a)
resampleVector gen v
  = G.replicateM n $ do i <- uniformR (0,n-1) gen
                        return $! G.unsafeIndex v i
  where
    n = G.length v


----------------------------------------------------------------
-- Jackknife
----------------------------------------------------------------

-- | /O(n) or O(n^2)/ Compute a statistical estimate repeatedly over a
-- sample, each time omitting a successive element.
jackknife :: Estimator -> Sample -> U.Vector Double
jackknife Mean sample             = jackknifeMean sample
jackknife Variance sample         = jackknifeVariance sample
jackknife VarianceUnbiased sample = jackknifeVarianceUnb sample
jackknife StdDev sample = jackknifeStdDev sample
jackknife (Function est) sample
  | G.length sample == 1 = singletonErr "jackknife"
  | otherwise            = U.map f . indices $ sample
  where f i = est (dropAt i sample)

-- | /O(n)/ Compute the jackknife mean of a sample.
jackknifeMean :: Sample -> U.Vector Double
jackknifeMean samp
  | len == 1  = singletonErr "jackknifeMean"
  | otherwise = G.map (/l) $ G.zipWith (+) (pfxSumL samp) (pfxSumR samp)
  where
    l   = fromIntegral (len - 1)
    len = G.length samp

-- | /O(n)/ Compute the jackknife variance of a sample with a
-- correction factor @c@, so we can get either the regular or
-- \"unbiased\" variance.
jackknifeVariance_ :: Double -> Sample -> U.Vector Double
jackknifeVariance_ c samp
  | len == 1  = singletonErr "jackknifeVariance"
  | otherwise = G.zipWith4 go als ars bls brs
  where
    als = pfxSumL . G.map goa $ samp
    ars = pfxSumR . G.map goa $ samp
    goa x = v * v where v = x - m
    bls = pfxSumL . G.map (subtract m) $ samp
    brs = pfxSumR . G.map (subtract m) $ samp
    m = mean samp
    n = fromIntegral len
    go al ar bl br = (al + ar - (b * b) / q) / (q - c)
      where b = bl + br
            q = n - 1
    len = G.length samp

-- | /O(n)/ Compute the unbiased jackknife variance of a sample.
jackknifeVarianceUnb :: Sample -> U.Vector Double
jackknifeVarianceUnb samp
  | G.length samp == 2  = singletonErr "jackknifeVariance"
  | otherwise           = jackknifeVariance_ 1 samp

-- | /O(n)/ Compute the jackknife variance of a sample.
jackknifeVariance :: Sample -> U.Vector Double
jackknifeVariance = jackknifeVariance_ 0

-- | /O(n)/ Compute the jackknife standard deviation of a sample.
jackknifeStdDev :: Sample -> U.Vector Double
jackknifeStdDev = G.map sqrt . jackknifeVarianceUnb

pfxSumL :: U.Vector Double -> U.Vector Double
pfxSumL = G.map kbn . G.scanl add zero

pfxSumR :: U.Vector Double -> U.Vector Double
pfxSumR = G.tail . G.map kbn . G.scanr (flip add) zero

-- | Drop the /k/th element of a vector.
dropAt :: U.Unbox e => Int -> U.Vector e -> U.Vector e
dropAt n v = U.slice 0 n v U.++ U.slice (n+1) (U.length v - n - 1) v

singletonErr :: String -> a
singletonErr func = error $
                    "Statistics.Resampling." ++ func ++ ": not enough elements in sample"

-- | Split a generator into several that can run independently.
splitGen :: Int -> GenIO -> IO [GenIO]
splitGen n gen
  | n <= 0    = return []
  | otherwise =
  fmap (gen:) . replicateM (n-1) $
  initialize =<< (uniformVector gen 256 :: IO (U.Vector Word32))