File: ExactDistribution.hs

package info (click to toggle)
haskell-statistics 0.16.2.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 640 kB
  • sloc: haskell: 6,819; ansic: 35; python: 33; makefile: 9
file content (396 lines) | stat: -rw-r--r-- 14,476 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
{-# LANGUAGE BangPatterns,
             FlexibleInstances,
             FlexibleContexts,
             ScopedTypeVariables
  #-}
-- |
-- Module    : Tests.ExactDistribution
-- Copyright : (c) 2022 Lorenz Minder
-- License   : BSD3
--
-- Maintainer  : lminder@gmx.net
-- Stability   : experimental
-- Portability : portable
--
-- Tests comparing distributions to exact versions.
--
-- This module provides exact versions of some distributions, and tests
-- to compare them to the production implementations in
-- Statistics.Distribution.*.  It also contains the functionality to
-- test the production distributions against the exact versions.  Errors
-- are flagged if data points are discovered where the probability mass
-- function, the cumulative probability function, or its complement
-- deviates too far (more than a prescribed tolerance) from the exact
-- calculation.
--
-- The distributions here are implemented with rational integer
-- arithmetic, using pretty much the textbook definitions formulas.
-- Numerical problems like overflow or rounding errors cannot occur with
-- this approach, making them are easy to write, read and verify.  They
-- are, of course, substantially slower than the production
-- distributions in Statistics.Distribution.*.  This makes them
-- unsuitable for most uses other than testing and debugging.  (Also,
-- only a handful of distributions can be implemented exactly with
-- rational arithmetic.)
--
-- This module has the following sub-components:
-- 
-- * Exact (rational) definitions of some distribution functions,
--   including both the probability mass as well as the CDF.
--
-- * QC.Arbitrary implementations to sample test cases (i.e.,
--   distribution parameters and evaluation points).
--
-- * "Linkage": a mechanism to construct a production distribution
--   corresponding to a test case for an exact distribution.
--
-- * A set of tests for the distributions derived using all of the above
--   components.
--
-- This module exports a number symbols which can be useful for
-- debugging and experimentation.  For use in a test suite, only the
-- `exactDistributionTests` function is needed.

module Tests.ExactDistribution (
    -- * Exact math functions
      exactChoose

    -- * Exact distributions
    , ExactDiscreteDistr(..)

    , ExactBinomialDistr(..)
    , ExactDiscreteUniformDistr(..)
    , ExactGeometricDistr(..)
    , ExactHypergeomDistr(..)

    -- * Linking to production distributions
    , ProductionProbFuncs(..)
    , productionProbFuncs
    , ProductionLinkage

    -- * Individual test routines
    , pmfMatch
    , cdfMatch
    , complCdfMatch

    -- * Test groups
    , Tag(..)
    , distTests
    , exactDistributionTests
) where

----------------------------------------------------------------

import Data.Foldable
import Data.Ratio

import Test.Tasty                       (TestTree, testGroup)
import Test.Tasty.QuickCheck            (testProperty)
import Test.QuickCheck as QC
import Numeric.MathFunctions.Comparison (relativeError)

import Statistics.Distribution
import Statistics.Distribution.Binomial
import Statistics.Distribution.DiscreteUniform
import Statistics.Distribution.Geometric
import Statistics.Distribution.Hypergeometric

----------------------------------------------------------------
--
-- Math functions.
--
-- Used for implementing the distributions below.
--
----------------------------------------------------------------

-- | Exactly compute binomial coefficient.
--
-- /n/ need not be an integer, can be fractional.
exactChoose :: Ratio Integer -> Integer -> Ratio Integer
exactChoose n k
    | k < 0     = 0
    | otherwise = foldl' (*) 1 factors
    where   factors = [ (n - k' + j) / j | j <- [1..k'] ]
            k' = fromInteger k :: Ratio Integer

----------------------------------------------------------------
--
-- Exact distributions.
--
----------------------------------------------------------------

-- | Exact discrete distribution.
class ExactDiscreteDistr a where
    -- | Probability mass function.
    exactProb :: a -> Integer -> Ratio Integer
    exactProb d x = exactCumulative d x - exactCumulative d (x - 1)

    -- | Cumulative distribution function.
    exactCumulative :: a -> Integer -> Ratio Integer

-- | Exact Binomial distribution.
data ExactBinomialDistr = ExactBD Integer (Ratio Integer)
    deriving(Show)

instance ExactDiscreteDistr ExactBinomialDistr where
    -- Probability mass, computed with textbook formula.
    exactProb (ExactBD n p) k
        | k < 0 || k > n    = 0
        | otherwise         = exactChoose n' k * p^k * (1-p)^(n-k)
        where n' = fromIntegral n
    -- CDF 
    --
    -- Computed iteratively by summing up all the probabilities
    -- <= /k/.  Rather than computing everything from scratch for each
    -- probability, we reuse previous results.  The meanings of the
    -- variables in the "update" function are:
    -- 
    -- bc   is the binomial coefficient (n choose j),
    -- pj   is the term p^j,
    -- pnj  is the term (1 - p)^(n - j)
    -- r    is the (partial) sum of the probabilities 
    --
    exactCumulative (ExactBD n p) k
        | k < 0             = 0
        | k >= n            = 1
        -- Special case for p = 1, since in the below fold we
        -- divide by (1 - p).
        | p == 1            = if k == n then 1 else 0
        | otherwise
          = result $ foldl' update (1, 1, (1 - p)^n, (1 - p)^n) [1..k]
          where update (!bc, !pj, !pnj, !r) !j =
                    let bc' = bc * (n - j + 1) `div` j 
                        pj' = pj * p
                        pnj' = pnj / (1 - p)
                        r' = r + (fromIntegral bc') * pj' * pnj'
                    in  (bc', pj', pnj', r')
                result (_, _, _, r) = r

-- | Exact Discrete Uniform distribution.
data ExactDiscreteUniformDistr = ExactDU Integer Integer
    deriving(Show)

instance ExactDiscreteDistr ExactDiscreteUniformDistr  where
    exactProb (ExactDU lower upper) k
        | k < lower || k > upper    = 0
        | otherwise                 = 1 % (upper - lower + 1)
    exactCumulative (ExactDU lower upper) k
        | k < lower                 = 0
        | k > upper                 = 1
        | otherwise                 =
            let d = (k - lower + 1)
            in  d % (upper - lower + 1)

-- | Geometric distribution.
data ExactGeometricDistr = ExactGeom (Ratio Integer)
    deriving(Show)

instance ExactDiscreteDistr ExactGeometricDistr where
    exactProb (ExactGeom p) k
        | k < 1                     = 0
        | otherwise                 = (1 - p)^(k - 1) * p

    exactCumulative (ExactGeom p) k = 1 - (1 - p)^k

-- | Hypergeometric distribution.
--
--   Parameters are /K/, /N/ and /n/, where:
--   - /N/ is the total sample space size.
--   - /K/ is number of "good" objects among /N/.
--   - /n/ is the number of draws without replacement.
data ExactHypergeomDistr = ExactHG Integer Integer Integer
    deriving(Show)

instance ExactDiscreteDistr ExactHypergeomDistr where
    exactProb (ExactHG nK nN n) k
        | k < 0                     = 0
        | k > n || k > nN           = 0
        | otherwise                 =
            exactChoose nK' k * exactChoose (nN' - nK') (n - k)
                / exactChoose nN' n
            where nN' = fromIntegral nN
                  nK' = fromIntegral nK

    exactCumulative d k = sum [ exactProb d i | i <- [0..k] ]

----------------------------------------------------------------
--
-- TestCase construction.
--
-- Contains the TestCase data type which encapsulates an instance of an
-- exact distribution together with an evaluation point.
--
-- Then in contains the QC.Arbitrary implementations for TestCases of
-- the different exact distributions.  As a general rule, we try the
-- sampling to be relatively efficient, i.e., we only want to sample
-- valid distribution parameters.  The evaluation points are sampled
-- such that most points are within the support of the distribution.
--
----------------------------------------------------------------

-- Divisor to compute a rational number from an integer.
--
-- We want input parameters to be exactly representable as
-- Double values.  This is so that the production distribution does not
-- mismatch the exact one simply because the input values don't exactly
-- match.  (This can happen if the derivative of the distribution
-- function is large.)   For this reason, the gd value needs to be a
-- power of 2, and <= 2^53, since the mantissa of a Double is 53 bits.
--
-- A value of 2^53 gives the most accurate and diverse tests, but the
-- cost is increased running times, as the computed numerators and
-- denominators will become quite large.
gd :: Integer
gd = 2^(16 :: Int)

-- TestCase
--
-- Combination of an exact distribution together with an evaluation point.
data TestCase a = TestCase a Integer deriving (Show)

instance QC.Arbitrary (TestCase ExactBinomialDistr) where
    arbitrary = do
        -- This somewhat odd sampling of /n/ is done so that lower
        -- values (<1000) are more often represented as the larger ones.
        n <- (*) <$> chooseInteger (1,1000) <*> chooseInteger(1,2)
        p <- (% gd) <$> chooseInteger (0, gd)
        k <- chooseInteger (-1, n + 1)
        return $ TestCase (ExactBD n p) k
    shrink _ = []

instance QC.Arbitrary (TestCase ExactDiscreteUniformDistr) where
    arbitrary = do
        a <- chooseInteger (-1000, 1000)
        sz <- chooseInteger (1, 1000)
        let b = a + sz
        k <- chooseInteger (a - 10, b + 10)
        return $ TestCase (ExactDU a b) k
    shrink _ = []

instance QC.Arbitrary (TestCase ExactGeometricDistr) where
    arbitrary = do
        p <- (% gd) <$> chooseInteger (1, gd)
        let lim = (floor $ 100 / p) :: Integer
        k <- chooseInteger (0, lim)
        return $ TestCase (ExactGeom p) k
    shrink _ = []

instance QC.Arbitrary (TestCase ExactHypergeomDistr) where
    arbitrary = do
        nN <- chooseInteger (1, 100)        -- XXX lower bound should be 0
        nK <- chooseInteger (0, nN)
        n  <- chooseInteger (1, nN)         -- XXX lower bound should be 0
        k  <- chooseInteger (0, min n nK)
        return $ TestCase (ExactHG nK nN n) k
    shrink _ = []

----------------------------------------------------------------
--
-- Linking to the production distributions
--
-- This section contains the ProductionLinkage typeclass and
-- implementation, that allows to obtain a functions for evaluating
-- the production distribution functions for a corresponding exact
-- distribution.
--
----------------------------------------------------------------

-- | Distribution evaluation functions.
--
-- This is used to store a
data ProductionProbFuncs = ProductionProbFuncs {
        prodProb            :: Int -> Double
    ,   prodCumulative      :: Double -> Double
    ,   prodComplCumulative :: Double -> Double
    }

productionProbFuncs :: (DiscreteDistr a) => a -> ProductionProbFuncs
productionProbFuncs d = ProductionProbFuncs {
        prodProb = probability d
    ,   prodCumulative = cumulative d
    ,   prodComplCumulative = complCumulative d
    }

class (ExactDiscreteDistr a) => ProductionLinkage a where
    productionLinkage :: a -> ProductionProbFuncs

instance ProductionLinkage ExactBinomialDistr where
    productionLinkage (ExactBD n p) =
        let d = binomial (fromIntegral n) (fromRational p)
        in  productionProbFuncs d

instance ProductionLinkage ExactDiscreteUniformDistr where
    productionLinkage (ExactDU lower upper) =
        let d = discreteUniformAB (fromIntegral lower) (fromIntegral upper)
        in  productionProbFuncs d

instance ProductionLinkage ExactGeometricDistr where
    productionLinkage (ExactGeom p) =
        let d = geometric $ fromRational p
        in  productionProbFuncs d

instance ProductionLinkage ExactHypergeomDistr where
    productionLinkage (ExactHG nK nN n) =
        let d = hypergeometric (fromIntegral nK) (fromIntegral nN) (fromIntegral n)
        in  productionProbFuncs d

----------------------------------------------------------------
-- Tests
----------------------------------------------------------------

-- Check production probability mass function accuracy.
--
-- Inputs: tolerance (max relative error) and test case
pmfMatch :: (Show a, ProductionLinkage a) => Double -> TestCase a -> Bool
pmfMatch tol (TestCase dExact k) =
    let dProd = productionLinkage dExact
        pe = fromRational $ exactProb dExact k
        pa = prodProb dProd k'
        k' = fromIntegral k
    in  relativeError pe pa < tol

-- Check production cumulative probability function accuracy.
--
-- Inputs:  tolerance (max relative error) and test case.
cdfMatch :: (Show a, ProductionLinkage a) => Double -> TestCase a -> Bool
cdfMatch tol (TestCase dExact k) =
    let dProd = productionLinkage dExact
        pe = fromRational $ exactCumulative dExact k
        pa = prodCumulative dProd k'
        k' = fromIntegral k
    in  relativeError pe pa < tol

-- Check production complement cumulative function accuracy.
--
-- Inputs:  tolerance (max relative error) and test case.
complCdfMatch :: (Show a, ProductionLinkage a) => Double -> TestCase a -> Bool
complCdfMatch tol (TestCase dExact k) =
    let dProd = productionLinkage dExact
        pe = fromRational $ 1 - exactCumulative dExact k
        pa = prodComplCumulative dProd k'
        k' = fromIntegral k
    in  relativeError pe pa < tol

-- Phantom type to encode an exact distribution.
data Tag a = Tag

distTests :: (Show a, ProductionLinkage a, Arbitrary (TestCase a)) =>
    Tag a -> String -> Double -> TestTree
distTests (Tag :: Tag a) name tol =
    testGroup ("Exact tests for " ++ name) [
        testProperty "PMF match" $ ((pmfMatch tol) :: TestCase a -> Bool)
    ,   testProperty "CDF match" $ ((cdfMatch tol) :: TestCase a -> Bool)
    ,   testProperty "1 - CDF match" $ ((complCdfMatch tol) :: TestCase a -> Bool)
    ]


-- Test driver -------------------------------------------------

exactDistributionTests :: TestTree
exactDistributionTests = testGroup "Test distributions against exact"
  [
    distTests (Tag :: Tag ExactBinomialDistr)       "Binomial"          1.0e-12
  , distTests (Tag :: Tag ExactDiscreteUniformDistr) "DiscreteUniform"  1.0e-12
  , distTests (Tag :: Tag ExactGeometricDistr)      "Geometric"         1.0e-13
  , distTests (Tag :: Tag ExactHypergeomDistr)      "Hypergeometric"    1.0e-12
  ]