File: StudentT.hs

package info (click to toggle)
haskell-statistics 0.16.2.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 640 kB
  • sloc: haskell: 6,819; ansic: 35; python: 33; makefile: 9
file content (149 lines) | stat: -rw-r--r-- 5,958 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
{-# LANGUAGE FlexibleContexts, Rank2Types, ScopedTypeVariables #-}
-- | Student's T-test is for assessing whether two samples have
--   different mean. This module contain several variations of
--   T-test. It's a parametric tests and assumes that samples are
--   normally distributed.
module Statistics.Test.StudentT
    (
      studentTTest
    , welchTTest
    , pairedTTest
    , module Statistics.Test.Types
    ) where

import Statistics.Distribution hiding (mean)
import Statistics.Distribution.StudentT
import Statistics.Sample (mean, varianceUnbiased)
import Statistics.Test.Types
import Statistics.Types    (mkPValue,PValue)
import Statistics.Function (square)
import qualified Data.Vector.Generic  as G
import qualified Data.Vector.Unboxed  as U
import qualified Data.Vector.Storable as S
import qualified Data.Vector          as V



-- | Two-sample Student's t-test. It assumes that both samples are
--   normally distributed and have same variance. Returns @Nothing@ if
--   sample sizes are not sufficient.
studentTTest :: (G.Vector v Double)
             => PositionTest  -- ^ one- or two-tailed test
             -> v Double      -- ^ Sample A
             -> v Double      -- ^ Sample B
             -> Maybe (Test StudentT)
studentTTest test sample1 sample2
  | G.length sample1 < 2 || G.length sample2 < 2 = Nothing
  | otherwise                                    = Just Test
      { testSignificance = significance test t ndf
      , testStatistics   = t
      , testDistribution = studentT ndf
      }
  where
    (t, ndf) = tStatistics True sample1 sample2
{-# INLINABLE  studentTTest #-}
{-# SPECIALIZE studentTTest :: PositionTest -> U.Vector Double -> U.Vector Double -> Maybe (Test StudentT) #-}
{-# SPECIALIZE studentTTest :: PositionTest -> S.Vector Double -> S.Vector Double -> Maybe (Test StudentT) #-}
{-# SPECIALIZE studentTTest :: PositionTest -> V.Vector Double -> V.Vector Double -> Maybe (Test StudentT) #-}

-- | Two-sample Welch's t-test. It assumes that both samples are
--   normally distributed but doesn't assume that they have same
--   variance. Returns @Nothing@ if sample sizes are not sufficient.
welchTTest :: (G.Vector v Double)
           => PositionTest  -- ^ one- or two-tailed test
           -> v Double      -- ^ Sample A
           -> v Double      -- ^ Sample B
           -> Maybe (Test StudentT)
welchTTest test sample1 sample2
  | G.length sample1 < 2 || G.length sample2 < 2 = Nothing
  | otherwise                                    = Just Test
      { testSignificance = significance test t ndf
      , testStatistics   = t
      , testDistribution = studentT ndf
      }
  where
    (t, ndf) = tStatistics False sample1 sample2
{-# INLINABLE  welchTTest #-}
{-# SPECIALIZE welchTTest :: PositionTest -> U.Vector Double -> U.Vector Double -> Maybe (Test StudentT) #-}
{-# SPECIALIZE welchTTest :: PositionTest -> S.Vector Double -> S.Vector Double -> Maybe (Test StudentT) #-}
{-# SPECIALIZE welchTTest :: PositionTest -> V.Vector Double -> V.Vector Double -> Maybe (Test StudentT) #-}

-- | Paired two-sample t-test. Two samples are paired in a
-- within-subject design. Returns @Nothing@ if sample size is not
-- sufficient.
pairedTTest :: forall v. (G.Vector v (Double, Double), G.Vector v Double)
            => PositionTest          -- ^ one- or two-tailed test
            -> v (Double, Double)    -- ^ paired samples
            -> Maybe (Test StudentT)
pairedTTest test sample
  | G.length sample < 2 = Nothing
  | otherwise           = Just Test
      { testSignificance = significance test t ndf
      , testStatistics   = t
      , testDistribution = studentT ndf
      }
  where
    (t, ndf) = tStatisticsPaired sample
{-# INLINABLE  pairedTTest #-}
{-# SPECIALIZE pairedTTest :: PositionTest -> U.Vector (Double,Double) -> Maybe (Test StudentT) #-}
{-# SPECIALIZE pairedTTest :: PositionTest -> V.Vector (Double,Double) -> Maybe (Test StudentT) #-}


-------------------------------------------------------------------------------

significance :: PositionTest    -- ^ one- or two-tailed
             -> Double          -- ^ t statistics
             -> Double          -- ^ degree of freedom
             -> PValue Double   -- ^ p-value
significance test t df =
  case test of
    -- Here we exploit symmetry of T-distribution and calculate small tail
    SamplesDiffer -> mkPValue $ 2 * tailArea (negate (abs t))
    AGreater      -> mkPValue $ tailArea (negate t)
    BGreater      -> mkPValue $ tailArea  t
  where
    tailArea = cumulative (studentT df)


-- Calculate T statistics for two samples
tStatistics :: (G.Vector v Double)
            => Bool               -- variance equality
            -> v Double
            -> v Double
            -> (Double, Double)
{-# INLINE tStatistics #-}
tStatistics varequal sample1 sample2 = (t, ndf)
  where
    -- t-statistics
    t = (m1 - m2) / sqrt (
      if varequal
        then ((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2) * (1 / n1 + 1 / n2)
        else s1 / n1 + s2 / n2)

    -- degree of freedom
    ndf | varequal  = n1 + n2 - 2
        | otherwise = square (s1 / n1 + s2 / n2)
                    / (square s1 / (square n1 * (n1 - 1)) + square s2 / (square n2 * (n2 - 1)))
    -- statistics of two samples
    n1 = fromIntegral $ G.length sample1
    n2 = fromIntegral $ G.length sample2
    m1 = mean sample1
    m2 = mean sample2
    s1 = varianceUnbiased sample1
    s2 = varianceUnbiased sample2


-- Calculate T-statistics for paired sample
tStatisticsPaired :: (G.Vector v (Double, Double))
                  => v (Double, Double)
                  -> (Double, Double)
{-# INLINE tStatisticsPaired #-}
tStatisticsPaired sample = (t, ndf)
  where
    -- t-statistics
    t = let d    = U.map (uncurry (-)) $ G.convert sample
            sumd = U.sum d
        in sumd / sqrt ((n * U.sum (U.map square d) - square sumd) / ndf)
    -- degree of freedom
    ndf = n - 1
    n   = fromIntegral $ G.length sample