1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
|
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Denis Engemann <denis.engemann@gmail.com>
# Eric Larson <larson.eric.d@gmail.com>
#
# License: Simplified BSD
import numpy as np
from functools import reduce
from string import ascii_uppercase
from ..utils import _check_option
# The following function is a rewriting of scipy.stats.f_oneway
# Contrary to the scipy.stats.f_oneway implementation it does not
# copy the data while keeping the inputs unchanged.
def ttest_1samp_no_p(X, sigma=0, method='relative'):
"""Perform one-sample t-test.
This is a modified version of :func:`scipy.stats.ttest_1samp` that avoids
a (relatively) time-consuming p-value calculation, and can adjust
for implausibly small variance values [1]_.
Parameters
----------
X : array
Array to return t-values for.
sigma : float
The variance estate will be given by "var + sigma * max(var)" or
"var + sigma", depending on "method". By default this is 0 (no
adjustment). See Notes for details.
method : str
If 'relative', the minimum variance estimate will be sigma * max(var),
if 'absolute' the minimum variance estimate will be sigma.
Returns
-------
t : array
t-values, potentially adjusted using the hat method.
Notes
-----
To use the "hat" adjustment method [1]_, a value of ``sigma=1e-3`` may be a
reasonable choice.
You can use the conversion from ``scipy.stats.distributions.t.ppf``::
thresh = -scipy.stats.distributions.t.ppf(p_thresh, n_samples - 1) / 2.
to convert a desired p-value threshold to 2-tailed t-value threshold.
For one-tailed tests, ``thresh`` in the above should be multiplied by 2
(and for ``tail=-1``, multiplied by ``-1``).
References
----------
.. [1] Ridgway et al. 2012 "The problem of low variance voxels in
statistical parametric mapping; a new hat avoids a 'haircut'",
NeuroImage. 2012 Feb 1;59(3):2131-41.
"""
_check_option('method', method, ['absolute', 'relative'])
var = np.var(X, axis=0, ddof=1)
if sigma > 0:
limit = sigma * np.max(var) if method == 'relative' else sigma
var += limit
return np.mean(X, axis=0) / np.sqrt(var / X.shape[0])
def f_oneway(*args):
"""Perform a 1-way ANOVA.
The one-way ANOVA tests the null hypothesis that 2 or more groups have
the same population mean. The test is applied to samples from two or
more groups, possibly with differing sizes [1]_.
This is a modified version of :func:`scipy.stats.f_oneway` that avoids
computing the associated p-value.
Parameters
----------
*args : array_like
The sample measurements should be given as arguments.
Returns
-------
F-value : float
The computed F-value of the test.
Notes
-----
The ANOVA test has important assumptions that must be satisfied in order
for the associated p-value to be valid.
1. The samples are independent
2. Each sample is from a normally distributed population
3. The population standard deviations of the groups are all equal. This
property is known as homocedasticity.
If these assumptions are not true for a given set of data, it may still be
possible to use the Kruskal-Wallis H-test (:func:`scipy.stats.kruskal`)
although with some loss of power
The algorithm is from Heiman [2]_, pp.394-7.
References
----------
.. [1] Lowry, Richard. "Concepts and Applications of Inferential
Statistics". Chapter 14.
.. [2] Heiman, G.W. Research Methods in Statistics. 2002.
"""
n_classes = len(args)
n_samples_per_class = np.array([len(a) for a in args])
n_samples = np.sum(n_samples_per_class)
ss_alldata = reduce(lambda x, y: x + y,
[np.sum(a ** 2, axis=0) for a in args])
sums_args = [np.sum(a, axis=0) for a in args]
square_of_sums_alldata = reduce(lambda x, y: x + y, sums_args) ** 2
square_of_sums_args = [s ** 2 for s in sums_args]
sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
ssbn = 0
for k, _ in enumerate(args):
ssbn += square_of_sums_args[k] / n_samples_per_class[k]
ssbn -= square_of_sums_alldata / float(n_samples)
sswn = sstot - ssbn
dfbn = n_classes - 1
dfwn = n_samples - n_classes
msb = ssbn / float(dfbn)
msw = sswn / float(dfwn)
f = msb / msw
return f
def _map_effects(n_factors, effects):
"""Map effects to indices."""
if n_factors > len(ascii_uppercase):
raise ValueError('Maximum number of factors supported is 26')
factor_names = list(ascii_uppercase[:n_factors])
if isinstance(effects, str):
if '*' in effects and ':' in effects:
raise ValueError('Not "*" and ":" permitted in effects')
elif '+' in effects and ':' in effects:
raise ValueError('Not "+" and ":" permitted in effects')
elif effects == 'all':
effects = None
elif len(effects) == 1 or ':' in effects:
effects = [effects]
elif '+' in effects:
# all main effects
effects = effects.split('+')
elif '*' in effects:
pass # handle later
else:
raise ValueError('"{}" is not a valid option for "effects"'
.format(effects))
if isinstance(effects, list):
bad_names = [e for e in effects if e not in factor_names]
if len(bad_names) > 1:
raise ValueError('Effect names: {} are not valid. They should '
'the first `n_factors` ({}) characters from the'
'alphabet'.format(bad_names, n_factors))
indices = list(np.arange(2 ** n_factors - 1))
names = list()
for this_effect in indices:
contrast_idx = _get_contrast_indices(this_effect + 1, n_factors)
this_code = (n_factors - 1) - np.where(contrast_idx == 1)[0]
this_name = [factor_names[e] for e in this_code]
this_name.sort()
names.append(':'.join(this_name))
if effects is None or isinstance(effects, str):
effects_ = names
else:
effects_ = effects
selection = [names.index(sel) for sel in effects_]
names = [names[sel] for sel in selection]
if isinstance(effects, str):
if '*' in effects:
# hierarchical order of effects
# the * based effect can be used as stop index
sel_ind = names.index(effects.replace('*', ':')) + 1
names = names[:sel_ind]
selection = selection[:sel_ind]
return selection, names
def _get_contrast_indices(effect_idx, n_factors): # noqa: D401
"""Henson's factor coding, see num2binvec."""
binrepr = np.binary_repr(effect_idx, n_factors)
return np.array([int(i) for i in binrepr], dtype=int)
def _iter_contrasts(n_subjects, factor_levels, effect_picks):
"""Set up contrasts."""
from scipy.signal import detrend
sc = []
n_factors = len(factor_levels)
# prepare computation of Kronecker products
for n_levels in factor_levels:
# for each factor append
# 1) column vector of length == number of levels,
# 2) square matrix with diagonal == number of levels
# main + interaction effects for contrasts
sc.append([np.ones([n_levels, 1]),
detrend(np.eye(n_levels), type='constant')])
for this_effect in effect_picks:
contrast_idx = _get_contrast_indices(this_effect + 1, n_factors)
c_ = sc[0][contrast_idx[n_factors - 1]]
for i_contrast in range(1, n_factors):
this_contrast = contrast_idx[(n_factors - 1) - i_contrast]
c_ = np.kron(c_, sc[i_contrast][this_contrast])
df1 = np.linalg.matrix_rank(c_)
df2 = df1 * (n_subjects - 1)
yield c_, df1, df2
def f_threshold_mway_rm(n_subjects, factor_levels, effects='A*B',
pvalue=0.05):
"""Compute F-value thresholds for a two-way ANOVA.
Parameters
----------
n_subjects : int
The number of subjects to be analyzed.
factor_levels : list-like
The number of levels per factor.
effects : str
A string denoting the effect to be returned. The following
mapping is currently supported:
* ``'A'``: main effect of A
* ``'B'``: main effect of B
* ``'A:B'``: interaction effect
* ``'A+B'``: both main effects
* ``'A*B'``: all three effects
pvalue : float
The p-value to be thresholded.
Returns
-------
F_threshold : list | float
list of F-values for each effect if the number of effects
requested > 2, else float.
See Also
--------
f_oneway
f_mway_rm
Notes
-----
.. versionadded:: 0.10
"""
from scipy.stats import f
effect_picks, _ = _map_effects(len(factor_levels), effects)
F_threshold = []
for _, df1, df2 in _iter_contrasts(n_subjects, factor_levels,
effect_picks):
F_threshold.append(f(df1, df2).isf(pvalue))
return F_threshold if len(F_threshold) > 1 else F_threshold[0]
def f_mway_rm(data, factor_levels, effects='all',
correction=False, return_pvals=True):
"""Compute M-way repeated measures ANOVA for fully balanced designs.
Parameters
----------
data : ndarray
3D array where the first two dimensions are compliant
with a subjects X conditions scheme where the first
factor repeats slowest::
A1B1 A1B2 A2B1 A2B2
subject 1 1.34 2.53 0.97 1.74
subject ... .... .... .... ....
subject k 2.45 7.90 3.09 4.76
The last dimensions is thought to carry the observations
for mass univariate analysis.
factor_levels : list-like
The number of levels per factor.
effects : str | list
A string denoting the effect to be returned. The following
mapping is currently supported (example with 2 factors):
* ``'A'``: main effect of A
* ``'B'``: main effect of B
* ``'A:B'``: interaction effect
* ``'A+B'``: both main effects
* ``'A*B'``: all three effects
* ``'all'``: all effects (equals 'A*B' in a 2 way design)
If list, effect names are used: ``['A', 'B', 'A:B']``.
correction : bool
The correction method to be employed if one factor has more than two
levels. If True, sphericity correction using the Greenhouse-Geisser
method will be applied.
return_pvals : bool
If True, return p-values corresponding to F-values.
Returns
-------
F_vals : ndarray
An array of F-statistics with length corresponding to the number
of effects estimated. The shape depends on the number of effects
estimated.
p_vals : ndarray
If not requested via return_pvals, defaults to an empty array.
See Also
--------
f_oneway
f_threshold_mway_rm
Notes
-----
.. versionadded:: 0.10
"""
from scipy.stats import f
if data.ndim == 2: # general purpose support, e.g. behavioural data
data = data[:, :, np.newaxis]
elif data.ndim > 3: # let's allow for some magic here.
data = data.reshape(
data.shape[0], data.shape[1], np.prod(data.shape[2:]))
effect_picks, _ = _map_effects(len(factor_levels), effects)
n_obs = data.shape[2]
n_replications = data.shape[0]
# put last axis in front to 'iterate' over mass univariate instances.
data = np.rollaxis(data, 2)
fvalues, pvalues = [], []
for c_, df1, df2 in _iter_contrasts(n_replications, factor_levels,
effect_picks):
y = np.dot(data, c_)
b = np.mean(y, axis=1)[:, np.newaxis, :]
ss = np.sum(np.sum(y * b, axis=2), axis=1)
mse = (np.sum(np.sum(y * y, axis=2), axis=1) - ss) / (df2 / df1)
fvals = ss / mse
fvalues.append(fvals)
if correction:
# sample covariances, leave off "/ (y.shape[1] - 1)" norm because
# it falls out.
v = np.array([np.dot(y_.T, y_) for y_ in y])
v = (np.array([np.trace(vv) for vv in v]) ** 2 /
(df1 * np.sum(np.sum(v * v, axis=2), axis=1)))
eps = v
df1, df2 = np.zeros(n_obs) + df1, np.zeros(n_obs) + df2
if correction:
# numerical imprecision can cause eps=0.99999999999999989
# even with a single category, so never let our degrees of
# freedom drop below 1.
df1, df2 = [np.maximum(d[None, :] * eps, 1.) for d in (df1, df2)]
if return_pvals:
pvals = f(df1, df2).sf(fvals)
else:
pvals = np.empty(0)
pvalues.append(pvals)
# handle single effect returns
return [np.squeeze(np.asarray(vv)) for vv in (fvalues, pvalues)]
def _parametric_ci(arr, ci=.95):
"""Calculate the `ci`% parametric confidence interval for `arr`."""
mean = arr.mean(0)
if len(arr) < 2: # can't compute standard error
sigma = np.full_like(mean, np.nan)
return mean, sigma
from scipy import stats
sigma = stats.sem(arr, 0)
return stats.t.interval(ci, loc=mean, scale=sigma, df=arr.shape[0])
|