File: ks_test.sl

package info (click to toggle)
slang2 2.3.3-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 11,488 kB
  • sloc: ansic: 101,756; sh: 3,435; makefile: 1,046; pascal: 440
file content (91 lines) | stat: -rw-r--r-- 2,530 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
% Kolmogorov–Smirnov test

% Asymptotically correct.  Stephens 1974
private define ks_test_prob (n, d)
{
   variable sn = sqrt(n);
   variable factor = sn + 0.12 + 0.11/sn;
   return 1-smirnov_cdf (sn * d);
}

define ks_test ()
{
   variable d_ref = NULL;
   if (_NARGS == 2)
     d_ref = ();
   else if (_NARGS != 1)
     usage ("p = ks_test (CDF [,&D]);  %% 1-sample KS test\n",
	    + " Here CDF are the expected CDFs at the corresponding random points.");
   variable cdf = ();

   cdf = __tmp(cdf)[array_sort(cdf)];
   variable n = length (cdf);
   variable nn = 1.0*n;
   variable dplus = max ([1:n]/nn - cdf);
   variable dminus = max (cdf-[0:n-1]/nn);
   variable d = max ([dplus, dminus]);
   if (d_ref != NULL)
     @d_ref = d;

   return ks_test_prob (n, d);
}

% We want ks_test2_prob to return P(D_mn >= d), where d is the observed value.
% It is known that d can only take on values c/mn where c, m, and n are integers.
% So set d=c/mn.
% kim_jennrich_cdf returns P(D_mn <= c/mn)
% But we want P(D_mn >= c/mn) = 1-P(D_mn < c/mn)
%   P(D_mn <= (c-1)/mn) <= P(D_mn < c/mn) <= P(D_mn <= c/mn)
%   P(D_mn <= (c-1)/mn) <= P(D_mn < c/mn) <= P(D_mn < c/mn) + P(D_mn==c/mn)
%   P(D_mn <= (c-1)/mn) <= P(D_mn < c/mn) + P(D_mn==c/mn)
%
% Since D_mn can only take on values c/mn, it follows that
%   P(D_mn < c/mn) = P(D_mn <= (c-1)/mn)
%
private define ks_test2_prob ()
{
   %if (_NARGS != 3)
   %  usage ("p = %s(m, n, d); %% P(D_mn >= d)", _function_name ());
   variable d, m, n; (m, n, d) = ();

   % See the above note for why 1 is subtracted for the first argument of
   % kim_jennrich.
   variable fm = double (m);
   if (fm * n <= 10000.0)
     return 1.0 - kim_jennrich_cdf (m, n, int (d*m*n + 0.5) - 1);

   % Use asymptotic forms.
   return ks_test_prob ((fm*n)/(fm+n), d);
}

define ks_test2 ()
{
   variable d_ref = NULL;
   if (_NARGS == 3)
     d_ref = ();
   else if (_NARGS != 2)
     usage ("p = %s(X1, X2 [,&D]); %% Two-sample KS test", _function_name ());

   variable xm, xn; (xm, xn) = ();
   variable x = [xn, xm];
   variable n = length (xn);
   variable m = length (xm);
   variable mn = m + n;
   variable c = Int_Type[mn];
   c[[0:n-1]] = 1;

   variable i = array_sort (x);
   x = x[i];
   c = c[i]; c = cumsum (__tmp(c));
   variable dmn = (c/n - [1:mn]/(mn*1.0));
   variable factor = mn/(m*1.0);
   variable dplus = factor * max(dmn);
   variable dminus = factor * min(dmn);
   variable d = max([dplus, -dminus]);

   if (d_ref != NULL)
     @d_ref = d;

   return ks_test2_prob (m, n, d);
}