1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
|
% In this notebook, we demonstrate statistical tests and distributions,
% as implemented in the file STATIST.E. We load this file first.
>load "statist"
% Assume the following measurements. We wish to compute the mean
% value and the measured standard deviation.
>M=[1000,1004,998,997,1002];
>mean(M)
>dev(M)
% Next we plot the normal distribution with this mean and deviation.
>fplot("qnormal(x,mean(M),dev(M))",990,1010); wait(20);
% We wish to compute the probability that a value is bigger than
% 1005, assuming the measured distribution. To do this, we have
% to normalize the value and compute the normal distribution quantile
% of this value.
>1-normaldis((1005-mean(M))/dev(M))
% So the probability is 4.7 percent.
%
% Next we assume measured sizes of men in cm. We set data ranges
% 155.5-159.5, 159.5-163.5 and so on.
>r=155.5:4:190.5; v=[22,71,136,169,139,71,32,8];
% We can use xplotrange to plot these measurements. The function
% expects the end values of the ranges and the measurements. So
% r must be one longer than v.
>xplotrange(r,v); wait(20);
% Next we take as measurement point the middle of each range and
% compute the mean value and the deviation of all data.
>l=(r[1:8]+r[2:9])/2;
>{m,d}=meandev(l,v); m, d,
% meandev computes the mean value of measurements r with multiplicities
% v.
>{m1,d1}=meandev([100,200],[1,99]); m1, d1,
% We can add a plot of the expected distribution with this mean
% value and deviation over the true size measurements.
>hold on; linewidth(3);
>fplot("qnormal(x,m,d)*sum(v)*4",min(r),max(r));
>linewidth(1); hold off; shg;
% Next we test the distribution against the normal distribution.
% This uses a chi^2 test.
>testnormal(r,sum(v),v,m,d)
% This error probablity for rejection is much to high. So we do
% not reject the hypothesis of normal distribution.
%
% In comparision, we try the same with a sample of the built in
% random number generator.
>p=normal(1,1000); testnormal(0:10,1000,count(p*2.5+5,10),5,2.5)
% This will sometimes lead to rejection. To put up a real test,
% we repeat this test 1000 times and reject, if p<0.05. This should
% lead to about 5 percent rejections.
>function test (m=1000)
$n=0;
$loop 1 to m;
$if testnormal(0:10,100,count(normal(1,100)*2.5+5,10),5,2.5)<0.05;
$n=n+1;
$endif;
$end;
$return n/m;
$endfunction
>test
% Next we test dice throws to the equidistribution. Assuming 600
% throws, we got some values.
>chitest([90,103,114,101,103,89],dup(100,6)')
% So we do not reject equi-distribution.
%
% Next we generate 1000 dice throws and do the same.
>n=1000; t=random([1,n*6]); chitest(count(t*6,6),dup(n,6)')
% A result less than 0.05 leads to the assumption of a false dice.
% This may happen every 20-th time.
%
% Next we test for the mean value 100 with the t-test.
>s=100+normal([1,100])*10;
>ttest(mean(s),dev(s),10,100)
% The function accepts the mean, the deviation and the number
% of data, and the mean value to test for.
%
% Finally, we check two measurements for the same mean. We reject
% if the result is <0.05.
>tcomparedata(normal(1,10),normal(1,10))
% If bias one distribution, we get more rejections. Repeat this
% test several times to see the effect.
>tcomparedata(normal(1,10),normal(1,10)+1)
% In the next example, we generate 100 times 20 random dice throws
% and count the ones in it. There must be 20/6=3.3 ones on average.
>R=random(100,20); R=sum(R*6<=1)'; mean(R)
% We now compare the number of ones with the binomial distribution.
% First we plot the distribution of ones.
>t=count(R,21); xplotrange(0:21,t); wait(20);
% Then we compute the expected values.
>n=0:20; b=bin(20,n)*(1/6)^n*(5/6)^(20-n)*100;
% We now collect several numbers to get categories, which are
% big enough.
>t1=sum(t[1:2])|t[3:7]|sum(t[8:21]);
>b1=sum(b[1:2])|b[3:7]|sum(b[8:21]);
% The chitest rejects our result, if its result is <0.05.
>chitest(t1,b1)
% This may happen any 20-th time. So we repeat the test 100 times
% and should get 5 percent rejections.
>function test (m=100)
$n=0;
$b=bin(20,0:20)*(1/6)^(0:20)*(5/6)^(20-(0:20))*100;
$loop 1 to m;
$t=count(sum(random(100,20)*6<=1)',21);
$c=chitest(sum(t[1:2])|t[3:7]|sum(t[8:21]), ..
$sum(b[1:2])|b[3:7]|sum(b[8:21]));
$if c<0.05; n=n+1; endif;
$end;
$return n/m;
$endfunction
>test
% The following example contains results of two groups of persons
% (male and female, say) in voting any of six parties.
>A=[23,37,43,52,67,74;27,39,41,49,63,77]
% We wish to test for independence of the votes from the sex.
>tabletest(A)
% We now demonstrate how to read and write data to a file.
>a=random(1,100); mean(a), dev(a),
% To write it, we use the printformat function.
>open("test.dat","w"); printformat(a'); close();
% To read it, we use the getvector function, which accepts almost
% any data format.
>open("test.dat","r"); a=getvector(100); close();
>mean(a), dev(a),
% Next we use a variance analysis (F-test) to test three samples
% of normally distributed data for same mean value.
>x1=[109,111,98,119,91,118,109,99,115,109,94]; mean(x1),
>x2=[120,124,115,139,114,110,113,120,117]; mean(x2),
>x3=[120,112,115,110,105,134,105,130,121,111]; mean(x3)
>varanalysis(x1,x2,x3)
% This means, we reject the hypothesis of same mean value.
%
% We can compute the binomial distribution. First there is the
% binomialsum function, which returns the probability of i or
% less hits out of n trials.
>binsum(410,1000,0.4)
% The normalum function approximates the same with the normal
% distribution and is much faster.
>normalsum(410,1000,0.4)
% This is the direct way. But normalsum uses some tricks to avoid
% overflow and is faster.
>p=0.4; i=0:410; n=1000; sum(bin(n,i)*p^i*(1-p)^(n-i))
% invbinsum computes the inverse of binomialsum.
>invbinsum(0.75,1000,0.4)
% In Bridge there may be 5 outstanding cards (out of 52) in two
% hands (26 cards). Let us compute the probability of a distribution
% worse than 3:2 (e.g. 0:5, 1:4, 4:1 or 5:0).
>2*hypergeomsum(1,5,13,26)
% An application is the medin test, which is to reject data samples
% with different mean distribution by testing with the median
% of the united sample.
>a=[56,66,68,49,61,53,45,58,54];
>b=[72,81,51,73,69,78,59,67,65,71,68,71];
>mediantest(a,b)
% Another test on equality is the rank test. This is much sharper
% than the median test.
>ranktest(a,b)
% In the following example, both distributions have the same mean.
>ranktest(random(1,100),random(1,50)*3-1)
% Let us now try to simulate two treatments a and b applied to
% different persons.
>a=[8.0,7.4,5.9,9.4,8.6,8.2,7.6,8.1,6.2,8.9];
>b=[6.8,7.1,6.8,8.3,7.9,7.2,7.4,6.8,6.8,8.1];
% The signum test decides, wether a is better than b.
>signtest(a,b)
% This is to much error. We cannot reject that a not better than
% b.
%
% The Wilcoxon test is sharper than this test, but relies on the
% quantitative value of the differences.
>wilcoxon(a,b)
% Two tests with internally generated series.
>wilcoxon(normal(1,20),normal(1,20)-1)
>wilcoxon(normal(1,20),normal(1,20))
>
|