1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
#!/usr/local/bin/perl
# ********************************************************************
# * Copyright (C) 2016 and later: Unicode, Inc. and others.
# * License & terms of use: http://www.unicode.org/copyright.html
# ********************************************************************
# ********************************************************************
# * COPYRIGHT:
# * Copyright (c) 2002, International Business Machines Corporation and
# * others. All Rights Reserved.
# ********************************************************************
package Dataset;
use Statistics::Descriptive;
use Statistics::Distributions;
use strict;
# Create a new Dataset with the given data.
sub new {
my ($class) = shift;
my $self = bless {
_data => \@_,
_scale => 1.0,
_mean => 0.0,
_error => 0.0,
}, $class;
my $n = @_;
if ($n >= 1) {
my $stats = Statistics::Descriptive::Full->new();
$stats->add_data(@{$self->{_data}});
$self->{_mean} = $stats->mean();
if ($n >= 2) {
# Use a t distribution rather than Gaussian because (a) we
# assume an underlying normal dist, (b) we do not know the
# standard deviation -- we estimate it from the data, and (c)
# we MAY have a small sample size (also works for large n).
my $t = Statistics::Distributions::tdistr($n-1, 0.005);
$self->{_error} = $t * $stats->standard_deviation();
}
}
$self;
}
# Set a scaling factor for all data; 1.0 means no scaling.
# Scale must be > 0.
sub setScale {
my ($self, $scale) = @_;
$self->{_scale} = $scale;
}
# Multiply the scaling factor by a value.
sub scaleBy {
my ($self, $a) = @_;
$self->{_scale} *= $a;
}
# Return the mean.
sub getMean {
my $self = shift;
return $self->{_mean} * $self->{_scale};
}
# Return a 99% error based on the t distribution. The dataset
# is described as getMean() +/- getError().
sub getError {
my $self = shift;
return $self->{_error} * $self->{_scale};
}
# Divide two Datasets and return a new one, maintaining the
# mean+/-error. The new Dataset has no data points.
sub divide {
my $self = shift;
my $rhs = shift;
my $minratio = ($self->{_mean} - $self->{_error}) /
($rhs->{_mean} + $rhs->{_error});
my $maxratio = ($self->{_mean} + $self->{_error}) /
($rhs->{_mean} - $rhs->{_error});
my $result = Dataset->new();
$result->{_mean} = ($minratio + $maxratio) / 2;
$result->{_error} = $result->{_mean} - $minratio;
$result->{_scale} = $self->{_scale} / $rhs->{_scale};
$result;
}
# subtracts two Datasets and return a new one, maintaining the
# mean+/-error. The new Dataset has no data points.
sub subtract {
my $self = shift;
my $rhs = shift;
my $result = Dataset->new();
$result->{_mean} = $self->{_mean} - $rhs->{_mean};
$result->{_error} = $self->{_error} + $rhs->{_error};
$result->{_scale} = $self->{_scale};
$result;
}
# adds two Datasets and return a new one, maintaining the
# mean+/-error. The new Dataset has no data points.
sub add {
my $self = shift;
my $rhs = shift;
my $result = Dataset->new();
$result->{_mean} = $self->{_mean} + $rhs->{_mean};
$result->{_error} = $self->{_error} + $rhs->{_error};
$result->{_scale} = $self->{_scale};
$result;
}
# Divides a dataset by a scalar.
# The new Dataset has no data points.
sub divideByScalar {
my $self = shift;
my $s = shift;
my $result = Dataset->new();
$result->{_mean} = $self->{_mean}/$s;
$result->{_error} = $self->{_error}/$s;
$result->{_scale} = $self->{_scale};
$result;
}
# Divides a dataset by a scalar.
# The new Dataset has no data points.
sub multiplyByScalar {
my $self = shift;
my $s = shift;
my $result = Dataset->new();
$result->{_mean} = $self->{_mean}*$s;
$result->{_error} = $self->{_error}*$s;
$result->{_scale} = $self->{_scale};
$result;
}
1;
|