File: GLM.pm

package info (click to toggle)
libpdl-stats-perl 0.855-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 468 kB
sloc: perl: 1,459; makefile: 3
file content (2494 lines) | stat: -rw-r--r-- 70,276 bytes
#
# GENERATED WITH PDL::PP from lib/PDL/Stats/GLM.pd! Don't modify!
#
package PDL::Stats::GLM;

our @EXPORT_OK = qw(ols_t ols ols_rptd anova anova_rptd anova_design_matrix dummy_code effect_code effect_code_w interaction_code r2_change logistic pca pca_sorti plot_means plot_residuals plot_screes fill_m fill_rand dev_m stddz sse mse rmse pred_logistic d0 dm dvrs );
our %EXPORT_TAGS = (Func=>\@EXPORT_OK);

use PDL::Core;
use PDL::Exporter;
use DynaLoader;


   
   our @ISA = ( 'PDL::Exporter','DynaLoader' );
   push @PDL::Core::PP, __PACKAGE__;
   bootstrap PDL::Stats::GLM ;








#line 14 "lib/PDL/Stats/GLM.pd"

use strict;
use warnings;

use Carp;
use PDL::LiteF;
use PDL::MatrixOps;
use PDL::Stats::Basic;
use PDL::Stats::Kmeans;

eval { require PDL::Core; require PDL::GSL::CDF; };
my $CDF = 1 if !$@;

=encoding utf8

=head1 NAME

PDL::Stats::GLM -- general and generalized linear modelling methods such as ANOVA, linear regression, PCA, and logistic regression.

=head1 SYNOPSIS

    use PDL::LiteF;
    use PDL::Stats::GLM;

    # do a multiple linear regression and plot the residuals
    my $y = pdl( 8, 7, 7, 0, 2, 5, 0 );
    my $x = pdl( [ 0, 1, 2, 3, 4, 5, 6 ],        # linear component
                 [ 0, 1, 4, 9, 16, 25, 36 ] );   # quadratic component
    my %m  = $y->ols( $x, {plot=>1} );
    print "$_\t$m{$_}\n" for sort keys %m;

=head1 DESCRIPTION

For more about general linear modelling, see
L<Wikipedia|https://en.wikipedia.org/wiki/General_linear_model>.
For an unbelievably thorough text on experimental design
and analysis, including linear modelling, see L<A First
Course in Design and Analysis of Experiments, Gary
W. Oehlert|http://users.stat.umn.edu/~gary/book/fcdae.pdf>.

The terms FUNCTIONS and METHODS are arbitrarily used to refer to
methods that are broadcastable and methods that are NOT broadcastable,
respectively. FUNCTIONS support bad values.

P-values, where appropriate, are provided if PDL::GSL::CDF is installed.

=cut
#line 75 "lib/PDL/Stats/GLM.pm"


=head1 FUNCTIONS

=cut






=head2 fill_m

=for sig

 Signature: (a(n); [o]b(n))
 Types: (float double)

=for ref

Replaces bad values with sample mean. Mean is set to 0 if all obs are
bad.

=for usage

     pdl> p $data
     [
      [  5 BAD   2 BAD]
      [  7   3   7 BAD]
     ]

     pdl> p $data->fill_m
     [
      [      5     3.5       2     3.5]
      [      7       3       7 5.66667]
     ]
  

=pod

Broadcasts over its inputs.

=for bad

The output pdl badflag is cleared.

=cut




*fill_m = \&PDL::fill_m;






=head2 fill_rand

=for sig

 Signature: (a(n); [o]b(n))
 Types: (sbyte byte short ushort long ulong indx ulonglong longlong
   float double ldouble)

=for ref

Replaces bad values with random sample (with replacement) of good
observations from the same variable.

=for usage

    pdl> p $data
    [
     [  5 BAD   2 BAD]
     [  7   3   7 BAD]
    ]

    pdl> p $data->fill_rand

    [
     [5 2 2 5]
     [7 3 7 7]
    ]

=pod

Broadcasts over its inputs.

=for bad

The output pdl badflag is cleared.

=cut




*fill_rand = \&PDL::fill_rand;






=head2 dev_m

=for sig

 Signature: (a(n); [o]b(n))
 Types: (float double)

=for usage

 $b = dev_m($a);
 dev_m($a, $b);      # all arguments given
 $b = $a->dev_m;     # method call
 $a->dev_m($b);
 $a->inplace->dev_m; # can be used inplace
 dev_m($a->inplace);

=for ref

Replaces values with deviations from the mean.

=pod

Broadcasts over its inputs.

=for bad

C<dev_m> processes bad values.
It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.

=cut




*dev_m = \&PDL::dev_m;






=head2 stddz

=for sig

 Signature: (a(n); [o]b(n))
 Types: (float double)

=for usage

 $b = stddz($a);
 stddz($a, $b);      # all arguments given
 $b = $a->stddz;     # method call
 $a->stddz($b);
 $a->inplace->stddz; # can be used inplace
 stddz($a->inplace);

=for ref

Standardize ie replace values with z_scores based on sample standard deviation from the mean (replace with 0s if stdv==0).

=pod

Broadcasts over its inputs.

=for bad

C<stddz> processes bad values.
It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.

=cut




*stddz = \&PDL::stddz;






=head2 sse

=for sig

 Signature: (a(n); b(n); [o]c())
 Types: (float double)

=for usage

 $c = sse($a, $b);
 sse($a, $b, $c);  # all arguments given
 $c = $a->sse($b); # method call
 $a->sse($b, $c);

=for ref

Sum of squared errors between actual and predicted values.

=pod

Broadcasts over its inputs.

=for bad

C<sse> processes bad values.
It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.

=cut




*sse = \&PDL::sse;






=head2 mse

=for sig

 Signature: (a(n); b(n); [o]c())
 Types: (float double)

=for usage

 $c = mse($a, $b);
 mse($a, $b, $c);  # all arguments given
 $c = $a->mse($b); # method call
 $a->mse($b, $c);

=for ref

Mean of squared errors between actual and predicted values, ie variance around predicted value.

=pod

Broadcasts over its inputs.

=for bad

C<mse> processes bad values.
It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.

=cut




*mse = \&PDL::mse;






=head2 rmse

=for sig

 Signature: (a(n); b(n); [o]c())
 Types: (float double)

=for usage

 $c = rmse($a, $b);
 rmse($a, $b, $c);  # all arguments given
 $c = $a->rmse($b); # method call
 $a->rmse($b, $c);

=for ref

Root mean squared error, ie stdv around predicted value.

=pod

Broadcasts over its inputs.

=for bad

C<rmse> processes bad values.
It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.

=cut




*rmse = \&PDL::rmse;






=head2 pred_logistic

=for sig

 Signature: (a(n,m); b(m); [o]c(n))
 Types: (float double)

=for ref

Calculates predicted prob value for logistic regression.

=for usage

    # glue constant then apply coeff returned by the logistic method
    $pred = $x->glue(1,ones($x->dim(0)))->pred_logistic( $m{b} );

=pod

Broadcasts over its inputs.

=for bad

C<pred_logistic> processes bad values.
It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.

=cut




*pred_logistic = \&PDL::pred_logistic;






=head2 d0

=for sig

 Signature: (a(n); [o]c())
 Types: (float double)

=for usage

 $c = d0($a);
 d0($a, $c);  # all arguments given
 $c = $a->d0; # method call
 $a->d0($c);

=for ref

Null deviance for logistic regression.

=pod

Broadcasts over its inputs.

=for bad

C<d0> processes bad values.
It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.

=cut




*d0 = \&PDL::d0;






=head2 dm

=for sig

 Signature: (a(n); b(n); [o]c())
 Types: (float double)

=for ref

Model deviance for logistic regression.

=for usage

    my $dm = $y->dm( $y_pred );
      # null deviance
    my $d0 = $y->dm( ones($y->nelem) * $y->avg );

=pod

Broadcasts over its inputs.

=for bad

C<dm> processes bad values.
It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.

=cut




*dm = \&PDL::dm;






=head2 dvrs

=for sig

 Signature: (a(); b(); [o]c())
 Types: (float double)

=for usage

 $c = dvrs($a, $b);
 dvrs($a, $b, $c);  # all arguments given
 $c = $a->dvrs($b); # method call
 $a->dvrs($b, $c);

=for ref

Deviance residual for logistic regression.

=pod

Broadcasts over its inputs.

=for bad

C<dvrs> processes bad values.
It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.

=cut




*dvrs = \&PDL::dvrs;





#line 359 "lib/PDL/Stats/GLM.pd"

#line 360 "lib/PDL/Stats/GLM.pd"

=head2 ols_t

=for ref

Broadcasted version of ordinary least squares regression (B<ols>). The
price of broadcasting was losing significance tests for coefficients
(but see B<r2_change>). The fitting function was shamelessly copied then
modified from PDL::Fit::Linfit. Intercept is FIRST of coeff if CONST => 1.

ols_t does not handle bad values. consider B<fill_m> or B<fill_rand>
if there are bad values.

=for options

Default options (case insensitive):

    CONST   => 1,

=for usage

Usage:

    # DV, 2 person's ratings for top-10 box office movies
    # ascending sorted by box office numbers

    pdl> p $y = pdl '1 1 2 4 4 4 4 5 5 5; 1 2 2 2 3 3 3 3 5 5'

    # model with 2 IVs, a linear and a quadratic trend component

    pdl> $x = cat sequence(10), sequence(10)**2

    # suppose our novice modeler thinks this creates 3 different models
    # for predicting movie ratings

    pdl> p $x = cat $x, $x * 2, $x * 3
    [
     [
      [ 0  1  2  3  4  5  6  7  8  9]
      [ 0  1  4  9 16 25 36 49 64 81]
     ]
     [
      [  0   2   4   6   8  10  12  14  16  18]
      [  0   2   8  18  32  50  72  98 128 162]
     ]
     [
      [  0   3   6   9  12  15  18  21  24  27]
      [  0   3  12  27  48  75 108 147 192 243]
     ]
    ]

    pdl> p $x->info
    PDL: Double D [10,2,3]

    # insert a dummy dim between IV and the dim (model) to be broadcasted

    pdl> %m = $y->ols_t( $x->dummy(2) )

    pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m

    # 2 persons' ratings, each fitted with 3 "different" models
    F   [
     [ 38.314159  25.087209]
     [ 38.314159  25.087209]
     [ 38.314159  25.087209]
    ]

    # df is the same across dv and iv models
    F_df    [2 7]
    F_p [
     [0.00016967051 0.00064215074]
     [0.00016967051 0.00064215074]
     [0.00016967051 0.00064215074]
    ]

    R2  [
     [ 0.9162963 0.87756762]
     [ 0.9162963 0.87756762]
     [ 0.9162963 0.87756762]
    ]

    b   [ # constant     linear      quadratic
     [
      [ 0.66363636    0.99015152   -0.056818182] # person 1
      [        1.4    0.18939394    0.022727273] # person 2
     ]
     [
      [ 0.66363636    0.49507576   -0.028409091]
      [        1.4    0.09469697    0.011363636]
     ]
     [
      [ 0.66363636    0.33005051   -0.018939394]
      [        1.4   0.063131313   0.0075757576]
     ]
    ]

    # our novice modeler realizes at this point that
    # the 3 models only differ in the scaling of the IV coefficients
    ss_model    [
     [ 20.616667  13.075758]
     [ 20.616667  13.075758]
     [ 20.616667  13.075758]
    ]

    ss_residual [
     [ 1.8833333  1.8242424]
     [ 1.8833333  1.8242424]
     [ 1.8833333  1.8242424]
    ]

    ss_total    [22.5 14.9]
    y_pred      [
     [
      [0.66363636  1.5969697  2.4166667  3.1227273  ...  4.9727273]
    ...

=cut

*ols_t = \&PDL::ols_t;
sub PDL::ols_t {
  _ols_common(1, @_);
}

sub _ols_common {
  my ($broadcasted, $y, $ivs, $opt) = @_;
  ($y, $ivs) = _ols_prep_inputs(@_);
  _ols_main($broadcasted, $y, $ivs, $opt);
}

sub _ols_prep_inputs {
    # y [n], ivs [n x attr] pdl
  my ($broadcasted, $y, $ivs, $opt) = @_;
  my %opt = (
    CONST => 1,
    PLOT  => 0,
    WIN    => undef,      # for plotting
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  if (!$broadcasted) {
    $y = $y->squeeze;
    $y->getndims > 1 and
      croak "use ols_t for broadcasted version";
  }
  $ivs = $ivs->dummy(1) if $ivs->getndims == 1;
  ($y, $ivs) = _rm_bad_value( $y, $ivs ) if !$broadcasted;
    # set up ivs and const as ivs
  $opt{CONST} and
    $ivs = ones($ivs->dim(0))->glue( 1, $ivs );
  ($y, $ivs);
}

sub _ols_main {
  my ($broadcasted, $y, $ivs, $opt) = @_;
  my %opt = (
    CONST => 1,
    PLOT  => 0,
    WIN    => undef,      # for plotting
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  my $C = inv( $ivs x $ivs->t );
  # Internally normalise data
  # (double) it or ushort y and sequence iv won't work right
  my $ymean = $y->abs->avgover->double;
  $ymean->where( $ymean==0 ) .= 1;
  my $divisor = $broadcasted ? $ymean->dummy(0) : $ymean;
  my $y2 = $y / $divisor;
  my $Y = $ivs x $y2->dummy(0);
  # Do the fit
    # Fitted coefficients vector
  my $coeff = PDL::squeeze( $C x $Y );
  $coeff = $coeff->dummy(0)
    if $broadcasted and $coeff->getndims == 1 and $y->getndims > 1;
  $coeff *= $divisor; # Un-normalise
    # ***$coeff x $ivs looks nice but produces nan on successive tries***
  my $y_pred = sumover( ($broadcasted ? $coeff->dummy(1) : $coeff) * $ivs->transpose );
  $opt{PLOT} and $y->plot_residuals( $y_pred, \%opt );
  return $coeff unless wantarray;
  my %ret = (y_pred => $y_pred);
  $ret{ss_total} = $opt{CONST} ? $y->ss : sumover( $y ** 2 );
  $ret{ss_residual} = $y->sse( $ret{y_pred} );
  $ret{ss_model} = $ret{ss_total} - $ret{ss_residual};
  $ret{R2} = $ret{ss_model} / $ret{ss_total};
  my $n_var = $opt{CONST} ? $ivs->dim(1) - 1 : $ivs->dim(1);
  $ret{F_df} = pdl( $n_var, my $df1 = $y->dim(0) - $ivs->dim(1) );
  $ret{F} = $ret{ss_model} / $n_var
    / ($ret{ss_residual} / $df1);
  $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $n_var, $df1 )
    if $CDF;
  if (!$broadcasted) {
    my $se_b = ones( $coeff->dims? $coeff->dims : 1 );
    $opt{CONST} and
      $se_b->slice(0) .= sqrt( $ret{ss_residual} / $df1 * $C->slice(0,0) );
      # get the se for bs by successively regressing each iv by the rest ivs
    if ($ivs->dim(1) > 1) {
      my @coords = $opt{CONST} ? 1..$n_var : 0..$n_var-1;
      my $ones = !$opt{CONST} ? undef : ones($ivs->dim(0));
      for my $k (@coords) {
        my $G = $ivs->dice_axis(1, [grep $_ != $k, @coords]);
        $G = $ones->glue( 1, $G ) if $opt{CONST};
        my $b_G = $ivs->slice(':',$k)->ols( $G, {CONST=>0,PLOT=>0} );
        my $ss_res_k = $ivs->slice(':',$k)->squeeze->sse( sumover($b_G * $G->transpose) );
        $se_b->slice($k) .= sqrt( $ret{ss_residual} / $df1 / $ss_res_k );
      }
    }
    else {
      $se_b->slice(0)
        .= sqrt( $ret{ss_residual} / $df1 / sum( $ivs->slice(':',0)**2 ) );
    }
    $ret{b_se} = $se_b;
    $ret{b_t} = $coeff / $ret{b_se};
    $ret{b_p} = 2 * ( 1 - $ret{b_t}->abs->gsl_cdf_tdist_P( $df1 ) )
      if $CDF;
  }
  for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };
  $ret{b} = $coeff;
  return %ret;
}

=head2 r2_change

=for ref

Significance test for the incremental change in R2 when new variable(s)
are added to an ols regression model.

Returns the change stats as well as stats for both models. Based on
L</ols_t>. (One way to make up for the lack of significance tests for
coeffs in ols_t).

=for options

Default options (case insensitive):

    CONST   => 1,

=for usage

Usage:

    # suppose these are two persons' ratings for top 10 box office movies
    # ascending sorted by box office

    pdl> p $y = qsort ceil(random(10, 2) * 5)
    [
     [1 1 2 2 2 3 4 4 4 4]
     [1 2 2 3 3 3 4 4 5 5]
    ]

    # first IV is a simple linear trend

    pdl> p $x1 = sequence 10
    [0 1 2 3 4 5 6 7 8 9]

    # the modeler wonders if adding a quadratic trend improves the fit

    pdl> p $x2 = sequence(10) ** 2
    [0 1 4 9 16 25 36 49 64 81]

    # two difference models are given in two pdls
    # each as would be pass on to ols_t
    # the 1st model includes only linear trend
    # the 2nd model includes linear and quadratic trends
    # when necessary use dummy dim so both models have the same ndims

    pdl> %c = $y->r2_change( $x1->dummy(1), cat($x1, $x2) )

    pdl> p "$_\t$c{$_}\n" for sort keys %c
      #              person 1   person 2
    F_change        [0.72164948 0.071283096]
      # df same for both persons
    F_df    [1 7]
    F_p     [0.42370145 0.79717232]
    R2_change       [0.0085966043 0.00048562549]
    model0  HASH(0x8c10828)
    model1  HASH(0x8c135c8)

    # the answer here is no.

=cut

*r2_change = \&PDL::r2_change;
sub PDL::r2_change {
  my ($self, $ivs0, $ivs1, $opt) = @_;
  $ivs0->getndims == 1 and $ivs0 = $ivs0->dummy(1);

  my %ret;

  $ret{model0} = { $self->ols_t( $ivs0, $opt ) };
  $ret{model1} = { $self->ols_t( $ivs1, $opt ) };

  $ret{R2_change} = $ret{model1}->{R2} - $ret{model0}->{R2};
  $ret{F_df}
    = pdl(my $df0 = $ivs1->dim(1) - $ivs0->dim(1),
          my $df1 = $ret{model1}->{F_df}->slice('(1)') );
  $ret{F_change}
    = $ret{R2_change} * $df1
    / ( (1-$ret{model1}->{R2}) * $df0 );
  $ret{F_p} = 1 - $ret{F_change}->gsl_cdf_fdist_P( $df0, $df1 )
    if $CDF;

  for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };

  %ret;
}

=head1 METHODS

=head2 anova

=for ref

Analysis of variance. Uses type III sum of squares for unbalanced data.

Dependent variable should be a 1D pdl. Independent variables can be
passed as 1D perl array ref or 1D pdl.

Will only calculate p-value (C<F_p>) if there are more samples than the
product of categories of all the IVs.

Supports bad value (by ignoring missing or BAD values in dependent and
independent variables list-wise).

For more on ANOVA, see L<https://en.wikipedia.org/wiki/Analysis_of_variance>.

=for options

Default options (case insensitive):

    V      => 1,          # carps if bad value in variables
    IVNM   => [],         # auto filled as ['IV_0', 'IV_1', ... ]
    PLOT   => 0,          # plots highest order effect
                          # can set plot_means options here
    WIN    => undef,      # for plotting

=for usage

Usage:

    # suppose this is ratings for 12 apples

    pdl> p $y = qsort ceil( random(12)*5 )
    [1 1 2 2 2 3 3 4 4 4 5 5]

    # IV for types of apple

    pdl> p $a = sequence(12) % 3 + 1
    [1 2 3 1 2 3 1 2 3 1 2 3]

    # IV for whether we baked the apple

    pdl> @b = qw( y y y y y y n n n n n n )

    pdl> %m = $y->anova( $a, \@b, { IVNM=>['apple', 'bake'] } )

    pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m
    F	2.46666666666667
    F_df	[5 6]
    F_p	0.151168719948632
    ms_model	3.08333333333333
    ms_residual	1.25
    ss_model	15.4166666666667
    ss_residual	7.5
    ss_total	22.9166666666667
    | apple | F	0.466666666666667
    | apple | F_p	0.648078345471096
    | apple | df	2
    | apple | m	[2.75    3  3.5]
    | apple | ms	0.583333333333334
    | apple | se	[0.85391256 0.81649658 0.64549722]
    | apple | ss	1.16666666666667
    | apple || err df	6
    | apple ~ bake | F	0.0666666666666671
    | apple ~ bake | F_p	0.936190104380701
    | apple ~ bake | df	2
    | apple ~ bake | m	[
     [1.5   2 2.5]
     [  4   4 4.5]
    ]
    | apple ~ bake | ms	0.0833333333333339
    | apple ~ bake | se	[
     [0.5   1 0.5]
     [  1   1 0.5]
    ]
    | apple ~ bake | ss	0.166666666666668
    | apple ~ bake || err df	6
    | bake | F	11.2666666666667
    | bake | F_p	0.015294126084452
    | bake | df	1
    | bake | m	[2 4.1666667]
    | bake | ms	14.0833333333333
    | bake | se	[0.36514837 0.40138649]
    | bake | ss	14.0833333333333
    | bake || err df	6

This is implemented as a call to L</anova_rptd>, with an C<undef>
subjects vector.

=cut

*anova = \&PDL::anova;
sub PDL::anova {
  my ($y, @args) = @_;
  anova_rptd($y, undef, @args);
}

sub _interactions {
  my ($ivs_ref, $idv) = @_;
  my (@inter, @idv_inter);
  for my $nway ( 2 .. @$ivs_ref ) {
    my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] );
    while ( my @v = &$iter_idv() ) {
      push @inter, interaction_code(@$ivs_ref[@v]);
      push @idv_inter, join ' ~ ', @$idv[@v];
    }
  }
  (\@inter, \@idv_inter);
}

# now prepare for cell mean
sub _interactions_cm {
  my ($ivs_ref, $pdl_ivs_raw) = @_;
  my ($dim0, @inter_cm, @inter_cmo) = $ivs_ref->[0]->dim(0);
  for my $nway ( 2 .. @$ivs_ref ) {
    my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] );
    while ( my @v = &$iter_idv() ) {
      my @i_cm;
      for my $o ( 0 .. $dim0 - 1 ) {
        push @i_cm, join '', map $_->slice("($o)"), @$pdl_ivs_raw[@v];
      }
      my ($inter, $map) = effect_code( \@i_cm );
      push @inter_cm, $inter;
        # get the order to put means in correct multi dim pdl pos
        # this is order in var_e dim(1)
      my @levels = sort { $map->{$a} <=> $map->{$b} } keys %$map;
        # this is order needed for cell mean
      my @i_cmo  = sort { reverse($levels[$a]) cmp reverse($levels[$b]) }
                        0 .. $#levels;
      push @inter_cmo, pdl @i_cmo;
    }
  }
  (\@inter_cmo, \@inter_cm);
}

sub _cell_means {
  my ($data, $ivs_cm_ref, $i_cmo_ref, $idv, $pdl_ivs_raw) = @_;
  my %ind_id;
  @ind_id{ @$idv } = 0..$#$idv;
  my %cm;
  my $i = 0;
  for (@$ivs_cm_ref) {
    confess "_cell_means passed empty ivs_cm_ref ndarray at pos $i" if $_->isempty;
    my $last = zeroes $_->dim(0);
    my $i_neg = which $_->slice(':',0) == -1;
    $last->slice($i_neg) .= 1;
    $_->where($_ == -1) .= 0;
    $_ = $_->glue(1, $last);
    my @v = split ' ~ ', $idv->[$i];
    my @shape = map $pdl_ivs_raw->[$_]->uniq->nelem, @ind_id{@v};
    my ($m, $ss) = $data->centroid( $_ );
    $m  = $m->slice($i_cmo_ref->[$i])->sever;
    $ss = $ss->slice($i_cmo_ref->[$i])->sever;
    $m = $m->reshape(@shape);
    my $se = sqrt( ($ss/($_->sumover - 1)) / $_->sumover )->reshape(@shape);
    $cm{ "| $idv->[$i] | m" }  = $m;
    $cm{ "| $idv->[$i] | se" } = $se;
    $i++;
  }
  \%cm;
}

  # http://www.perlmonks.org/?node_id=371228
sub _combinations {
  my ($num, $arr) = @_;
  return sub { return }
    if $num == 0 or $num > @$arr;
  my @pick;
  return sub {
    return @$arr[ @pick = ( 0 .. $num - 1 ) ]
      unless @pick;
    my $i = $#pick;
    $i-- until $i < 0 or $pick[$i]++ < @$arr - $num + $i;
    return if $i < 0;
    @pick[$i .. $#pick] = $pick[$i] .. $#$arr;
    return @$arr[@pick];
  };
}

=head2 anova_rptd

=for ref

Repeated measures and mixed model anova. Uses type III sum of squares.

The standard error (se) for the means are based on the relevant mean
squared error from the anova, ie it is pooled across levels of the effect.
Will only calculate p-value (C<F_p>) if there are more samples than the
product of categories of all the IVs.

Uses L</anova_design_matrix>, so supports bad values.

For more on repeated measures ANOVA, see
L<https://en.wikipedia.org/wiki/Repeated_measures_design>,
and for mixed models, see
L<https://en.wikipedia.org/wiki/Mixed-design_analysis_of_variance>.

=for options

Default options (case insensitive):

    V      => 1,    # carps if bad value in dv
    IVNM   => [],   # auto filled as ['IV_0', 'IV_1', ... ]
    BTWN   => [],   # indices of between-subject IVs (matches IVNM indices)
    PLOT   => 0,    # plots highest order effect
                    # see plot_means() for more options
    WIN    => undef,      # for plotting

=for usage

Usage:

    Some fictional data: recall_w_beer_and_wings.txt

    Subject Beer    Wings   Recall
    Alex    1       1       8
    Alex    1       2       9
    Alex    1       3       12
    Alex    2       1       7
    Alex    2       2       9
    Alex    2       3       12
    Brian   1       1       12
    Brian   1       2       13
    Brian   1       3       14
    Brian   2       1       9
    Brian   2       2       8
    Brian   2       3       14
    ...

      # rtable allows text only in 1st row and col
    my ($data, $idv, $subj) = rtable 'recall_w_beer_and_wings.txt';

    my ($b, $w, $dv) = $data->dog;
      # subj and IVs can be 1d pdl or @ ref
      # subj must be the first argument
    my %m = $dv->anova_rptd( $subj, $b, $w, {ivnm=>['Beer', 'Wings']} );

    print "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m

    ss_residual	19.0833333333333
    ss_subject	24.8333333333333
    ss_total	133.833333333333
    | Beer | F	9.39130434782609
    | Beer | F_p	0.0547977008378944
    | Beer | df	1
    | Beer | m [10.916667  8.9166667]
    | Beer | ms	24
    | Beer | se [0.4614791  0.4614791]
    | Beer | ss	24
    | Beer || err df	3
    | Beer || err ms	2.55555555555556
    | Beer || err ss	7.66666666666667
    | Beer ~ Wings | F	0.510917030567687
    | Beer ~ Wings | F_p	0.623881438624431
    | Beer ~ Wings | df	2
    | Beer ~ Wings | m [
     [   10     7]
     [ 10.5  9.25]
     [12.25  10.5]
    ]
    | Beer ~ Wings | ms	1.625
    | Beer ~ Wings | se [
     [0.89170561 0.89170561]
     [0.89170561 0.89170561]
     [0.89170561 0.89170561]
    ]
    | Beer ~ Wings | ss	3.25000000000001
    | Beer ~ Wings || err df	6
    | Beer ~ Wings || err ms	3.18055555555555
    | Beer ~ Wings || err ss	19.0833333333333
    | Wings | F	4.52851711026616
    | Wings | F_p	0.0632754786153548
    | Wings | df	2
    | Wings | m [8.5 9.875 11.375]
    | Wings | ms	16.5416666666667
    | Wings | se [0.67571978 0.67571978 0.67571978]
    | Wings | ss	33.0833333333333
    | Wings || err df	6
    | Wings || err ms	3.65277777777778
    | Wings || err ss	21.9166666666667

For mixed model anova, ie when there are between-subject IVs
involved, feed the IVs as above, but specify in BTWN which IVs are
between-subject. For example, if we had added age as a between-subject
IV in the above example, we would do

    my %m = $dv->anova_rptd( $subj, $age, $b, $w,
                           { ivnm=>['Age', 'Beer', 'Wings'], btwn=>[0] });

=cut

*anova_rptd = \&PDL::anova_rptd;
sub PDL::anova_rptd {
  my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
  my ($y, $subj, @ivs_raw) = @_;
  confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1;
  croak "Mismatched number of elements in DV and IV. Are you passing IVs the old-and-abandoned way?"
    if (ref $ivs_raw[0] eq 'ARRAY') and (@{ $ivs_raw[0] } != $y->nelem);
  for (@ivs_raw) {
    croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1
  }
  my %opt = (
    V      => 1,    # carps if bad value in dv
    IVNM   => [],   # auto filled as ['IV_0', 'IV_1', ... ]
    ( !defined($subj) ? () : (
    BTWN   => [],   # indices of between-subject IVs (matches IVNM indices)
    )),
    PLOT   => 0,    # plots highest order effect
    WIN    => undef,      # for plotting
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
    if !$opt{IVNM} or !@{ $opt{IVNM} };
  (my ($dsgn, $idv), $y, my ($sj, $ivs_ref_filtrd, $pdl_ivs_raw, $ivs_ref, $err_ref)) = $y->anova_design_matrix($subj, @ivs_raw, \%opt);
  confess "anova_rptd: got singular matrix for X' x X"
    if any +($dsgn->t x $dsgn)->det == 0;
  my $b_full = _ols_main(1, $y, $dsgn->t, {CONST=>0});

  my %ret = (ss_total => $y->ss,
    ss_residual => $y->sse( sumover( $b_full * $dsgn ) ));

  if (defined $subj) {
    my @full = (@$ivs_ref, @$err_ref);
    my $has_btwn = @{ $opt{BTWN} };
    my @is_btwn; $is_btwn[$_] = 1 for @{ $opt{BTWN} };
    my @within_inds = 0 .. $#ivs_raw;
    @within_inds = grep !$is_btwn[$_], @within_inds if $has_btwn;
    my $within_df = pdl(map $_->dim(1), @full[@within_inds])->prodover->sclr;
    EFFECT: for my $k (0 .. $#full) {
      my $e = ($k > $#$ivs_ref)?  '| err' : '';
      my $i = ($k > $#$ivs_ref)?  $k - @$ivs_ref : $k;
      my $i_pref = $k == $#full ? undef : "| $idv->[$i] |";

      if (!defined $full[$k]) {     # ss_residual as error
        $ret{ "$i_pref$e ss" } = $ret{ss_residual};
          # highest ord inter for purely within design, (p-1)*(q-1)*(n-1)
        my $factor = (ref $full[-1] ? $full[-1] : $err_ref->[$full[-1]])->dim(1);
        my $df = $ret{ "$i_pref$e df" } = $factor * $within_df;
        die "${i_pref}residual df = 0" if $df <= 0;
        $ret{ "$i_pref$e ms" } = $ret{ "$i_pref$e ss" } / $df;
      } elsif (ref $full[$k]) {       # unique error term
        next EFFECT
          unless my @G = grep $_ != $k && defined $full[$_], 0 .. $#full;
        my $G = ones($y->dim(0))->glue(1, grep ref $_, @full[@G]);
        my $b_G = $y->ols_t( $G, {CONST=>0} );
        my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"}
          = $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual};
        if ($k != $#full) {
          my $df = $ret{"$i_pref$e df"} = $full[$k]->dim(1);
          die "residual df = 0" if $df <= 0;
          $ret{"$i_pref$e ms"} = $ss / $df;
        }
      } else {                        # repeating error term
        my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"}
          = $ret{"| $idv->[$full[$k]] |$e ss"};
        if ($k != $#full) {
          my $df = $ret{"$i_pref$e df"} = $ret{"| $idv->[$full[$k]] |$e df"};
          die "residual df = 0" if $df <= 0;
          $ret{"$i_pref$e ms"} = $ss / $df;
        }
      }
    }
  } else {
    $ret{ss_model} = $ret{ss_total} - $ret{ss_residual};
    $ret{F_df} = pdl(my $F_df0 = $dsgn->dim(0) - 1, my $df1 = $y->nelem - $dsgn->dim(0));
    $ret{ms_model} = $ret{ss_model} / $F_df0;
    $ret{ms_residual} = $ret{ss_residual} / $df1;
    $ret{F} = $ret{ms_model} / $ret{ms_residual};
    $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $F_df0, $df1 )
      if $CDF and $df1 > 0;

    # get IV ss from $ivs_ref instead of $dsgn pdl
    my $ones = ones($y->dim(0));
    for my $k (0 .. $#$ivs_ref) {
      my $G = $ones->glue(1, @$ivs_ref[grep $_ != $k, 0 .. $#$ivs_ref]);
      my $b_G = $y->ols_t( $G, {CONST=>0} );
      $ret{ "| $idv->[$k] | ss" }
        = $y->sse( sumover($b_G * $G->transpose) ) - $ret{ss_residual};
      my $df0 = $ret{ "| $idv->[$k] | df" } = $ivs_ref->[$k]->dim(1);
      $ret{ "| $idv->[$k] || err df" } = $df1;
      die "residual df = 0" if $df1 <= 0;
      $ret{ "| $idv->[$k] | ms" } = $ret{ "| $idv->[$k] | ss" } / $df0;
    }
  }
    # have all iv, inter, and error effects. get F and F_p
  for (0 .. $#$ivs_ref) {
    my $ms_residual = defined $subj ? $ret{ "| $idv->[$_] || err ms" } : $ret{ms_residual};
    my ($df0, $df1) = @ret{"| $idv->[$_] | df" , "| $idv->[$_] || err df"};
    my $F = $ret{ "| $idv->[$_] | F" } = $ret{ "| $idv->[$_] | ms" } / $ms_residual;
    $ret{ "| $idv->[$_] | F_p" } = 1 - $F->gsl_cdf_fdist_P($df0, $df1)
      if $CDF and $df1 > 0;
  }

  for (keys %ret) {ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze};

  my ($inter_cmo_ref, $inter_cm_ref)
    = _interactions_cm($ivs_ref_filtrd, $pdl_ivs_raw);
  # append inter info to cell means effects
  my $ivs_cm_ref = [@$ivs_ref_filtrd, @$inter_cm_ref];
  my @i_cmo_ref = map pdl(values %{ (effect_code($_))[1] })->qsort, @$pdl_ivs_raw;
#line 1068 "lib/PDL/Stats/GLM.pd"
  push @i_cmo_ref, @$inter_cmo_ref;

  my $cm_ref
    = _cell_means( $y, $ivs_cm_ref, \@i_cmo_ref, $idv, $pdl_ivs_raw );
  if (defined $subj) {
    my @ls = map { $_->uniq->nelem } @$pdl_ivs_raw;
    $cm_ref
      = _fix_rptd_se( $cm_ref, \%ret, $opt{IVNM}, \@ls, $sj->uniq->nelem );
  }
    # integrate mean and se into %ret
  @ret{ keys %$cm_ref } = values %$cm_ref;

  my $highest = join(' ~ ', @{ $opt{IVNM} });
  $cm_ref->{"| $highest | m"}->plot_means( $cm_ref->{"| $highest | se"},
                                           { %opt, IVNM=>$idv } )
    if $opt{PLOT};

  %ret;
}

=head2 anova_design_matrix

=for ref

Effect-coded design matrix for anova, including repeated-measures and
mixed-model. The C<X> for use in linear regression i.e. C<Y = X β + ε>.

Added in 0.854. See L<https://en.wikipedia.org/wiki/Design_matrix> for more.

Supports bad value in the dependent and independent variables. It
automatically removes bad data listwise, i.e. remove a subject's data if
there is any cell missing for the subject.

=for options

Default options (case insensitive):

    V      => 1,    # carps if bad value in dv
    IVNM   => [],   # auto filled as ['IV_0', 'IV_1', ... ]
    BTWN   => [],   # indices of between-subject IVs (matches IVNM indices)

=for usage

  $matrix = $dv->anova_design_matrix(undef, $b, $w, {ivnm=>[qw(b w)]});
  $matrix = $dv->anova_design_matrix(
    $subj, $b, $w, {ivnm=>[qw(b w)]}); # repeated-measures
  $matrix = $dv->anova_design_matrix(
    $subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # mixed-model
  ($matrix, $ivnm_ref) = $dv->anova_design_matrix(
    $subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # list context also names

=cut

*anova_design_matrix = \&PDL::anova_design_matrix;
sub PDL::anova_design_matrix {
  my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
  my ($y, $subj, @ivs_raw) = @_;
  confess "No IVs: did you omit 'undef' for anova?" if !@ivs_raw;
  confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1;
  for (@ivs_raw) {
    croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1;
  }
  my %opt = (
    V      => 1,    # carps if bad value in dv
    IVNM   => [],   # auto filled as ['IV_0', 'IV_1', ... ]
    ( !defined($subj) ? () : (
    BTWN   => [],   # indices of between-subject IVs (matches IVNM indices)
    )),
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
    if !$opt{IVNM} or !@{ $opt{IVNM} };
  my @idv_orig = @{ $opt{IVNM} };
  my @pdl_ivs_raw = map scalar PDL::Stats::Basic::code_ivs($_), @ivs_raw;
  my $pdl_ivs = pdl(\@pdl_ivs_raw);
    # explicit set badflag because pdl() removes badflag
  $pdl_ivs->badflag( scalar grep $_->badflag, @pdl_ivs_raw );
  my $sj;
  if (defined($subj)) {
    # delete bad data listwise ie remove subj if any cell missing
    $sj = PDL::Stats::Basic::code_ivs($subj);
    my $ibad = which( $y->isbad | nbadover($pdl_ivs->transpose) );
    my $sj_bad = $sj->slice($ibad)->uniq;
    if ($sj_bad->nelem) {
      warn $sj_bad->nelem . " subjects with missing data removed\n"
        if $opt{V};
      $sj = $sj->setvaltobad($_)
        for (list $sj_bad);
      my $igood = which $sj->isgood;
      for ($y, $sj, @pdl_ivs_raw) {
        $_ = $_->slice( $igood )->sever;
        $_->badflag(0);
      }
    }
  } else {
    ($y, $pdl_ivs) = _rm_bad_value( $y, $pdl_ivs );
    if ($opt{V} and $y->nelem < $pdl_ivs_raw[0]->nelem) {
      warn sprintf "%d subjects with missing data removed\n", $pdl_ivs_raw[0]->nelem - $y->nelem;
    }
    @pdl_ivs_raw = $pdl_ivs->dog;
  }
  my @ivs_ref_fltrd = map scalar effect_code($_), @pdl_ivs_raw;
  my ($ivs_inter_ref, $idv_inter) = _interactions(\@ivs_ref_fltrd, \@idv_orig);
  # append inter info to main effects
  my $ivs_ref = [@ivs_ref_fltrd, @$ivs_inter_ref];
  my @idv = (@idv_orig, @$idv_inter);
    # matches $ivs_ref, with an extra last pdl for subj effect
  my $err_ref = !defined($subj) ? [] :
    _add_errors( $sj, $ivs_ref, \@idv, \@pdl_ivs_raw, $opt{BTWN} );
  for (grep ref $err_ref->[$_], 0..$#$err_ref) {
    my ($null_row_ids, $non_null_row_ids) = $err_ref->[$_]->zcover->which_both;
    confess "got null columns $null_row_ids in error entry #$_ ($idv[$_])"
      if !$null_row_ids->isempty;
  }
  my $dsgn = PDL::glue(1, ones($y->dim(0)), @$ivs_ref, (grep ref($_), @$err_ref))->t;
  !wantarray ? $dsgn : ($dsgn, \@idv, $y, $sj, \@ivs_ref_fltrd, \@pdl_ivs_raw, $ivs_ref, $err_ref);
}

# code (btwn group) subjects. Rutherford (2011) pp 208-209
sub _code_btwn {
  my ($subj, $btwn) = @_;
  my (@grp, %grp_s);
  for my $n (0 .. $subj->nelem - 1) {
      # construct string to code group membership
      # something not treated as BAD by code_ivs to start off marking group membership
      # if no $btwn, everyone ends up in the same grp
    my $s = '_' . join '', map $_->slice($n), @$btwn;
    push @grp, $s;                 # group membership
    $s .= $subj->slice($n);        # keep track of total uniq subj
    $grp_s{$s} = 1;
  }
  my $grp = PDL::Stats::Kmeans::iv_cluster \@grp;
  my $spdl = zeroes $subj->dim(0), keys(%grp_s) - $grp->dim(1);
  my $d1 = 0;
  for my $g (0 .. $grp->dim(1)-1) {
    my $col_inds = which $grp->slice(':',$g);
    my $gsub = $subj->slice( $col_inds )->effect_code;
    my ($nobs, $nsub) = $gsub->dims;
    $spdl->slice($col_inds, [$d1,$d1+$nsub-1]) .= $gsub;
    $d1 += $nsub;
  }
  $spdl;
}

sub _add_errors {
  my ($subj, $ivs_ref, $idv, $raw_ivs, $btwn) = @_;
  my $spdl = _code_btwn($subj, [@$raw_ivs[@$btwn]]);
  # if btwn factor involved, or highest order inter for within factors
  # elem is undef, so that
  # @errors ind matches @$ivs_ref, with an extra elem at the end for subj
    # mark btwn factors for error terms
    # same error term for B(wn) and A(btwn) x B(wn) (Rutherford, p205)
  my %is_btwn = map +($_=>1), @$idv[ @$btwn ];
  my $has_btwn = keys %is_btwn;
  my %idv2indx = map +($idv->[$_]=>$_), 0..$#$idv;
  my $ie_subj;
  my @errors = map {
    my @fs = split ' ~ ', $idv->[$_];
      # separate bw and wn factors
      # if only bw, error is bw x subj
      # if only wn or wn and bw, error is wn x subj
    my @bw = !$has_btwn ? ()  : grep  $is_btwn{$_}, @fs;
    my @wn = !$has_btwn ? @fs : grep !$is_btwn{$_}, @fs;
    $ie_subj = $_ if !defined($ie_subj) and !@wn;
    my $err = join ' ~ ', @wn ? @wn : @bw;
      # highest order inter of within factors, use ss_residual as error
    if ( @wn == @$raw_ivs - @$btwn )                            { undef }
      # repeating btwn factors use ss_subject as error
    elsif (!@wn and $_ > $ie_subj)                           { $ie_subj }
      # repeating error term
    elsif ($_ > $idv2indx{$err})                      { $idv2indx{$err} }
    elsif (@wn)               { interaction_code($ivs_ref->[$_], $spdl) }
    else                                                        { $spdl }
  } 0 .. $#$ivs_ref;
  push @errors, $has_btwn ? $ie_subj : $spdl;
  \@errors;
}

sub _fix_rptd_se {
    # if ivnm lvls_ref for within ss only this can work for mixed design
  my ($cm_ref, $ret, $ivnm, $lvls_ref, $n) = @_;
  my @se = map /^\| (.+?) \| se$/ ? $1 : (), keys %$cm_ref;
  my @n_obs = map {
    my @ivs = split / ~ /, $_;
    my $i_ivs = which_id $ivnm, \@ivs;
    my $icollapsed = setops pdl(0 .. $#$ivnm), 'XOR', $i_ivs;
    my $collapsed = $icollapsed->nelem?
                      pdl( @$lvls_ref[(list $icollapsed)] )->prodover
                  :   1
                  ;
    $n * $collapsed;
  } @se;
  for my $i (0 .. $#se) {
    $cm_ref->{"| $se[$i] | se"}
      .= sqrt( $ret->{"| $se[$i] || err ms"} / $n_obs[$i] );
  }
  $cm_ref;
}

=head2 dummy_code

=for ref

Dummy coding of nominal variable (perl @ ref or 1d pdl) for use in regression.

Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).

=for usage

    pdl> @a = qw(a a a b b b c c c)
    pdl> p $a = dummy_code(\@a)
    [
     [1 1 1 0 0 0 0 0 0]
     [0 0 0 1 1 1 0 0 0]
    ]

=cut

*dummy_code = \&PDL::dummy_code;
sub PDL::dummy_code {
  my ($var_ref) = @_;
  my $var_e = effect_code( $var_ref );
  $var_e->where( $var_e == -1 ) .= 0;
  $var_e;
}

=head2 effect_code

=for ref

Unweighted effect coding of nominal variable (perl @ ref or 1d pdl) for
use in regression. returns in @ context coded pdl and % ref to level -
pdl->dim(1) index.

Supports BAD value (missing or 'BAD' values result in the corresponding
pdl elements being marked as BAD).

=for usage

    my @var = qw( a a a b b b c c c );
    my ($var_e, $map) = effect_code( \@var );

    print $var_e . $var_e->info . "\n";

    [
     [ 1  1  1  0  0  0 -1 -1 -1]
     [ 0  0  0  1  1  1 -1 -1 -1]
    ]
    PDL: Double D [9,2]

    print "$_\t$map->{$_}\n" for sort keys %$map
    a       0
    b       1
    c       2

=cut

*effect_code = \&PDL::effect_code;
sub PDL::effect_code {
  my ($var_ref) = @_;
  my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref );
  my $var_max = $var->max;
  confess "effect_code called with only one unique value" if $var_max < 1;
  my $var_e = yvals( float, $var->nelem, $var_max ) == $var;
  $var_e->slice(which( $var == $var_max ), ) .= -1;
  $var_e = $var_e->setbadif( $var->isbad ) if $var->badflag;
  wantarray ? ($var_e, $map_ref) : $var_e;
}

=head2 effect_code_w

=for ref

Weighted effect code for nominal variable. returns in @ context coded pdl and % ref to level - pdl->dim(1) index.

Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).

=for usage

    pdl> @a = qw( a a b b b c c )
    pdl> p $a = effect_code_w(\@a)
    [
     [   1    1    0    0    0   -1   -1]
     [   0    0    1    1    1 -1.5 -1.5]
    ]

=cut

*effect_code_w = \&PDL::effect_code_w;
sub PDL::effect_code_w {
  my ($var_ref) = @_;
  my ($var_e, $map_ref) = effect_code( $var_ref );
  return wantarray ? ($var_e, $map_ref) : $var_e if $var_e->sum == 0;
  my $pos = $var_e == 1;
  my $neg = $var_e == -1;
  my $w = $pos->sumover / $neg->sumover;
  my $neg_ind = $neg->whichND;
  $var_e->indexND($neg_ind) *= $w->slice($neg_ind->slice('(1)'));
  wantarray ? ($var_e, $map_ref) : $var_e;
}

=head2 interaction_code

Returns the coded interaction term for effect-coded variables.

Supports BAD value (missing or 'BAD' values result in the corresponding
pdl elements being marked as BAD).

=for usage

    pdl> $a = sequence(6) > 2
    pdl> p $a = $a->effect_code
    [
     [ 1  1  1 -1 -1 -1]
    ]

    pdl> $b = pdl( qw( 0 1 2 0 1 2 ) )
    pdl> p $b = $b->effect_code
    [
     [ 1  0 -1  1  0 -1]
     [ 0  1 -1  0  1 -1]
    ]

    pdl> p $ab = interaction_code( $a, $b )
    [
     [ 1  0 -1 -1 -0  1]
     [ 0  1 -1 -0 -1  1]
    ]

=cut

*interaction_code = \&PDL::interaction_code;
sub PDL::interaction_code {
  my $i = ones( $_[0]->dim(0), 1 );
  $i = ($i * $_->dummy(1))->clump(1,2) for @_;
  $i;
}

=head2 ols

=for ref

Ordinary least squares regression, aka linear regression.

Unlike B<ols_t>, ols is not broadcastable, but it can handle bad value (by
ignoring observations with bad value in dependent or independent variables
list-wise) and returns the full model in list context with various stats.

IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply
the constant vector in $x. Intercept is automatically added and returned
as FIRST of the coeffs if CONST=>1. Returns full model in list context
and coeff in scalar context.

For more on multiple linear regression see
L<https://en.wikipedia.org/wiki/Multiple_linear_regression>.

=for options

Default options (case insensitive):

    CONST  => 1,
    PLOT   => 0,   # see plot_residuals() for plot options
    WIN    => undef,      # for plotting

=for usage

Usage:

    # suppose this is a person's ratings for top 10 box office movies
    # ascending sorted by box office

    pdl> $y = pdl '[1 1 2 2 2 2 4 4 5 5]'

    # construct IV with linear and quadratic component

    pdl> p $x = cat sequence(10), sequence(10)**2
    [
     [ 0  1  2  3  4  5  6  7  8  9]
     [ 0  1  4  9 16 25 36 49 64 81]
    ]

    pdl> %m = $y->ols( $x )

    pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m

    F       40.4225352112676
    F_df    [2 7]
    F_p     0.000142834216344756
    R2      0.920314253647587

    # coeff  constant   linear    quadratic
    b        [0.981818  0.212121  0.030303]
    b_p      [0.039910  0.328001  0.203034]
    b_se     [0.389875  0.201746  0.021579]
    b_t      [2.518284  1.051422  1.404218]
    ss_model        19.8787878787879
    ss_residual     1.72121212121212
    ss_total        21.6
    y_pred  [0.98181818  1.2242424  1.5272727  ...  4.6181818  5.3454545]

=cut

*ols = \&PDL::ols;
sub PDL::ols {
  _ols_common(0, @_);
}

# ivs = [nobs x nivs] so can `dog` retval
sub _rm_bad_value {
  my ($y, $ivs) = @_;
  return ($y, $ivs, undef) if !$y->check_badflag and !$ivs->check_badflag;
  my $idx = which($y->isgood & (nbadover ($ivs->transpose)==0));
  $_ = $_->slice($idx)->sever for $y, $ivs;
  $_->badflag(0) for $y, $ivs;
  ($y, $ivs, $idx);
}

=head2 ols_rptd

=for ref

Repeated measures linear regression.
Handles purely within-subject design for now.

(Lorch & Myers, 1990; Van den Noortgate & Onghena, 2006).
See F<t/glm.t> for an example using the Lorch and Myers data.

=for usage

Usage:

    # This is the example from Lorch and Myers (1990),
    # a study on how characteristics of sentences affected reading time
    # Three within-subject IVs:
    # SP -- serial position of sentence
    # WORDS -- number of words in sentence
    # NEW -- number of new arguments in sentence

    # $subj can be 1D pdl or @ ref and must be the first argument
    # IV can be 1D @ ref or pdl
    # 1D @ ref is effect coded internally into pdl
    # pdl is left as is

    my %r = $rt->ols_rptd( $subj, $sp, $words, $new );

    print "$_\t$r{$_}\n" for sort keys %r;

    ss_residual   58.3754646504336
    ss_subject    51.8590337714286
    ss_total      405.188241771429
    #      SP        WORDS      NEW
    F   [  7.208473  61.354153  1.0243311]
    F_p [0.025006181 2.619081e-05 0.33792837]
    coeff   [0.33337285 0.45858933 0.15162986]
    df  [1 1 1]
    df_err  [9 9 9]
    ms  [ 18.450705  73.813294 0.57026483]
    ms_err  [ 2.5595857  1.2030692 0.55671923]
    ss  [ 18.450705  73.813294 0.57026483]
    ss_err  [ 23.036272  10.827623  5.0104731]

=cut

*ols_rptd = \&PDL::ols_rptd;
sub PDL::ols_rptd {
  my ($y, $subj, @ivs_raw) = @_;

  $y = $y->squeeze;
  $y->getndims > 1 and
    croak "ols_rptd does not support broadcasting";

  my @ivs = map {  (ref $_ eq 'PDL' and $_->ndims > 1)?  $_
                  : ref $_ eq 'PDL' ?                    $_->dummy(1)
                  :                   scalar effect_code($_)
                  ;
                } @ivs_raw;

  my %r;

  $r{ss_total} = $y->ss;

  # STEP 1: subj
  my $s = effect_code $subj;
  my $b_s = $y->ols_t($s);
  my $pred = sumover($b_s->slice('1:-1') * $s->transpose) + $b_s->slice(0);
  $r{ss_subject} = $r{ss_total} - $y->sse( $pred );

  # STEP 2: add predictor variables
  my $iv_p = $s->glue(1, @ivs);
  my $b_p = $y->ols_t($iv_p);

    # only care about coeff for predictor vars. no subj or const coeff
  $r{coeff} = $b_p->slice([-@ivs,-1])->sever;

    # get total sse for this step
  $pred = sumover($b_p->slice('1:-1') * $iv_p->transpose) + $b_p->slice(0);
  my $ss_pe  = $y->sse( $pred );

    # get predictor ss by successively reducing the model
  $r{ss} = zeroes scalar(@ivs);
  for my $i (0 .. $#ivs) {
    my $iv = $s->glue(1, @ivs[ grep $_ != $i, 0..$#ivs ]);
    my $b  = $y->ols_t($iv);
    $pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0);
    $r{ss}->slice($i) .= $y->sse($pred) - $ss_pe;
  }

  # STEP 3: get predictor x subj interaction as error term
  my $iv_e = PDL::glue 1, map interaction_code( $s, $_ ), @ivs;

    # get total sse for this step. full model now.
  my $b_f = $y->ols_t( $iv_p->glue(1,$iv_e) );
  $pred = sumover($b_f->slice('1:-1') * $iv_p->glue(1,$iv_e)->transpose) + $b_f->slice(0);
  $r{ss_residual}  = $y->sse( $pred );

    # get predictor x subj ss by successively reducing the error term
  $r{ss_err} = zeroes scalar(@ivs);
  for my $i (0 .. $#ivs) {
    my $iv = $iv_p->glue(1, map interaction_code($s, $_),
      @ivs[grep $_ != $i, 0..$#ivs]);
    my $b  = $y->ols_t($iv);
    my $pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0);
    $r{ss_err}->slice($i) .= $y->sse($pred) - $r{ss_residual};
  }

  # Finally, get MS, F, etc

  $r{df} = pdl( map $_->squeeze->ndims, @ivs );
  $r{ms} = $r{ss} / $r{df};

  $r{df_err} = $s->dim(1) * $r{df};
  $r{ms_err} = $r{ss_err} / $r{df_err};

  $r{F} = $r{ms} / $r{ms_err};

  $r{F_p} = 1 - $r{F}->gsl_cdf_fdist_P( $r{df}, $r{df_err} )
    if $CDF;

  %r;
}

=head2 logistic

=for ref

Logistic regression with maximum likelihood estimation using L<PDL::Fit::LM>.

IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply
the constant vector in $x. It is included in the model and returned as
LAST of coeff. Returns full model in list context and coeff in scalar
context.

The significance tests are likelihood ratio tests (-2LL deviance)
tests. IV significance is tested by comparing deviances between the
reduced model (ie with the IV in question removed) and the full model.

***NOTE: the results here are qualitatively similar to but not identical
with results from R, because different algorithms are used for the
nonlinear parameter fit. Use with discretion***

=for options

Default options (case insensitive):

    INITP => zeroes( $x->dim(1) + 1 ),    # n_iv + 1
    MAXIT => 1000,
    EPS   => 1e-7,

=for usage

Usage:

    # suppose this is whether a person had rented 10 movies

    pdl> p $y = ushort( random(10)*2 )
    [0 0 0 1 1 0 0 1 1 1]

    # IV 1 is box office ranking

    pdl> p $x1 = sequence(10)
    [0 1 2 3 4 5 6 7 8 9]

    # IV 2 is whether the movie is action- or chick-flick

    pdl> p $x2 = sequence(10) % 2
    [0 1 0 1 0 1 0 1 0 1]

    # concatenate the IVs together

    pdl> p $x = cat $x1, $x2
    [
     [0 1 2 3 4 5 6 7 8 9]
     [0 1 0 1 0 1 0 1 0 1]
    ]

    pdl> %m = $y->logistic( $x )

    pdl> p "$_\t$m{$_}\n" for sort keys %m

    D0	13.8629436111989
    Dm	9.8627829791575
    Dm_chisq	4.00016063204141
    Dm_df	2
    Dm_p	0.135324414081692
      #  ranking    genre      constant
    b	[0.41127706 0.53876358 -2.1201285]
    b_chisq	[ 3.5974504 0.16835559  2.8577151]
    b_p	[0.057868258  0.6815774 0.090936587]
    iter	12
    y_pred	[0.10715577 0.23683909 ... 0.76316091 0.89284423]

    # to get the covariance out, supply a true value for the COV option:
    pdl> %m = $y->logistic( $x, {COV=>1} )
    pdl> p $m{cov};

=cut

*logistic = \&PDL::logistic;
sub PDL::logistic {
  require PDL::Fit::LM;
  my ( $self, $ivs, $opt ) = @_;
  $self = $self->squeeze;
    # make compatible w multiple var cases
  $ivs->getndims == 1 and $ivs = $ivs->dummy(1);
  $self->dim(0) != $ivs->dim(0) and
    carp "mismatched n btwn DV and IV!";
  my %opt = (
    INITP => zeroes( $ivs->dim(1) + 1 ),    # n_ivs + 1
    MAXIT => 1000,
    EPS   => 1e-7,
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
    # not using it atm
  $opt{WT} = 1;
    # Use lmfit. Fourth input argument is reference to user-defined
    # copy INITP so we have the original value when needed
  my ($yfit,$coeff,$cov,$iter)
    = PDL::Fit::LM::lmfit($ivs, $self, $opt{WT}, \&_logistic, $opt{INITP}->copy,
      { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
    # apparently at least coeff is child of some pdl
    # which is changed in later lmfit calls
  $yfit  = $yfit->copy;
  $coeff = $coeff->copy;
  return $coeff unless wantarray;
  my %ret;
  my $n0 = $self->where($self == 0)->nelem;
  my $n1 = $self->nelem - $n0;
  $ret{cov} = $cov if $opt{COV};
  $ret{D0} = -2*($n0 * log($n0 / $self->nelem) + $n1 * log($n1 / $self->nelem));
  $ret{Dm} = sum( $self->dvrs( $yfit ) ** 2 );
  $ret{Dm_chisq} = $ret{D0} - $ret{Dm};
  $ret{Dm_df} = $ivs->dim(1);
  $ret{Dm_p}
    = 1 - PDL::GSL::CDF::gsl_cdf_chisq_P( $ret{Dm_chisq}, $ret{Dm_df} )
    if $CDF;
  my $coeff_chisq = zeroes $opt{INITP}->nelem;
  if ( $ivs->dim(1) > 1 ) {
    for my $k (0 .. $ivs->dim(1)-1) {
      my @G = grep { $_ != $k } (0 .. $ivs->dim(1)-1);
      my $G = $ivs->dice_axis(1, \@G);
      my $init = $opt{INITP}->dice([ @G, $opt{INITP}->dim(0)-1 ])->copy;
      my $y_G
        = PDL::Fit::LM::lmfit( $G, $self, $opt{WT}, \&_logistic, $init,
        { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
      $coeff_chisq->slice($k) .= $self->dm( $y_G ) - $ret{Dm};
    }
  }
  else {
      # d0 is, by definition, the deviance with only intercept
    $coeff_chisq->slice(0) .= $ret{D0} - $ret{Dm};
  }
  my $y_c
      = PDL::Fit::LM::lmfit( $ivs, $self, $opt{WT}, \&_logistic_no_intercept, $opt{INITP}->slice('0:-2')->sever,
      { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
  $coeff_chisq->slice(-1) .= $self->dm( $y_c ) - $ret{Dm};
  $ret{b} = $coeff;
  $ret{b_chisq} = $coeff_chisq;
  $ret{b_p} = 1 - $ret{b_chisq}->gsl_cdf_chisq_P( 1 )
    if $CDF;
  $ret{y_pred} = $yfit;
  $ret{iter} = $iter;
  for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };
  %ret;
}

sub _logistic {
  my ($x,$par,$ym,$dyda) = @_;
    # $b and $c are fit parameters slope and intercept
  my $b = $par->slice([0,$x->dim(1) - 1])->sever;
  my $c = $par->slice(-1)->sever;
    # Write function with dependent variable $ym,
    # independent variable $x, and fit parameters as specified above.
    # Use the .= (dot equals) assignment operator to express the equality
    # (not just a plain equals)
  $ym .= 1 / ( 1 + exp( -1 * (sumover($b * $x->transpose) + $c) ) );
  my @dy = map $dyda->slice(",($_)"), 0 .. $par->dim(0)-1;
    # Partial derivative of the function with respect to each slope
    # fit parameter ($b in this case). Again, note .= assignment
    # operator (not just "equals")
  $dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym)
    for (0 .. $b->dim(0)-1);
    # Partial derivative of the function re intercept par
  $dy[-1] .= $ym * (1 - $ym);
}

sub _logistic_no_intercept {
  my ($x,$par,$ym,$dyda) = @_;
  my $b = $par->slice([0,$x->dim(1) - 1])->sever;
    # Write function with dependent variable $ym,
    # independent variable $x, and fit parameters as specified above.
    # Use the .= (dot equals) assignment operator to express the equality
    # (not just a plain equals)
  $ym .= 1 / ( 1 + exp( -1 * sumover($b * $x->transpose) ) );
  my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1);
    # Partial derivative of the function with respect to each slope
    # fit parameter ($b in this case). Again, note .= assignment
    # operator (not just "equals")
  $dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym)
    for 0 .. $b->dim(0)-1;
}

=head2 pca

=for ref

Principal component analysis. Based on corr instead of cov.

Bad values are ignored pair-wise. OK when bad values are few but otherwise
probably should fill_m etc before pca). Uses L<PDL::MatrixOps/eigens_sym>.

=for options

Default options (case insensitive):

    CORR  => 1,     # boolean. use correlation or covariance
    PLOT  => 0,     # calls plot_screes by default
                    # can set plot_screes options here
    WIN    => undef,      # for plotting

=for usage

Usage:

    my $d = qsort random 10, 5;      # 10 obs on 5 variables
    my %r = $d->pca( \%opt );
    print "$_\t$r{$_}\n" for (keys %r);

    eigenvalue    # variance accounted for by each component
    [4.70192 0.199604 0.0471421 0.0372981 0.0140346]

    eigenvector   # dim var x comp. weights for mapping variables to component
    [
     [ -0.451251  -0.440696  -0.457628  -0.451491  -0.434618]
     [ -0.274551   0.582455   0.131494   0.255261  -0.709168]
     [   0.43282   0.500662  -0.139209  -0.735144 -0.0467834]
     [  0.693634  -0.428171   0.125114   0.128145  -0.550879]
     [  0.229202   0.180393  -0.859217     0.4173  0.0503155]
    ]

    loadings      # dim var x comp. correlation between variable and component
    [
     [ -0.978489  -0.955601  -0.992316   -0.97901  -0.942421]
     [ -0.122661   0.260224  0.0587476   0.114043  -0.316836]
     [ 0.0939749   0.108705 -0.0302253  -0.159616 -0.0101577]
     [   0.13396 -0.0826915  0.0241629  0.0247483   -0.10639]
     [  0.027153  0.0213708  -0.101789  0.0494365 0.00596076]
    ]

    pct_var       # percent variance accounted for by each component
    [0.940384 0.0399209 0.00942842 0.00745963 0.00280691]

Plot scores along the first two components,

    $d->plot_scores( $r{eigenvector} );

=cut

*pca = \&PDL::pca;
sub PDL::pca {
  my ($self, $opt) = @_;
  my %opt = (
    CORR  => 1,
    PLOT  => 0,
    WIN    => undef,      # for plotting
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  my $var_var = $opt{CORR}? $self->corr_table : $self->cov_table;
    # value is axis pdl and score is var x axis
  my ($eigvec, $eigval) = $var_var->eigens_sym;
  $eigvec = $eigvec->transpose; # compatibility with PDL::Slatec::eigsys
    # ind is sticky point for broadcasting
  my $ind_sorted = $eigval->qsorti->slice('-1:0');
  $eigvec = $eigvec->slice(':',$ind_sorted)->sever;
  $eigval = $eigval->slice($ind_sorted)->sever;
    # var x axis
  my $var     = $eigval / $eigval->sum->sclr;
  my $loadings;
  if ($opt{CORR}) {
    $loadings = $eigvec * sqrt( $eigval->transpose );
  }
  else {
    my $scores = $eigvec x $self->dev_m;
    $loadings = $self->corr( $scores->dummy(1) );
  }
  $var->plot_screes(\%opt)
    if $opt{PLOT};
  ( eigenvalue=>$eigval, eigenvector=>$eigvec,
           pct_var=>$var, loadings=>$loadings );
}

=head2 pca_sorti

Determine by which vars a component is best represented. Descending sort
vars by size of association with that component. Returns sorted var and
relevant component indices.

=for options

Default options (case insensitive):

    NCOMP => 10,     # maximum number of components to consider

=for usage

Usage:

      # let's see if we replicated the Osgood et al. (1957) study
    pdl> ($data, $idv, $ido) = rtable 'osgood_exp.csv', {v=>0}

      # select a subset of var to do pca
    pdl> $ind = which_id $idv, [qw( ACTIVE BASS BRIGHT CALM FAST GOOD HAPPY HARD LARGE HEAVY )]
    pdl> $data = $data( ,$ind)->sever
    pdl> @$idv = @$idv[list $ind]

    pdl> %m = $data->pca

    pdl> ($iv, $ic) = $m{loadings}->pca_sorti()

    pdl> p "$idv->[$_]\t" . $m{loadings}->($_,$ic)->flat . "\n" for (list $iv)

             #   COMP0     COMP1    COMP2    COMP3
    HAPPY	[0.860191 0.364911 0.174372 -0.10484]
    GOOD	[0.848694 0.303652 0.198378 -0.115177]
    CALM	[0.821177 -0.130542 0.396215 -0.125368]
    BRIGHT	[0.78303 0.232808 -0.0534081 -0.0528796]
    HEAVY	[-0.623036 0.454826 0.50447 0.073007]
    HARD	[-0.679179 0.0505568 0.384467 0.165608]
    ACTIVE	[-0.161098 0.760778 -0.44893 -0.0888592]
    FAST	[-0.196042 0.71479 -0.471355 0.00460276]
    LARGE	[-0.241994 0.594644 0.634703 -0.00618055]
    BASS	[-0.621213 -0.124918 0.0605367 -0.765184]

=cut

*pca_sorti = \&PDL::pca_sorti;
sub PDL::pca_sorti {
    # $self is pdl (var x component)
  my ($self, $opt) = @_;
  my %opt = (
    NCOMP => 10,     # maximum number of components to consider
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  my $ncomp = pdl($opt{NCOMP}, $self->dim(1))->min;
  $self = $self->dice_axis( 1, pdl(0..$ncomp-1) );
  my $icomp = $self->transpose->abs->maximum_ind;
    # sort between comp
  my $ivar_sort = $icomp->qsorti;
  $self = $self->slice($ivar_sort)->sever;
    # sort within comp
  my $ic = $icomp->slice($ivar_sort)->iv_cluster;
  for my $comp (0 .. $ic->dim(1)-1) {
    my $i = $self->slice(which($ic->slice(':',$comp)), "($comp)")->qsorti->slice('-1:0');
    $ivar_sort->slice(which $ic->slice(':',$comp))
      .= $ivar_sort->slice(which $ic->slice(':',$comp))->slice($i);
  }
  wantarray ? ($ivar_sort, pdl(0 .. $ic->dim(1)-1)) : $ivar_sort;
}

=head2 plot_means

Plots means anova style. Can handle up to 4-way interactions (ie 4D pdl).

=for options

Default options (case insensitive):

    IVNM  => ['IV_0', 'IV_1', 'IV_2', 'IV_3'],
    DVNM  => 'DV',
    AUTO  => 1,       # auto set dims to be on x-axis, line, panel
                      # if set 0, dim 0 goes on x-axis, dim 1 as lines
                      # dim 2+ as panels
      # see PDL::Graphics::Simple for next option
    WIN   => undef,   # pgswin object. not closed here if passed
                      # allows comparing multiple lines in same plot
    SIZE  => 640,           # individual square panel size in pixels

=for usage

Usage:

      # see anova for mean / se pdl structure
    $mean->plot_means( $se, {IVNM=>['apple', 'bake']} );

Or like this:

    $m{'| apple ~ bake | m'}->plot_means;

=cut

*plot_means = \&PDL::plot_means;
sub PDL::plot_means {
  require PDL::Graphics::Simple;
  my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
  my ($self, $se) = @_;
  $self = $self->squeeze;
  if ($self->ndims > 4) {
    carp "Data is > 4D. No plot here.";
    return;
  }
  my %opt = (
    IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'],
    DVNM => 'DV',
    AUTO  => 1,             # auto set vars to be on X axis, line, panel
    WIN   => undef,         # PDL::Graphics::Simple object
    SIZE  => 640,           # individual square panel size in pixels
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt }
    # decide which vars to plot as x axis, lines, panels
    # put var w most levels on x axis
    # put var w least levels on diff panels
  my @iD = 0..3;
  my @dims = (1, 1, 1, 1);
    # splice ARRAY,OFFSET,LENGTH,LIST
  splice @dims, 0, $self->ndims, $self->dims;
  $self = $self->reshape(@dims)->sever;
  $se = $se->reshape(@dims)->sever
    if defined $se;
  @iD = reverse list qsorti pdl @dims
    if $opt{AUTO};
    # $iD[0] on x axis
    # $iD[1] as separate lines
  my $nx = $self->dim($iD[2]);    # n xpanels
  my $ny = $self->dim($iD[3]);    # n ypanels
  my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin(
    size=>[$opt{SIZE}*$nx, $opt{SIZE}*$ny,'px']);
  my $seq0 = sequence(my $dim0 = $self->dim($iD[0]));
  my ($pcount, @plots) = 0;
  for my $y (0..$ny-1) {
    for my $x (0..$nx-1) {
      my $key_prefix = "$opt{IVNM}[$iD[0]]|";
      $key_prefix .= $opt{IVNM}[$iD[2]] . "=$x|" if $nx > 1;
      $key_prefix .= $opt{IVNM}[$iD[3]] . "=$y|" if $ny > 1;
      for (0 .. $self->dim($iD[1]) - 1) {
        my $ke = "$key_prefix$opt{IVNM}[$iD[1]]=$_";
        my $ydiced = $self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_)->squeeze;
        push @plots, with=>'lines', ke=>"$ke mean", style=>$pcount,
          $seq0+$pcount*0.05, $ydiced;
        push @plots, with=>'errorbars', ke=>"$ke error", style=>$pcount,
          $seq0+$pcount*0.05, $ydiced,
          $se->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)
            ->dice_axis($iD[1],$_)->squeeze
          if defined($se);
        $pcount++;
      }
    }
  }
  my ($ymin, $ymax) = pdl($self, !defined $se ? () : ($self+$se, $self-$se))->minmax;
  $w->plot(@plots,
    { xlabel=>$opt{IVNM}[$iD[0]], ylabel=>$opt{DVNM},
      xrange=>[-0.05,$dim0-1+$pcount*0.05], yrange=>[$ymin-0.05,$ymax+0.05] }
  );
  $w;
}

=head2 plot_residuals

Plots residuals against predicted values.

=for usage

Usage:

    use PDL::Graphics::Simple;
    $w = pgswin();
    $y->plot_residuals( $y_pred, { win=>$w } );

=for options

Default options (case insensitive):

     # see PDL::Graphics::Simple for more info
    WIN   => undef,  # pgswin object. not closed here if passed
                     # allows comparing multiple lines in same plot
    SIZE  => 640,    # plot size in pixels
    COLOR => 1,

=cut

*plot_residuals = \&PDL::plot_residuals;
sub PDL::plot_residuals {
  require PDL::Graphics::Simple;
  my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
  my ($y, $y_pred) = @_;
  my %opt = (
     # see PDL::Graphics::Simple for next options
    WIN   => undef,  # pgswin object. not closed here if passed
                     # allows comparing multiple lines in same plot
    SIZE  => 640,    # plot size in pixels
    COLOR => 1,
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  my $residuals = $y - $y_pred;
  my $win = $opt{WIN} || PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
  $win->plot(
    with=>'points', style=>$opt{COLOR}, $y_pred, $residuals,
    with=>'lines',  style=>$opt{COLOR}, pdl($y_pred->minmax), pdl(0,0), # 0-line
    {xlabel=>'predicted value', ylabel=>'residuals'},
  );
}

=head2 plot_scores

Plots standardized original and PCA transformed scores against two components. (Thank you, Bob MacCallum, for the documentation suggestion that led to this function.)

=for options

Default options (case insensitive):

  CORR  => 1,      # boolean. PCA was based on correlation or covariance
  COMP  => [0,1],  # indices to components to plot
    # see PDL::Graphics::Simple for next options
  WIN   => undef,  # pgswin object. not closed here if passed
                   # allows comparing multiple lines in same plot
  SIZE  => 640,    # plot size in pixels
  COLOR => [1,2],  # color for original and rotated scores

=for usage

Usage:

  my %p = $data->pca();
  $data->plot_scores( $p{eigenvector}, \%opt );

=cut

*plot_scores = \&PDL::plot_scores;
sub PDL::plot_scores {
  require PDL::Graphics::Simple;
  my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
  my ($self, $eigvec) = @_;
  my %opt = (
    CORR  => 1,      # boolean. PCA was based on correlation or covariance
    COMP  => [0,1],  # indices to components to plot
     # see PDL::Graphics::Simple for next options
    WIN   => undef,  # pgswin object. not closed here if passed
                     # allows comparing multiple lines in same plot
    SIZE  => 640,    # plot size in pixels
    COLOR => [1,2],  # color for original and transformed scoress
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt }
  my $i = pdl $opt{COMP};
  my $z = $opt{CORR} ? $self->stddz : $self->dev_m;
    # transformed normed values
  my $scores = sumover($eigvec->slice(':',$i) * $z->transpose->dummy(1))->transpose;
  $z = $z->slice(':',$i)->sever;
  my $win = $opt{WIN} || PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
  $win->plot(
    with=>'points', style=>$opt{COLOR}[0], ke=>'original', $z->slice(',(0)'), $z->slice(',(1)'),
    with=>'points', style=>$opt{COLOR}[1], ke=>'transformed', $scores->slice(',(0)'), $scores->slice(',(1)'),
    {xlabel=>"Component $opt{COMP}[0]", ylabel=>"Component $opt{COMP}[1]"},
  );
}

=head2 plot_screes

Scree plot. Plots proportion of variance accounted for by PCA components.

=for options

Default options (case insensitive):

  NCOMP => 20,     # max number of components to plot
  CUT   => 0,      # set to plot cutoff line after this many components
                   # undef to plot suggested cutoff line for NCOMP comps
   # see PDL::Graphics::Simple for next options
  WIN   => undef,  # pgswin object. not closed here if passed
                   # allows comparing multiple lines in same plot
  SIZE  => 640,    # plot size in pixels

=for usage

Usage:

  # variance should be in descending order
  $d = qsort random 10, 5;      # 10 obs on 5 variables
  %pca = $d->pca( \%opt );
  $pca{pct_var}->plot_screes( {ncomp=>16, win=>$win=PDL::Graphics::Simple::pgswin()} );

Or, because NCOMP is used so often, it is allowed a shortcut,

  $pca{pct_var}->plot_screes( 16 );

=cut

*plot_scree = \&PDL::plot_screes;      # here for now for compatibility
*plot_screes = \&PDL::plot_screes;
sub PDL::plot_screes {
  require PDL::Graphics::Simple;
  my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
  my ($self, $ncomp) = @_;
  my %opt = (
    NCOMP => 20,     # max number of components to plot
    CUT   => 0,      # set to plot cutoff line after this many components
                     # undef to plot suggested cutoff line for NCOMP comps
     # see PDL::Graphics::Simple for next options
    WIN   => undef,  # pgswin object. not closed here if passed
                     # allows comparing multiple lines in same plot
    SIZE  => 640,    # plot size in pixels
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  $opt{NCOMP} = $ncomp
    if $ncomp;
    # re-use $ncomp below
  $ncomp = ($self->dim(0) < $opt{NCOMP})? $self->dim(0) : $opt{NCOMP};
  my $self_comp = $self->slice([0,$ncomp-1]);
  $opt{CUT}   = PDL::Stats::Kmeans::_scree_ind $self_comp
    if !defined $opt{CUT};
  my $win = $opt{WIN} ||
    PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
  $win->plot(
    with=>'lines', ke=>'scree', sequence($ncomp), $self_comp,
    !$opt{CUT} ? () : (with=>'lines', ke=>'cut', pdl($opt{CUT}-.5, $opt{CUT}-.5), pdl(-.05, $self->max->sclr+.05)),
    {xlabel=>'Component', ylabel=>'Proportion of Variance Accounted for',
      xrange=>[-0.05,$ncomp-0.95], yrange=>[0,1], le=>'tr'},
  );
}

=head2 plot_stripchart

Stripchart plot. Plots ANOVA-style data, categorised against given IVs.

=for options

Default options (case insensitive):

  IVNM => [],   # auto filled as ['IV_0', 'IV_1', ... ]
  DVNM => 'DV',
   # see PDL::Graphics::Simple for next options
  WIN => undef,  # pgswin object. not closed here if passed

=for usage

Usage:

  %m = $y->plot_stripchart( $a, \@b, { IVNM=>[qw(apple bake)] } );

=cut

my $CHART_GAP = 0.1;
*plot_stripchart = \&PDL::plot_stripchart;
sub PDL::plot_stripchart {
  require PDL::Graphics::Simple;
  my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
  my ($y, @ivs_raw) = @_;
  my %opt = (
    IVNM => [],   # auto filled as ['IV_0', 'IV_1', ... ]
    DVNM => 'DV',
    WIN => undef,  # pgswin object. not closed here if passed
  );
  if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
    if !$opt{IVNM} or !@{ $opt{IVNM} };
  my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin();
  my @codes = map [code_ivs($_)], @ivs_raw;
  my @levels = map {
    my $map = $_->[1];
    [sort {$map->{$a} <=> $map->{$b}} keys %$map];
  } @codes;
  my $xjitter = $y->random * $CHART_GAP;
  my ($pcount, @plots) = 0;
  push @plots, with=>'points', ke=>"all data", $xjitter+$pcount, $y;
  $pcount++;
  for my $i (0..$#ivs_raw) {
    my $levs = $levels[$i];
    my $name = $opt{IVNM}[$i];
    my $coded = $codes[$i][0];
    for my $j (0..$#$levs) {
      my $inds = which($coded == $j);
      push @plots, with=>'points', ke=>"$name=$levs->[$j]",
        $xjitter->slice($inds)+$pcount+$j*$CHART_GAP, $y->slice($inds);
    }
    $pcount++;
  }
  my ($ymin, $ymax) = $y->minmax;
  my $xmax = $pcount-1 + $CHART_GAP*($#{$levels[-1]} + 2);
  $w->plot(@plots,
    { xlabel=>'IV', ylabel=>$opt{DVNM},
      xrange=>[-1,$xmax], yrange=>[$ymin-$CHART_GAP,$ymax+$CHART_GAP] }
  );
  $w;
}

=head1 SEE ALSO

L<PDL::Fit::Linfit>

L<PDL::Fit::LM>

=head1 REFERENCES

Cohen, J., Cohen, P., West, S.G., & Aiken, L.S. (2003). Applied
Multiple Regression/correlation Analysis for the Behavioral Sciences
(3rd ed.). Mahwah, NJ: Lawrence Erlbaum Associates Publishers.

Hosmer, D.W., & Lemeshow, S. (2000). Applied Logistic Regression (2nd
ed.). New York, NY: Wiley-Interscience.

Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated
measures data in cognitive research. Journal of Experimental Psychology:
Learning, Memory, & Cognition, 16, 149-157.

Osgood C.E., Suci, G.J., & Tannenbaum, P.H. (1957). The Measurement of
Meaning. Champaign, IL: University of Illinois Press.

Rutherford, A. (2011). ANOVA and ANCOVA: A GLM Approach (2nd ed.). Wiley.

Shlens, J. (2009). A Tutorial on Principal Component Analysis. Retrieved
April 10, 2011 from http://citeseerx.ist.psu.edu/

The GLM procedure: unbalanced ANOVA for two-way design with
interaction. (2008). SAS/STAT(R) 9.2 User's Guide. Retrieved June 18,
2009 from http://support.sas.com/

Van den Noortgate, W., & Onghena, P. (2006). Analysing repeated
measures data in cognitive research: A comment on regression coefficient
analyses. European Journal of Cognitive Psychology, 18, 937-952.

=head1 AUTHOR

Copyright (C) 2009 Maggie J. Xiong <maggiexyz users.sourceforge.net>

All rights reserved. There is no warranty. You are allowed to redistribute
this software / documentation as described in the file COPYING in the
PDL distribution.

=cut
#line 2491 "lib/PDL/Stats/GLM.pm"

# Exit with OK status

1;