1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98

\name{hoeffd}
\alias{hoeffd}
\alias{print.hoeffd}
\title{
Matrix of Hoeffding's D Statistics
}
\description{
Computes a matrix of Hoeffding's (1948) \code{D} statistics for all
possible pairs of columns of a matrix. \code{D} is a measure of the
distance between \code{F(x,y)} and \code{G(x)H(y)}, where \code{F(x,y)}
is the joint CDF of \code{X} and \code{Y}, and \code{G} and \code{H} are
marginal CDFs. Missing values are deleted in pairs rather than deleting
all rows of \code{x} having any missing variables. The \code{D}
statistic is robust against a wide variety of alternatives to
independence, such as nonmonotonic relationships. The larger the value
of \code{D}, the more dependent are \code{X} and \code{Y} (for many
types of dependencies). \code{D} used here is 30 times Hoeffding's
original \code{D}, and ranges from 0.5 to 1.0 if there are no ties in
the data. \code{print.hoeffd} prints the information derived by
\code{hoeffd}. The higher the value of \code{D}, the more dependent are
\code{x} and \code{y}. \code{hoeffd} also computes the mean and maximum
absolute values of the difference between the joint empirical CDF and
the product of the marginal empirical CDFs.
}
\usage{
hoeffd(x, y)
\method{print}{hoeffd}(x, \dots)
}
\arguments{
\item{x}{
a numeric matrix with at least 5 rows and at least 2 columns (if
\code{y} is absent), or an object created by \code{hoeffd}
}
\item{y}{
a numeric vector or matrix which will be concatenated to \code{x}
}
\item{\dots}{ignored}
}
\value{
a list with elements \code{D}, the
matrix of D statistics, \code{n} the
matrix of number of observations used in analyzing each pair of variables,
and \code{P}, the asymptotic Pvalues.
Pairs with fewer than 5 nonmissing values have the D statistic set to NA.
The diagonals of \code{n} are the number of nonNAs for the single variable
corresponding to that row and column.
}
\details{
Uses midranks in case of ties, as described by Hollander and Wolfe.
Pvalues are approximated by linear interpolation on the table
in Hollander and Wolfe, which uses the asymptotically equivalent
BlumKieferRosenblatt statistic. For \code{P<.0001} or \code{>0.5}, \code{P} values are
computed using a wellfitting linear regression function in \code{log P} vs.
the test statistic.
Ranks (but not bivariate ranks) are computed using efficient
algorithms (see reference 3).
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
\email{f.harrell@vanderbilt.edu}
}
\references{
Hoeffding W. (1948): A nonparametric test of independence. Ann Math Stat
19:54657.
Hollander M. and Wolfe D.A. (1973). Nonparametric Statistical Methods,
pp. 228235, 423. New York: Wiley.
Press WH, Flannery BP, Teukolsky SA, Vetterling, WT (1988): Numerical
Recipes in C. Cambridge: Cambridge University Press.
}
\seealso{
\code{\link{rcorr}}, \code{\link{varclus}}
}
\examples{
x < c(2, 1, 0, 1, 2)
y < c(4, 1, 0, 1, 4)
z < c(1, 2, 3, 4, NA)
q < c(1, 2, 3, 4, 5)
hoeffd(cbind(x,y,z,q))
# Hoeffding's test can detect even onetomany dependency
set.seed(1)
x < seq(10,10,length=200)
y < x*sign(runif(200,1,1))
plot(x,y)
hoeffd(x,y)
}
\keyword{nonparametric}
\keyword{htest}
