1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
|
### cvsegments.R: A utility function to generate segments for k-fold
### cross-validation.
#' @title Generate segments for cross-validation
#'
#' @description The function generates a list of segments for cross-validation. It can
#' generate random, consecutive and interleaved segments, and supports keeping
#' replicates in the same segment.
#'
#' @details If \code{length.seg} is specified, it is used to calculate the number of
#' segments to generate. Otherwise \code{k} must be specified. If
#' \eqn{k*length.seg \ne N}{k*length.seg <> N}, the \eqn{k*length.seg - N} last
#' segments will contain only \eqn{length.seg - 1} indices.
#'
#' If \code{type} is \code{"random"}, the indices are allocated to segments in
#' random order. If it is \code{"consecutive"}, the first segment will contain
#' the first \eqn{length.seg} indices, and so on. If \code{type} is
#' \code{"interleaved"}, the first segment will contain the indices \eqn{1,
#' length.seg+1, 2*lenght.seg+1, \ldots, (k-1)*length.seg+1}, and so on.
#'
#' If \eqn{nrep > }, it is assumed that each \code{nrep} consecutive rows are
#' replicates (repeated measurements) of the same object, and care is taken
#' that replicates are never put in different segments.
#'
#' Warning: If \code{k} does not divide \code{N}, a specified \code{length.seg}
#' does not divide \code{N}, or \code{nrep} does not divide \code{length.seg},
#' the number of segments and/or the segment length will be adjusted as needed.
#' Warnings are printed for some of these cases, and one should always inspect
#' the resulting segments to make sure they are as expected.
#'
#' Stratification of samples is enabled by the \code{stratify} argument. This
#' is useful if there are sub-groups in the data set that should have a
#' proportional representation in the cross-validation segments or if the
#' response is categorical (classifiation). If \code{stratify} is combined with
#' \code{nrep}, \code{stratify} corresponds to the sets of replicates (see
#' example).
#'
#' @param N Integer. The number of rows in the data set.
#' @param k Integer. The number of segments to return.
#' @param length.seg Integer. The length of the segments. If given, it
#' overrides \code{k}.
#' @param nrep Integer. The number of (consecutive) rows that are replicates
#' of the same object. Replicates will always be kept in the same segment.
#' @param type One of \code{"random"}, \code{"consecutive"} and
#' \code{"interleaved"}. The type of segments to generate. Default is
#' \code{"random"}.
#' @param stratify Either a \code{list} of indices or an integer \code{vector}
#' indicating which stratum each sample (or set of replicates) belongs to (see
#' Details).
#' @return A list of vectors. Each vector contains the indices for one
#' segment. The attribute \code{"incomplete"} contains the number of
#' incomplete segments, and the attribute \code{"type"} contains the type of
#' segments.
#' @author Bjørn-Helge Mevik, Ron Wehrens and Kristian Hovde Liland
#' @keywords models
#' @examples
#'
#' ## Segments for 10-fold randomised cross-validation:
#' cvsegments(100, 10)
#'
#' ## Segments with four objects, taken consecutive:
#' cvsegments(60, length.seg = 4, type = "cons")
#'
#' ## Incomplete segments
#' segs <- cvsegments(50, length.seg = 3)
#' attr(segs, "incomplete")
#'
#' ## Leave-one-out cross-validation:
#' cvsegments(100, 100)
#' ## Leave-one-out with variable/unknown data set size n:
#' n <- 50
#' cvsegments(n, length.seg = 1)
#'
#' ## Data set with replicates
#' cvsegments(100, 25, nrep = 2)
#' ## Note that rows 1 and 2 are in the same segment, rows 3 and 4 in the
#' ## same segment, and so on.
#'
#' ## Stratification
#' cvsegments(10, 3, type = "consecutive", stratify = c(rep(1,7), rep(2,3)))
#' ## Note that the last three samples are spread across the segments
#' ## according to the stratification vector.
#' cvsegments(20, 3, type = "consecutive", nrep = 2, stratify = c(rep(1,7), rep(2,3)))
#' ## Note the length of stratify matching number of replicate sets, not samples.
#'
#' @export
cvsegments <- function(N, k, length.seg = ceiling(N / k), nrep = 1,
type = c("random", "consecutive", "interleaved"), stratify = NULL)
{
## length.seg overrides k:
if (!missing(length.seg)) k <- ceiling(N / length.seg)
## Check arguments:
if (k > N) stop("More segments than observations requested")
if (N %% nrep != 0)
stop("The number of replicates does not divide ",
"the number of observations")
if (length.seg %% nrep != 0)
warning("Segment length is not a multiple of the number of ",
"replicates.\n A best effort segment size will be used.")
if (!missing(length.seg) && N %% length.seg != 0)
warning("Required segment length does not divide the number of ",
"observations.\n A best effort segment size will be used.")
## The idea is to generate a k times length.seg matrix with indices, and
## use each column as a segment. If k*length.seg > N, the last element of
## the N - k*length.seg last rows will be NA. Any NAs are stripped when
## the matrix is converted to a list of vectors.
## If nrep > 1, N and length.seg is first divided by nrep, and the matrix
## of indices is created as above. The matrix is then expanded by
## replacing each element i with a column vector nrep * (i - 1) + 1:nrep,
## before using the columns as segments.
## Reduce N and length.seg if needed
if (nrep > 1) {
N <- N / nrep
length.seg <- ceiling(N / k)
}
incomplete <- k * length.seg - N # Number of incomplete segments
complete <- k - incomplete # Number of complete segments
## Create matrix of indices
type <- match.arg(type)
if (is.null(stratify)) { # No stratification
switch(type,
random = {
inds <- matrix(c(sample(1:N), rep(NA, incomplete)),
nrow = length.seg, byrow = TRUE)
},
consecutive = {
if (complete < k) {
inds <- cbind(matrix(1:(length.seg*complete),
nrow = length.seg),
rbind(matrix((length.seg*complete+1):N,
nrow = length.seg-1), NA))
} else {
inds <- matrix(1:N, nrow = length.seg)
}
},
interleaved = {
inds <- matrix(c(1:N, rep(NA, incomplete)),
nrow = length.seg, byrow = TRUE)
}
)
} else {
## Prepare stratification
if (!is.list(stratify)) {
stratify <- lapply(1:max(stratify), function(i)which(stratify==i))
}
stratVec <- unlist(lapply(1:length(stratify), function(i)rep(i,length(stratify[[i]]))))
indVec <- unlist(stratify)
segs <- integer(N)
blinds <- rep(1:k, length.out=N)
switch(type,
random = {
stratRand <- sample(length(stratify))
stratify <- stratify[stratRand]
indVec <- unlist(stratify)
stratVec <- unlist(lapply(1:length(stratify), function(i)rep(i,length(stratify[[i]]))))
for(i in 1:length(stratify)){
segs[stratify[[i]]] <- blinds[indVec[stratVec==i]]
}
inds <- lapply(1:k,function(i)indVec[segs==i])
inds <- lapply(inds, function(i)i[sample(length(i))])
if(incomplete>0)
inds[-(1:complete)] <- lapply(inds[-(1:complete)], function(i){i[length.seg] <- NA;i})
inds <- do.call(cbind, inds)
inds <- inds[,sample(k)]
},
consecutive = {
for(i in 1:length(stratify)){
blinds[stratVec==i] <- sort(blinds[stratVec==i])
}
for(i in 1:length(stratify)){
segs[stratify[[i]]] <- blinds[stratVec==i]
}
inds <- lapply(1:k,function(i)which(segs==i))
if(incomplete>0)
inds[-(1:complete)] <- lapply(inds[-(1:complete)], function(i){i[length.seg] <- NA;i})
inds <- do.call(cbind, inds)
},
interleaved = {
for(i in 1:length(stratify)){
segs[stratify[[i]]] <- blinds[indVec[stratVec==i]]
}
inds <- lapply(1:k,function(i)indVec[segs==i])
if(incomplete>0)
inds[-(1:complete)] <- lapply(inds[-(1:complete)], function(i){i[length.seg] <- NA;i})
inds <- do.call(cbind, inds)
}
)
}
## Expand matrix if needed
if (nrep > 1) {
inds <- outer(1:nrep, nrep * (inds - 1), '+')
dims <- dim(inds)
dim(inds) <- c(dims[1] * dims[2], dims[3])
}
## Convert to list of segments defined by the columns, and add attributes
res <- lapply(as.data.frame(inds), function(x) c(na.omit(x)))
attr(res, "incomplete") <- incomplete
attr(res, "type") <- if (length.seg == 1) "leave-one-out" else type
res
}
|