File: cvsegments.R

package info (click to toggle)
r-cran-pls 2.8-5-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,056 kB
  • sloc: sh: 13; makefile: 2
file content (211 lines) | stat: -rw-r--r-- 9,481 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
### cvsegments.R: A utility function to generate segments for k-fold
### cross-validation.



#' @title Generate segments for cross-validation
#'
#' @description The function generates a list of segments for cross-validation.  It can
#' generate random, consecutive and interleaved segments, and supports keeping
#' replicates in the same segment.
#'
#' @details If \code{length.seg} is specified, it is used to calculate the number of
#' segments to generate.  Otherwise \code{k} must be specified.  If
#' \eqn{k*length.seg \ne N}{k*length.seg <> N}, the \eqn{k*length.seg - N} last
#' segments will contain only \eqn{length.seg - 1} indices.
#'
#' If \code{type} is \code{"random"}, the indices are allocated to segments in
#' random order.  If it is \code{"consecutive"}, the first segment will contain
#' the first \eqn{length.seg} indices, and so on.  If \code{type} is
#' \code{"interleaved"}, the first segment will contain the indices \eqn{1,
#' length.seg+1, 2*lenght.seg+1, \ldots, (k-1)*length.seg+1}, and so on.
#'
#' If \eqn{nrep > }, it is assumed that each \code{nrep} consecutive rows are
#' replicates (repeated measurements) of the same object, and care is taken
#' that replicates are never put in different segments.
#'
#' Warning: If \code{k} does not divide \code{N}, a specified \code{length.seg}
#' does not divide \code{N}, or \code{nrep} does not divide \code{length.seg},
#' the number of segments and/or the segment length will be adjusted as needed.
#' Warnings are printed for some of these cases, and one should always inspect
#' the resulting segments to make sure they are as expected.
#'
#' Stratification of samples is enabled by the \code{stratify} argument.  This
#' is useful if there are sub-groups in the data set that should have a
#' proportional representation in the cross-validation segments or if the
#' response is categorical (classifiation). If \code{stratify} is combined with
#' \code{nrep}, \code{stratify} corresponds to the sets of replicates (see
#' example).
#'
#' @param N Integer.  The number of rows in the data set.
#' @param k Integer.  The number of segments to return.
#' @param length.seg Integer.  The length of the segments.  If given, it
#' overrides \code{k}.
#' @param nrep Integer.  The number of (consecutive) rows that are replicates
#' of the same object.  Replicates will always be kept in the same segment.
#' @param type One of \code{"random"}, \code{"consecutive"} and
#' \code{"interleaved"}.  The type of segments to generate.  Default is
#' \code{"random"}.
#' @param stratify Either a \code{list} of indices or an integer \code{vector}
#' indicating which stratum each sample (or set of replicates) belongs to (see
#' Details).
#' @return A list of vectors.  Each vector contains the indices for one
#' segment.  The attribute \code{"incomplete"} contains the number of
#' incomplete segments, and the attribute \code{"type"} contains the type of
#' segments.
#' @author Bjørn-Helge Mevik, Ron Wehrens and Kristian Hovde Liland
#' @keywords models
#' @examples
#'
#' ## Segments for 10-fold randomised cross-validation:
#' cvsegments(100, 10)
#'
#' ## Segments with four objects, taken consecutive:
#' cvsegments(60, length.seg = 4, type = "cons")
#'
#' ## Incomplete segments
#' segs <- cvsegments(50, length.seg = 3)
#' attr(segs, "incomplete")
#'
#' ## Leave-one-out cross-validation:
#' cvsegments(100, 100)
#' ## Leave-one-out with variable/unknown data set size n:
#' n <- 50
#' cvsegments(n, length.seg = 1)
#'
#' ## Data set with replicates
#' cvsegments(100, 25, nrep = 2)
#' ## Note that rows 1 and 2 are in the same segment, rows 3 and 4 in the
#' ## same segment, and so on.
#'
#' ## Stratification
#' cvsegments(10, 3, type = "consecutive", stratify = c(rep(1,7), rep(2,3)))
#' ## Note that the last three samples are spread across the segments
#' ## according to the stratification vector.
#' cvsegments(20, 3, type = "consecutive", nrep = 2, stratify = c(rep(1,7), rep(2,3)))
#' ## Note the length of stratify matching number of replicate sets, not samples.
#'
#' @export
cvsegments <- function(N, k, length.seg = ceiling(N / k), nrep = 1,
                            type = c("random", "consecutive", "interleaved"), stratify = NULL)
{
    ## length.seg overrides k:
    if (!missing(length.seg)) k <- ceiling(N / length.seg)

    ## Check arguments:
    if (k > N) stop("More segments than observations requested")
    if (N %% nrep != 0)
        stop("The number of replicates does not divide ",
             "the number of observations")
    if (length.seg %% nrep != 0)
        warning("Segment length is not a multiple of the number of ",
                "replicates.\n  A best effort segment size will be used.")
    if (!missing(length.seg) && N %% length.seg != 0)
        warning("Required segment length does not divide the number of ",
                "observations.\n  A best effort segment size will be used.")

    ## The idea is to generate a k times length.seg matrix with indices, and
    ## use each column as a segment.  If k*length.seg > N, the last element of
    ## the N - k*length.seg last rows will be NA.  Any NAs are stripped when
    ## the matrix is converted to a list of vectors.

    ## If nrep > 1, N and length.seg is first divided by nrep, and the matrix
    ## of indices is created as above.  The matrix is then expanded by
    ## replacing each element i with a column vector nrep * (i - 1) + 1:nrep,
    ## before using the columns as segments.

    ## Reduce N and length.seg if needed
    if (nrep > 1) {
        N <- N / nrep
        length.seg <- ceiling(N / k)
    }

    incomplete <- k * length.seg - N    # Number of incomplete segments
    complete   <- k - incomplete        # Number of complete segments

    ## Create matrix of indices
    type <- match.arg(type)
    if (is.null(stratify)) { # No stratification
        switch(type,
               random = {
                   inds <- matrix(c(sample(1:N), rep(NA, incomplete)),
                                  nrow = length.seg, byrow = TRUE)
               },
               consecutive = {
                   if (complete < k) {
                       inds <- cbind(matrix(1:(length.seg*complete),
                                            nrow = length.seg),
                                     rbind(matrix((length.seg*complete+1):N,
                                                  nrow = length.seg-1), NA))
                   } else {
                       inds <- matrix(1:N, nrow = length.seg)
                   }
               },
               interleaved = {
                   inds <- matrix(c(1:N, rep(NA, incomplete)),
                                  nrow = length.seg, byrow = TRUE)
               }
        )
    } else {

        ## Prepare stratification
        if (!is.list(stratify)) {
            stratify <- lapply(1:max(stratify), function(i)which(stratify==i))
        }
        stratVec <- unlist(lapply(1:length(stratify), function(i)rep(i,length(stratify[[i]]))))
        indVec   <- unlist(stratify)
        segs     <- integer(N)
        blinds   <- rep(1:k, length.out=N)
        switch(type,
               random = {
                   stratRand <- sample(length(stratify))
                   stratify  <- stratify[stratRand]
                   indVec    <- unlist(stratify)
                   stratVec  <- unlist(lapply(1:length(stratify), function(i)rep(i,length(stratify[[i]]))))
                   for(i in 1:length(stratify)){
                       segs[stratify[[i]]] <- blinds[indVec[stratVec==i]]
                   }
                   inds <- lapply(1:k,function(i)indVec[segs==i])
                   inds <- lapply(inds, function(i)i[sample(length(i))])
                   if(incomplete>0)
                       inds[-(1:complete)] <- lapply(inds[-(1:complete)], function(i){i[length.seg] <- NA;i})
                   inds <- do.call(cbind, inds)
                   inds <- inds[,sample(k)]
               },
               consecutive = {
                   for(i in 1:length(stratify)){
                       blinds[stratVec==i] <- sort(blinds[stratVec==i])
                   }
                   for(i in 1:length(stratify)){
                       segs[stratify[[i]]] <- blinds[stratVec==i]
                   }
                   inds <- lapply(1:k,function(i)which(segs==i))
                   if(incomplete>0)
                       inds[-(1:complete)] <- lapply(inds[-(1:complete)], function(i){i[length.seg] <- NA;i})
                   inds <- do.call(cbind, inds)
               },
               interleaved = {
                   for(i in 1:length(stratify)){
                       segs[stratify[[i]]] <- blinds[indVec[stratVec==i]]
                   }
                   inds <- lapply(1:k,function(i)indVec[segs==i])
                   if(incomplete>0)
                       inds[-(1:complete)] <- lapply(inds[-(1:complete)], function(i){i[length.seg] <- NA;i})
                   inds <- do.call(cbind, inds)
               }
        )
    }

    ## Expand matrix if needed
    if (nrep > 1) {
        inds <- outer(1:nrep, nrep * (inds - 1), '+')
        dims <- dim(inds)
        dim(inds) <- c(dims[1] * dims[2], dims[3])
    }

    ## Convert to list of segments defined by the columns, and add attributes
    res <- lapply(as.data.frame(inds), function(x) c(na.omit(x)))
    attr(res, "incomplete") <- incomplete
    attr(res, "type") <- if (length.seg == 1) "leave-one-out" else type
    res
}