1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
|
#
# This routine creates a stacked data set.
# The key input is the cmap matrix, which has one row for each column
# of X and one column per transition. It may have extra rows if there are
# proportional baseline hazards
# The first row of smat contains state to strata information.
# Input data is X, Y, strata, and initial state (integer).
# The model.frame is used only for strata, currently a rare case in multistate
#
# For each transition the expanded data has a set of rows, all those whose
# initial state makes them eligible for the transition.
# Strata is most often null; it encodes a users strata() addition(s). Such terms
# occur less often in multistate models (in my experience so far.)
#
stacker <- function(cmap, smap, istate, X, Y, mf, states, dropzero=TRUE) {
from.state <- as.numeric(sub(":.*$", "", colnames(cmap)))
to.state <- as.numeric(sub("^.*:", "", colnames(cmap)))
# just in case cmap has columns I don't need (I don't think this can
# happen
check <- match(from.state, istate, nomatch=0)
if (any(check==0)){
# I think that this is impossible
warning("extra column in cmap, this is a bug") # debugging line
# browser()
cmap <- cmap[,check>0, drop=FALSE]
smap <- smap[,check>0, drop=FALSE]
from.state <- from.state[check>0]
to.state <- to.state[check>0]
}
# Don't create X and Y matrices for transitions with no covariates, for
# coxph calls. But I need them for survfit.coxph (at present)
zerocol <- apply(cmap==0, 2, all)
if (dropzero && any(zerocol)) {
cmap <- cmap[,!zerocol, drop=FALSE]
smap <- smap[,!zerocol, drop=FALSE]
smap[,] <- match(smap, sort(unique(c(smap)))) # relabel as 1, 2,...
from.state <- from.state[!zerocol]
to.state <- to.state[!zerocol]
}
endpoint <- c(0, match(attr(Y, "states"), states))
endpoint <- endpoint[ 1 + Y[,ncol(Y)]] # endpoint of each row, 0=censor
# Jan 2021: changed from looping once per strata to once per transition.
# Essentially, a block of data for each unique column of cmap. If two
# of those columns have the same starting state, it makes me nervous
# (statistically), but forge onward and sort the issues out in the
# fits.
# Dec 2024: change to once per stratum. The issue was multiple
# transitions like 1:4 and 1:5 in the same stratum, which led to the
# same row of data twice in one stratum; which is not valid.
# May 2025: further update in conjunction with "Shared coefficients
# and shared baselines in multistate models", which forced a re-thinking
# of the process. For the use cases found there, the code below is now
# correct. For others, the data produced may be questionable;
# we await an actual use case to work things out.
# The constructed X matrix will have a block of rows for each column of
# cmap such that: the column is not 0, it is not a duplicate of another
# cmap column that is in the same stratum.
# Within a stratum we retain sort order of the data
sgrp <- match(smap[1,], unique(smap[1,])) # the stratum group
nblock <- max(sgrp) # total number of blocks
# Pass 1 to find the total data set size
n.perblock <- integer(nblock)
cgrp <- vector("list", nblock)
for (i in 1:nblock) {
j <- which(sgrp == i) # which cols of cmap & smap apply
ctemp <- !duplicated(t(cmap[,j, drop=FALSE])) # unique coef sets in sgrp
cgrp[[i]] <- lapply(j[which(ctemp)],
function(i) {
temp <- cmap[,j,drop=FALSE] - cmap[,i]
j[which(apply(temp, 2, function(x) all(x==0)))]})
# cgrp[[i]] will be a list, 99% of the time it will be of length 1:
# a vector containing all the columns of cmap that go with this strata
# This is the case found in the vignette (with multiple subcases):
# everyone at risk appears once in the stratum
# if cgrp has length >1 the resulting risk sets have dups of some or
# all obs, but with different coefficient patterns.
temp <- 0
for (k in cgrp[[i]])
temp <- temp + sum(istate %in% from.state[k])
n.perblock[i] <- temp
}
# The constructed X matrix has a block of rows for each stratum
n2 <- sum(n.perblock) # number of rows in new data
newX <- matrix(0, nrow=n2, ncol=max(cmap))
k <- 0
rindex <- integer(n2) # original row for each new row of data
newstat <- integer(n2) # new status
Xcols <- ncol(X) # number of columns in X
for (i in 1:nblock) {
for (j in cgrp[[i]]) {
subject <- which(istate %in% from.state[j]) # data rows in strata
nr <- k + seq(along.with =subject) # rows in the newX for subblock
rindex[nr] <- subject
nc <- cmap[,min(j)] # all cols j of cmap are the same
if (any(nc > Xcols)) { # constructed PH variables
newX[nr, nc[nc>Xcols] ] <- 1
nc <- nc[1:Xcols]
}
newX[nr, nc[nc>0]] <- X[subject, which(nc>0)] #row of cmap= col of X
event.that.counts <- (endpoint[subject] %in% to.state[j])
newstat[nr] <- ifelse(event.that.counts, 1L, 0L)
k <- max(nr)
}
}
# which (grouped) transition each row of newX represents
transition <- rep(1:nblock, n.perblock)
newstrat <- factor(colnames(smap)[transition], colnames(smap))
# newstrat will be 1:2, ... i.e. the from:to pairs
if (nrow(smap) >1) {
# there are strata in the call as well, so expand the strata to
# "strata from the model".1:2, etc
sstrat <- function(...) strata(..., shortlabel=TRUE)
tmap <- smap[-1,, drop=FALSE] #ignore (Baseline) row
# rtemp will be list, row numbers for transition 1, rows for 2,..
rtemp <- split(rindex, rep(1:nblock, n.perblock)) #rows per trans
temp <- vector("list", nblock) # one per transition
for (i in 1:nblock){
j <- (tmap[,i]>0) # which stata vars for this transition?
if (sum(j) ==1) {
zz <- mf[[(rownames(tmap))[j]]] # the strata
temp[[i]] <- zz[rtemp[[i]]]
} else if (sum(j)>1) {
zz <- mf[rtemp[[i]], (rownames(tmap))[j]]
temp[[i]] <- do.call(sstrat, zz)
} else temp[[i]] <- rep(0, n.perblock[i]) #no prior strata
}
newstrat <- do.call(sstrat, list(newstrat, unlist(temp)))
}
# remove any rows where X or strata is missing
# these arise when a variable is used only for some transitions
# the row of data needs to be tossed for the given ones, but will be
# okay for other transitions which do not use the offending variable.
keep <- !apply(is.na(newX), 1, any) & !is.na(newstrat)
if (!all(keep)) {
newX <- newX[keep,, drop=FALSE]
rindex <- rindex[keep]
newstat <- newstat[keep]
transition <- transition[keep]
newstrat <- newstrat[keep]
}
if (ncol(Y) ==2) newY <- Surv(Y[rindex,1], newstat)
else newY <- Surv(Y[rindex,1], Y[rindex,2], newstat)
# If there were multiple unique cmap columns within one strata, then an
# observation might appear twice, and we might no longer be sorted.
# (This is also the case where we do not yet know what a 'sensible'
# result of stacker should be, i.e., very rare.)
if (any(sapply(cgrp, length)) > 1) {
indx <- order(newstrat, rindex)
if (any(diff(indx) !=1)) {
newX <- newX[indx,, drop=FALSE]
rindex <- rindex[indx]
newstat <- newstat[indx]
transition <- transition[indx]
}
}
#
# give variable names to the new data (some names get used more than once)
# vname <- rep("", ncol(newX))
# vname[cmap[cmap>0]] <- colnames(X)[row(cmap)[cmap>0]]
first <- match(sort(unique(cmap[cmap>0])), cmap) #first instance of each value
vname <- rownames(cmap)[row(cmap)[first]]
colnames(newX) <- vname
list(X=newX, Y=newY, strata=as.integer(newstrat),
transition= as.integer(transition), rindex=rindex)
}
|