1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
|
# Several binarization methods for time series. <measurements> is a list of matrices with the
# genes in the rows, each specifying one time series.
# If <method> is "kmeans", k-means binarization is used.
# <nstart> and <iter.max> are the corresponding parameters for k-means. This clustering
# is employed for binarization.
# If <method> is "edgeDetector", an edge detector is used for binarization.
# If <method> is "edgeDetector" and <edge> is "firstEdge",
# the first "significant" edge in the sorted data is the threshold for the binarization.
# If <edge> is "maxEdge", the algorithm searches for the edge with the highest gradient.
# With the <scaling> factor, the size of the first edge can be adapted.
# If <method> is "scanStatistic", a scan statistic is used to binarize the data
# If <dropInsignificant> is true, insignificant genes (that are not recommended by the statistic) are removed.
# Returns a list containing the binarized matrix list and (for k-means) a vector
# of thresholds and (for scan statistic) a vector specifying genes to remove.
binarizeTimeSeries <- function(measurements, method=c("kmeans","edgeDetector","scanStatistic"), nstart=100, iter.max=1000, edge=c("firstEdge","maxEdge"), scaling=1, windowSize=0.25, sign.level=0.1, dropInsignificant=FALSE)
{
if (!is.null(dim(measurements)))
fullData <- measurements
else
# in case of list, paste all matrices before clustering
{
fullData <- measurements[[1]]
for (m in measurements[-1])
{
fullData <- cbind(fullData,m)
}
}
#switch between the different methods
switch(match.arg(method),
kmeans={
cluster <- apply(fullData,1,function(gene)
# cluster data using k-means
{
cl_res <- kmeans(gene, 2, nstart=nstart,iter.max=iter.max)
if (cl_res$centers[1] > cl_res$centers[2])
# exchange clusters if necessary, so that smaller numbers
# are binarized to 0, and larger numbers are binarized to 1
group <- abs(cl_res$cluster-2)
else
group <- cl_res$cluster-1
# calculate the binarization threshold
threshold <- min(cl_res$centers) + dist(cl_res$centers)[1]/2
list(bin=group, threshold=threshold)
})
if (is.null(dim(measurements)))
# split up the collated binarized measurements into a list of matrices of the original size
{
startIndex <- 0
binarizedTimeSeries <- lapply(measurements,function(m)
{
currentSplit <- (startIndex+1):(startIndex+ncol(m))
startIndex <<- startIndex + ncol(m)
t(sapply(cluster,function(cl)
cl$bin[currentSplit]))
})
}
else
{
binarizedTimeSeries <- t(sapply(cluster,function(cl)
cl$bin))
}
#colnames(binarizedTimeSeries)<-colnames(fullData)
return(list(binarizedMeasurements=binarizedTimeSeries,
thresholds=sapply(cluster,function(cl)cl$threshold)))
},
edgeDetector={
#switch between the different edgedetectors
switch(match.arg(edge),
firstEdge={
cluster <- apply(fullData,1,function(gene)
# cluster data using edgedetector
{
cl_res <- edgeDetector(gene,scaling,edge="firstEdge")
list(bin=cl_res$bindata,thresholds=cl_res$thresholds)
})
if (is.null(dim(measurements)))
# split up the collated binarized measurements into a list of matrices of the original size
{
startIndex <- 0
binarizedTimeSeries <- lapply(measurements,function(m)
{
currentSplit <- (startIndex+1):(startIndex+ncol(m))
startIndex <<- startIndex + ncol(m)
t(sapply(cluster,function(cl)
cl$bin[currentSplit]))
})
threshlist<-sapply(cluster,function(cl) cl$thresholds)
}
else
{
binarizedTimeSeries <- t(sapply(cluster,function(cl)
cl$bin))
threshlist<-sapply(cluster,function(cl) cl$thresholds)
}
return(list(binarizedMeasurements=binarizedTimeSeries,thresholds= threshlist))
},
maxEdge={
cluster <- apply(fullData,1,function(gene)
# cluster data using edgedetector
{
cl_res <- edgeDetector(gene,edge="maxEdge")
list(bin=cl_res$bindata,thresholds=cl_res$thresholds)
})
if (is.null(dim(measurements)))
# split up the collated binarized measurements into a list of matrices of the original size
{
startIndex <- 0
binarizedTimeSeries <- lapply(measurements,function(m)
{
currentSplit <- (startIndex+1):(startIndex+ncol(m))
startIndex <<- startIndex + ncol(m)
t(sapply(cluster,function(cl)
cl$bin[currentSplit]))
})
threshlist<-sapply(cluster,function(cl) cl$thresholds)
}
else
{
binarizedTimeSeries <- t(sapply(cluster,function(cl)
cl$bin))
threshlist<-sapply(cluster,function(cl) cl$thresholds)
}
return(list(binarizedMeasurements=binarizedTimeSeries,thresholds= threshlist))
},
stop("'method' must be one of \"firstEdge\",\"maxEdge\"")
)
},
scanStatistic={
cluster <- apply(fullData,1,function(gene)
# cluster data using scanStatistic
{
cl_res <- scanStatistic(gene,windowSize,sign.level)
list(bin= cl_res$bindata,thresholds=cl_res$thresholds,reject=cl_res$reject)
})
significant <- sapply(cluster,function(cl)(cl$reject==FALSE))
# remove not recommended genes
if (dropInsignificant)
{
significant <- sapply(cluster,function(cl)(cl$reject==FALSE))
cluster <- cluster[significant]
}
if (is.null(dim(measurements)))
# split up the collated binarized measurements into a list of matrices of the original size
{
startIndex <- 0
binarizedTimeSeries <- lapply(measurements,function(m)
{
currentSplit <- (startIndex+1):(startIndex+ncol(m))
startIndex <<- startIndex + ncol(m)
t(sapply(cluster,function(cl)
cl$bin[currentSplit]))
})
rejectlist<-sapply(cluster,function(cl) cl$reject)
threshlist<-sapply(cluster,function(cl) cl$thresholds)
}
else
{
binarizedTimeSeries <- t(sapply(cluster,function(cl)
cl$bin))
rejectlist<-sapply(cluster,function(cl) cl$reject)
threshlist<-sapply(cluster,function(cl) cl$thresholds)
}
if(any(rejectlist))
{
warning("The following genes show a uniform behaviour and should possibly be excluded from binarization:", paste(rownames(fullData)[!significant],collapse=" "))
}
return(list(binarizedMeasurements=binarizedTimeSeries,thresholds=threshlist,reject=rejectlist))
},
stop("'method' must be one of \"kmeans\",\"edgeDetector\",\"scanStatistic\"")
)
}
|