1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
|
# Feature selection functions
#
# Author: Renaud Gaujoux
# Created: Mar 18, 2013
###############################################################################
#' @include NMF-class.R
NULL
#' Feature Selection in NMF Models
#'
#' The function \code{featureScore} implements different methods to computes
#' basis-specificity scores for each feature in the data.
#'
#' One of the properties of Nonnegative Matrix Factorization is that is tend to
#' produce sparse representation of the observed data, leading to a natural
#' application to bi-clustering, that characterises groups of samples by
#' a small number of features.
#'
#' In NMF models, samples are grouped according to the basis
#' components that contributes the most to each sample, i.e. the basis
#' components that have the greatest coefficient in each column of the coefficient
#' matrix (see \code{\link{predict,NMF-method}}).
#' Each group of samples is then characterised by a set of features selected
#' based on basis-specifity scores that are computed on the basis matrix.
#'
#' @section Feature scores:
#' The function \code{featureScore} can compute basis-specificity scores using
#' the following methods:
#'
#' \describe{
#'
#' \item{\sQuote{kim}}{ Method defined by \cite{KimH2007}.
#'
#' The score for feature \eqn{i} is defined as:
#' \deqn{S_i = 1 + \frac{1}{\log_2 k} \sum_{q=1}^k p(i,q) \log_2 p(i,q)}{
#' S_i = 1 + 1/log2(k) sum_q [ p(i,q) log2( p(i,q) ) ] },
#'
#' where \eqn{p(i,q)} is the probability that the \eqn{i}-th feature contributes
#' to basis \eqn{q}: \deqn{p(i,q) = \frac{W(i,q)}{\sum_{r=1}^k W(i,r)} }{
#' p(i,q) = W(i,q) / (sum_r W(i,r)) }
#'
#' The feature scores are real values within the range [0,1].
#' The higher the feature score the more basis-specific the corresponding feature.
#' }
#'
#' \item{\sQuote{max}}{Method defined by \cite{Carmona-Saez2006}.
#'
#' The feature scores are defined as the row maximums.
#' }
#'
#' }
#'
#' @param object an object from which scores/features are computed/extracted
#' @param ... extra arguments to allow extension
#'
#' @return \code{featureScore} returns a numeric vector of the length the number
#' of rows in \code{object} (i.e. one score per feature).
#'
#' @export
#' @rdname scores
#' @inline
#'
setGeneric('featureScore', function(object, ...) standardGeneric('featureScore') )
#' Computes feature scores on a given matrix, that contains the basis component in columns.
setMethod('featureScore', 'matrix',
function(object, method=c('kim', 'max')){
method <- match.arg(method)
score <- switch(method,
kim = {
#for each row compute the score
s <- apply(object, 1, function(g){
g <- abs(g)
p_i <- g/sum(g)
crossprod(p_i, log2(p_i))
})
# scale, translate and return the result
1 + s / log2(ncol(object))
}
, max = {
apply(object, 1L, function(x) max(abs(x)))
}
)
# return the computed score
return(score)
}
)
#' Computes feature scores on the basis matrix of an NMF model.
setMethod('featureScore', 'NMF',
function(object, ...){
featureScore(basis(object), ...)
}
)
#' The function \code{extractFeatures} implements different methods to select the
#' most basis-specific features of each basis component.
#'
#' @section Feature selection:
#' The function \code{extractFeatures} can select features using the following
#' methods:
#' \describe{
#' \item{\sQuote{kim}}{ uses \cite{KimH2007} scoring schema and
#' feature selection method.
#'
#' The features are first scored using the function
#' \code{featureScore} with method \sQuote{kim}.
#' Then only the features that fulfil both following criteria are retained:
#'
#' \itemize{
#' \item score greater than \eqn{\hat{\mu} + 3 \hat{\sigma}}, where \eqn{\hat{\mu}}
#' and \eqn{\hat{\sigma}} are the median and the median absolute deviation
#' (MAD) of the scores respectively;
#'
#' \item the maximum contribution to a basis component is greater than the median
#' of all contributions (i.e. of all elements of W).
#' }
#'
#' }
#'
#' \item{\sQuote{max}}{ uses the selection method used in the \code{bioNMF}
#' software package and described in \cite{Carmona-Saez2006}.
#'
#' For each basis component, the features are first sorted by decreasing
#' contribution.
#' Then, one selects only the first consecutive features whose highest
#' contribution in the basis matrix is effectively on the considered basis.
#' }
#'
#' }
#'
#' @return \code{extractFeatures} returns the selected features as a list of indexes,
#' a single integer vector or an object of the same class as \code{object}
#' that only contains the selected features.
#'
#' @rdname scores
#' @inline
#' @export
#'
setGeneric('extractFeatures', function(object, ...) standardGeneric('extractFeatures') )
# internal functio to trick extractFeatures when format='subset'
.extractFeaturesObject <- local({
.object <- NULL
function(object){
# first call resets .object
if( missing(object) ){
res <- .object
.object <<- NULL
res
}else # set .object for next call
.object <<- object
}
})
#' Select features on a given matrix, that contains the basis component in columns.
#'
#' @param method scoring or selection method.
#' It specifies the name of one of the method described in sections \emph{Feature scores}
#' and \emph{Feature selection}.
#'
#' Additionally for \code{extractFeatures}, it may be an integer vector that
#' indicates the number of top most contributing features to
#' extract from each column of \code{object}, when ordered in decreasing order,
#' or a numeric value between 0 and 1 that indicates the minimum relative basis
#' contribution above which a feature is selected (i.e. basis contribution threshold).
#' In the case of a single numeric value (integer or percentage), it is used for all columns.
#'
#' Note that \code{extractFeatures(x, 1)} means relative contribution threshold of
#' 100\%, to select the top contributing features one must explicitly specify
#' an integer value as in \code{extractFeatures(x, 1L)}.
#' However, if all elements in methods are > 1, they are automatically treated as
#' if they were integers: \code{extractFeatures(x, 2)} means the top-2 most
#' contributing features in each component.
#' @param format output format.
#' The following values are accepted:
#' \describe{
#' \item{\sQuote{list}}{(default) returns a list with one element per column in
#' \code{object}, each containing the indexes of the selected features, as an
#' integer vector.
#' If \code{object} has row names, these are used to name each index vector.
#' Components for which no feature were selected are assigned a \code{NA} value.}
#'
#' \item{\sQuote{combine}}{ returns all indexes in a single vector.
#' Duplicated indexes are made unique if \code{nodups=TRUE} (default).}
#'
#' \item{\sQuote{subset}}{ returns an object of the same class as \code{object},
#' but subset with the selected indexes, so that it contains data only from
#' basis-specific features.}
#' }
#'
#' @param nodups logical that indicates if duplicated indexes,
#' i.e. features selected on multiple basis components (which should in
#' theory not happen), should be only appear once in the result.
#' Only used when \code{format='combine'}.
#'
#' @examples
#'
#' # random NMF model
#' x <- rnmf(3, 50,20)
#'
#' # probably no feature is selected
#' extractFeatures(x)
#' # extract top 5 for each basis
#' extractFeatures(x, 5L)
#' # extract features that have a relative basis contribution above a threshold
#' extractFeatures(x, 0.5)
#' # ambiguity?
#' extractFeatures(x, 1) # means relative contribution above 100%
#' extractFeatures(x, 1L) # means top contributing feature in each component
#'
setMethod('extractFeatures', 'matrix',
function(object, method=c('kim', 'max')
, format=c('list', 'combine', 'subset'), nodups=TRUE){
res <-
if( is.numeric(method) ){
# repeat single values
if( length(method) == 1L ) method <- rep(method, ncol(object))
# float means percentage, integer means count
# => convert into an integer if values > 1
if( all(method > 1L) ) method <- as.integer(method)
if( is.integer(method) ){ # extract top features
# only keep the specified number of feature for each column
mapply(function(i, l) head(order(object[,i], decreasing=TRUE), l)
, seq(ncol(object)), method, SIMPLIFY=FALSE)
}else{ # extract features with contribution > threshold
# compute relative contribution
so <- sweep(object, 1L, rowSums(object), '/')
# only keep features above threshold for each column
mapply(function(i, l) which(so[,i] >= l)
, seq(ncol(object)), method, SIMPLIFY=FALSE)
}
}else{
method <- match.arg(method)
switch(method,
kim = { # KIM & PARK method
# first score the genes
s <- featureScore(object, method='kim')
# filter for the genes whose score is greater than \mu + 3 \sigma
th <- median(s) + 3 * mad(s)
sel <- s >= th
#print( s[sel] )
#print(sum(sel))
# build a matrix with:
#-> row#1=max column index, row#2=max value in row, row#3=row index
temp <- 0;
g.mx <- apply(object, 1L,
function(x){
temp <<- temp +1
i <- which.max(abs(x));
#i <- sample(c(1,2), 1)
c(i, x[i], temp)
}
)
# test the second criteria
med <- median(abs(object))
sel2 <- g.mx[2,] >= med
#print(sum(sel2))
# subset the indices
g.mx <- g.mx[, sel & sel2, drop=FALSE]
# order by decreasing score
g.mx <- g.mx[,order(s[sel & sel2], decreasing=TRUE)]
# return the indexes of the features that fullfil both criteria
cl <- factor(g.mx[1,], levels=seq(ncol(object)))
res <- split(g.mx[3,], cl)
# add the threshold used
attr(res, 'threshold') <- th
# return result
res
},
max = { # MAX method from bioNMF
# determine the specific genes for each basis vector
res <- lapply(1:ncol(object),
function(i){
mat <- object
vect <- mat[,i]
#order by decreasing contribution to factor i
index.sort <- order(vect, decreasing=TRUE)
for( k in seq_along(index.sort) )
{
index <- index.sort[k]
#if the feature contributes more to any other factor then return the features above it
if( any(mat[index,-i] >= vect[index]) )
{
if( k == 1 ) return(as.integer(NA))
else return( index.sort[1:(k-1)] )
}
}
# all features meet the criteria
seq_along(vect)
}
)
# return res
res
}
)
}
#Note: make sure there is an element per basis (possibly NA)
res <- lapply(res, function(ind){ if(length(ind)==0) ind<-NA; as.integer(ind)} )
# add names if possible
if( !is.null(rownames(object)) ){
noNA <- sapply(res, is_NA)
res[noNA] <- lapply(res[noNA], function(x){
setNames(x, rownames(object)[x])
})
}
# apply the desired output format
format <- match.arg(format)
res <- switch(format
#combine: return all the indices in a single vector
, combine = {
# ensure that there is no names: for unlist no to mess up feature names
names(res) <- NULL
ind <- na.omit(unlist(res))
if( nodups ) unique(ind)
else ind
}
#subset: return the object subset with the selected indices
, subset = {
ind <- na.omit(unique(unlist(res)))
sobject <- .extractFeaturesObject()
{if( is.null(sobject) ) object else sobject}[ind, , drop=FALSE]
}
#else: leave as a list
,{
# add component names if any
names(res) <- colnames(object)
res
}
)
# add attribute method to track the method used
attr(res, 'method') <- method
# return result
return( res )
}
)
#' Select basis-specific features from an NMF model, by applying the method
#' \code{extractFeatures,matrix} to its basis matrix.
#'
#'
setMethod('extractFeatures', 'NMF',
function(object, ...){
# extract features from the basis matrix, but subset the NMF model itself
.extractFeaturesObject(object)
extractFeatures(basis(object), ...)
}
)
unit.test(extractFeatures, {
.check <- function(x){
msg <- function(...) paste(class(x), ':', ...)
checkTrue( is.list(extractFeatures(x)), msg("default returns list"))
checkTrue( is.list(extractFeatures(x, format='list')), msg("format='list' returns list"))
checkTrue( is.integer(extractFeatures(x, format='combine')), msg("format='combine' returns an integer vector"))
checkTrue( is(extractFeatures(x, format='subset'), class(x)), msg("format='subset' returns same class as object"))
}
.check(rmatrix(50, 5))
.check(rnmf(3, 50, 5))
})
|