File: DGEList.R

package info (click to toggle)
r-bioc-edger 3.40.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,484 kB
  • sloc: cpp: 1,425; ansic: 1,109; sh: 21; makefile: 5
file content (133 lines) | stat: -rw-r--r-- 4,724 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
DGEList <- function(counts=matrix(0,0,0), lib.size=colSums(counts), norm.factors=rep(1,ncol(counts)), samples=NULL, group=NULL, genes=NULL, remove.zeros=FALSE) 
#	Construct DGEList object from components, with some checking
#	Created 28 Sep 2008. Last modified 16 Nov 2020.
{
#	Check whether counts is a data.frame
	if(is.data.frame(counts)) {
		cl <- vapply(counts,class,FUN.VALUE="")
		IsNumeric <- (cl %in% c("integer","numeric"))
		a <- which(!IsNumeric)
		if(length(a)) {
			if(length(a)==1L && a[1]==1L) {
				stop(
"The count matrix is a data.frame instead of a matrix and the first column is of class ",cl[1],"\n",
"  instead of being numeric. Was the first column intended to contain geneids?"
				)
			} else {
				stop(
"The count matrix is a data.frame instead of a matrix and ",length(a)," columns are non-numeric.\n",
"  Should these columns be gene annotation instead of counts?"
				)
			}
		}
	}

#	Check counts
	counts <- as.matrix(counts)
	if( !any(typeof(counts) == c("integer","double")) ) stop("non-numeric values found in counts")
	nlib <- ncol(counts)
	ntags <- nrow(counts)
	if(nlib>0L && is.null(colnames(counts))) colnames(counts) <- paste0("Sample",1L:nlib)
	if(ntags>0L && is.null(rownames(counts))) rownames(counts) <- 1L:ntags
	.isAllZero(counts) # don't really care about all-zeroes, but do want to protect against NA's, negative values.

#	Check lib.size
	if(is.null(lib.size)) {
		lib.size <- colSums(counts)
		if(min(lib.size) <= 0) warning("library size of zero detected")
	} else {
		if(!is.numeric(lib.size)) stop("'lib.size' must be numeric")
		if(nlib != length(lib.size)) stop("length of 'lib.size' must equal number of columns in 'counts'")
		minlibsize <- min(lib.size)
		if(is.na(minlibsize)) stop("NA library sizes not allowed")
		if(minlibsize < 0) stop("negative library sizes not permitted")
		if(minlibsize == 0) {
			if(any(lib.size==0 & colSums(counts)>0)) stop("library size set to zero but counts are nonzero")
			warning("library size of zero detected")
		}
	}

#	Check norm.factors
	if(is.null(norm.factors)) {
		norm.factors <- rep_len(1,ncol(counts))
	} else {
		if(!is.numeric(norm.factors)) stop("'lib.size' must be numeric")
		if(!identical(nlib,length(norm.factors))) stop("Length of 'norm.factors' must equal number of columns in 'counts'")
		minnf <- min(norm.factors)
		if(is.na(minnf)) stop("NA norm factors not allowed")
		if(minnf <= 0) stop("norm factors should be positive")
		if( abs(prod(norm.factors) - 1) > 1e-6 ) warning("norm factors don't multiply to 1")
	}

#	Check samples
	if(!is.null(samples)) {
		samples <- as.data.frame(samples)
		if(nlib != nrow(samples)) stop("Number of rows in 'samples' must equal number of columns in 'counts'")
	}

#	Get group from samples if appropriate
	if(is.null(group) && !is.null(samples$group)) {
		group <- samples$group
		samples$group <- NULL
	}

#	Check group
	if(is.null(group)) {
		group <- rep_len(1L,nlib)
		levels(group) <- "1"
		class(group) <- "factor"
	} else {
		if(length(group) != nlib) stop("Length of 'group' must equal number of columns in 'counts'")
		group <- dropEmptyLevels(group)
	}

#	Make data frame of sample information
	sam <- data.frame(group=group,lib.size=lib.size,norm.factors=norm.factors)
	if(!is.null(samples)) sam <- data.frame(sam, samples)
	samples <- sam
	if(anyDuplicated(colnames(counts))) {
		message("Repeated column names found in count matrix")
		row.names(samples) <- 1L:nlib
	} else 
		row.names(samples) <- colnames(counts)

#	Make object
	x <- new("DGEList",list(counts=counts,samples=samples))

#	Add data frame of gene information
	if(!is.null(genes)) {
		genes <- as.data.frame(genes, stringsAsFactors=FALSE)
		if(nrow(genes) != ntags) stop("Counts and genes have different numbers of rows")
		if(anyDuplicated(row.names(counts)))
			warning("Count matrix has duplicated rownames",call.=FALSE)
		else 
			row.names(genes) <- row.names(counts)
		x$genes <- genes
	}

#	Remove rows with all zeros
	if(remove.zeros) {
		all.zeros <- rowSums(counts>0,na.rm=TRUE)==0
		if(any(all.zeros)) {
			x <- x[!all.zeros,]
			message("Removing ",sum(all.zeros)," rows with all zero counts")
		}
	}

#	x$offset <- expandAsMatrix(getOffset(x),dim(counts))
#	x$weights <- matrix(1,ntags,nlib)

	x
}

.isAllZero <- function(y) 
# Check whether all counts are zero.
# Also checks and stops with an informative error message if negative, NA or infinite counts are present.
{
	if (!length(y)) return(FALSE)
	check.range <- range(y)
	if (is.na(check.range[1])) stop("NA counts not allowed", call.=FALSE)
	if (check.range[1] < 0) stop("Negative counts not allowed", call.=FALSE)
	if (is.infinite(check.range[2])) stop("Infinite counts not allowed", call.=FALSE)
	check.range[2]==0
}