File: featureCounts2DGEList.R

package info (click to toggle)
r-bioc-edger 3.40.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,484 kB
  • sloc: cpp: 1,425; ansic: 1,109; sh: 21; makefile: 5
file content (132 lines) | stat: -rw-r--r-- 4,972 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
featureCounts2DGEList <- function(x)
#	Convert featureCounts output to DGEList
#	Gordon Smyth
#	Created 7 Jan 2021. Last modified 26 Jan 2021.
{
#	Check for featureCounts output
	if(!is.list(x) || !all(c("counts","annotation","targets","stat") %in% names(x)))
		stop("x should be a list containing components `counts`, `annotation`, `targets` and `stat`")

#	Get and simplify sample names
	SampleNames <- colnames(x$counts)
	if(is.null(SampleNames)) SampleNames <- x$targets
	if(is.null(SampleNames)) SampleNames <- seq_len(ncol(x$counts))
	SampleNames <- removeExt(removeExt(SampleNames))
	colnames(x$counts) <- SampleNames

#	Mapping statistics	
	Stat <- t(as.matrix(x$stat[,-1,drop=FALSE]))
	rownames(Stat) <- SampleNames
	colnames(Stat) <- x$stat[,1]
	PropAssigned <- Stat[,1] / rowSums(Stat)
	Samples <- data.frame(PropAssigned=PropAssigned)

#	GeneID must be character
	x$annotation$GeneID <- as.character(x$annotation$GeneID)
	if(!identical(row.names(x$counts),x$annotation$GeneID)) warning("row.names of counts are not the same as GeneIDs")

#	Detect use of meta-features
	DupRowNames <- as.logical(anyDuplicated(rownames(x$counts)))
	MultipleFeatures <- as.logical(length(grep(";",x$annotation$Start)))
	Junctions <- !is.null(x$counts_junction)
	if(MultipleFeatures && DupRowNames) stop("Meta-feature counts should have unique row.names")
	if(MultipleFeatures && Junctions) stop("Junction counts incompatible with meta-feature counts")

##	Meta-features (genes)

	if(MultipleFeatures) {
#		Collapse Chr, Start, End and Strand for meta-feature
		firstfun <- function(z) z[1]
		lastfun <- function(z) z[length(z)]
		z <- strsplit(x$annotation$Chr,split=";")
		z1 <- x$annotation$Chr <- vapply(z,firstfun,"")
		z2 <- x$annotation$Chr <- vapply(z,lastfun,"")
		i <- z1 != z2
		z1[i] <- paste(z1[i],z2[i],sep=";")
		x$annotation$Chr <- z1
		z <- strsplit(as.character(x$annotation$Start),split=";")
		x$annotation$Start <- as.integer(vapply(z,firstfun,""))
		z <- strsplit(as.character(x$annotation$End),split=";")
		x$annotation$End <- as.integer(vapply(z,lastfun,""))
		z <- strsplit(x$annotation$Strand,split=";")
		x$annotation$Strand <- vapply(z,firstfun,"")

		x$annotation$GeneID <- NULL
		return(DGEList(counts=x$counts,genes=x$annotation,samples=Samples))
	}

##	Features

#	Ensure classes for features
	x$annotation$Start <- as.integer(x$annotation$Start)
	x$annotation$End <- as.integer(x$annotation$End)
	x$annotation$Strand <- as.character(x$annotation$Strand)
	x$annotation$Length <- as.integer(x$annotation$Length)

##	Simple features

	if(!DupRowNames && !Junctions) {
		x$annotation$GeneID <- NULL
		return(DGEList(counts=x$counts,genes=x$annotation,samples=Samples))
	}

##	Features within meta-features (exons)

#	Expand the row names to include exon number (counting in genomic order)
	o <- order(x$annotation$GeneID,x$annotation$Chr,x$annotation$Start)
	Counts <- x$counts[o,]
	Genes <- x$annotation[o,]
	FirstExon <- which(!duplicated(Genes$GeneID))
	LastExon <- c(FirstExon[-1]-1L,nrow(Counts))
	RowNum <- seq_len(nrow(Counts))
	NExons <- RowNum[LastExon]-RowNum[FirstExon]+1L
	ExonNum <- RowNum-RowNum[rep.int(FirstExon,times=NExons)]+1L
	Genes$Exon <- ExonNum
	rownames(Counts) <- paste0(Genes$GeneID,".e",ExonNum)
	row.names(Genes) <- rownames(Counts)

#	Sort in genomic order	
	o <- order(Genes$Chr,Genes$Start)
	Counts <- Counts[o,]
	Genes <- Genes[o,]

	if(!Junctions) return (DGEList(counts=Counts,genes=Genes,samples=Samples))

##	Exons and junctions

#	JCounts <- as.matrix(x$counts_junction[,-seq_len(8)])
#	colnames(JCounts) <- SampleNames
#	JGenes <- x$counts_junction[,c("PrimaryGene","Site1_chr","Site1_location","Site2_location","Site1_strand")]
#	names(JGenes) <- c("GeneID","Chr","Start","End","Strand")
#	JGenes$GeneID <- as.character(JGenes$GeneID)
#	JGenes$Length <- JGenes$End-JGenes$Start+1L
#	JGenes$Exon <- rep_len(0L,nrow(JGenes))

	JCounts <- x$counts_junction
    JCounts <- JCounts[!is.na(JCounts$PrimaryGene),]
    JGenes <- JCounts[,c("PrimaryGene","Site1_chr","Site1_location","Site2_location","Site1_strand")]
    names(JGenes) <- c("GeneID","Chr","Start","End","Strand")
	JGenes$GeneID <- as.character(JGenes$GeneID)
	JGenes$Start <- as.integer(JGenes$Start)
	JGenes$End <- as.integer(JGenes$End)
	JGenes$Strand <- as.character(JGenes$Strand)
    JGenes$Length <- 1L
    JCounts <- as.matrix(JCounts[,-seq_len(8)])
    o <- order(JGenes$GeneID,JGenes$Chr,JGenes$Start)
    JCounts <- JCounts[o,]
    JGenes <- JGenes[o,]
    RowNames <- JGenes$GeneID
    N <- length(RowNames)
    First <- which(!duplicated(RowNames))
    Last <- c(First[-1]-1L,N)
    NExons <- Last-First+1L
    One <- rep_len(1L,N)
    One[First[-1]] <- 1L-NExons[-length(NExons)]
    ExonNumber <- cumsum(One)
    RowNames <- paste0(RowNames,".j",ExonNumber)
    JGenes$Exon <- ExonNumber
    row.names(JGenes) <- RowNames
    rownames(JCounts) <- RowNames

	DGEList(counts=rbind(Counts,JCounts),genes=rbind(Genes,JGenes),samples=Samples)
}