File: binning.R

package info (click to toggle)
metabat 2.18-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 972 kB
  • sloc: cpp: 10,869; sh: 422; python: 297; perl: 163; makefile: 19; ansic: 11
file content (142 lines) | stat: -rw-r--r-- 6,881 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# CAMI-1 data analysis
source('benchmark.R')

#How we got the bam files (BBMap is required for this method):
#If you already have bam files, skip this step.
#low
#system("runBBmap.sh CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta.gz RL_S001__insert_270.fq.gz")
#system("for i in $(ls *.fq.gz); do runBBmap.sh CAMI_medium_GoldStandardAssembly.fasta.gz $i; done > /dev/null 2>&1")
#system("for i in $(ls *.fq.gz); do runBBmap.sh CAMI_high_GoldStandardAssembly.fasta.gz $i; done > /dev/null 2>&1")

#system("MetaBAT/jgi_summarize_bam_contig_depths --outputDepth depth.txt CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta.gz.d/*.bam")
#for medium need to re-calculate depth.txt using combined reads
#system("MetaBAT/jgi_summarize_bam_contig_depths --outputDepth depth.txt CAMI_medium_GoldStandardAssembly.fasta.gz.d/*.bam")
#system("MetaBAT/jgi_summarize_bam_contig_depths --outputDepth depth.txt CAMI_high_GoldStandardAssembly.fasta.gz.d/*.bam")

args = commandArgs(trailingOnly=TRUE)#Example arguments will be provided in comments next to the R commands below.

#low
#system("zcat CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta.gz | fastaLengths.pl - > contigs-low.txt")#We did not end up using this
truth <- read.table(args[1], skip=4, as.is=T)#i.e. /cami/low/gsa_mapping.binning
sizes <- read.table(args[2], as.is=T)#i.e. /cami/low/sizes

res <- read.table(args[3], as.is=T)#i.e. /cami/low/resA-1.txt
res <- read.table(args[4], as.is=T)#i.e. /cami/low/resB-1.txt

#medium
#system("zcat CAMI_medium_GoldStandardAssembly.fasta.gz | fastaLengths.pl - > contigs-medium.txt")#We did not end up using this
truth <- read.table(args[5], skip=4, as.is=T)#i.e. /cami/medium/pooled_gsa_mapping.binning
sizes <- read.table(args[6], as.is=T)#i.e. /cami/medium/sizes

res <- read.table(args[7], as.is=T)#i.e. /cami/medium/resA-1.txt
res <- read.table(args[8], as.is=T)#i.e. /cami/medium/resB-1.txt

#high
#system("zcat CAMI_high_GoldStandardAssembly.fasta.gz | fastaLengths.pl - > contigs-high.txt")#We did not end up using this
truth <- read.table(args[9], skip=4, as.is=T)#i.e. /cami/high/gsa_mapping_pool.binning
sizes <- read.table(args[10], as.is=T)#i.e. /cami/high/sizes

res <- read.table(args[11], as.is=T)#i.e. /cami/high/resA-1.txt
res <- read.table(args[12], as.is=T)#i.e. /cami/high/resB-1.txt

nrow(truth) == nrow(sizes)
truth$size <- sizes$V2[match(truth$V1, sizes$V1)]
truth.genome <- ddply(truth, .(V2), function(x) cbind(ctgs=nrow(x), size=sum(x$size)))

res <- res[res$V2 > 0, ]
res <- merge(truth, res, by.x='V1', by.y='V1', all.x=T)

#table(res$V2.x, res$V2.y)
compB <- ddply(res, .(V2.x), function(x) ddply(x, .(V2.y), function(xx) sum(xx$size)))
compB <- ddply(compB, .(V2.x), function(x) {
	na <- x$V1[is.na(x$V2.y)]
	if(length(na) == 0)
		na <- 0
	x <- x[which(!is.na(x$V2.y)), ,drop=F]
	if (nrow(x) > 0) {
		cbind(binID=x$V2.y[which.max(x$V1)], comp=max(x$V1) / (sum(x$V1)+na))
	} else { #nothing binned
		cbind(binID=NA, comp=0)
	}
})

compB <- merge(truth.genome, compB, by.x='V2', by.y='V2.x')

compB <- cbind(compB, prec=foreach(x=iter(compB, by='row'), .combine=c) %dopar% {
	denom <- sum(res$size[which(res$V2.y==x$binID)])
	if (denom >= 200000) {
		numer <- sum(res$size[which(res$V2.x==x$V2 & res$V2.y==x$binID)])
		numer / denom
	} else
		0
})
sum(compB$comp > .50 & compB$prec > .90, na.rm=T) #21 => 87 => 432

#MaxBin 2
#low
for(i in seq(4,4,2)) {
	system(sprintf("cut -f1,%d depth.txt| tail -n+2 > depth.txt.mb.%d", i,i))
	write.table(sprintf("depth.txt.mb.%d", i), file='abund_list', col.names=F, row.names=F, quote=F, append=T)
}
system("~/files/MaxBin-2.2.3/run_MaxBin.pl -contig CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta -abund_list abund_list -out resMB-1/bin")
#0 hours 11 minutes and 5 seconds.

#medium
for(i in seq(4,10,2)) {
	system(sprintf("cut -f1,%d depth.txt| tail -n+2 > depth.txt.mb.%d", i,i))
	write.table(sprintf("depth.txt.mb.%d", i), file='abund_list', col.names=F, row.names=F, quote=F, append=T)
}

#high
for(i in seq(4,12,2)) {
	system(sprintf("cut -f1,%d depth.txt| tail -n+2 > depth.txt.mb.%d", i,i))
	write.table(sprintf("depth.txt.mb.%d", i), file='abund_list', col.names=F, row.names=F, quote=F, append=T)
}
system("~/files/MaxBin-2.2.3/run_MaxBin.pl -contig CAMI_high_GoldStandardAssembly.fasta -abund_list abund_list -out mb/bin")

system('metabat2 -i CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta.gz -a depth.txt -o resB-1/bin -v')

system('metabat2 -i CAMI_medium_GoldStandardAssembly.fasta.gz -a depth.txt -o resB-1/bin -v')

system('metabat2 -i CAMI_high_GoldStandardAssembly.fasta.gz -a depth.txt -o resB-1/bin -v')
printPerf(list(MetaBAT=calcPerfCAMI("MetaBAT","high/resB-1/bin", complexity='high')))

#CONCOCT
system('concoct --composition_file CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta --coverage_file depth.txt.mb.4 2>CONCOCT.log') #--length_threshold 2500

system("paste depth.txt.mb.* | cut -f1,2,4,6,8 -d$'\t' > depth-only-medium.txt")
system('concoct --composition_file CAMI_medium_GoldStandardAssembly.fasta --coverage_file depth-only-medium.txt 2>CONCOCT.log')

system("paste depth.txt.mb.* | cut -f1,2,4,6,8,10 -d$'\t' > depth-only-high.txt")
system('concoct --composition_file CAMI_high_GoldStandardAssembly.fasta --coverage_file depth-only-high.txt 2>CONCOCT.log')

printPerf(list(CONCOCT=calcPerfCAMI("CONCOCT")))

res <- list(MetaBAT2=calcPerfCAMI("MetaBAT","MetaBATLow/bin",complexity='low'), 
		MaxBin2=calcPerfCAMI("MaxBin","MaxBinLow/bin",complexity='low'), 
		CONCOCT=calcPerfCAMI("CONCOCT",complexity='low'),
		MyCC=calcPerfCAMI("MaxBin","20170808_0126_4mer_0.7_cov/Cluster",complexity='low'), 
		BinSanity=calcPerfCAMI("BinSanity","BinSanity-Final-bins",complexity='low'), 
		COCACOLA=calcPerfCAMI("CONCOCT","result.csv",complexity='low'))

res <- list(MetaBAT2=calcPerfCAMI("MetaBAT","MetaBATMed/bin",complexity='medium'), 
		MaxBin2=calcPerfCAMI("MaxBin","MaxBinMed/bin",complexity='medium'), 
		CONCOCT=calcPerfCAMI("CONCOCT",complexity='medium'), 
		MyCC=calcPerfCAMI("MaxBin","20170808_1000_4mer_0.7_cov/Cluster",complexity='medium'), 
		BinSanity=calcPerfCAMI("BinSanity","BinSanity-Final-bins",complexity='medium'), 
		COCACOLA=calcPerfCAMI("CONCOCT","result.csv",complexity='medium'))

res <- list(MetaBAT2=calcPerfCAMI("MetaBAT","MetaBATHigh/bin",complexity='high'), 
		MaxBin2=calcPerfCAMI("MaxBin","MaxBinHigh/bin",complexity='high'), 
		CONCOCT=calcPerfCAMI("CONCOCT",complexity='high'),
		MyCC=calcPerfCAMI("MaxBin","20170808_0155_4mer_0.7_cov/Cluster",complexity='high'), 
		BinSanity=calcPerfCAMI("BinSanity","BinSanity-Final-bins",complexity='high'), 
		COCACOLA=calcPerfCAMI("CONCOCT","result.csv",complexity='high'))

printPerf(res)

res <- res[c('MetaBAT2','MaxBin2','BinSanity','COCACOLA','CONCOCT','MyCC')]

pdf("Rplots.pdf", width=12, height=4)
plotPerf3(res, rec=seq(.5,.9,.1), legend.position=c(.95,.7))
dev.off()