File: sav.R

package info (click to toggle)
qcumber 1.0.14%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 360 kB
  • ctags: 160
  • sloc: python: 1,272; sh: 68; makefile: 12
file content (148 lines) | stat: -rwxr-xr-x 5,334 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env Rscript
#source("http://bioconductor.org/biocLite.R")
#biocLite("savR")
options(warn=-1)
library(savR)
library(reshape2)

args = commandArgs(trailingOnly=TRUE)
project <- savR(args[1])

################
##  Indexing  ##
################

#total reads
total_reads<- clusters(project, 1L)
pf_reads<- pfClusters(project, 1L)


################
##    Plots   ##
################

##
# Data By Cycle
##

extraction<- extractionMetrics((project))

# Data By Cycle, FWHM/All Lanes / Both surfaces / All Bases
reshaped_extraction <- melt(extraction, measure.vars= c("FWHM_A","FWHM_C", "FWHM_T","FWHM_G"))
FWHM<- (aggregate(reshaped_extraction$value, by=list(reshaped_extraction$cycle, reshaped_extraction$variable), FUN=mean))
colnames(FWHM) <- c("Cycles","FWHM", "Value")
FWHM$FWHM<- sub("FWHM_","",FWHM$FWHM)
ggplot(data=FWHM )+
  geom_line( aes(x=Cycles , y =Value, color=FWHM)) +
  ggtitle("Data by Cycle - FWHM") +
  xlab("Cycle") +
  ylab("All bases FWHM")
ggsave(paste(args[2], "/FWHM.png", sep=""))

# Data By Cycle,Intensity /All Lanes / Both surfaces / All Bases
reshaped_extraction <- melt(extraction, measure.vars= c("int_A","int_C", "int_T","int_G"))
intensity<- (aggregate(reshaped_extraction$value, by=list(reshaped_extraction$cycle, reshaped_extraction$variable), FUN=mean))
colnames(intensity) <- c("Cycles","Intensity", "Value")
intensity$Intensity<- sub("int_","", intensity$Intensity)
ggplot(data=intensity )+
  geom_line( aes(x=Cycles , y =Value, color=Intensity))+
  ggtitle("Data By Cycle - Intensity")+
  xlab("Cycle")+ylab("All bases intensity")
ggsave(paste(args[2], "/intensity.png", sep=""))
# Data By Cycle, %Base /All Lanes / Both surfaces / All Bases
#
corr<- correctedIntensities(project)


corr[,seq(14,17)]<-round(corr[,seq(14,17)] / apply(corr[,seq(14,17)], 1, sum) *100,2)
corr<- melt(corr, measure.vars= c("num_A","num_C", "num_T","num_G"))
corr<-(aggregate(corr$value, by=list(corr$cycle, corr$variable), FUN=mean))
colnames(corr)<- c("Cycle", "Base", "Perc_Base")
corr$Base<- sub("num_","", corr$Base)
ggplot(corr) +
  geom_line(aes(x=Cycle, y= Perc_Base, color=Base)) +
  ylab("All Bases % Base") +
  ggtitle("Data by Cycle - % Base")
ggsave(paste(args[2], "/base_perc.png" , sep =""))

##
# Data By Lane
##

tiles<- tileMetrics(project)
# Density, Both Surfaces
#pfBoxplot(project) # Generate a boxplot of the numbers of clusters and the number of Illumina pass-filter clusters per tile and lane
dens <-(tiles[which(tiles$code==100 | tiles$code==101 ),])
dens[which(dens$code==100),]$code <- "Raw Clusters"
dens[which(dens$code==101),]$code<- "PF Clusters"
dens$value <- dens$value/1000
ggplot(data = dens , aes(x=lane, y=value, fill=code))+
  geom_boxplot() +
  ggtitle("Data By Lane - Cluster Density") +
  xlab("Lane")+ylab("Cluster Density (K/mm2)")
ggsave(paste(args[2], "/density.png", sep=""))

# Phasing, Both Surfaces, All Bases
phasing_code <- seq(200, (200 + (length(project@reads)-1)*2),2)
phasing <-(tiles[which(tiles$code %in% phasing_code) ,])
for(i in phasing_code){
  cat(paste("Read ",((i-200)/2)+1))
  phasing[which(phasing$code==i),]$code = paste("Read ",((i-200)/2)+1)
}
ggplot(data = phasing[which(phasing$value>0),] , aes(x=lane, y=value*100, fill=code))+
  geom_boxplot() +
  ggtitle("Data By Lane - Phasing")+
  xlab("Lane")+
  ylab("% Phasing")+
  scale_x_continuous(breaks = unique(phasing$lane))
ggsave(paste(args[2], "/phasing.png", sep=""))


# Pre-Phasing, Both Surfaces, All Bases
prephasing_code <- seq(201, (201 + (length(project@reads)-1)*2),2)
prephasing <-(tiles[which(tiles$code %in% prephasing_code) ,])
for(i in prephasing_code){

  prephasing[which(prephasing$code==i),]$code = paste("Read ",((i-201)/2)+1)
}
ggplot(data = prephasing[which(prephasing$value>0),] , aes(x=lane, y=value*100, fill=code))+
  geom_boxplot() +
  ggtitle("Data By Lane - Prephasing")+
  xlab("Lane")+
  ylab("% Prephasing") +
  scale_x_continuous(breaks = unique(prephasing$lane))
ggsave(paste(args[2], "/prephasing.png", sep=""))



##
# QScore Heatmap
##
png(paste(args[2], "/qscore_heatmap.png", sep=""), height=1025, width = 2571, res = 200)
qualityHeatmap(project, lane=seq(1,project@layout@lanecount) ,read=c(1,2))+ theme(axis.title.y = element_blank())
dev.off()

qualy<- qualityMetrics(project)
qualy<- data.frame(apply(qualy, 2, as.numeric))
qualy<- melt(qualy, measure.vars= colnames(qualy)[4:ncol(qualy)])

qualy<- aggregate(qualy$value, by=list(qualy$variable), FUN=sum)
colnames(qualy)<- c("QScore","Total")
qualy$Total <- qualy$Total/1000000
qualy$QScore <- as.numeric(qualy$QScore)
ggplot(qualy, aes(x=QScore, y = Total )) +
  geom_bar(stat="identity", aes(fill=QScore>=30)) +
  ylab("Total (million)") +
  geom_vline(aes(xintercept=30), linetype="dashed") +
  geom_text(aes(x=35, y=max(Total)-max(Total)*0.1 ,label=(paste("QScore >=30 \n",
                                                 round(sum(qualy[which(qualy$QScore>=30),]$Total)/1000,2),
                                                 "G \n",
                                                 round(sum(qualy[which(qualy$QScore>=30),]$Total)/ sum(qualy$Total)*100,2),
                                                 "%")
                                           ))) +
  ggtitle("QScore Distribution") +
  theme(legend.position="none")
ggsave(paste(args[2], "/qscore_distr.png", sep=""))