File: nanook_plot_comparison_reference.R

package info (click to toggle)
nanook 1.33%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 2,764 kB
  • sloc: java: 8,788; perl: 491; sh: 84; python: 42; makefile: 21
file content (215 lines) | stat: -rwxr-xr-x 9,482 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
library(ggplot2)
library(scales)
library(grid)
library(gridExtra)

# Filenames
args <- commandArgs(TRUE)
analysisdir <- args[1];
graphsdir <- args[2];
samplelist <- args[3];
outdir <- args[4];
reference <- args[5];
format <- args[6];

types = c("2D", "Template", "Complement");
colours = c("#68B5B9", "#CF746D", "#91A851");

if (format=="png") {
    textsize <- c(40)
    pointsize <- c(5)
    pointalpha <- c(0.5)
    pointshape <- c(1)
    pointwidth <- c(3)
    xvjust <- c(1.2)
    yvjust <- c(1.8)
} else {
    textsize <- c(10)
    pointsize <- c(2)
    pointalpha <-c(0.4)
    pointshape <- c(1)
    pointwidth <- c(1)
    xvjust <- c(0.2)
    yvjust <- c(0.8)
}


data_samples = read.table(samplelist, header=TRUE);

imagewidth <- 1 + nrow(data_samples) * 0.5;

# Query identity
for (t in 1:3) {
    df <- data.frame();
    listOfDataFrames <- NULL;
    count <- c(1);
    for (i in 1:nrow(data_samples)) {
        type = types[t];
        sampledir <- data_samples[i, "SampleDir"];
        filename_data <- paste(sampledir, "/", analysisdir, "/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
        message(filename_data);
        if (file.exists(filename_data)) {
            data_field = read.table(filename_data, header=TRUE);
            message(nrow(data_field));
            if (nrow(data_field) > 0) {
                thisid <- data_samples[i, "SampleName"];
                listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$QueryPercentIdentity);
                count <- count + 1;
            }
        }
    }

    df <- do.call("rbind", listOfDataFrames);
    output_file <- paste(graphsdir, "/", reference, "_", type, "_query_identity.pdf", sep="");
    message(output_file);
    pdf(output_file, width=imagewidth, height = 4);
    print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Read identity %"));
    garbage <- dev.off();

    output_file <- paste(graphsdir, "/", reference, "_", type, "_query_identity_zoom.pdf", sep="");
    message(output_file);
    pdf(output_file, width=imagewidth, height = 4);
    print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Read identity %") + scale_y_continuous(limits=c(60, 100)));
    garbage <- dev.off();
}

# Query GC
for (t in 1:3) {
    df <- data.frame();
    listOfDataFrames <- NULL;
    count <- c(1);
    for (i in 1:nrow(data_samples)) {
        type = types[t];
        sampledir <- data_samples[i, "SampleDir"];
        filename_data <- paste(sampledir, "/", analysisdir ,"/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
        if (file.exists(filename_data)) {
            data_field = read.table(filename_data, header=TRUE);
            message(nrow(data_field));
            if (nrow(data_field) > 0) {
                thisid <- data_samples[i, "SampleName"];
                listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$QueryGC);
                count <- count + 1;
            }
        }
    }
    
    df <- do.call("rbind", listOfDataFrames);
    output_file <- paste(graphsdir, "/", reference, "_", type, "_query_gc.pdf", sep="");
    message(output_file);
    pdf(output_file, width=imagewidth, height = 4);
    print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Read GC %"));
    garbage <- dev.off();
}

# Best perfect kmer
for (t in 1:3) {
    df <- data.frame();
    listOfDataFrames <- NULL;
    count <- c(1);
    for (i in 1:nrow(data_samples)) {
        type = types[t];
        sampledir <- data_samples[i, "SampleDir"];
        filename_data <- paste(sampledir, "/", analysisdir, "/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
        if (file.exists(filename_data)) {
            data_field = read.table(filename_data, header=TRUE);
            if (nrow(data_field) > 0) {
                thisid <- data_samples[i, "SampleName"];
                listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$LongestPerfectKmer);
                count <- count + 1;
            }
        }
    }
    
    df <- do.call("rbind", listOfDataFrames);
    output_file <- paste(graphsdir, "/", reference, "_", type, "_best_perfect_kmer.pdf", sep="");
    message(output_file);
    pdf(output_file, width=imagewidth, height = 4);
    print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Best perfect kmer"));
    garbage <- dev.off();
}

#PercentQueryAligned
for (t in 1:3) {
    df <- data.frame();
    listOfDataFrames <- NULL;
    count <- c(1);
    for (i in 1:nrow(data_samples)) {
        type = types[t];
        sampledir <- data_samples[i, "SampleDir"];
        filename_data <- paste(sampledir, "/", analysisdir, "/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
        if (file.exists(filename_data)) {
            data_field = read.table(filename_data, header=TRUE);
            if (nrow(data_field) > 0) {
                thisid <- data_samples[i, "SampleName"];
                listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$PercentQueryAligned);
                count <- count + 1;
            }
        }
    }
    
    df <- do.call("rbind", listOfDataFrames);
    output_file <- paste(graphsdir, "/", reference, "_", type, "_percent_query_aligned.pdf", sep="");
    message(output_file);
    pdf(output_file, width=imagewidth, height = 4);
    print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("% read aligned"));
    garbage <- dev.off();

    output_file <- paste(graphsdir, "/", reference, "_", type, "_percent_query_aligned_zoom.pdf", sep="");
    message(output_file);
    pdf(output_file, width=imagewidth, height = 4);
    print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("% read aligned") + scale_y_continuous(limits=c(75, 100)));
    garbage <- dev.off();
}

#AlignmentSize
for (t in 1:3) {
    df <- data.frame();
    listOfDataFrames <- NULL;
    count <- c(1);
    for (i in 1:nrow(data_samples)) {
        type = types[t];
        sampledir <- data_samples[i, "SampleDir"];
        filename_data <- paste(sampledir, "/", analysisdir, "/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
        if (file.exists(filename_data)) {
            data_field = read.table(filename_data, header=TRUE);
            if (nrow(data_field) > 0) {
                thisid <- data_samples[i, "SampleName"];
                listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$AlignmentSize);
                count <- count + 1;
            }
        }
    }
    
    df <- do.call("rbind", listOfDataFrames);
    output_file <- paste(graphsdir, "/", reference, "_", type, "_alignment_size.pdf", sep="");
    message(output_file);
    pdf(output_file, width=imagewidth, height = 4);
    print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Alignment size"));
    garbage <- dev.off();
}

#AlignmentPercentIdentity
#for (t in 1:3) {
#    df <- data.frame();
#    listOfDataFrames <- NULL;
#    count <- c(1);
#    for (i in 1:nrow(data_samples)) {
#        type = types[t];
#        sampledir <- data_samples[i, "SampleDir"];
#        filename_data <- paste(sampledir, "/analysis/", reference, "/", reference, "_",type,"_alignments.txt", sep="");
#        if (file.exists(filename_data)) {
#            data_field = read.table(filename_data, header=TRUE);
#            thisid <- data_samples[i, "SampleName"];
#            message(thisid);
#            listOfDataFrames[[count]] <- data.frame(Sample=thisid, Variable=data_field$AlignmentPercentIdentity);
#            count <- count + 1;
#        }
#    }
#
#    df <- do.call("rbind", listOfDataFrames);
#    output_file <- paste(outdir, "/graphs/", reference, "_", type, "_alignment_identity.pdf", sep="");
#    message(output_file);
#    pdf(output_file, width=imagewidth, height = 4);
#    print(ggplot(df, aes(x=Sample, y=Variable, fill=Sample)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill=FALSE) + theme(text = element_text(size=textsize)) + ggtitle(types[t]) + ylab("Alignment identity %"));
#    garbage <- dev.off();
#}