File: Assemblytics_dotplot.R

package info (click to toggle)
assemblytics 1.2.1%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 244 kB
  • sloc: python: 577; perl: 234; sh: 122; makefile: 13
file content (111 lines) | stat: -rwxr-xr-x 4,346 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env Rscript

# Author: Maria Nattestad
# github.com/marianattestad/assemblytics

library(ggplot2)

args<-commandArgs(TRUE)
prefix <- args[1]

filename <- paste(prefix,".oriented_coords.csv",sep="")
plot.output.filename <- paste(prefix,".Assemblytics.Dotplot_filtered",sep="")

plot.title <- "Dot plot of Assemblytics filtered alignments"

ref.pos <- function(chrom,pos,chr.lengths) {
    
    chrom.index <- which(names(chr.lengths)==chrom)-1
    offset.based.on.previous.chromosomes <- 0
    if (chrom.index != 0) {
        offset.based.on.previous.chromosomes <- sum(as.numeric(chr.lengths[c(1:chrom.index)])) 
    }
    
    loc <- offset.based.on.previous.chromosomes + pos   
    loc
}


coords <- read.csv(filename,sep=",",header=TRUE)

if (nrow(coords)>100000) {
    coords <- coords[1:100000,]
}

# names(coords) <- c("ref_start", "ref_end","query_start","query_end","ref_length","query_length","ref","query")


coords$ref <- as.character(coords$ref)
coords$query <- as.character(coords$query)



ordered_common_chromosome_names <- c(seq(1,100),paste("chr",seq(1,100),sep=""),paste("Chr",seq(1,100),sep=""),c("X","Y","M","MT","Chr0","chr0","0"))

all_chromosomes_some_ordered <- c(intersect(ordered_common_chromosome_names,unique(coords$ref)),setdiff(unique(coords$ref),ordered_common_chromosome_names))



coords$ref <- factor(coords$ref,levels=all_chromosomes_some_ordered)


chromosomes <- levels(coords$ref)

chr.lengths <- sapply(chromosomes,function(chr){max(coords[coords$ref==chr,"ref_length"])})
names(chr.lengths) <- chromosomes

coords <- cbind(coords, alignment.length=abs(coords$query_start-coords$query_end))


coords <- cbind(coords, ref.loc.start=mapply(FUN=ref.pos,coords$ref,coords$ref_start,MoreArgs=list(chr.lengths)),
                ref.loc.stop=mapply(FUN=ref.pos,coords$ref,coords$ref_end,MoreArgs=list(chr.lengths)))

# pick longest alignment. then pick the ref.loc.start of that
query.group <- split(coords,factor(coords$query))

ref.loc.of.longest.alignment.by.query <- unlist(sapply(query.group, function(coords.for.each.query) {coords.for.each.query$ref.loc.start[coords.for.each.query$alignment.length==max(coords.for.each.query$alignment.length)][1]}),recursive=FALSE)


# decide optimal-ish ordering of the queries
ordered.query.names <- names(ref.loc.of.longest.alignment.by.query)[order(ref.loc.of.longest.alignment.by.query)]

# construct a query.lengths list
query.lengths <- sapply(ordered.query.names,function(each.query){
    max(coords[coords$query==each.query,"query_length"])
})

# use the query.lengths to give offset positions to each query, adding a query.loc.start column and a query.loc.stop column to each entry in filtered.coords

coords$query.loc.start <- mapply(FUN=ref.pos,coords$query,coords$query_start,MoreArgs=list(query.lengths))
coords$query.loc.stop <- mapply(FUN=ref.pos,coords$query,coords$query_end,MoreArgs=list(query.lengths))


# Hide labels for chromosomes accounting for less than 2% of the reference
chr.labels <- names(chr.lengths)
chr.labels[chr.lengths < 0.02*sum(as.numeric(chr.lengths))] <- ""

query.labels <- names(query.lengths)
query.labels[query.lengths < 0.02*sum(as.numeric(query.lengths))] <- ""


theme_set(theme_bw(base_size = 24))

colors <- c("black","red")

coords$tag <- factor(coords$tag,levels=c("unique","repetitive"))



# CREATE PNG
png(file=paste(plot.output.filename, ".png",sep=""),width=1000,height=1000)    
print(ggplot(coords, aes(x=ref.loc.start,xend=ref.loc.stop,y=query.loc.start,yend=query.loc.stop,color=tag)) + geom_segment(lineend="butt",size=1.5) + labs(x="Reference",y="Query",title=plot.title) + scale_y_continuous(breaks = cumsum(as.numeric(query.lengths)),labels=query.labels,expand=c(0,0), limits = c(0,sum(as.numeric(query.lengths)))) + scale_x_continuous(breaks = cumsum(as.numeric(chr.lengths)),labels=chr.labels,expand=c(0,0),limits=c(0,sum(as.numeric(chr.lengths)))) + scale_color_manual(values=colors,name="Filter") + theme(
    axis.ticks.y=element_line(size=0),
    axis.text.x = element_text(angle = 90, hjust = 1,vjust=-0.5),
    axis.text.y = element_text(size=12,vjust=1.1),
    plot.title = element_text(vjust=3),
    panel.grid.major.x = element_line(colour = "black",size=0.1), 
    panel.grid.major.y = element_line(colour = "black",size=0.1), 
    panel.grid.minor = element_line(NA)
))

dev.off()