File: debug_est_alt.R

package info (click to toggle)
tombo 1.5.1-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 24,424 kB
  • sloc: python: 12,438; sh: 613; makefile: 10
file content (121 lines) | stat: -rw-r--r-- 6,024 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
library(ggplot2)
library(stringr)
library(ggridges)


## density basename and alternative base
densBase <- 'debug_est_alt'
altBase <- 'C'


## parse density data
densDat <- read.table(paste0(densBase, '.alternate_density.txt'), header=TRUE)
standardDensDat <- read.table(paste0(densBase, '.control_density.txt'), header=TRUE)
densDat$Sample <- "Alternative"
standardDensDat$Sample <- "Standard"

allDensDat <- rbind.data.frame(densDat, standardDensDat)
sAllDat <- split(allDensDat, allDensDat$Kmer)
sDiffs <- sort(unlist(lapply(sAllDat, function(x)
    weighted.mean(x[x$Sample == 'Alternative','Signal'], x[x$Sample == 'Alternative','Density']) -
    weighted.mean(x[x$Sample == 'Standard','Signal'], x[x$Sample == 'Standard','Density']))))


## plot k-mers with the largest shifts in average signal level
upDat <- do.call(rbind.data.frame, lapply(names(head(sDiffs, 20)), function(kmer) sAllDat[[kmer]]))
dnDat <- do.call(rbind.data.frame, lapply(names(tail(sDiffs, 20)), function(kmer) sAllDat[[kmer]]))

pdf(paste0(densBase, '.density.pdf'), width=10)
ggplot(upDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) +
    geom_ridgeline(alpha=0.4, size=0, color='white') +
    scale_fill_discrete(name='Contains\nAlternative\nBase') +
    theme_ridges() + theme(axis.text.y=element_text(family="mono"))
ggplot(dnDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) +
    geom_ridgeline(alpha=0.4, size=0, color='white') +
    scale_fill_discrete(name='Contains\nAlternative\nBase') +
    theme_ridges() + theme(axis.text.y=element_text(family="mono"))
foo <- dev.off()


## plot alternative densities in k-mers without the alternative base of interest
noAltBaseDiffs <- sDiffs[!str_detect(names(sDiffs), altBase)]

upDat <- do.call(rbind.data.frame, lapply(names(head(noAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]]))
dnDat <- do.call(rbind.data.frame, lapply(names(tail(noAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]]))

pdf(paste0(densBase, '.noAlt.density.pdf'), width=10)
ggplot(upDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) +
    geom_ridgeline(alpha=0.4, size=0, color='white') +
    scale_fill_discrete(name='Contains\nAlternative\nBase') +
    theme_ridges() + theme(axis.text.y=element_text(family="mono"))
ggplot(dnDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) +
    geom_ridgeline(alpha=0.4, size=0, color='white') +
    scale_fill_discrete(name='Contains\nAlternative\nBase') +
    theme_ridges() + theme(axis.text.y=element_text(family="mono"))
foo <- dev.off()


singleAltBaseDiffs <- sDiffs[str_count(names(sDiffs), altBase) == 1]

upDat <- do.call(rbind.data.frame, lapply(names(head(singleAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]]))
dnDat <- do.call(rbind.data.frame, lapply(names(tail(singleAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]]))

pdf(paste0(densBase, '.singleAlt.density.pdf'), width=10)
ggplot(upDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) +
    geom_ridgeline(alpha=0.4, size=0, color='white') +
    scale_fill_discrete(name='Contains\nAlternative\nBase') +
    theme_ridges() + theme(axis.text.y=element_text(family="mono"))
ggplot(dnDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) +
    geom_ridgeline(alpha=0.4, size=0, color='white') +
    scale_fill_discrete(name='Contains\nAlternative\nBase') +
    theme_ridges() + theme(axis.text.y=element_text(family="mono"))
foo <- dev.off()


onePlusAltBaseDiffs <- sDiffs[str_count(names(sDiffs), altBase) > 0]

upDat <- do.call(rbind.data.frame, lapply(names(head(onePlusAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]]))
dnDat <- do.call(rbind.data.frame, lapply(names(tail(onePlusAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]]))

pdf(paste0(densBase, '.onePlusAlt.density.pdf'), width=10)
ggplot(upDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) +
    geom_ridgeline(alpha=0.4, size=0, color='white') +
    scale_fill_discrete(name='Contains\nAlternative\nBase') +
    theme_ridges() + theme(axis.text.y=element_text(family="mono"))
ggplot(dnDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) +
    geom_ridgeline(alpha=0.4, size=0, color='white') +
    scale_fill_discrete(name='Contains\nAlternative\nBase') +
    theme_ridges() + theme(axis.text.y=element_text(family="mono"))
foo <- dev.off()


## plot estimated shift correction due to high mod content samples
getMedMean <- function(x, sampType){
    sampDat <- x[x$Sample == sampType,]
    densCum <- cumsum(sampDat$Density)
    return(c(sampDat$Signal[which.min(abs(densCum - (tail(densCum, 1) / 2)))],
             weighted.mean(sampDat$Signal, sampDat$Density)))
}

standDiffs <- do.call(rbind.data.frame, lapply(sAllDat, function(x){
    stdMedMean <- getMedMean(x, 'Standard')
    altMedMean <- getMedMean(x, 'Alternative')
    data.frame(stdMed=stdMedMean[1],
               altMed=altMedMean[1],
               stdMean=stdMedMean[2],
               altMean=altMedMean[2],
               kmer=x$Kmer[1],
               hasAltBase=str_detect(x$Kmer[1], altBase))}))
standDiffs$MedDiff <- standDiffs$stdMed - standDiffs$altMed
standDiffs$MeanDiff <- standDiffs$stdMean - standDiffs$altMean

pdf(paste0('signal_shifts.', densBase, '.pdf'))
ggplot(standDiffs[!standDiffs$hasAltBase,]) + geom_point(aes(x=stdMed, y=MedDiff), alpha=0.3) +
    geom_smooth(aes(x=stdMed, y=MedDiff), color='red', method = "lm", formula = y ~ x + I(x^2)) + theme_bw()
ggplot(standDiffs[!standDiffs$hasAltBase,]) + geom_point(aes(x=stdMean, y=MeanDiff), alpha=0.3) +
    geom_smooth(aes(x=stdMean, y=MeanDiff), color='red', method = "lm", formula = y ~ x + I(x^2)) + theme_bw()
ggplot(standDiffs) + geom_point(aes(x=stdMed, y=MedDiff, color=hasAltBase), alpha=0.3) +
    geom_smooth(aes(x=stdMed, y=MedDiff, color=hasAltBase), method = "lm", formula = y ~ x + I(x^2)) +  theme_bw()
ggplot(standDiffs) + geom_point(aes(x=stdMean, y=MeanDiff, color=hasAltBase), alpha=0.3) +
    geom_smooth(aes(x=stdMean, y=MeanDiff, color=hasAltBase), method = "lm", formula = y ~ x + I(x^2)) +  theme_bw()
foo <- dev.off()