File: debug_params.R

package info (click to toggle)
tombo 1.5.1-8
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 25,140 kB
  • sloc: python: 12,433; sh: 613; makefile: 16
file content (74 lines) | stat: -rw-r--r-- 3,137 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggridges))

## set _DEBUG_PARAMS = True in resquiggle.py
## example run for min_obs_per_base testing:
##for i in {0..2}; do
##    testParam=`echo $i | awk '{print ($1 * 1) + 1}'`
##    tombo resquiggle param_test_reads/ genome.fasta \
##        --segmentation-parameters 5 3 $testParam 5 \
##        --signal-align-parameters 4.2 4.2 300 1500 20.0 40 750 2500 250 \
##        --processes 4
##done > param_values.txt


dat <- read.table('param_values.txt', header=TRUE)

## update with output from tombo.resquiggle._write_params_debug
colnames(dat) <- c(
    'running_window', 'min_obs_per_base', 'raw_min_obs_per_base',
    'mean_obs_per_event', 'match_evalue', 'skip_pen', 'bandwidth',
    'read_name', 'mean_score')

## filter out save bandwidth reads if bandwidth was not tested here
if(length(unique(dat$bandwidth))  == 2){
    dat <- dat %>% filter(bandwidth == min(as.numeric(dat$bandwidth)))
}

#3 convert params to factors and get stat that was investigated in this run
param_names <- setdiff(colnames(dat), c('mean_score', 'read_name'))
for(param_name in param_names){
    dat[,param_name] <- factor(dat[,param_name])
}
stat <- param_names[which.max(sapply(param_names, function(param_name)
    length(unique(dat[,param_name]))))]

## take min score over same read with same params (over re-scalings)
dat <- dat %>% group_by_at(c(param_names, 'read_name')) %>%
    summarize(mean_score=min(mean_score))

## filter for reads included in all parameter groups
rdat <- dat %>% group_by(read_name) %>% summarize(nreads=n())
maxNReads <- rdat$read_name[which(rdat$nreads == max(rdat$nreads))]
fdat <- dat %>% filter(read_name %in% maxNReads)

## compute and print mean and median stats for plotting
sumDat <- dat %>% group_by_at(stat) %>%
    summarize(med=median(mean_score), mean=mean(mean_score))
sumFDat <- fdat %>% group_by_at(stat) %>%
    summarize(med=median(mean_score), mean=mean(mean_score))

sumDat %>% print.data.frame(digits=6)
sumFDat %>% print.data.frame(digits=6)

pdf(paste0('param_values.', stat, '.pdf'), width=10)
ggplot(dat, aes_string(y=stat, x='mean_score', fill=stat)) +
    geom_vline(aes(xintercept=min(sumDat$med))) +
    geom_density_ridges(alpha=0.3, cex=0.5) +
    geom_point(aes_string(x='med', y=stat),
               color='red', size=5, data=sumDat) +
    geom_point(aes_string(x='mean', y=stat),
               color='orange', size=5, data=sumDat) +
    theme_minimal() + theme(legend.position="none") +
    coord_cartesian(xlim=quantile(dat$mean_score, c(0.01, 0.98)))
ggplot(fdat, aes_string(y=stat, x='mean_score', fill=stat)) +
    geom_vline(aes(xintercept=min(sumFDat$med))) +
    geom_density_ridges(alpha=0.3, cex=0.5) +
    geom_point(aes_string(x='med', y=stat),
               color='red', size=5, data=sumFDat) +
    geom_point(aes_string(x='mean', y=stat),
               color='orange', size=5, data=sumFDat) +
    theme_minimal() + theme(legend.position="none") +
    coord_cartesian(xlim=quantile(fdat$mean_score, c(0.01, 0.98)))
foo <- dev.off()