1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
|
#!/usr/bin/env Rscript
#options(stringsAsFactors = FALSE)
library('caret')
library('rpart.plot')
num_longest = 500
max_length = 5000
main = function () {
args<-commandArgs(TRUE)
if (length(args) == 0) {
stop("require param: cds.longest.scores")
}
data_file = args[1]
data = read.table(data_file, header=T, com='', row.names=1)
rand_idx = grep("rand", row.names(data))
rand_data = data[rand_idx,]
reg_data = data[-rand_idx,]
rand_data = len_normalize_adj(rand_data)
reg_data = len_normalize_adj(reg_data)
rand_data = data.frame(rand_data, myvar='NO')
reg_data = data.frame(reg_data, myvar='YES')
target_data = rbind(rand_data, reg_data)
rownames(target_data) = NULL
write.table(target_data, 'adj.data', quote=F, sep="\t")
f = as.formula(myvar ~.)
t = train(f, target_data, method='rpart')
pdf(paste0(data_file, ".rpart.results.pdf"))
rpart.plot(t$finalModel)
boxplot(reg_data[,1:6], outline=F, main='top_orfs')
boxplot(rand_data[,1:6], outline=F, main='rand')
dev.off()
message("done")
}
len_normalize_adj = function(data) {
data = data[data$seq_length <= max_length,]
data = data[rev(order(data$seq_length)),]
num_top_select = min(num_longest, nrow(data))
data = data[1:num_top_select,]
scores = data[,c('score_1', 'score_2', 'score_3', 'score_4', 'score_5', 'score_6')]
len_norm_scores = sweep(scores, 1, data$seq_length, '/')
return(len_norm_scores)
}
main()
|