File: m8CBl_to_plot2.R

package info (click to toggle)
fasta3 36.3.8i.14-Nov-2020-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,576 kB
  • sloc: ansic: 77,292; perl: 10,677; python: 2,461; sh: 428; csh: 86; sql: 55; makefile: 40
file content (276 lines) | stat: -rwxr-xr-x 9,771 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/env Rscript --vanilla

################################################################
# copyright (c) 2022 by William R. Pearson and The Rector &
# Visitors of the University of Virginia */
################################################################
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under this License is distributed on an "AS
# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied.  See the License for the specific language
# governing permissions and limitations under the License.
################################################################


## m8CBl_to_plot2.R alignment.m8CBl_file
##
## take an m8CBl output of the form:
##
## # lalign36 -m 8CBl ../seq/mchu.aa ../seq/mchu.aa
## # LALIGN 36.3.8h Aug, 2019
## # Query: sp|P62158|CALM_HUMAN Calmodulin; CaM - 149 aa
## # Database: ../seq/mchu.aa
## # Fields: query id, query length, subject id, subject length, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score, BTOP
## # 1 hits found
## sp|P62158|CALM_HUMAN	149	sp|P62158|CALM_HUMAN	149	47.95	73	38	3	1	76	77	149	3.1e-11	49.3	1AK1QTLDTS2QE1A-E-F-KR3SRLV5DN1TY1TSTAKA2GRTH2RTSN2QENKPL1EDAE1LVQDDE2NR1VA1AI2ND1TQIVDNFYPE2LVTQ2ATRA1
## sp|P62158|CALM_HUMAN	149	sp|P62158|CALM_HUMAN	149	37.11	97	61	8	11	111	47	147	6.7e-09	41.6	2FLKQEDAMFISNLEFV1KA2DN3TDTFKP1LFGL1VM1-A1SKLMGKQDNTPDTS1AE1LIQRDEMAIFNREVVF1AK4TY1DSFAPA1FL-R-H-VLM1MNMLAGRE1MLKT1TEDESVED1EM4F-R-V-F-1KI2ND1YQIVSNAYAE1LFRVHQVM2
## sp|P62158|CALM_HUMAN	149	sp|P62158|CALM_HUMAN	149	39.39	33	20	5	1	37	113	146	0.065	18.3	MLAGDEQK2ED1QEIVAD1FM-IKR2F-S-L-F-1KI4TQIVTNTYKE1LFGVTQVM1
## # LALIGN processed 1 queries

## and convert to an x-y plot using ggplot

################
## This is a second attempt to provide an alternative to
## lav2_plt.pl, which takes an lalign.lav file and converts it to
## postscript or SVG.  Unlike m8CBl_to_plot.R, this version uses more vector math and fewer for() loops
##
## This program does not use the .lav file -- it used the BTOP alignment encoding provided by lalign36 -m8CBl
## ( the 'l' in -m8CBl is required to get thte sequence lengths)
##
################
##
## future versions could include expectation values on the alignments
## and domain diagrams
##

library(ggplot2)
library(RColorBrewer)
library(cowplot)

args <- commandArgs(trailingOnly=TRUE)

p.name<-'m8CBl_to_plot2.R'
plabel=paste(c(p.name,args,date()),collapse=' ', sep=' ')

if (is.na(args[1])) {
    print("usage m8CBl_to_plot.R alignment.m8CBl_file")
    exit(1)
}

btop_split <- function(btop_str) {

    x = c()
    y = c()

    ## print(btop_str)

    parts <- strsplit(gsub("([:digit:]+)","~\\1~",btop_str),"~")[[1]]
    ## print(parts)

    parts2 <- strsplit(gsub("([A-Z\\-][A-Z\\-])","~\\1~",parts),"~")
    parts2d <- lapply(parts2, function(z){ z[!is.na(z) & z != ""]})
    partsv <- Reduce(c,parts2d)
    ## print(partsv)
    partsv
}

btop2inc <- function(btop_vec) {

    ## convert vector to numbers if it has numbers

    btop_len = length(btop_vec)
    ## print(btop_vec)

    xy_inc = ifelse(grepl('[[:digit:]]+',btop_vec), strtoi(btop_vec), 0)
    xy_mat = ifelse(grepl('[A-Z][A-Z]',btop_vec), 1, 0)

    x_inc  = ifelse(substr(btop_vec,2,2)=='-', 1, 0)
    y_inc  = ifelse(substr(btop_vec,1,1)=='-', 1, 0)

    x_inc = x_inc + xy_inc + xy_mat
    y_inc = y_inc + xy_inc + xy_mat

    xy_list = list(x=x_inc, y=y_inc)
    ## print(xy_list)
    xy_list
}

btop2xy <- function(btop_list) {

    this_x = 0
    this_y = 0

    x_list = c()
    y_list = c()
    ## print(length(btop_list))
    ## print(btop_list)

    xy_inc = btop2inc(btop_list)

    x_list = cumsum(xy_inc$x)
    y_list = cumsum(xy_inc$y)

    xy_list = list(x=x_list, y=y_list)
    ## print(xy_list)
    xy_list
}

eval_type <- function(eval, thresh, pref) {

    thresh_len = length(thresh)
    for (ix in 1:thresh_len) {
        if (eval < thresh[ix]) {
            etype = ix
            break
        }
    }

    if (eval >= thresh[thresh_len]) {
        etype = thresh_len+1
    }

    return(etype)
}

## end of functions // main program

elinval = c(1e-4, 1e-2, 1.0, 10.0)
blinval = c(40.0,30.0,20.0, 10.0)
line_pref = 'l'

m8_col_names = c("qseqid","qlen","sseqid","slen","percid","aln_len","mismat","gaps","q_start","q_end","s_start","s_end","evalue","bits","BTOP")
m8_col_names_ann = c("qseqid","qlen","sseqid","slen","percid","aln_len","mismat","gaps","q_start","q_end","s_start","s_end","evalue","bits","BTOP","ann_str")
m8_col_names_ann2 = c("qseqid","qlen","sseqid","slen","percid","aln_len","mismat","gaps","q_start","q_end","s_start","s_end","evalue","bits","BTOP","ann_str","seq_ann_str")

m8_df <- read.table(args[1],header=FALSE)

m8_df_ncols <- dim(m8_df)[2]

if (m8_df_ncols == length(m8_col_names)) {
    colnames(m8_df) = m8_col_names
} else if (m8_df_ncols == length(m8_col_names_ann)) {
    colnames(m8_df) = m8_col_names_ann
} else if (m8_df_ncols == length(m8_col_names_ann2)) {
    colnames(m8_df) = m8_col_names_ann2
} else {
    stop("ERROR --  do not recognize number of input columns")
}

m8_df$row_num = seq.int(nrow(m8_df))

## identical sequences, plot mirror alignment
do_mirror = (m8_df[1,]$qseqid == m8_df[1,]$sseqid)

## take the BTOP encoding and convert it to a vector of tokens

btop_list <- lapply(m8_df$BTOP, btop_split)

## take the list of token vectors (one for each alignment) and convert to xy-offsets
xy_list <- lapply(btop_list, btop2xy)

## create a vector of names for each alignment -- required to keep x,y pairs separate for each alignment
diag_name = paste0(line_pref,"0")

## keep a list of the alignment names used
lname_list = c(diag_name)

## line_colors <- brewer.pal(length(elinval)+1,'Dark2')
line_colors <- c('darkblue','brown','darkgreen','green','lightgreen')

## line_color for identity  alignemnt
line_colors_n <- c()

## create x,y,aln_name dataframe for plotting, including diagonal if do_mirror
if (do_mirror) {
    xy_df = data.frame(name='aln0',x=c(1,m8_df[1,]$qlen), y=c(1,m8_df[1,]$slen),ev_ix=0, u.eline=diag_name)
    xy_lab = data.frame(u.eline=diag_name,lab='',x=m8_df[1,]$qlen/2.0,y=m8_df[1,]$slen/2.0)
    line_colors_n <- c('black')
} else {
    xy_df = NULL
    xy_lab = NULL
}

## go through each of the alignments produced by btop_split/btop2xy -- add to dataframe for plotting
for (ix in 1:length(xy_list)) {

    q_off = m8_df[ix,]$q_start-1
    s_off = m8_df[ix,]$s_start-1

    ## data frame for this alignment
    tmp_df = data.frame(name=paste0('aln',ix), x=xy_list[[ix]]$x, y=xy_list[[ix]]$y)
    tmp_df$x = tmp_df$x + q_off
    tmp_df$y = tmp_df$y + s_off

    ## add linetype/alignment color
    ev_ix = eval_type(m8_df[ix,]$evalue, elinval)
    ## print(sprintf("ix: %d evalue: %g ev_ix: %d",ix, m8_df[ix,]$evalue, ev_ix))

    tmp_df$ev_ix = ev_ix
    aln_name = paste0(line_pref,ev_ix,"_",ix)
    tmp_df$u.eline= aln_name
    line_colors_n <- append(line_colors_n,line_colors[ev_ix])

    ## add this alignment to the alignemnt dataframe
    xy_df = rbind(xy_df, tmp_df)

    q_mid = (m8_df[ix,]$q_end - m8_df[ix,]$q_start+1)/2.0 + m8_df[ix,]$q_start-1
    s_mid = (m8_df[ix,]$s_end - m8_df[ix,]$s_start+1)/2.0 + m8_df[ix,]$s_start-1

    tmp_lab = data.frame(u.eline=aln_name,lab=sprintf("%0.2g",m8_df[ix,]$evalue),x=q_mid,y=s_mid)

    xy_lab = rbind(xy_lab, tmp_lab)

    if (do_mirror) {
        tmp_df = data.frame(name=paste0('aln',ix,'c'), y=xy_list[[ix]]$x, x=xy_list[[ix]]$y)
        tmp_df$x = tmp_df$x + s_off
        tmp_df$y = tmp_df$y + q_off
        tmp_df$ev_ix = ev_ix
        tmp_df$u.eline= paste0(line_pref,ev_ix,"_",ix,'c')
        xy_df = rbind(xy_df, tmp_df)

        line_colors_n <- append(line_colors_n,line_colors[ev_ix])
    }
}

x_label = sprintf("%s %d aa",m8_df[1,]$qseqid, m8_df[1,]$qlen)
y_label = sprintf("%s %d aa",m8_df[1,]$sseqid, m8_df[1,]$slen)

pdf(file=paste0(args[1],'.pdf'),width=5, height=6)

## set colors based on evalue's
e_colors = scale_color_manual(values=line_colors_n)

## print(xy_lab)

elab_size = 4
elab_size = 4 - (nrow(xy_lab)-4)/4 * 0.75
if (elab_size < 2) {
    elab_size = 2
}

## pA -- draw alignment plot

pA <- ggplot(xy_df) + geom_line(aes(x=x,y=y,color=u.eline)) + geom_text(data=xy_lab, aes(x=x,y=y,label=lab),angle=45,nudge_x= -2.0, nudge_y=2.0, hjust=0.5, vjust=0,size=elab_size) +
    coord_fixed(ratio=1) + xlab(x_label) + ylab(y_label) + e_colors  +
    theme_bw() + guides(color='none') + theme(plot.margin=unit(c(2.0, 0.5, 0.5, 0.5), "cm"))

## pB -- draw evalue color legend
xlab_pos = c(5, 45, 5, 45, 80)
ylab_pos = c(15, 15, 10, 10, 12.5)
l_types = c('l1','l2','l3','l4','l5')
lab_df <- data.frame(x1 = xlab_pos, x2=xlab_pos + 15, y1 = ylab_pos, y2=ylab_pos, u.eline=l_types, etext=c("< 0.0001", "< 0.01","< 1.0", "< 10","> 10"))

e_colors2 = scale_color_manual(values=line_colors)

pB <- ggplot(lab_df) + geom_segment(aes(x=x1, xend=x2, y=y1,yend=y2, color=u.eline)) + geom_text(aes(x=x2+2,y=y1+0.03, label=etext),hjust=0) + xlim(c(-20,120)) + ylim(c(5,18)) + coord_fixed() + e_colors2 + guides(color='none') +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_blank(), axis.text.x=element_blank(), axis.text.y=element_blank(),axis.ticks=element_blank(),axis.title=element_blank(),plot.margin=unit(c(-0.5, 0.5, 0.5, 0.5), "cm")) + labs(caption=plabel)

## plot out pA (the data) above pB (the legend)
plot_grid(pA,pB,ncol=1,rel_heights=c(0.8,0.2),rel_widths=c(1.0,1.0))