1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
|
#!/usr/bin/python3
# Given a blast_tabular file with search results from one or more protein queries
#
################################################################
# copyright (c) 2018 by William R. Pearson and The Rector & Visitors
# of the University of Virginia */
# ###############################################################
# Licensed under the Apache License, Version 2.0 (the "License"); you
# may not use this file except in compliance with the License. You
# may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0 Unless required by
# applicable law or agreed to in writing, software distributed under
# this License is distributed on an "AS IS" BASIS, WITHOUT WRRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and
# limitations under the License.
# ###############################################################
import fileinput
import sys
import re
import argparse
import urllib.request, urllib.error, urllib.parse
from rename_exons import *
def replace_dom_number(line):
out_str = ''
if (not re.search(r'~',line)):
return line
(info, num, vdom) = re.search(r'^([^~]+)~(\d+)(v?)$',line).groups()
if (vdom is None):
vdom=''
if (num in homolog_dict):
return "%s~h%s%s" % (info, str(homolog_dict[num]['num']), vdom)
else:
name = line.split(" ")[-1].split("{")[0]
if (name == "NODOM"):
return line
else:
if (name in nonhomolog_dict):
return '~'.join(line.split('~')[:-1]) + "~" + str(nonhomolog_dict[name])
return out_str
################
# __main__ function
#
e_thresh = 1e-6
q_thresh = 30.0
homolog_dict = {}
nonhomolog_dict = {}
def main():
# print "#"," ".join(sys.argv)
hom_color = 1
n_hom_color = 11
parser=argparse.ArgumentParser(description='relabel_domains.py result_file.m8CB')
parser.add_argument('--have_qslen', help='bl_tab fields include query/subject lengths',dest='have_qslen',action='store_true',default=False)
parser.add_argument('--dom_info', help='raw domain coordinates included',action='store_true',default=False)
parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
args=parser.parse_args()
end_field = -1
data_fields_reset=False
(fields, end_field) = set_data_fields(args, [])
if (args.have_qslen and args.dom_info):
data_fields_reset=True
for line in fileinput.input(args.files):
# pass through comments
if (line[0] == '#'):
print(line, end='') # ',' because have not stripped
continue
################
# break up tab fields, check for extra fields
line = line.strip('\n')
line_data = line.split('\t')
if (not data_fields_reset): # look for --have_qslen number, --dom_info data, even if not set
(fields, end_field) = set_data_fields(args, line_data)
data_fields_reset = True
################
# get exon annotations
data = parse_protein(line_data,fields,'') # get score/alignment/domain data
if (len(data['sdom_list'])==0 and len(data['qdom_list'])==0):
print(line) # no domains to be edited, print stripped line and contine
continue
################
# have domains, check if significant and new, or old and known
# goals are: (1) consistent coloring between query and subject for same domain
# (2) homologous domains get special labels
# need dict of good domain names
################
# check to update doms with good E()-value
if float(data['evalue']) <= e_thresh:
for q_dom in data['qdom_list']:
if (float(q_dom.q_score) >= q_thresh and q_dom.name not in homolog_dict ):
homolog_dict['q_dom.name'] = q_dom_color
dom_color += 1
for s_dom in data['sdom_list']:
if (float(s_dom.q_score) >= q_thresh and s_dom.name not in homolog_dict):
homolog_dict['s_dom.name'] = s_dom.color
hom_color += 1
else:
for s_dom in data['sdom_list']:
if (s_dom.name not in homolog_dict):
nonhomolog_dict['s_dom.name'] = s_dom.color
n_hom_color += 1
################
# done storing good domains, write things out
btab_str = '\t'.join(str(data[x]) for x in fields[:end_field])
for s_dom in data['sdom_list']:
if (s_dom.name in homolog_dict):
s_dom.color=homolog_dict[s_dom.name]
elif (s_dom.name in nonhomolog_dict):
s_dom.color=nonhomolog_dict[s_dom.name]
dom_bar_str = ''
for dom in sorted(data['qdom_list']+data['sdom_list'],key=lambda r: r.idnum):
dom_bar_str += dom.make_bar_str()
print(btab_str+dom_bar_str)
if __name__ == '__main__':
main()
|