File: relabel_domains.py

package info (click to toggle)
fasta3 36.3.8h.2020-02-11-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 6,048 kB
  • sloc: ansic: 56,138; perl: 10,192; python: 2,205; sh: 416; csh: 85; sql: 55; makefile: 38
file content (154 lines) | stat: -rwxr-xr-x 5,135 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/python3

# Given a blast_tabular file with search results from one or more protein queries
# 

################################################################
# copyright (c) 2018 by William R. Pearson and The Rector & Visitors
# of the University of Virginia */
# ###############################################################
# Licensed under the Apache License, Version 2.0 (the "License"); you
# may not use this file except in compliance with the License.  You
# may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0 Unless required by
# applicable law or agreed to in writing, software distributed under
# this License is distributed on an "AS IS" BASIS, WITHOUT WRRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied.  See the
# License for the specific language governing permissions and
# limitations under the License.
# ###############################################################


import fileinput
import sys
import re
import argparse
import urllib.request, urllib.error, urllib.parse

from rename_exons import *

def replace_dom_number(line):

    out_str = ''
    if (not re.search(r'~',line)):
        return line

    (info, num, vdom) = re.search(r'^([^~]+)~(\d+)(v?)$',line).groups()
    if (vdom is None):
        vdom=''

    if (num in homolog_dict):
        return "%s~h%s%s" % (info, str(homolog_dict[num]['num']), vdom)
        
    else:
        name = line.split(" ")[-1].split("{")[0]
        if (name == "NODOM"):
            return line
        else:
            if (name in nonhomolog_dict):
                return '~'.join(line.split('~')[:-1]) + "~" + str(nonhomolog_dict[name])
    return out_str


################
# __main__ function
#

e_thresh = 1e-6
q_thresh = 30.0

homolog_dict = {}
nonhomolog_dict = {}

def main():

    # print "#"," ".join(sys.argv)

    hom_color = 1
    n_hom_color = 11

    parser=argparse.ArgumentParser(description='relabel_domains.py result_file.m8CB')

    parser.add_argument('--have_qslen', help='bl_tab fields include query/subject lengths',dest='have_qslen',action='store_true',default=False)
    parser.add_argument('--dom_info', help='raw domain coordinates included',action='store_true',default=False)
    parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')

    args=parser.parse_args()

    end_field = -1
    data_fields_reset=False

    (fields, end_field) = set_data_fields(args, [])

    if (args.have_qslen and args.dom_info):
        data_fields_reset=True


    for line in fileinput.input(args.files):
    # pass through comments
        if (line[0] == '#'):
            print(line, end='')	# ',' because have not stripped
            continue

        ################
        # break up tab fields, check for extra fields
        line = line.strip('\n')
        line_data = line.split('\t')
        if (not data_fields_reset):     # look for --have_qslen number, --dom_info data, even if not set
            (fields, end_field) = set_data_fields(args, line_data)
            data_fields_reset = True

        ################
        # get exon annotations
        data = parse_protein(line_data,fields,'')	# get score/alignment/domain data

        if (len(data['sdom_list'])==0 and len(data['qdom_list'])==0):
            print(line)	# no domains to be edited, print stripped line and contine
            continue

        ################
        # have domains, check if significant and new, or old and known
        # goals are: (1) consistent coloring between query and subject for same domain
        #            (2) homologous domains get special labels
        # need dict of good domain names

        ################
        # check to update doms with good E()-value
        if float(data['evalue']) <= e_thresh:
            for q_dom in data['qdom_list']:
                if (float(q_dom.q_score) >= q_thresh and q_dom.name not in homolog_dict ):
                    homolog_dict['q_dom.name'] = q_dom_color
                    dom_color += 1

            for s_dom in data['sdom_list']:
                if (float(s_dom.q_score) >= q_thresh and s_dom.name not in homolog_dict):
                    homolog_dict['s_dom.name'] = s_dom.color
                    hom_color += 1
        else:
            for s_dom in data['sdom_list']:
                if (s_dom.name not in homolog_dict):
                    nonhomolog_dict['s_dom.name'] = s_dom.color
                    n_hom_color += 1


        ################
        # done storing good domains, write things out

        btab_str = '\t'.join(str(data[x]) for x in fields[:end_field])

        for s_dom in data['sdom_list']:
            if (s_dom.name in homolog_dict):
                s_dom.color=homolog_dict[s_dom.name]
            elif (s_dom.name in nonhomolog_dict):
                s_dom.color=nonhomolog_dict[s_dom.name]
        

        dom_bar_str = ''
        for dom in sorted(data['qdom_list']+data['sdom_list'],key=lambda r: r.idnum):
            dom_bar_str += dom.make_bar_str()

        print(btab_str+dom_bar_str)


if __name__ == '__main__':
    main()