File: annot_blast_btop4.py

package info (click to toggle)
fasta3 36.3.8i.14-Nov-2020-3
links: PTS, VCS
area: main
in suites: sid, trixie
size: 7,016 kB
sloc: ansic: 77,269; perl: 10,677; python: 2,461; sh: 428; csh: 86; sql: 55; makefile: 40
file content (685 lines) | stat: -rwxr-xr-x 27,067 bytes
parent folder | download | duplicates (2)
#!/usr/bin/python3

################################################################
# copyright (c) 2017,2018 by William R. Pearson and The Rector &
# Visitors of the University of Virginia */
################################################################
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under this License is distributed on an "AS
# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied.  See the License for the specific language
# governing permissions and limitations under the License. 
################################################################

################################################################
# annot_blast_btop4.py --query query.file --ann_script ann_pfam_www.pl --include_doms blast_tab_btop_file
################################################################
# annot_blast_btop4.py associates domain annotation information and
# subalignment scores with a blast tabular (-outfmt 6 or -outfmt 7)
# file that contains the raw score and the BTOP alignment encoding
# This file can be generated from "blastp/n" or "blast_formatter"
# using the command:
#   blast_formatter -archive blast_output.asn -outfmt '7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore score btop'  > blast_output.tab_annot
#
# If the BTOP field or query_file is not available, the script
# produces domain content without sub-alignment scores.
################################################################
## 2-Dec-2019
# added --have_qslen, --raw_score/--no_raw_score
# made more robust to multiple HSPs when using --ann_file
#
################################################################
## 4-Nov-2018
# add --include_doms, which adds a new field with the coordinates of
# the domains in the protein (independent of alignment)
#
################################################################
## 21-July-2018
# include sequence length (actually alignment end) to produce NODOM's (no NODOM's without length).
#
################################################################
## 13-Jan-2017
# modified to provide query/subject coordinates and identities if no
# query sequence -- does not decrement for reverse-complement fastx/blastx DNA
################################################################
## 16-Nov-2015
# modify to allow multi-query blast searches
################################################################
## 19-Dec-2015
# add -q_annot_script to annotate query sequence
#

import argparse
import fileinput
import sys
import re
import shutil
import subprocess
from math import log

# read lines of the form:
# gi|121694|sp|P20432.1|GSTT1_DROME	gi|121694|sp|P20432|GSTT1_DROME	100.00	209	0	0	1	209	1	209	6e-156	433	1113	209
# gi|121694|sp|P20432.1|GSTT1_DROME	gi|1170090|sp|P04907|GSTF3_MAIZE	26.77	198	123	7	4	185	6	197	2e-08	51.2	121	FL1YG ... 1NKRA1YW1
# gi|121694|sp|P20432.1|GSTT1_DROME	gi|81174731|sp|P0ACA5|SSPA_ECO57	39.66	58	32	2	43	100	49	103	8e-06	43.9	102	EDFLLI ... V-I-NEQS3FM
# gi|121694|sp|P20432.1|GSTT1_DROME	gi|121695|sp|P12653|GSTF1_MAIZE	27.62	181	107	7	32	203	34	199	9e-05	40.8	94	LI1LF ... N-1AS1CLLM1

# and report the domain content ala -m 8CC

def init_blosum62():

    # ncbi_blaa -- list of amino acids
    ncbi_blaa = "A R N D C Q E G H I L K M F P S T W Y V B Z X *".split(' ')

    # blosum62: 2D dict of scoring matrix values

    blosum62 = {}
    blosum62['A'] = dict(zip(ncbi_blaa,[ 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-2,-1, 0,-4]))
    blosum62['R'] = dict(zip(ncbi_blaa,[-1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-1, 0,-1,-4]))
    blosum62['N'] = dict(zip(ncbi_blaa,[-2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3, 3, 0,-1,-4]))
    blosum62['D'] = dict(zip(ncbi_blaa,[-2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1,0,-1,-4,-3,-3,4,1,-1,-4]))
    blosum62['C'] = dict(zip(ncbi_blaa,[ 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3,-3,-2,-4]))
    blosum62['Q'] = dict(zip(ncbi_blaa,[-1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2,1,0,-3,-1,0,-1,-2,-1,-2,0,3,-1,-4]))
    blosum62['E'] = dict(zip(ncbi_blaa,[-1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3,1,-2,-3,-1,0,-1,-3,-2,-2,1,4,-1,-4]))
    blosum62['G'] = dict(zip(ncbi_blaa,[ 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2,0,-2,-2,-3,-3,-1,-2,-1,-4]))
    blosum62['H'] = dict(zip(ncbi_blaa,[-2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2,2,-3,0,0,-1,-4]))
    blosum62['I'] = dict(zip(ncbi_blaa,[-1,-3,-3,-3,-1,-3,-3,-4,-3,4,2,-3,1,0,-3,-2,-1,-3,-1,3,-3,-3,-1,-4]))
    blosum62['L'] = dict(zip(ncbi_blaa,[-1,-2,-3,-4,-1,-2,-3,-4,-3,2,4,-2,2,0,-3,-2,-1,-2,-1,1,-4,-3,-1,-4]))
    blosum62['K'] = dict(zip(ncbi_blaa,[-1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2,5,-1,-3,-1,0,-1,-3,-2,-2,0,1,-1,-4]))
    blosum62['M'] = dict(zip(ncbi_blaa,[-1,-1,-2,-3,-1, 0,-2,-3,-2,1,2,-1,5,0,-2,-1,-1,-1,-1,1,-3,-1,-1,-4]))
    blosum62['F'] = dict(zip(ncbi_blaa,[-2,-3,-3,-3,-2,-3,-3,-3,-1,0,0,-3,0,6,-4,-2,-2,1,3,-1,-3,-3,-1,-4]))
    blosum62['P'] = dict(zip(ncbi_blaa,[-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4,7,-1,-1,-4,-3,-2,-2,-1,-2,-4]))
    blosum62['S'] = dict(zip(ncbi_blaa,[ 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2,0,-1,-2,-1,4,1,-3,-2,-2,0,0,0,-4]))
    blosum62['T'] = dict(zip(ncbi_blaa,[ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1,1,5,-2,-2,0,-1,-1,0,-4]))
    blosum62['W'] = dict(zip(ncbi_blaa,[-3 -3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1,1,-4,-3,-2,11,2,-3,-4,-3,-2,-4]))
    blosum62['Y'] = dict(zip(ncbi_blaa,[-2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1,3,-3,-2,-2,2,7,-1,-3,-2,-1,-4]))
    blosum62['V'] = dict(zip(ncbi_blaa,[ 0,-3,-3,-3,-1,-2,-2,-3,-3,3,1,-2,1,-1,-2,-2,0,-3,-1,4,-3,-2,-1,-4]))
    blosum62['B'] = dict(zip(ncbi_blaa,[-2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4,0,-3,-3,-2,0,-1,-4,-3,-3, 4, 1,-1,-4]))
    blosum62['Z'] = dict(zip(ncbi_blaa,[-1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3,1,-1,-3,-1,0,-1,-3,-2,-2,1,4,-1,-4]))
    blosum62['X'] = dict(zip(ncbi_blaa,[ 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2,0,0,-2,-1,-1,-1,-1,-1,-4]))
    blosum62['*'] = dict(zip(ncbi_blaa,[-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,1]))

    if (len(blosum62.keys()) != len(ncbi_blaa)):
        sys.stderr.write(" blosum62 length mismatch %d != %d\n" %(len(blosum62), len(ncbi_blaa)))
        print(' '.join(ncbi_blaa),file=sys.stderr)
        print(' '.join(blosum62.keys()),file=sys.stderr)

        exit(1)
        
    blosum62_diag = {x:blosum62[x][x] for x in ncbi_blaa}

    return (blosum62, blosum62_diag, -11, -1)

################
# read_annots (\@hit_list)
# input: hit_entry['s_seq_id, etc'], target
# output: modified $hit_entry['domains']
#         modified $hit_entry['sites']
#
# extend to make robust to multiple hits on the same subject

def read_annots(Reader):

    target_set = {}

    current_domain = ""
    hit_ix = 0
    seq_domains = []
    seq_sites = []

    subj_domains = {}

    for line in Reader:
        if (line[0]=='='):
            continue

        line = line.strip("\n")

        # check for header
        if (line[0] == '>'):
            if (current_domain):  # previous domains/sites have already been found and parsed
                if (current_domain not in  target_set):
                    target_set[current_domain] = {}
                    target_set[current_domain]['domains'] = [ d for d in seq_domains ]  # previous domains
                    target_set[current_domain]['sites'] = [ s for s in seq_sites ]      # previous sites
                else:
                    sys.stderr.write("*** phase error: %s duplicate\n"%(current_domain))

            seq_domains = [];   # current domains
            seq_sites = [];     # current sites
            current_domain = line.split(' ')[0][1:]

        else:			# check for data
            a_fields = line.split('\t')
            a_fields[0]=int(a_fields[0])
            if (a_fields[1] == '-'):
                a_fields[2]=int(a_fields[2])
                annot_info = dict(zip(('d_pos','type','d_end','descr'), a_fields))
                re_df=re.compile(r' :(\d+)$')
                annot_info['descr'] =  re_df.sub(r'~\1',annot_info['descr'])
                seq_domains.append(annot_info)
            else:
                annot_info = dict(zip(('d_pos','type', 'd_val', 'descr', a_fields)))
                annot_info['d_end'] = annot_info['d_pos']
                seq_sites.append(annot_info)
    
    Reader.close()

    # get the last one
    if (current_domain):  # previous domains/sites have already been found and parsed
        if (current_domain not in  target_set):
            target_set[current_domain] = {}
            target_set[current_domain]['domains'] = [ d for d in seq_domains ]  # previous domains
            target_set[current_domain]['sites'] = [ s for s in seq_sites ]      # previous sites
        # else:
            # sys.stderr.write("*** phase error: %s duplicate\n"%(current_domain))

    return target_set

################
# merge_annots(hit_r):
#
# take different annotations in hit_r and put them in one list
#
def merge_annots(hit_r):
    merged_annots = []
    if ('q_aligned_domains' in hit_r):
        for annot in hit_r['q_aligned_domains']:
            annot['target']=1
            merged_annots.append(annot)

    if ('aligned_domains' in hit_r):
        for annot in hit_r['aligned_domains']:
            annot['target']=0
            merged_annots.append(annot)

    merged_annots = sorted(merged_annots, key=lambda x: x['qa_start'])

    return(merged_annots)


################
# get_file_annots(file_name)
#
def get_file_annots(file_name, hit_list):

    with open(file_name,'r') as Reader:
        ann_set = read_annots(Reader)

    return ann_set

################
# get_script_annots(script_name, hit_list)
#
# set up stdin/stdout pipe to send in hit list info and read results
#
def get_script_annots(script_name, hit_list, key_list):

    seq_set = {}

    proc = subprocess.Popen(script_name, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True, encoding='utf-8')

    for hit in hit_list:
        (seq_id, seq_len) = (hit[key_list[0]],hit[key_list[1]])
        if (seq_id not in seq_set):
            proc.stdin.write(("%s\t%s\n"%(seq_id,seq_len)))

    proc.stdin.close()

    while (proc.returncode is None):
        proc.poll()

    return read_annots(proc.stdout)

################
#
# link_annots(hit_list, annot_set)
#
# put 'domains' and 'sites' into each hit in the hit list
#
def link_annots(hit_list, annot_set):

    for hit in hit_list:
        seqid = hit['s_seq_id']
        if (seqid in annot_set):
            if ('domains' in annot_set[seqid]):
                hit['domains']=annot_set[seqid]['domains']
            if ('sites' in annot_set[seqid]):
                hit['sites']=annot_set[seqid]['sites']

# input: a blast BTOP string of the form: "1VA160TS7KG10RK27"
# returns a list_ref of tokens: (1, "VA", 60, "TS", 7, "KG, 10, "RK", 27)
#
def decode_btop(btop_str):

    tokens = re.split(r'(\d+)',btop_str)  # split with capture returns both strings between and separator (\d+)

    if not tokens[0]:
        tokens = tokens[1:]

    out_tokens = []

    for token in tokens:
        if re.match(r'\d+',token):
            out_tokens.append(token)
        else:
            mis_tokens = re.split(r'(..)',token)  # split with capture
            for mis in mis_tokens:
                if (mis):
                    out_tokens.append(mis)

    return out_tokens

def parse_query_lib(query_file):

    query_seqs = {}

    with open(query_file,"r") as qfd:
        header=''
        seq_data=''

        for line in qfd:
            line = line.strip("\n")
            
            if (line[0]=='>'):
                if (header):
                    # save existing sequence
                    seq_data = '' + seq_data
                    query_seqs[header]=seq_data

                header = line[1:].split(' ')[0]
            else:
                line = re.sub(r'[^A-Za-z]','',line)
                seq_data += line.upper()

    # save last entry
    if (header):
        query_seqs[header]=seq_data

    return query_seqs

# given: (1) a query sequence; (2) an encoded alignment; (3) a scoring matrix
# calculate:
# (1) the overall score
# (2) a per residue dictionary of scores and mappings from query -> subject
# (2) a per residue dictionary of scores and mappings from subject -> query
#
def alignment_score(query_r, hit, matrix_2d, matrix_diag, g_open, g_ext):

    query_start, subj_start = (int(hit['q_start']),int(hit['s_start']))
    btop_align_r = decode_btop(hit['BTOP'])
    hit['btop_align'] = btop_align_r

    q_map = []
    s_map = []

    gap0, gap1 = (0 ,0)

    q_ix = query_start - 1 # start from zero
    s_ix = subj_start - 1

    score, m_score = (0, 0)
    seq0, seq1 = ("","")

    for btop in btop_align_r:
        if (re.search(r'^\d+$',btop)):   # matching query sequence, add it up
            for i in  range(0,int(btop)):
                res = query_r[q_ix]
                score += matrix_diag[res]
                q_map.append({'s':score, 'y_ix':s_ix, 'res':res})
                s_map.append({'s':score, 'y_ix':q_ix, 'res':res})
                q_ix += 1
                s_ix += 1
        else:
            seq0, seq1 = (btop[0],btop[1])

            if (re.search(r'\-',btop)):  # is there a gap?
                if (seq0 == '-'):      # is it in query?
                    if gap0:             # are we in a gap?
                        score += g_ext
                    else:
                        score += g_open+g_ext
                        gap0 = True
                    # 'y_ix':-1 indicates alignment to gap
                    s_map.append({'s':score, 'y_ix':-1, 'res':seq1})
                    s_ix += 1
                else:			# gap is in subject
                    if gap1:
                        score += g_ext
                    else:
                        score += g_open+g_ext
                        gap1 = True
                    # 'y_ix':-1 indicates alignment to gap
                    q_map.append({'s':score, 'y_ix':-1, 'res':seq0})
                    q_ix += 1
            else:			# mismatch, not gap
                score += matrix_2d[seq0][seq1]
                gap1=gap0 = False
            
                q_map.append({'s':score, 'y_ix':s_ix, 'res':seq0})
                s_map.append({'s':score, 'y_ix':q_ix, 'res':seq1})

                q_ix += 1
                s_ix += 1

    return score, q_map, s_map

################################################################
# sub_alignment_stats -- calculate stats for ONE domain entry
# given x_map, xa_start, xa_end  where x=q/s depending on target
# domain_r : domain boundaries
# 
# calculate a score, identity and boundaries in both sequences and return values
#

def one_sub_alignment_stats(domain_r, x_map, y_map, xa_start, xa_end, ya_start, ya_end):

    td_start, td_end = (domain_r['d_pos'],domain_r['d_end'])

    if (td_end < xa_start or td_start > xa_end):
        return 0

    if (td_start < xa_start):
        td_start = xa_start

    if (td_end > xa_end):
        td_end = xa_end

    td_start -= xa_start
    td_end -= xa_start

    left_score = 0
    if (td_start>0) :
        left_score = x_map[td_start-1]['s']

    score = x_map[td_end]['s'] - left_score

    # map[] coordinates are 0-based
    # ya_start = x_map[td_start]['y_ix']+1
    # ya_end = x_map[td_end]['y_ix']+1

    #### identity calculation:
    n_len = 0
    n_id = 0
    for xi in range(td_start, td_end+1):
        this_x = x_map[xi]

        x_res=this_x['res']
        if (this_x['y_ix'] >= 0):
            n_len += 1
            y_res = y_map[this_x['y_ix']-ya_start+1]['res']
            if (x_res.upper() == y_res.upper()):
                n_id += 1

    ident = float(n_id)/float(n_len)

    return score, ident, td_start+xa_start-1, td_end+xa_start-1, x_map[td_start]['y_ix'], x_map[td_end]['y_ix']

################
# get domain scores, idents, boundaries for list of domains
#
def do_sub_alignment_stats(domain_list, x_map, y_map, xa_start, xa_end, ya_start, ya_end, keys_str):

    aligned_doms = []

    for domain in domain_list:
        subalign_data = one_sub_alignment_stats(domain, x_map, y_map, xa_start, xa_end, ya_start, ya_end)
        if (subalign_data and len(subalign_data)==6):
            sub_data = dict(zip(keys_str,subalign_data))
            for k in ('type','descr'):
                sub_data[k] = domain[k]
            aligned_doms.append(sub_data)

    return(aligned_doms)

####
# print raw domain info:
# |DX:%d-%d;C=dom_info|XD:%d-%d:C=dom_info
#
def format_dom_info(q_dom_r, dom_r):

    dom_str = ""
    for dom  in q_dom_r:
        dom_str += "|DX:%d-%d;C=%s"%(dom['d_pos'],dom['d_end'], dom['descr'])

    for dom  in dom_r:
        dom_str += "|XD:%d-%d;C=%s"%(dom['d_pos'],dom['d_end'], dom['descr'])

    return dom_str


def format_annot_info(annot_list_r, hit):

    annot_str = "";

    # two types of annotations, domains and sites.

    score_scale = hit['score']/hit['raw_score']

    for annot_r in (annot_list_r ):

        if (annot_r['type'] == '-'):
            fsub_score = annot_r['score']/hit['raw_score']

            ns_score, s_bit = (int(annot_r['score'] * score_scale + 0.5),
                               fsub_score * hit['bits'])

            qval = 0.0
            if (hit['evalue'] == 0.0):
                if (s_bit > 50.0):
                    qval = 3000.0
                else:
                    qval = -10.0 * (2.0*log(400.0) + s_bit)/log(10.0)
            else:
                qval = -10.0*log(hit['evalue'])*fsub_score/log(10.0)

            if qval < 0.0:
                qval = 0.0

            rx_str = 'XR'
            if (annot_r['target']):
                rx_str = "RX"
            annot_str += ';'.join(("|%s:%d-%d:%d-%d:s=%d"%(rx_str,
                                      annot_r['qa_start']+1,annot_r['qa_end']+1,
                                      annot_r['sa_start']+1,annot_r['sa_end']+1,ns_score),
                                   "b=%.1f"%(s_bit),"I=%.3f"%(annot_r['ident']),
                                   "Q=%.1f"%(qval),"C=%s"%(annot_r['descr'])))

        else:        # site annotation
            ann_type = annot_r['type'];
            site_str = "|%cX"%(ann_type)
            if (annot_r['target'] == 1):
                site_str = "|X%c"%(ann_type)
            elif (annot_r['target'] == 2):
                site_str = "|%c%c"%(ann_type, ann_type)

            annot_str += "%s:"%(site_str)
            annot_str += "%d%s%s%d%s"%(annot_r['qa_pos'], annot_r['q_res'], annot_r['m_symb'],
                                        annot_r['sa_pos'], annot_r['s_res'])

    return annot_str

def main(args):

    blosum62, blosum62_diag, g_open, g_ext = init_blosum62()

    if (args.query_file):
        # query_lib_r has a set of query sequences
        query_lib_r = parse_query_lib(args.query_file)
    else:
        sys.stderr.write("--query required\n")
        exit(1)

    tab_fields = "q_seqid s_seqid percid alen mismatch gopen q_start q_end s_start s_end evalue bits BTOP".split(' ')
    int_fields = "alen mismatch gopen q_start q_end s_start s_end".split(' ')
    float_fields = "percid evalue bits score".split(' ')

    if (args.have_qslen):
        tab_fields = "q_seqid q_len s_seqid s_len percid alen mismatch gopen q_start q_end s_start s_end evalue bits BTOP".split(' ')
        int_fields = "q_len s_len alen mismatch gopen q_start q_end s_start s_end".split(' ')

    # the fields that are displayed are listed here.  By default, all fields except score and BTOP are displayed.
    out_tab_fields = tab_fields[0:-1]
    in_tab_fields = tab_fields[0:-1]

    if (args.raw_out):
        out_tab_fields.append("raw_score")

    if (args.raw_in):
        in_tab_fields.append("score")

    ## always add BTOP
    in_tab_fields.append("BTOP")
    tab_fields = in_tab_fields

    if (args.out_fields):
        out_tab_fields = out_fields.split(" ")

    header_lines = []

    next_line = ""
    have_data = False

    hit_list = []
    q_hit_list = []

    for line in fileinput.input(args.files):
        if (line[0] == '#'):
            if (have_data):
                next_line = line
                have_data = False
                break
            else:
                header_lines.append(line)
            continue

        have_data = True
        line = line.strip('\n')
        if (line):
            this_data = dict(zip(tab_fields, line.split("\t")))
            for k in this_data.keys():
                if (k in int_fields):
                    this_data[k] = int(this_data[k])
                if (k in float_fields):
                    this_data[k] = float(this_data[k])
            hit_list.append(this_data)

    # get the query annotations
    q_hit_list = []
    if (args.q_ann_file):
        q_seqid = hit_list[0]['q_seqid']
        q_hit_list.append({'s_seq_id':q_seqid, 's_end':len(query_lib_r[q_seqid])})
        q_annots = get_file_annots(args.q_ann_file, q_hit_list)
        link_annots(q_hit_list, q_annots)

    elif (args.q_ann_script):
        args.q_ann_script = re.sub(r'\+',' ',args.q_ann_script)

        if (args.q_ann_script and shutil.which(args.q_ann_script.split(" ")[0])):
            q_seqid = hit_list[0]['q_seqid']
            q_hit_list.append({'s_seq_id':q_seqid, 's_end':len(query_lib_r[q_seqid])})
            q_annots = get_script_annots(args.q_ann_script, q_hit_list, ['s_seq_id','s_end'])
            link_annots(q_hit_list, q_annots)

    # get the subject annotations
    # first set up the list with sequence lengths
    if (args.ann_file or args.ann_script):
        s_len = 100000
        for hit in hit_list:
            hit['s_seq_id']=hit['s_seqid']
            if (not args.have_qslen):
                hit['s_end']=s_len
        
        if (args.ann_file):
            s_annots = get_file_annots(args.ann_file, hit_list)
            link_annots(hit_list, s_annots)
        elif (args.ann_script):
            args.ann_script = re.sub(r'\+',' ',args.ann_script)
            if (shutil.which(args.ann_script.split(" ")[0])):
                s_annots = get_script_annots(args.ann_script, hit_list,['s_seq_id','s_end'])
                link_annots(hit_list, s_annots)

    for line in header_lines:
        print(line, end='')

    header_lines = [next_line]

    # now get query annotation if available

    for hit in hit_list:
        list_covered = []

        # If I have an encoded aligment {BTOP} and a query sequence query_lib_r && query_lib_r[hit['q_seqid']]
        # then I can calculate sub-alignment scores
        if ('BTOP' in hit and query_lib_r and hit['q_seqid'] in query_lib_r):

            # calculate raw_score and mappings
            hit['raw_score'], q_map, s_map = alignment_score(query_lib_r[hit['q_seqid']],
                                                             hit,blosum62, blosum62_diag, g_open, g_ext)
            if ('score' not in hit):
                hit['score'] = hit['raw_score']

            # calculate sub-alignment scores in subject/library coordinates

            if ('domains' in hit and len(hit['domains'])>0):
                hit['aligned_domains'] = do_sub_alignment_stats(hit['domains'], s_map, q_map, hit['s_start'],hit['s_end'],hit['q_start'],hit['q_end'],
                                                                ('score','ident','sa_start', 'sa_end', 'qa_start', 'qa_end'))

            # calculate sub-alignment scores in query coordinates
            if (len(q_hit_list) > 0 and 'domains' in q_hit_list[0] and len(q_hit_list[0]['domains'])>0):
                hit['q_aligned_domains'] = do_sub_alignment_stats(q_hit_list[0]['domains'], q_map, s_map, hit['q_start'],hit['q_end'],hit['s_start'],hit['s_end'],
                                                                  ('score','ident','qa_start', 'qa_end', 'sa_start', 'sa_end'))
        ################
        ## final output display

        print("\t".join([str(hit[x]) for x in out_tab_fields]),end='') # show fields from original blast tabular file

        merged_annots_r = merge_annots(hit)                # merge the four possible annotation lists into one.

        if (len(merged_annots_r)>0):
            print("\t"+format_annot_info(merged_annots_r, hit),end='')
            if (args.dom_info):
                if (len(q_hit_list) > 0 and 'domains' in q_hit_list[0]):
                    print("\t"+format_dom_info(q_hit_list[0]['domains'], hit['domains']),end='')
                else:
                    print("\t"+format_dom_info([], hit['domains']),end='')
        elif (len(list_covered)>0):
            print("\t" + ";".join(list_covered))
            if (args.dom_info):
                print("\t"+format_dom_info(q_hit_list[0]['domains'], hit['domains']),end='')
        print()

    for line in header_lines:
        print(line,end="")


if __name__ == '__main__':

    print('# ' + ' '.join(sys.argv))

    parser=argparse.ArgumentParser(description='annot_blast_btop4.py : annotate blast tabular format with BTOP ')
    # not implemented
    # parser.add_argument('--matrix', help='scoring matrix',dest='matrix',action='store',default='BL62')
    parser.add_argument('--ann_script', help='script for subject annotations',dest='ann_script',action='store')
    parser.add_argument('--q_ann_script', help='script for query annotations',dest='q_ann_script',action='store')
    parser.add_argument('--ann_file', help='subject annotation file',dest='ann_file',action='store')
    parser.add_argument('--q_ann_file', help='query annotation file',dest='q_ann_file',action='store')
    parser.add_argument('--have_qslen', help='query/subject lenghts in tab file',dest='have_qslen',action='store_true',default=False)
    parser.add_argument('--dom_info', help='show unaligned domain coordinates',dest='dom_info',action='store_true',default=False)
    parser.add_argument('--sub2query', help='get query annots from self-subject',dest='sub_query',action='store_true',default=False)
    parser.add_argument('--query', help='file of query sequences',dest='query_file',action='store')
    parser.add_argument('--out_fields', help='names/order of output fields',dest='out_fields',action='store')
    parser.add_argument('--raw_score', help='raw score after bit score',dest='raw_in',action='store_true',default=True)
    parser.add_argument('--no_raw_score', help='raw score after bit score',dest='raw_in',action='store_false', default=True)
    parser.add_argument('--no-raw_score', help='raw score after bit score',dest='raw_in',action='store_false', default=True)
    parser.add_argument('--raw_score_out', help='display raw score',dest='raw_out',action='store_true',default=False)
    parser.add_argument('files', metavar='FILE', help='Blast tabular BTOP files to read', nargs='*')

    args=parser.parse_args()

    main(args)