File: m4topre.py

package info (click to toggle)
pbdagcon 0.3%2Bgit20180411.c14c422%2Bdfsg-8
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,004 kB
  • sloc: ansic: 8,522; cpp: 7,747; python: 668; makefile: 197; sh: 130
file content (209 lines) | stat: -rwxr-xr-x 5,954 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/python3
"""Super-simple converter from blasr m4 alignments to pbdagcon 'pre'
alignments. For use in the pre-assembler dagcon workflow.
"""
from __future__ import print_function

import sys
import heapq
import string # pylint: disable=W0402

from collections import namedtuple, defaultdict
import numpy as np
from pbcore.io.FastaIO import FastaReader

# qname tname score pctsimilarity qstrand qstart qend qseqlength tstrand tstart
# ... tend tseqlength mapqv
#
# store only fields we need
__m4fields__ = [0, 1, 2, 5, 6, 8, 9, 10, 11]
M4RECORD = namedtuple(
    'M4RECORD', 'qname tname score qstart qend tstrand tstart tend tseqlength')

__tuplfy__ = M4RECORD._make # pylint: disable=W0212

# dna compliment
__rc__ = string.maketrans('actgACTG', 'tgacTGAC')


def parse_m4(rec):
    """Parse in the m4 file, returning a list of records"""
    return [y for (x, y) in enumerate(rec.split()) if x in __m4fields__]


def rating(rec):
    """Rates the alignment for by length and score (revisit at some point)"""
    score = -int(rec.score)
    alen = int(rec.tend) - int(rec.tstart)
    return score + alen


def schwartzian(rec):
    """Provides a schwartzian transform for the given record, used for 
    sorting
    """
    flds = rec.split()
    return (flds[1], float(flds[2]), rec)


def sort_targ_score(recs):
    """Sorts the list in place by target id (string), then score (float)"""
    recs[:] = [schwartzian(x) for x in recs]
    recs.sort()
    recs[:] = [rec for (target, score, rec) in recs] # pylint: disable=W0612


def rescore(recs):
    """Rescore alignments using coverage based statistics"""
    prev = ""
    cov = np.zeros(1)
    for idx, rec in enumerate(recs):
        fields = rec.split()
        rec = __tuplfy__(fields)
        if rec.tname != prev:
            prev = rec.tname
            cov = np.zeros(int(rec.tseqlength), dtype=np.float16)

        if rec.tstrand:
            start = int(rec.tseqlength) - int(rec.tend)
            end = int(rec.tseqlength) - int(rec.tstart)
        else:
            start = int(rec.tstart)
            end = int(rec.tend)

        cov[start:end] += 1
        score = np.sum(1/cov[start:end])
        fields[2] = str(-score)
        recs[idx] = " ".join(fields)


def bestn_true(recstr, myq):
    """Checks if the record falls inside bestn (used when blasr is chunked)"""
    rec = __tuplfy__(recstr.split())
    rate = rating(rec)
    return rate in myq[rec.qname[32:]].top


class AlnLimiter(object): # pylint: disable=R0903
    """Functor that returns alignments until some count is reached. Alignments
    should be sorted. 
    """
    def __init__(self, limit=76):
        self.count = 0
        self.target = ''
        self.limit = limit

    def __call__(self, rec):
        target = rec.split()[1]
        if target != self.target:
            self.count = 0
            self.target = target
        self.count += 1
        return self.count < self.limit


class TopAlignments(object): # pylint: disable=R0903
    """Tracks the top alignments for a given query, used for bestn calc"""
    bestn = 10

    def __init__(self):
        self.top = [0] * TopAlignments.bestn

    def __call__(self):
        return  # noop

    def add(self, aln):
        """Adds an alignment to a bounded list, kicking out another if 
        necessary
        """
        heapq.heappushpop(self.top, aln)


def main(): # pylint: disable=R0914
    """Drives the program"""
    mym4 = sys.argv[1]
    allm4 = sys.argv[2]
    reads = sys.argv[3]
    TopAlignments.bestn = int(sys.argv[4])

    # tracks bestn
    my_queries = defaultdict(TopAlignments)
    my_m4recs = []

    # load my m4 chunk
    m4h = open(mym4)
    rec_add = my_m4recs.append
    for line in m4h:
        flds = parse_m4(line)
        rec = __tuplfy__(flds)
        rate = rating(rec)
        my_queries[rec.qname[32:]].add(rate)
        rec_add(' '.join(flds))

    m4h.close()

    # if we're chunked locate relevant alignments
    if mym4 != allm4:
        # assuming fofn here
        m4files = [x.rstrip() for x in open(allm4) if x.rstrip() != mym4]
        for m4f in m4files:
            m4h = open(m4f)
            for recstr in m4h:
                rec = __tuplfy__(parse_m4(recstr))
                if rec.qname[32:] in my_queries:
                    rate = rating(rec)
                    my_queries[rec.qname[32:]].add(rate)
            m4h.close()

        # remove alignments that fall outside of bestn
        my_m4recs[:] = [x for x in my_m4recs if bestn_true(x, my_queries)]

    # sort by target name/score
    sort_targ_score(my_m4recs)

    # rescore based on coverage
    rescore(my_m4recs)

    # sort one more time be new score
    sort_targ_score(my_m4recs)

    # take a max number of alignments for each target
    limiter = AlnLimiter()
    my_m4recs[:] = [x for x in filter(limiter, my_m4recs)]

    # load only related sequences
    seqs = {}
    frea = FastaReader(reads)
    for fent in frea:
        if fent.name[32:] in my_queries:
            seqs[fent.name] = fent.sequence

    # may or may not help
    del my_queries

    # generate pre-alignments
    for recstr in my_m4recs:
        rec = __tuplfy__(recstr.split())

        # Bug 24538, rare case missing self hit
        if rec.tname not in seqs:
            msg = "Warning: skipping query %s target %s\n"
            sys.stderr.write(msg % (rec.qname, rec.tname))
            continue

        qst = int(rec.qstart)
        qnd = int(rec.qend)
        qseq = seqs[rec.qname][qst:qnd]
        strand = '-' if rec.tstrand == '1' else '+'
        tst = int(rec.tstart)
        tnd = int(rec.tend)
        if strand == '+':
            tseq = seqs[rec.tname][tst:tnd]
        else:
            tseq = seqs[rec.tname].translate(__rc__)[::-1][tst:tnd]

        print(' '.join([rec.qname, rec.tname, strand,
                       rec.tseqlength, str(tst), str(tnd), qseq, tseq]))

if __name__ == '__main__':
    sys.exit(main())