1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
|
#!/usr/bin/env python
__version__ = "$Revision: 1.9 $"
from __future__ import division
import commands
import itertools
import os
import re
from Bio import Wise
_SCORE_MATCH = 4
_SCORE_MISMATCH = -1
_SCORE_GAP_START = -5
_SCORE_GAP_EXTENSION = -1
_CMDLINE_DNAL = ["dnal", "-alb", "-nopretty"]
def _build_dnal_cmdline(match, mismatch, gap, extension):
res = _CMDLINE_DNAL[:]
res.extend(["-match", str(match)])
res.extend(["-mis", str(mismatch)])
res.extend(["-gap", str(-gap)]) # negative: convert score to penalty
res.extend(["-ext", str(-extension)]) # negative: convert score to penalty
return res
_CMDLINE_FGREP_COUNT = "fgrep -c '%s' %s"
def _fgrep_count(pattern, file):
return int(commands.getoutput(_CMDLINE_FGREP_COUNT % (pattern, file)))
_re_alb_line2coords = re.compile(r"^\[([^:]+):[^\[]+\[([^:]+):")
def _alb_line2coords(line):
return tuple([int(coord)+1 # one-based -> zero-based
for coord
in _re_alb_line2coords.match(line).groups()])
def _get_coords(filename):
alb = file(filename)
start_line = None
end_line = None
for line in alb:
if line.startswith("["):
if not start_line:
start_line = line # rstrip not needed
else:
end_line = line
if end_line is None: # sequence is too short
return [(0, 0), (0, 0)]
return zip(*map(_alb_line2coords, [start_line, end_line])) # returns [(start0, end0), (start1, end1)]
def _any(seq, pred=bool):
"Returns True if pred(x) is True at least one element in the iterable"
return True in itertools.imap(pred, seq)
class Statistics(object):
"""
Calculate statistics from an ALB report
"""
def __init__(self, filename, match, mismatch, gap, extension):
self.matches = _fgrep_count('"SEQUENCE" %s' % match, filename)
self.mismatches = _fgrep_count('"SEQUENCE" %s' % mismatch, filename)
self.gaps = _fgrep_count('"INSERT" %s' % gap, filename)
if gap == extension:
self.extensions = 0
else:
self.extensions = _fgrep_count('"INSERT" %s' % extension, filename)
self.score = (match*self.matches +
mismatch*self.mismatches +
gap*self.gaps +
extension*self.extensions)
if _any([self.matches, self.mismatches, self.gaps, self.extensions]):
self.coords = _get_coords(filename)
else:
self.coords = [(0, 0), (0,0)]
def identity_fraction(self):
return self.matches/(self.matches+self.mismatches)
header = "identity_fraction\tmatches\tmismatches\tgaps\textensions"
def __str__(self):
return "\t".join([str(x) for x in (self.identity_fraction(), self.matches, self.mismatches, self.gaps, self.extensions)])
def align(pair, match=_SCORE_MATCH, mismatch=_SCORE_MISMATCH, gap=_SCORE_GAP_START, extension=_SCORE_GAP_EXTENSION, **keywds):
cmdline = _build_dnal_cmdline(match, mismatch, gap, extension)
temp_file = Wise.align(cmdline, pair, **keywds)
try:
return Statistics(temp_file.name, match, mismatch, gap, extension)
except AttributeError:
try:
keywds['dry_run']
return None
except KeyError:
raise
def main():
import sys
stats = align(sys.argv[1:3])
print "\n".join(["%s: %s" % (attr, getattr(stats, attr))
for attr in
("matches", "mismatches", "gaps", "extensions")])
print "identity_fraction: %s" % stats.identity_fraction()
print "coords: %s" % stats.coords
def _test(*args, **keywds):
import doctest, sys
doctest.testmod(sys.modules[__name__], *args, **keywds)
if __name__ == "__main__":
if __debug__:
_test()
main()
|