1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
|
#!/usr/bin/python3
"""
Returns all positions of a maf with any pwm score > threshold
The positions are projected onto human coordinates
"""
import sys
from bx import intervals
from bx.align import maf as align_maf
from bx.pwm.pwm_score_maf import MafBlockScorer
from . import position_weight_matrix as pwmx
def isnan(x):
return not x == x
def main():
if len(sys.argv) < 5:
print(f"{sys.argv[0]} bedfile inmaf spec1,spec2,... motif_file ", file=sys.stderr)
sys.exit(0)
# read in intervals
regions = {}
for line in open(sys.argv[1]):
if line.startswith("#"):
continue
fields = line.strip().split()
chrom, start, end = fields[0], int(fields[1]), int(fields[2])
try:
name = fields[3]
except IndexError:
name = None
if chrom not in regions:
regions[chrom] = intervals.Intersecter()
regions[chrom].add(start, end, name)
pwm = {}
for wm in pwmx.Reader(open(sys.argv[4])):
pwm[wm.id] = wm
print(wm.id, len(wm), file=sys.stderr)
inmaf = open(sys.argv[2])
threshold = 0.5
species = []
for sp in sys.argv[3].split(","):
species.append(sp)
for maf in align_maf.Reader(inmaf):
mafchrom = maf.components[0].src.split(".")[1]
mafstart = maf.components[0].start
mafend = maf.components[0].end
reftext = maf.components[0].text
# maf block scores for each matrix
for scoremax, width, headers in MafBlockScorer(pwm, species, maf):
blocklength = width
mafsrc, mafstart, mafend = headers[0]
mafchrom = mafsrc.split(".")[1]
# lists of scores for each position in scoremax
for mx_name, mx in scoremax.items():
for offset in range(blocklength):
# scan all species with threshold
for i in range(len(species)):
if mx[i][offset] > threshold:
refstart = mafstart + offset - reftext.count("-", 0, offset)
refend = refstart + len(pwm[mx_name])
data = " ".join([f"{mx[x][offset]:.2f}" for x in range(len(species))])
# quote the motif
r = regions[mafchrom].find(refstart, refend)
if mafchrom in regions and len(r) > 0:
region_label = r[0].value
else:
continue
v_name = mx_name.replace(" ", "_")
print(mafchrom, refstart, refend, region_label, v_name, data)
break
if __name__ == "__main__":
main()
|