File: bed_score_aligned_pwm.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (86 lines) | stat: -rwxr-xr-x 2,810 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/python3
"""
Returns all positions of a maf with any pwm score > threshold
The positions are projected onto human coordinates
"""

import sys

from bx import intervals
from bx.align import maf as align_maf
from bx.pwm.pwm_score_maf import MafBlockScorer
from . import position_weight_matrix as pwmx


def isnan(x):
    return not x == x


def main():
    if len(sys.argv) < 5:
        print(f"{sys.argv[0]} bedfile inmaf spec1,spec2,... motif_file ", file=sys.stderr)
        sys.exit(0)

    # read in intervals
    regions = {}
    for line in open(sys.argv[1]):
        if line.startswith("#"):
            continue
        fields = line.strip().split()
        chrom, start, end = fields[0], int(fields[1]), int(fields[2])
        try:
            name = fields[3]
        except IndexError:
            name = None
        if chrom not in regions:
            regions[chrom] = intervals.Intersecter()
        regions[chrom].add(start, end, name)

    pwm = {}
    for wm in pwmx.Reader(open(sys.argv[4])):
        pwm[wm.id] = wm
        print(wm.id, len(wm), file=sys.stderr)

    inmaf = open(sys.argv[2])
    threshold = 0.5

    species = []

    for sp in sys.argv[3].split(","):
        species.append(sp)

    for maf in align_maf.Reader(inmaf):
        mafchrom = maf.components[0].src.split(".")[1]
        mafstart = maf.components[0].start
        mafend = maf.components[0].end
        reftext = maf.components[0].text

        # maf block scores for each matrix
        for scoremax, width, headers in MafBlockScorer(pwm, species, maf):
            blocklength = width
            mafsrc, mafstart, mafend = headers[0]
            mafchrom = mafsrc.split(".")[1]

            # lists of scores for each position in scoremax
            for mx_name, mx in scoremax.items():
                for offset in range(blocklength):
                    # scan all species with threshold
                    for i in range(len(species)):
                        if mx[i][offset] > threshold:
                            refstart = mafstart + offset - reftext.count("-", 0, offset)
                            refend = refstart + len(pwm[mx_name])

                            data = " ".join([f"{mx[x][offset]:.2f}" for x in range(len(species))])
                            # quote the motif
                            r = regions[mafchrom].find(refstart, refend)
                            if mafchrom in regions and len(r) > 0:
                                region_label = r[0].value
                            else:
                                continue
                            v_name = mx_name.replace(" ", "_")
                            print(mafchrom, refstart, refend, region_label, v_name, data)
                            break


if __name__ == "__main__":
    main()