File: div_snp_table_chr.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (158 lines) | stat: -rwxr-xr-x 4,950 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/python3

"""
FIXME!

usage: %prog feature.bed ar.bed snp.bed div_directory [options]
    -m, --mask=M: Mask AR and features with this file
    -s, --suffix=S: append suffix to chromosomes to get filenames from div_directory
    -l, --lens=l: Set chromosome ends using LEN file
"""

import sys

from bx.bitset import BinnedBitSet
from bx.bitset_builders import binned_bitsets_from_file
from bx.cookbook import doc_optparse


def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        lens = {}
        if options.lens:
            for line in open(options.lens):
                chrom, length = line.split()
                lens[chrom] = int(length)

        if options.suffix:
            suffix = options.suffix
        else:
            suffix = ""

        print("\nReading feature", end=" ", file=sys.stderr)
        interval_file = open(args[0])
        feature = binned_bitsets_from_file(interval_file, lens=lens)
        interval_file.close()
        # reuse interval file
        intervals = {}
        interval_file = open(args[0])
        for line in interval_file:
            fields = line.split()
            chrom, start, end = fields[0], int(fields[1]), int(fields[2])
            if chrom not in intervals:
                intervals[chrom] = []
            intervals[chrom].append([start, end])
        interval_file.close()

        print("\nReading ar", end=" ", file=sys.stderr)
        ar = binned_bitsets_from_file(open(args[1]), lens=lens)

        print("\nReading snps", end=" ", file=sys.stderr)
        snp = binned_bitsets_from_file(open(args[2]), lens=lens)
        snp_mask = clone_inverted(snp)
        snp_copy = clone(snp)

        print("\nMasking AR", end=" ", file=sys.stderr)
        ar_mask = clone_inverted(ar)
        print(file=sys.stderr)

        dirname = args[3]

        if options.mask:
            mask = binned_bitsets_from_file(open(options.mask), lens=lens)
        else:
            mask = None
    except Exception:
        doc_optparse.exit()

    if mask:
        for chrom in mask.keys():
            if chrom in feature:
                feature[chrom].iand(mask[chrom])
            if chrom in ar:
                ar[chrom].iand(mask[chrom])

    # divergence and snp counts for all features
    feature_div_count = 0
    feature_snp_count = 0
    ar_div_count = 0
    ar_snp_count = 0

    # collect snp and div
    for chr in feature.keys():
        if chr not in snp:
            continue
        if chr not in ar:
            continue

        print(f"reading {chr} ...", end=" ", file=sys.stderr)
        try:
            div = binned_bitsets_from_file(open(dirname + "/%s.bed" % (chr + suffix)), lens=lens)
        except Exception:
            print(f"{chr}.bed not found", file=sys.stderr)
            continue

        div[chr].iand(snp_mask[chr])  # div/snp sites count snp-only
        div_copy = clone(div)

        print("AR:", chr, end=" ", file=sys.stderr)
        snp[chr].iand(ar[chr])
        div[chr].iand(ar[chr])
        snp_count = snp[chr].count_range(0, snp[chr].size)
        ar_snp_count += snp_count
        print(snp_count, end=" ", file=sys.stderr)
        try:
            div_count = div[chr].count_range(0, div[chr].size)
            ar_div_count += div_count
            print(div_count, file=sys.stderr)
        except Exception:
            print(chr, "failed", file=sys.stderr)

        div = div_copy
        snp[chr] = snp_copy[chr]
        print("feature:", chr, end=" ", file=sys.stderr)
        feature[chr].iand(ar_mask[chr])  # clip to non-AR only
        snp[chr].iand(feature[chr])
        div[chr].iand(feature[chr])
        feature_snp_count += snp[chr].count_range(0, snp[chr].size)
        print(snp[chr].count_range(0, snp[chr].size), div[chr].count_range(0, div[chr].size), file=sys.stderr)
        feature_div_count += div[chr].count_range(0, div[chr].size)
        print(snp[chr].count_range(0, snp[chr].size), div[chr].count_range(0, div[chr].size), file=sys.stderr)

        # Note: can loop over feature intervals here for individual counts
        if chr in intervals:
            for start, end in intervals[chr]:
                ind_div_count = div[chr].count_range(start, end - start)
                ind_snp_count = snp[chr].count_range(start, end - start)
                print(chr, start, end, ind_div_count, ind_snp_count)

    print("feature snp\t%d" % feature_snp_count)
    print("feature div\t%d" % feature_div_count)
    print("ar snp\t%d" % ar_snp_count)
    print("ar div\t%d" % ar_div_count)


# copies a dictionary of bitsets
def copybits(binnedbits):
    bitset = BinnedBitSet(binnedbits.size)
    bitset.ior(binnedbits)
    return bitset


def clone(bitsets):
    r = {}
    for k, b in bitsets.items():
        r[k] = copybits(b)
    return r


def clone_inverted(bitsets):
    r = {}
    for k, b in bitsets.items():
        r[k] = copybits(b)
        r[k].invert()
    return r


main()