1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
|
#!/usr/bin/python3
"""
WARNING: bz2/bz2t support and file cache support are new and not as well
tested.
usage: %prog maf_files [options] < interval_file
-s, --species=SPECIES: Comma separated list of species to include
-p, --prefix=PREFIX: Prefix to add to each interval chrom (usually reference species)
-C, --usecache: Use a cache that keeps blocks of the MAF files in memory (requires ~20MB per MAF)
"""
import sys
from numpy import zeros
import bx.align.maf
from bx.cookbook import doc_optparse
def main():
# Parse Command Line
options, args = doc_optparse.parse(__doc__)
try:
maf_files = args
species = options.species.split(",")
prefix = options.prefix
use_cache = bool(options.usecache)
if not prefix:
prefix = ""
except Exception:
doc_optparse.exit()
# Open indexed access to mafs
index = bx.align.maf.MultiIndexed(maf_files, parse_e_rows=True, use_cache=use_cache)
# Print header
print("#chr", "start", "end", end=" ")
for s in species:
print(s, end=" ")
print()
# Iterate over input ranges
for line in sys.stdin:
fields = line.split()
# Input is BED3+
chr, start, end = fields[0], int(fields[1]), int(fields[2])
length = end - start
assert length > 0, "Interval has length less than one"
# Prepend prefix if specified
src = prefix + chr
# Keep a bitset for each species noting covered pieces
aligned_bits = []
missing_bits = []
for s in species:
aligned_bits.append(zeros(length, dtype=bool))
missing_bits.append(zeros(length, dtype=bool))
# Find overlap with reference component
blocks = index.get(src, start, end)
# Determine alignability for each position
for block in blocks:
# Determine the piece of the human interval this block covers,
# relative to the start of the interval of interest
ref = block.get_component_by_src(src)
assert ref.strand == "+", "Reference species blocks must be on '+' strand"
rel_start = max(start, ref.start) - start
rel_end = min(end, ref.end) - start
# Check alignability for each species
for i, s in enumerate(species):
other = block.get_component_by_src_start(s)
# Species does not appear at all indicates unaligned (best we
# can do here?)
if other is None:
continue
# An empty component might indicate missing data, all other
# cases (even contiguous) we count as not aligned
if other.empty:
if other.synteny_empty == bx.align.maf.MAF_MISSING_STATUS:
missing_bits[i][rel_start:rel_end] = True
# Otherwise we have a local alignment with some text, call
# it aligned
else:
aligned_bits[i][rel_start:rel_end] = True
# Now determine the total alignment coverage of each interval
print(chr, start, end, end=" ")
for i, s in enumerate(species):
aligned = sum(aligned_bits[i])
missing = sum(missing_bits[i])
# An interval will be called missing if it is < 100bp and <50%
# present, or more than 100bp and less that 50bp present (yes,
# arbitrary)
if length < 100 and missing > (length / 2):
print("NA", end=" ")
elif length >= 100 and missing > 50:
print("NA", end=" ")
else:
print(aligned / (length - missing), end=" ")
print()
# Close MAF files
index.close()
if __name__ == "__main__":
main()
|