File: maf_build_index.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (82 lines) | stat: -rwxr-xr-x 2,747 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/python3

"""
Build an index file for a set of MAF alignment blocks.

If index_file is not provided maf_file.index is used.

usage: %prog maf_file index_file
    -s, --species=a,b,c: only index the position of the block in the listed species
"""

import os.path
from io import TextIOWrapper

import bx.align.maf
from bx import interval_index_file
from bx.cookbook import doc_optparse
from bx.misc.seekbzip2 import SeekableBzip2File
from bx.misc.seeklzop import SeekableLzopFile


def main():
    options, args = doc_optparse.parse(__doc__)

    try:
        maf_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if maf_file.endswith(".bz2"):
            table_file = maf_file + "t"
            if not os.path.exists(table_file):
                doc_optparse.exit("To index bz2 compressed files first create a bz2t file with bzip-table.")
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableBzip2File(maf_file, table_file)
            # Strip .bz2 from the filename before adding ".index"
            maf_file = maf_file[:-4]
        elif maf_file.endswith(".lzo"):
            table_file = maf_file + "t"
            if not os.path.exists(table_file):
                doc_optparse.exit(
                    "To index lzo compressed files first create a lzot file with lzop_build_offset_table."
                )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableLzopFile(maf_file, table_file)
            # Strip .lzo from the filename before adding ".index"
            maf_file = maf_file[:-4]
        else:
            maf_in = open(maf_file, "rb")
        # Determine the name of the index file
        if len(args) > 1:
            index_file = args[1]
        else:
            index_file = maf_file + ".index"
        if options.species:
            species = options.species.split(",")
        else:
            species = None
    except Exception:
        doc_optparse.exception()

    maf_in = TextIOWrapper(maf_in, encoding="ascii")
    maf_reader = bx.align.maf.Reader(maf_in, parse_e_rows=True)

    indexes = interval_index_file.Indexes()

    # Need to be a bit tricky in our iteration here to get the 'tells' right
    while True:
        pos = maf_reader.file.tell()
        block = next(maf_reader)
        if block is None:
            break
        for c in block.components:
            if species is not None and c.src.split(".")[0] not in species:
                continue
            indexes.add(c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size)

    out = open(index_file, "wb")
    indexes.write(out)
    out.close()


if __name__ == "__main__":
    main()