File: analyze_indels.py

package info (click to toggle)
vg 1.30.0%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 267,848 kB
  • sloc: cpp: 446,974; ansic: 116,148; python: 22,805; cs: 17,888; javascript: 11,031; sh: 5,866; makefile: 4,039; java: 1,415; perl: 1,303; xml: 442; lisp: 242
file content (130 lines) | stat: -rwxr-xr-x 4,866 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/python3

"""
analyze_indels.py: take GAM JSON as input and produce tables of non-softclip
indel positions in the read and lengths
"""

import argparse
import os
import sys
import time
import subprocess
import collections
import json

    
def parse_args(args):
    """
    Takes in the command-line arguments list (args), and returns a nice argparse
    result with fields for all the options.
    
    Borrows heavily from the argparse documentation examples:
    <http://docs.python.org/library/argparse.html>
    """
    
    # Construct the parser (which is stored in parser)
    # Module docstring lives in __doc__
    # See http://python-forum.com/pythonforum/viewtopic.php?f=3&t=36847
    # And a formatter class so our examples in the docstring look good. Isn't it
    # convenient how we already wrapped it to 80 characters?
    # See http://docs.python.org/library/argparse.html#formatter-class
    parser = argparse.ArgumentParser(description=__doc__, 
        formatter_class=argparse.RawDescriptionHelpFormatter)
    
    parser.add_argument("--input", type=argparse.FileType('r'), default=sys.stdin,
                        help="JSON GAM to process")
    parser.add_argument("--positions", required=True, type=argparse.FileType('w'),
                        help="TSV of positions and counts to write")
    parser.add_argument("--lengths", required=True, type=argparse.FileType('w'),
                        help="TSV of lengths and counts to write")
    parser.add_argument("--mapqs", required=True, type=argparse.FileType('w'),
                        help="TSV of indel/gapless categories, MAPQs and counts to write")
    
    # The command line arguments start with the program name, which we don't
    # want to treat as an argument for argparse. So we remove it.
    args = args[1:]
        
    return parser.parse_args(args)

def main(args):
    """
    Parses command line arguments and do the work of the program.
    "args" specifies the program arguments, with args[0] being the executable
    name. The return value should be used as the program's exit code.
    """
    
    options = parse_args(args) # This holds the nicely-parsed options object
    
    # These will track stats over all the indels.
    
    # Where does each indel start in read space?
    indel_starts = collections.Counter()
    # How long is each indel?
    indel_lengths = collections.Counter()
    # For reads with and without indels, what are your MAPQs?
    read_mapqs = {'indel': collections.Counter(), 'gapless': collections.Counter()}
    
    for line in options.input:
        # Load each line
        alignment = json.loads(line)
        
        # Parse the alignment using these counters
        read_offset = 0
        ref_offset = 0
        
        # Determine if the read is 'indel' or 'gapless'
        status = 'gapless'
        
        mappings = alignment.get("path", {}).get("mapping", []) 
        
        for i, mapping in enumerate(mappings):
            edits = mapping.get("edit", [])
            for j, edit in enumerate(edits):
                # Parse each edit
                from_length = edit.get("from_length", 0)
                to_length = edit.get("to_length", 0)
                
                # If we're the first or last edit, we're never an indel; we're a softclip instead
                is_border = (i == 0 and j == 0) or (i == len(mappings) - 1 and j == len(edits) - 1)
                
                # If the lengths differ and we aren't on the end of the read we're an indel.
                is_indel = from_length != to_length and not is_border
                
                if is_indel:
                    # Record this indel
                    indel_starts[read_offset] += 1
                    indel_lengths[to_length - from_length] += 1
                    # Mark that the read has an indel
                    status = 'indel'
                
                # Advance the position counters
                read_offset += to_length
                ref_offset += from_length
                
        # Record the read's MAPQ
        read_mapqs[status][alignment.get("mapping_quality", 0)] += 1
                
    # Now dump tables for plotting
    for value, count in indel_starts.items():
        options.positions.write("{}\t{}\n".format(value, count))
        
    for value, count in indel_lengths.items():
        options.lengths.write("{}\t{}\n".format(value, count))
      
    for status, counts in read_mapqs.items():
        for value, count in counts.items():
            options.mapqs.write("{}\t{}\t{}\n".format(status, value, count))


def entrypoint():
    """
    0-argument entry point for setuptools to call.
    """
    
    # Provide main with its arguments and handle exit codes
    sys.exit(main(sys.argv))
    
if __name__ == "__main__" :
    entrypoint()