File: bamtofastq.py

package info (click to toggle)
bamkit 0.0.1%2Bgit20170413.ccd079d-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 232 kB
  • sloc: python: 1,470; sh: 51; makefile: 11
file content (187 lines) | stat: -rwxr-xr-x 6,866 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env python3

# for tgi cluster:
#/gapp/x64linux/opt/pythonbrew/venvs/Python-2.7.6/gemini/bin/python
# for uva cluster:

import pysam
import sys
import argparse
from argparse import RawTextHelpFormatter
import string
from string import *

__author__ = "Ira Hall (ihall@genome.wustl.edu) and Colby Chiang (cc2qe@virginia.edu)"
__version__ = "$Revision: 0.0.1 $"
__date__ = "$Date: 2014-09-04 14:31 $"

def bamtofastq(bamlist, is_sam, readgroup, rename, header):
    for bamfile in bamlist:
        # get file and header
        if bamfile == None: 
            if is_sam:
                bam = pysam.Samfile("-", "r", check_sq=False)
            else:
                bam = pysam.Samfile('-', 'rb', check_sq=False)
        else:
            if is_sam:
                bam = pysam.Samfile(bamfile, 'r', check_sq=False)
            else:
                bam = pysam.Samfile(bamfile, "rb", check_sq=False)
        # parse readgroup string
        try:
            rg_list = readgroup.split(',')
        except AttributeError:
            rg_list = None

        d = {}
        counter = 0
        header_written = False

        for al in bam:
            # print the header to file if requested
            if (header is not None
                and not header_written):
                header.write(bam.text)
                header.close()
                header_written = True

            # must be primary read alignment, not secondary or supplementary
            if (al.is_secondary or al.flag & 2048 == 2048):
                continue

            # skip unpaired reads
            if (not al.is_paired):
                continue

            # must be in a user specified readgroup
            if rg_list and al.opt('RG') not in rg_list:
                continue

            # ensures the read is not hard-clipped. important
            # when the BAM doesn't have shorter hits flagged as
            # secondary
            if al.cigar is not None and 5 in [x[0] for x in al.cigar]:
                continue

            # add read name to dictionary if not already there
            key = al.qname
            if key not in d:
                d.setdefault(key,al)
            # print matched read pairs
            else:
                # RG:Z:ID
                try:
                    RG1 = d[key].opt('RG')
                except KeyError:
                    RG1 = ""
                try:
                    RG2 = al.opt('RG')
                except KeyError:
                    RG2 = ""

                counter += 1
                if rename:
                    al.qname = RG2 + '.' + str(counter)
                    d[key].qname = RG1 + '.' + str(counter)

                if al.is_read1:
                    printfastq_rg(al,1,RG2)
                    printfastq_rg(d[key],2,RG1)
                else:
                    printfastq_rg(d[key],1,RG1)
                    printfastq_rg(al,2,RG2)
                del d[key]
        if len(d) != 0:
            sys.stderr.write('Warning: %s unmatched name groups\n' % len(d))

#===================================================================================================================================================
# functions
#===================================================================================================================================================

def get_args():
    parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="\
bamtofastq.py\n\
author: " + __author__ + "\n\
version: " + __version__ + "\n\
description: Convert a coordinate sorted BAM file to FASTQ\n\
             (ignores unpaired reads)")
    # parser.add_argument('-i', '--input', metavar='BAM', required=False, help='Input BAM file')
    parser.add_argument('-r', '--readgroup', metavar='STR', default=None, required=False, help='Read group(s) to extract (comma separated)')
    parser.add_argument('-n', '--rename', required=False, action='store_true', help='Rename reads')
    parser.add_argument('-S', '--is_sam', required=False, action='store_true', help='Input is SAM format')
    parser.add_argument('-H', '--header', metavar='FILE', type=argparse.FileType('w'), default=None, required=False,
                        help='Write BAM header to file')
    parser.add_argument('input', metavar='BAM', nargs='*', type=str,
                        help='Input BAM file(s). If absent then defaults to stdin.')

    # parse the arguments
    args = parser.parse_args()

    # bail if no BAM file
    if len(args.input) == 0:
        if sys.stdin.isatty():
            parser.print_help()
            exit(1)
        else:
            args.input = [None]
    
    # send back the user input
    return args

def printfastq(al,read):
    if(al.is_reverse):
        print("@" + str(al.qname) + "/" + str(read) + "\n" + str(revcomp(al.seq)) + "\n" + "+" + "\n" + str(al.qual[::-1]))
    else: 
        print("@" + str(al.qname) + "/" + str(read) + "\n" + str(al.seq) + "\n" + "+" + "\n" + str(al.qual))

def printfastq_rg(al,read, rg):
    if(al.is_reverse):
        print("@" + str(al.qname) + "/" + str(read) + " " + "RG:Z:" + str(rg) + "\n" + str(revcomp(al.seq)) + "\n" + "+" + "\n" + str(al.qual[::-1]))
    else: 
        print("@" + str(al.qname) + "/" + str(read) + " " + "RG:Z:" + str(rg) + "\n" + str(al.seq) + "\n" + "+" + "\n" + str(al.qual))

def revcomp(seq):
    seq1 = seq.translate(maketrans("AGCTagct", "TCGAtcga"))
    seq2 = seq1[::-1]
    return seq2

#===================================================================================================================================================
# driver
#===================================================================================================================================================

class Usage(Exception):
    def __init__(self, msg):
        self.msg = msg

def main():
    args = get_args()

    bamtofastq(args.input,
               args.is_sam,
               args.readgroup,
               args.rename,
               args.header)

if __name__ == "__main__":
    try:
        sys.exit(main())
    except IOError as e:
        if e.errno != 32:  # ignore SIGPIPE
            raise
    
# new features needed
# don't forget to select for primary alignments before using on bwa-mem files - DONE
# query memory usage to print out
# write output files based on library name
# include sample name as part of output file
# include sequencing center as part of output file
# stop opening and closing file the whole time - DONE
# close files at end
# add strict error checking ; need to catch all mistakes!!!!
# option for interleaved versus two fastq formats
# option for compressed versus uncompressed
# write out sequencing center - DONE
# need to make read-group based
# need to actually align output:
# need to have output directory prefix - default blank