1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
|
#!/usr/bin/env python
import sys
import getopt
import string
from optparse import OptionParser
import re
def extractSplitsFromBwaMem(inFile,numSplits,includeDups,minNonOverlap):
if inFile == "stdin":
data = sys.stdin
else:
data = open(inFile, 'r')
for line in data:
split = 0
if line[0] == '@':
print(line.strip())
continue
samList = line.strip().split('\t')
sam = SAM(samList)
if includeDups==0 and (1024 & sam.flag)==1024:
continue
for el in sam.tags:
if "SA:" in el:
if(len(el.split(";")))<=numSplits:
split = 1
mate = el.split(",")
mateCigar = mate[3]
mateFlag = int(0)
if mate[2]=="-": mateFlag = int(16)
if split:
read1 = sam.flag & 64
if read1 == 64: tag = "_1"
else: tag="_2"
samList[0] = sam.query + tag
readCigar = sam.cigar
readCigarOps = extractCigarOps(readCigar,sam.flag)
readQueryPos = calcQueryPosFromCigar(readCigarOps)
mateCigarOps = extractCigarOps(mateCigar,mateFlag)
mateQueryPos = calcQueryPosFromCigar(mateCigarOps)
overlap = calcQueryOverlap(readQueryPos.qsPos,readQueryPos.qePos,mateQueryPos.qsPos,mateQueryPos.qePos)
nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap
nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap
mno = min(nonOverlap1, nonOverlap2)
if mno >= minNonOverlap:
print("\t".join(samList))
#--------------------------------------------------------------------------------------------------
# functions
#--------------------------------------------------------------------------------------------------
class SAM (object):
"""
__very__ basic class for SAM input.
"""
def __init__(self, samList = []):
if len(samList) > 0:
self.query = samList[0]
self.flag = int(samList[1])
self.ref = samList[2]
self.pos = int(samList[3])
self.mapq = int(samList[4])
self.cigar = samList[5]
self.matRef = samList[6]
self.matePos = int(samList[7])
self.iSize = int(samList[8])
self.seq = samList[9]
self.qual = samList[10]
self.tags = samList[11:]#tags is a list of each tag:vtype:value sets
self.valid = 1
else:
self.valid = 0
self.query = 'null'
def extractTagValue (self, tagID):
for tag in self.tags:
tagParts = tag.split(':', 2);
if (tagParts[0] == tagID):
if (tagParts[1] == 'i'):
return int(tagParts[2]);
elif (tagParts[1] == 'H'):
return int(tagParts[2],16);
return tagParts[2];
return None;
#-----------------------------------------------
cigarPattern = '([0-9]+[MIDNSHP])'
cigarSearch = re.compile(cigarPattern)
atomicCigarPattern = '([0-9]+)([MIDNSHP])'
atomicCigarSearch = re.compile(atomicCigarPattern)
def extractCigarOps(cigar,flag):
if (cigar == "*"):
cigarOps = []
elif (flag & 0x0010):
cigarOpStrings = cigarSearch.findall(cigar)
cigarOps = []
for opString in cigarOpStrings:
cigarOpList = atomicCigarSearch.findall(opString)
# print cigarOpList
# "struct" for the op and it's length
cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1])
# add to the list of cigarOps
cigarOps.append(cigar)
cigarOps = cigarOps
cigarOps.reverse()
##do in reverse order because negative strand##
else:
cigarOpStrings = cigarSearch.findall(cigar)
cigarOps = []
for opString in cigarOpStrings:
cigarOpList = atomicCigarSearch.findall(opString)
# "struct" for the op and it's length
cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1])
# add to the list of cigarOps
cigarOps.append(cigar)
# cigarOps = cigarOps
return(cigarOps)
def calcQueryPosFromCigar(cigarOps):
qsPos = 0
qePos = 0
qLen = 0
# if first op is a H, need to shift start position
# the opPosition counter sees if the for loop is looking at the first index of the cigar object
opPosition = 0
for cigar in cigarOps:
if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'):
qsPos += cigar.length
qePos += cigar.length
qLen += cigar.length
elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'):
qLen += cigar.length
elif cigar.op == 'M' or cigar.op == 'I':
qePos += cigar.length
qLen += cigar.length
opPosition += 1
d = queryPos(qsPos, qePos, qLen);
return d
class cigarOp (object):
"""
sturct to store a discrete CIGAR operations
"""
def __init__(self, opLength, op):
self.length = int(opLength)
self.op = op
class queryPos (object):
"""
struct to store the start and end positions of query CIGAR operations
"""
def __init__(self, qsPos, qePos, qLen):
self.qsPos = int(qsPos)
self.qePos = int(qePos)
self.qLen = int(qLen)
def calcQueryOverlap(s1,e1,s2,e2):
o = 1 + min(e1, e2) - max(s1, s2)
return max(0, o)
###############################################
class Usage(Exception):
def __init__(self, msg):
self.msg = msg
def main():
usage = """%prog -i <file>
extractSplitReads_BwaMem v0.1.0
Author: Ira Hall
Description: Get split-read alignments from bwa-mem in lumpy compatible format. Ignores reads marked as duplicates.
Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405.
"""
parser = OptionParser(usage)
parser.add_option("-i", "--inFile", dest="inFile",
help="A SAM file or standard input (-i stdin).",
metavar="FILE")
parser.add_option("-n", "--numSplits", dest="numSplits", default=2, type = "int",
help="The maximum number of split-read mappings to allow per read. Reads with more are excluded. Default=2",
metavar="INT")
parser.add_option("-d", "--includeDups", dest="includeDups", action="store_true",default=0,
help="Include alignments marked as duplicates. Default=False")
parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", default=20, type = "int",
help="minimum non-overlap between split alignments on the query (default=20)",
metavar="INT")
(opts, args) = parser.parse_args()
if opts.inFile is None:
parser.print_help()
print()
else:
try:
extractSplitsFromBwaMem(opts.inFile, opts.numSplits, opts.includeDups, opts.minNonOverlap)
except IOError as err:
sys.stderr.write("IOError " + str(err) + "\n");
return
if __name__ == "__main__":
sys.exit(main())
|