1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
|
#!/usr/bin/python
import sys, glob, os, random
from optparse import OptionParser
from collections import defaultdict, namedtuple
from pbsuite.utils.FileHandlers import *
from pbsuite.jelly.Jelly import JellyProtocol
import networkx
USAGE = """
Consolidates all the reads that support a gap into a folder for assembly.
Extraction.py <Protocol.xml>"""
# Max number of gaps to hold in memory until all output is flushed
# Tweak this based on the amount of memory you have available.
MAXGAPHOLD = 20000;
# Benchmarking needs to be done to guide this number
# Amount of flank to put in from the reference for "Guided" Assembly
FLANKAMT = 2500;
MAXFILES = 10000
TrimInfo = namedtuple("ReadwithTrim", "name start end")
class Extraction():
def __init__(self, args):
#ArgParse
self.__parseArgs__(args)
self.__initLog()
protocol = JellyProtocol(sys.argv[1])
self.gapInfo = GapInfoFile(protocol.gapTable)
self.gapGraph = GapGraph(self.gapInfo)
def __parseArgs__(self, argv):
parser = OptionParser(USAGE)
parser.add_option("--debug",action="store_true",default=False)
self.options, args = parser.parse_args(argv)
if len(args) < 2:
parser.error("Expected one argument, the Protocol.xml")
self.protocol = JellyProtocol(args[1])
def __initLog(self):
"""Logging"""
logLevel = logging.DEBUG if self.options.debug else logging.INFO
logFormat = "%(asctime)s [%(levelname)s] %(message)s"
logging.basicConfig( stream=sys.stderr, level=logLevel, format=logFormat )
logging.info("Running %s" % " ".join(sys.argv) )
def __cleanReadName__(self, readName):
"""
Build a TrimInfo namedtuple for this read
"""
getSub = re.compile("(.*)##(\d+)#(\d+)##$")
#Check for trims
if readName.endswith('##'):#just endswith? not safe..
logging.debug("%s -- Missed Adapter" % readName)
name, start, end = getSub.match(readName).groups()
start = int(start)
end = int(end)
else:
name = readName
start = 0
end = None
return TrimInfo(name, start, end)
def __loadGMLFiles__(self):
"""
Iterates through all of the files inside of support directory
merges these graphs with the whole gapGraph
creates a dictionary of {"readName":[gapName or nodeName,...] }
"""
self.readSupport = defaultdict(list)
for i in glob.glob(os.path.join(self.protocol.outDir,"support","*.gml")):
truncNxVer = float('.'.join(networkx.__version__.split('.')[:2]))
if truncNxVer >= 1.7:
try:
inputGml = networkx.read_gml(i, relabel=True)
except ValueError:
logging.warning("GML file %s is empty" % i)
continue
elif truncNxVer == 1.1:
inputGml = networkx.read_gml(i)
else:
logging.warning("It is unknown if networkx version %s will work." % networkx.__version__)
logging.warning("If you get an error here, please report it!!!")
inputGml = networkx.read_gml(i)
for node in inputGml.nodes_iter():
for readName in inputGml.node[node]['extenders'].split(':'):
if readName == '':
continue
trimInfo = self.__cleanReadName__(readName)
self.gapGraph.add_extend(node, trimInfo.name)
self.readSupport[trimInfo.name].append((node, trimInfo))
for source, target, evidence in inputGml.edges_iter(data=True):
for readName in evidence['evidence'].split(':'):
if readName == '':
continue
trimInfo = self.__cleanReadName__(readName)
self.gapGraph.add_evidence(source, target, trimInfo.name)
self.readSupport[trimInfo.name].append((source, target, trimInfo))
self.readSupport = dict(self.readSupport)
def __loadReference__(self):
"""
Get the reference information into memory
"""
myReference = FastaFile(self.protocol.reference)
self.reference = {}
if self.protocol.scaffoldQualName is not None:
myQual = QualFile(self.protocol.scaffoldQualName)
for key in myReference:
self.reference[key.split('|')[-1]] = FastqEntry(key.split('|')[-1], myReference[key], myQual[key])
else:
for key in myReference:
self.reference[key.split('|')[-1]] = FastqEntry(key.split('|')[-1], myReference[key], 'l'*len(myReference[key]))
def __extractReads__(self):
"""
readsForExtraction = set(readSupport.keys())
for fastaFile to process:
readsToCollect = readsForExtraction & set(fastaFile.keys())
for usedRead in readsToCollect:
#check the gapFile holder from previous extraction
write to the file handle of the gaps in each used read
It might make it progressively more efficient to create set(readSupport.keys())^usedReads
readsForExtraction =
readsForExtraction = readsForExtraction^usedReads
done.
"""
self.gapOutputs = {}
self.supportFolder = os.path.join(self.protocol.outDir,"assembly")
try:
os.mkdir(self.supportFolder)
except OSError:
pass
outputQueue = defaultdict(list)# gapName: [readDataAsString]
numReads = 0
for inputFile in self.protocol.inputs:
logging.info("Parsing %s" % inputFile)
#Check if it is fasta/qual or fastq
if inputFile.lower().endswith(".fastq"):
inputReads = FastqFile(inputFile)
#spaces in the read names...
for i in inputReads.keys(): #protecting for spaces
inputReads[i.split(' ')[0]] = inputReads[i]
inputReads[i.split(' ')[0]].name = i.split(' ')[0]
#This messes up the names a tiny bit permanently since we're pointing
#at the object, but I'm okay with that because we don't need spaces anyway
elif inputFile.lower().endswith(".fasta"):
inputReads = self.selectiveMergeFastaQual(inputFile, \
inputFile[:inputFile.rindex('.')]+".qual")
else:
logging.error("Input Reads file %s doesn't end with .fasta or .fastq!")
exit(0)
logging.info("Loaded %d Reads" % (len(inputReads.keys())))
parsed = 0
for usedRead in inputReads.keys():
try:
gaps = self.readSupport[usedRead]
except KeyError:
continue
parsed += 1
for gap in gaps:
#We have an extender, add the read into all edges files
if len(gap) == 2:
contigEnd, trimInfo = gap
start, end = trimInfo.start, trimInfo.end
# -- If there isn't a single edge, then we need to populate an extender
numNodes = 0
for source, target in self.gapGraph.graph.edges(contigEnd):
if "Contig" not in self.gapGraph.graph.edge[source][target]['evidence'][0]:
numNodes += 1
#Ensure we don't make redundancies
l = [source, target]; l.sort(); source, target = l;
gapName = "%s_%s" % (source, target)
logging.debug("Writing %s to %s" % (usedRead, gapName))
numReads += 1
outputQueue[gapName].append(inputReads[usedRead].toString(start, end))
if numNodes == 0:
logging.debug("%s only has extending evidence" % (contigEnd))
logging.debug(self.gapGraph.graph.edges(contigEnd))
#self.__gapOutputs__(contigEnd, inputReads[usedRead].toString(start, end))
numReads += 1
outputQueue[contigEnd].append(inputReads[usedRead].toString(start, end))
#we have an edge, so just write to the gap
elif len(gap) == 3:
source, target, trimInfo = gap
l = [source, target]; l.sort(); source, target = l;
gapName = "%s_%s" % (source, target)
start, end = trimInfo.start, trimInfo.end
#self.__gapOutputs__(gapName, inputReads[usedRead].toString(start, end))
numReads += 1
outputQueue[gapName].append(inputReads[usedRead].toString(start, end))
if len(outputQueue.keys()) >= MAXGAPHOLD:
logging.info("Flushing Output Queue of %d gaps %d reads" % \
(len(outputQueue.keys()), numReads))
self.flushQueue(outputQueue)
logging.info("Finshed Flush")
numReads = 0
del(outputQueue);
outputQueue = defaultdict(list)
logging.info("Parsed %d Reads" % (parsed))
self.flushQueue(outputQueue)
def flushQueue(self, outputQueue):
"""
flushes the entire queue - dumps a file at a time
closes up the file when it's done
"""
for gapName in outputQueue:
try:
outFile = self.gapOutputs[gapName]
#exists, it's a string
outFile = open(outFile,'a')
self.gapOutputs[gapName] = outFile
except KeyError:
#returns a file handler
outFile = self.openGapOut(gapName)
#No newline necessary
outFile.write("".join(outputQueue[gapName]))
outFile.close()
self.gapOutputs[gapName] = outFile.name
def openGapOut(self, gapName):
"""
prepares the gap's output
"""
#Create output file
basedir = os.path.join(self.supportFolder, gapName.replace('/','.'))
try:
os.mkdir(basedir)
except OSError as e:
if e.errno == 17:
logging.warning("%s already exists... overwriting results" % basedir)
elif e.errno == 13:
logging.critical("%s cannot be created... insufficent file permissions" % basedir)
exit(13)
fh = open(os.path.join(basedir,"input.fastq"),'w')
self.gapOutputs[gapName] = fh
#Need to extract Seed Contigs
#i = gapName.split('_')
for flankName in gapName.split('_'):
logging.debug("flankName %s" % flankName)
sequence = self.reference[flankName[:10]]
sequenceLength = len(sequence.seq)
#contig end
if flankName.count('.') == 0:
if flankName[-1] == '5':
seq = sequence.getSeq(flankName, 0, FLANKAMT)
elif flankName[-1] == '3':
s = max(sequenceLength-FLANKAMT, 0)
seq = sequence.getSeq(flankName, s, sequenceLength)
#captured gap
elif flankName.count('.') == 1:
ref, part = flankName.split('.')
part, end = part.split('e')
if end == '5':
#-- This is what will messup during scaff incorp to cap gap -- ref\d{7}_-1_0
gap = "%s_%s_%s" % (ref, int(part)-1, part)
start = self.gapInfo[gap].end
end = min(start+FLANKAMT, sequenceLength)
seq = sequence.getSeq(flankName, start, end)
if end == '3':
gap = "%s_%s_%s" % (ref, part, int(part)+1)
end = self.gapInfo[gap].start
start = max(end - FLANKAMT, 0)#Prevent - start index
seq = sequence.getSeq(flankName, start, end)
seq = self.cleanSeq(seq)
fh.write( seq )
return fh
def cleanSeq(self, seq):
seq = seq.replace('M','C')
seq = seq.replace('R','A')
seq = seq.replace('W','T')
seq = seq.replace('S','G')
seq = seq.replace('Y','C')
seq = seq.replace('K','T')
seq = seq.replace('V','G')
seq = seq.replace('H','A')
return seq
def run(self):
#Merge GML files
logging.info("Opening GML Files")
self.__loadGMLFiles__()
#Figure out Support levels of evidence that we'll allow
logging.info("Loading Reference Sequence")
self.__loadReference__()
#Figure out an efficient way of opening inputJobs and extracting necessary reads
logging.info("Extracting Reads")
self.__extractReads__()
self.gapGraph.saveBML(os.path.join(self.supportFolder,"masterSupport.bml"))
logging.info("Finished")
def selectiveMergeFastaQual(self, fastaFn, qualFn):
"""
Merges a fasta/qual but is selective about what gets the full processing
selection is a dictionary of { readNames : True } for lookup
I'm trying to speed up Extraction as much as possible
"""
splRE = re.compile("\s+")
fasta = FastaFile(fastaFn)
qual = QualFile(qualFn, convert=False)
logging.info("Selective loading from %d reads" % (len(fasta.values())))
ret = {}
for key in fasta:
try:
exists = self.readSupport[key]
q = "".join([chr(int(x)+33) for x in splRE.split(qual[key])])
ret[key] = FastqEntry(key, fasta[key], q)
except KeyError:
#doesn't exist
continue
return ret
if __name__ == '__main__':
me = Extraction(sys.argv)
me.run()
|