1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
|
#!/usr/bin/python3
"""
Runs DWGSIM
usage: dwgsim_wrapper.py [options]
-e,--eRateFirstread=e: base/color error rate of the first read [0.020]
-E,--eRateSecondread=E: base/color error rate of the second read [0.020]
-d,--innerDist=d: inner distance between the two ends [500]
-s,--stdev=s: standard deviation [50]
-N,--numReads=N: number of read pairs [1000000]
-1,--loFirstread=1: length of the first read [70]
-2,--loSecondread=2: length of the second read [70]
-r,--mutRate=r: rate of mutations [0.0010]
-R,--fracIndels=R: fraction of mutations that are indels [0.10]
-X,--indelExt=X: probability an indel is extended [0.30]
-y,--randProb=y: probability of a random DNA read [0.10]
-n,--maxN=n: maximum number of Ns allowed in a given read [0]
-c,--platformType=c: generate reads for Illumina (nuc space), for SOLiD (in color space) or Ion Torrent
-S,--strand=S: strand 0: default, 1: same strand, 2: opposite strand
-f,--flowOrder=f: the flow order for Ion Torrent data [(null)]
-O,--splitOutput=O: which output needs to be reported, one: one single; two: two files; all: all output options
-H,--haploid=H: haploid mode
-i,--input=i: the reference genome FASTA
-3,--outputBFAST=3: the BFAST output FASTQ
-4,--outputBWA1=4: the first BWA output FASTQ
-5,--outputBWA2=5: the second BWA output FASTQ
-6,--outputMutations=6: the output mutations TXT
"""
import optparse, os, shutil, subprocess, sys, tempfile
def stop_err( msg ):
sys.stderr.write( '%s\n' % msg )
sys.exit()
def run_process ( cmd, name, tmp_dir, buffsize ):
try:
tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
tmp_stderr = open( tmp, 'wb' )
proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
returncode = proc.wait()
tmp_stderr.close()
# get stderr, allowing for case where it's very large
tmp_stderr = open( tmp, 'rb' )
stderr = ''
try:
while True:
stderr += tmp_stderr.read( buffsize )
if not stderr or len( stderr ) % buffsize != 0:
break
except OverflowError:
pass
tmp_stderr.close()
if returncode != 0:
raise Exception(stderr)
except Exception as e:
raise Exception('Error in \'' + name + '\'. \n' + str( e ))
def check_output ( output, canBeEmpty ):
if 0 < os.path.getsize( output ):
return True
elif False == canBeEmpty:
raise Exception('The output file is empty:' + output)
def __main__():
#Parse Command Line
parser = optparse.OptionParser()
parser.add_option( '-e', '--eRateFirstread', dest='eRateFirstread', type='float', help='base/color error rate of the first read' )
parser.add_option( '-E', '--eRateSecondread', dest='eRateSecondread', type='float', help='base/color error rate of the second read' )
parser.add_option( '-d', '--innerDist', dest='innerDist', type='int', help='inner distance between the two ends' )
parser.add_option( '-s', '--stdev', dest='stdev', type='float', help='standard deviation' )
parser.add_option( '-N', '--numReads', dest='numReads', type='int', help='number of read pairs' )
parser.add_option( '-r', '--mutRate', dest='mutRate', type='float', help='rate of mutations' )
parser.add_option( '-R', '--fracIndels', dest='fracIndels', type='float', help='fraction of mutations that are indels' )
parser.add_option( '-X', '--indelExt', dest='indelExt', type='float', help='probability an indel is extended' )
parser.add_option( '-y', '--randProb', dest='randProb', type='float', help='probability of a random DNA read' )
parser.add_option( '-n', '--maxN', dest='maxN', type='int', help='maximum number of Ns allowed in a given read' )
parser.add_option( '-c', '--platformType', dest='platformType', type='int', help='Platform to simulate: 0: Illumina; 1: solid; 2: Ion Torrent' )
parser.add_option( '-S', '--strand', dest='strand', type='choice', default='0', choices=('0', '1', '2'), help='strand 0: default, 1: same strand, 2: opposite strand' )
parser.add_option( '-f', '--flowOrder', dest='flowOrder', type='int', default='2')
parser.add_option( '-H', '--haploid', action='store_true', dest='haploid', default=False, help='haploid mode' )
parser.add_option( '-i', '--input', dest='input', help='The reference genome fasta' )
parser.add_option( '-O', dest='splitoutput', type='choice', default='two',choices=('two', 'one', 'all'), help='one: one single; two: two files; all: all output options' )
parser.add_option( '-1', '--loFirstread', dest='loFirstread', type='int', help='length of the first read' )
parser.add_option( '-2', '--loSecondread', dest='loSecondread', type='int', help='length of the second read' )
parser.add_option( '-6', '--outputMutations', dest='outputMutations', help='the output mutations TXT' )
parser.add_option( '-4', '--outputBWA1', dest='outputBWA1', help='the first BWA output FASTQ' )
parser.add_option( '-5', '--outputBWA2', dest='outputBWA2', help='the second BWA output FASTQ' )
parser.add_option( '-3', '--outputBFAST', dest='outputBFAST', help='the BFAST output FASTQ' )
(options, args) = parser.parse_args()
# output version # of tool
try:
tmp = tempfile.NamedTemporaryFile().name
tmp_stdout = open( tmp, 'wb' )
proc = subprocess.Popen( args='dwgsim 2>&1', shell=True, stdout=tmp_stdout )
tmp_stdout.close()
returncode = proc.wait()
stdout = None
for line in open( tmp_stdout.name, 'rb' ):
if line.lower().find( 'version' ) >= 0:
stdout = line.strip()
break
if stdout:
sys.stdout.write( '%s\n' % stdout )
else:
raise Exception
except:
sys.stdout.write( 'Could not determine DWGSIM version\n' )
buffsize = 1048576
# make temp directory for dwgsim, requires trailing slash
tmp_dir = '%s/' % tempfile.mkdtemp()
#'generic' options used in all dwgsim commands here
try:
reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name
os.symlink( options.input, reference_filepath )
assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.'
tmp_dir = '%s/' % tempfile.mkdtemp()
dwgsim_output_prefix = "dwgsim_output_prefix"
dwgsim_cmd = 'dwgsim -e %s -E %s -d %s -s %s -N %s -1 %s -2 %s -r %s -R %s -X %s -y %s -c %s -n %s' % \
(options.eRateFirstread, \
options.eRateSecondread, \
options.innerDist, \
options.stdev, \
options.numReads, \
options.loFirstread, \
options.loSecondread, \
options.mutRate, \
options.fracIndels, \
options.indelExt, \
options.randProb, \
options.platformType, \
options.maxN)
if options.haploid:
dwgsim_cmd = dwgsim_cmd + ' -H'
if options.platformType == "2":
dwgsim_cmd = dwgsim_cmd + options.flowOrder
dwgsim_cmd = dwgsim_cmd + ' ' + options.input
dwgsim_cmd = dwgsim_cmd + ' ' + tmp_dir + dwgsim_output_prefix
# temporary file to check the command being launched
tmpfile = open('/tmp/dwgsim.log', 'w')
tmpfile.write("%s" % dwgsim_cmd)
# need to nest try-except in try-finally to handle 2.4
try:
# launch dwgsim + accept mutations.txt output
run_process ( dwgsim_cmd, 'dwgsim', tmp_dir, buffsize )
cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.mutations.txt' + ' ' + options.outputMutations
run_process ( cmd, 'mv #1', tmp_dir, buffsize )
check_output ( options.outputMutations, True )
# check which output required and move files accordingly
tmpfile.write("\nsplitoutput: %s \ncondition one: %s \ncondition two: %s \ncondition all: %s\n" % (options.splitoutput, str((options.splitoutput == 'one')), str((options.splitoutput == 'two')), str((options.splitoutput == 'all'))))
if any( [options.splitoutput == 'two' , options.splitoutput == 'all'] ) : # two files
cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read1.fastq' + ' ' + options.outputBWA1
run_process ( cmd, 'mv #3', tmp_dir, buffsize )
cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read2.fastq' + ' ' + options.outputBWA2
run_process ( cmd, 'mv #4', tmp_dir, buffsize )
tmpfile.write("\nmoved!!: %s" % options.splitoutput)
# check that there are results in the output file
check_output ( options.outputBWA1, False )
check_output ( options.outputBWA2, False )
if any( [options.splitoutput == 'one' , options.splitoutput == 'all'] ):
cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bfast.fastq' + ' ' + options.outputBFAST
run_process ( cmd, 'mv #2', tmp_dir, buffsize )
# check that there are results in the output file
check_output ( options.outputBFAST, False )
sys.stdout.write( 'DWGSIM successful' )
except Exception as e:
stop_err( 'DWGSIM failed.\n' + str( e ) )
finally:
# clean up temp dir
if os.path.exists( tmp_dir ):
shutil.rmtree( tmp_dir )
if __name__=="__main__": __main__()
|