File: dwgsim_wrapper.py

package info (click to toggle)
dwgsim 0.1.12-4
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,500 kB
  • sloc: ansic: 3,505; python: 334; perl: 243; xml: 194; makefile: 42; sh: 37
file content (182 lines) | stat: -rw-r--r-- 9,484 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/python3

"""
Runs DWGSIM

usage: dwgsim_wrapper.py [options]
    -e,--eRateFirstread=e: base/color error rate of the first read [0.020]
    -E,--eRateSecondread=E: base/color error rate of the second read [0.020]
    -d,--innerDist=d: inner distance between the two ends [500]
    -s,--stdev=s: standard deviation [50]
    -N,--numReads=N: number of read pairs [1000000]
    -1,--loFirstread=1: length of the first read [70]
    -2,--loSecondread=2: length of the second read [70]
    -r,--mutRate=r: rate of mutations [0.0010]
    -R,--fracIndels=R: fraction of mutations that are indels [0.10]
    -X,--indelExt=X: probability an indel is extended [0.30]
    -y,--randProb=y: probability of a random DNA read [0.10]
    -n,--maxN=n: maximum number of Ns allowed in a given read [0]
    -c,--platformType=c: generate reads for Illumina (nuc space), for SOLiD (in color space) or Ion Torrent 
    -S,--strand=S: strand 0: default, 1: same strand, 2: opposite strand
    -f,--flowOrder=f: the flow order for Ion Torrent data [(null)]
    -O,--splitOutput=O: which output needs to be reported, one: one single; two: two files; all: all output options
    -H,--haploid=H: haploid mode
    -i,--input=i: the reference genome FASTA
    -3,--outputBFAST=3: the BFAST output FASTQ
    -4,--outputBWA1=4: the first BWA output FASTQ
    -5,--outputBWA2=5: the second BWA output FASTQ
    -6,--outputMutations=6: the output mutations TXT
"""

import optparse, os, shutil, subprocess, sys, tempfile

def stop_err( msg ):
    sys.stderr.write( '%s\n' % msg )
    sys.exit()

def run_process ( cmd, name, tmp_dir, buffsize ):
    try:
        tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
        tmp_stderr = open( tmp, 'wb' )
        proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
        returncode = proc.wait()
        tmp_stderr.close()
        # get stderr, allowing for case where it's very large
        tmp_stderr = open( tmp, 'rb' )
        stderr = ''
        try:
            while True:
                stderr += tmp_stderr.read( buffsize )
                if not stderr or len( stderr ) % buffsize != 0:
                    break
        except OverflowError:
            pass
        tmp_stderr.close()
        if returncode != 0:
            raise Exception(stderr)
    except Exception as e:
        raise Exception('Error in \'' + name + '\'. \n' + str( e ))

def check_output ( output, canBeEmpty ):
    if 0 < os.path.getsize( output ):
        return True
    elif False == canBeEmpty:
        raise Exception('The output file is empty:' + output)

def __main__():
    #Parse Command Line
    parser = optparse.OptionParser()
    parser.add_option( '-e', '--eRateFirstread', dest='eRateFirstread', type='float', help='base/color error rate of the first read' )
    parser.add_option( '-E', '--eRateSecondread', dest='eRateSecondread', type='float', help='base/color error rate of the second read' )
    parser.add_option( '-d', '--innerDist', dest='innerDist', type='int', help='inner distance between the two ends' )
    parser.add_option( '-s', '--stdev', dest='stdev', type='float', help='standard deviation' )
    parser.add_option( '-N', '--numReads', dest='numReads', type='int', help='number of read pairs' )
    parser.add_option( '-r', '--mutRate', dest='mutRate', type='float', help='rate of mutations' )
    parser.add_option( '-R', '--fracIndels', dest='fracIndels', type='float', help='fraction of mutations that are indels' )
    parser.add_option( '-X', '--indelExt', dest='indelExt', type='float', help='probability an indel is extended' )
    parser.add_option( '-y', '--randProb', dest='randProb', type='float', help='probability of a random DNA read' )
    parser.add_option( '-n', '--maxN', dest='maxN', type='int', help='maximum number of Ns allowed in a given read' )
    parser.add_option( '-c', '--platformType', dest='platformType', type='int', help='Platform to simulate: 0: Illumina; 1: solid; 2: Ion Torrent' )
    parser.add_option( '-S', '--strand', dest='strand', type='choice', default='0', choices=('0', '1', '2'), help='strand 0: default, 1: same strand, 2: opposite strand' )
    parser.add_option( '-f', '--flowOrder', dest='flowOrder', type='int', default='2')
    parser.add_option( '-H', '--haploid', action='store_true', dest='haploid', default=False, help='haploid mode' )
    parser.add_option( '-i', '--input', dest='input', help='The reference genome fasta' )
    parser.add_option( '-O', dest='splitoutput', type='choice', default='two',choices=('two', 'one', 'all'), help='one: one single; two: two files; all: all output options' )
    parser.add_option( '-1', '--loFirstread', dest='loFirstread', type='int', help='length of the first read' )
    parser.add_option( '-2', '--loSecondread', dest='loSecondread', type='int', help='length of the second read' )
    parser.add_option( '-6', '--outputMutations', dest='outputMutations', help='the output mutations TXT' )
    parser.add_option( '-4', '--outputBWA1', dest='outputBWA1', help='the first BWA output FASTQ' )
    parser.add_option( '-5', '--outputBWA2', dest='outputBWA2', help='the second BWA output FASTQ' )
    parser.add_option( '-3', '--outputBFAST', dest='outputBFAST', help='the BFAST output FASTQ' )
							
    (options, args) = parser.parse_args()

    # output version # of tool
    try:
        tmp = tempfile.NamedTemporaryFile().name
        tmp_stdout = open( tmp, 'wb' )
        proc = subprocess.Popen( args='dwgsim 2>&1', shell=True, stdout=tmp_stdout )
        tmp_stdout.close()
        returncode = proc.wait()
        stdout = None
        for line in open( tmp_stdout.name, 'rb' ):
            if line.lower().find( 'version' ) >= 0:
                stdout = line.strip()
                break
        if stdout:
            sys.stdout.write( '%s\n' % stdout )
        else:
            raise Exception
    except:
        sys.stdout.write( 'Could not determine DWGSIM version\n' )

    buffsize = 1048576

    # make temp directory for dwgsim, requires trailing slash
    tmp_dir = '%s/' % tempfile.mkdtemp()

    #'generic' options used in all dwgsim commands here

    try:
        reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name
        os.symlink( options.input, reference_filepath )
        assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.'
        tmp_dir = '%s/' % tempfile.mkdtemp()
        dwgsim_output_prefix = "dwgsim_output_prefix"
        dwgsim_cmd = 'dwgsim -e %s -E %s -d %s -s %s -N %s -1 %s -2 %s -r %s -R %s -X %s -y %s -c %s -n %s' % \
                (options.eRateFirstread, \
                options.eRateSecondread, \
                options.innerDist, \
                options.stdev, \
                options.numReads, \
                options.loFirstread, \
                options.loSecondread, \
                options.mutRate, \
                options.fracIndels, \
                options.indelExt, \
                options.randProb, \
                options.platformType, \
                options.maxN)
        if options.haploid:
            dwgsim_cmd = dwgsim_cmd + ' -H'
        if options.platformType == "2":
            dwgsim_cmd = dwgsim_cmd + options.flowOrder
        dwgsim_cmd = dwgsim_cmd + ' ' + options.input
        dwgsim_cmd = dwgsim_cmd + ' ' + tmp_dir + dwgsim_output_prefix
        
        # temporary file to check the command being launched
        tmpfile = open('/tmp/dwgsim.log', 'w')
        tmpfile.write("%s" % dwgsim_cmd)

        # need to nest try-except in try-finally to handle 2.4
        try:
			# launch dwgsim + accept mutations.txt output
			run_process ( dwgsim_cmd, 'dwgsim', tmp_dir, buffsize )
			cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.mutations.txt' + ' ' + options.outputMutations
			run_process ( cmd, 'mv #1', tmp_dir, buffsize )
			check_output ( options.outputMutations, True )
			# check which output required and move files accordingly
			tmpfile.write("\nsplitoutput: %s \ncondition one: %s \ncondition two: %s \ncondition all: %s\n" % (options.splitoutput, str((options.splitoutput == 'one')), str((options.splitoutput == 'two')), str((options.splitoutput == 'all'))))
			if  any( [options.splitoutput == 'two' , options.splitoutput == 'all'] ) :	# two files
				cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read1.fastq' + ' ' + options.outputBWA1
				run_process ( cmd, 'mv #3', tmp_dir, buffsize )
				cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read2.fastq' + ' ' + options.outputBWA2
				run_process ( cmd, 'mv #4', tmp_dir, buffsize )
				tmpfile.write("\nmoved!!: %s" % options.splitoutput)
				# check that there are results in the output file
				check_output ( options.outputBWA1, False )
				check_output ( options.outputBWA2, False )
			if any( [options.splitoutput == 'one' , options.splitoutput == 'all'] ):
				cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bfast.fastq' + ' ' + options.outputBFAST
				run_process ( cmd, 'mv #2', tmp_dir, buffsize )
				# check that there are results in the output file
				check_output ( options.outputBFAST, False )
			sys.stdout.write( 'DWGSIM successful' )
        except Exception as e:
			stop_err( 'DWGSIM failed.\n' + str( e ) )
    finally:
        # clean up temp dir
        if os.path.exists( tmp_dir ):
            shutil.rmtree( tmp_dir )

if __name__=="__main__": __main__()