1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
|
#!/usr/bin/env python3
import os
import argparse
import simulate_reads
import plot
import shutil
import sys
def copyhead(unassigned, amount, outfile):
infile = open(unassigned)
outfile = open(outfile, 'w')
i = 0
while i < amount * 4:
outfile.write(infile.readline())
i += 1
outfile.close()
def simulation(reffile1, reffile2, name1, name2, temppath, real_readfiles, gapsize, amount, chunksize, se):
print('Step 1/5')
if not se:
if not os.path.isfile(os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '_1.fasta')) and not os.path.isfile(os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_2_+' + str(amount) + '.fasta')):
simulate_reads.main(reffile1, reffile2, name1, name2, temppath, real_readfiles, se, amount, gapsize, chunksize)
else:
print('Simulated reads of %s and %s exist. Skipping.' % (name1, name2))
else:
if not os.path.isfile(os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '.fasta')):
simulate_reads.main(reffile1, reffile2, name1, name2, temppath, real_readfiles, se, amount, gapsize, chunksize)
else:
print('Simulated reads of %s and %s exist. Skipping.' % (name1, name2))
if not se:
if not os.path.isfile(os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_1.fastq')):
copyhead(real_readfiles[0], amount, os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_1.fastq'))
copyhead(real_readfiles[1], amount, os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_2.fastq'))
else:
if not os.path.isfile(os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '.fastq')):
copyhead(real_readfiles[0], amount, os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '.fastq'))
def training(kmers, name1, name2, temppath, threads, amount, se, skriptpath):
print('Step 2/5')
print('Calculating transition matrix')
if not se:
simulated11 = os.path.join(temppath, 'simulated_reads_' + name1 + '_' + str(amount) + '_1.fasta')
simulated12 = os.path.join(temppath, 'simulated_reads_' + name1 + '_' + str(amount) + '_2.fasta')
simulated21 = os.path.join(temppath, 'simulated_reads_' + name2 + '_' + str(amount) + '_1.fasta')
simulated22 = os.path.join(temppath, 'simulated_reads_' + name2 + '_' + str(amount) + '_2.fasta')
for k in kmers:
if not os.path.isfile(os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt')):
os.system('java -jar ' + os.path.join(skriptpath, 'trainer.jar') + ' -a1 ' + simulated11 + ' -a2 ' + simulated12 + ' -b1 ' + simulated21 + ' -b2 ' + simulated22 + ' -na ' + name1 + ' -nb ' + name2 + ' -o ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt') + ' -T -t ' + str(threads) + ' -k ' + str(k))
else:
print('Transition matrix for k = %i exists. Skipping.' % k)
else:
simulated11 = os.path.join(temppath, 'simulated_reads_' + name1 + '_' + str(amount) + '.fasta')
simulated21 = os.path.join(temppath, 'simulated_reads_' + name2 + '_' + str(amount) + '.fasta')
for k in kmers:
if not os.path.isfile(os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt')):
os.system('java -jar ' + os.path.join(skriptpath, 'trainer.jar') + ' -a1 ' + simulated11 + ' -b1 ' + simulated21 + ' -na ' + name1 + ' -nb ' + name2 + ' -o ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt') + ' -T -t ' + str(threads) + ' -k ' + str(k))
else:
print('Transition matrix for k = %i exists. Skipping.' % k)
def assign_simulated(kmers, temppath, name1, name2, threads, amount, se, skriptpath):
print('Step 3/5')
print('Assigning subset of simulated reads')
if not se:
for k in kmers:
if not os.path.isfile(os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '_1.fasta')):
os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '_1.fasta') + ' -2 ' + os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '_2.fasta') + ' -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '_1.fasta') + ' -o2 ' + os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '_2.fasta') + ' -t ' + str(threads))
else:
print('Assigned simulated reads for k = %i exist. Skipping' % k)
else:
for k in kmers:
if not os.path.isfile(os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '.fasta')):
os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '.fasta') + ' -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '.fasta') + ' -t 1')
else:
print('Assigned simulated reads for k = %i exist. Skipping' % k)
def assign_real(kmers, temppath, name1, name2, threads, amount, se, skriptpath):
print('Step 4/5')
print('Assigning subset of real reads')
if not se:
for k in kmers:
if not os.path.isfile(os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '_1.fastq')):
os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_1.fastq') + ' -2 ' + os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_2.fastq') + ' -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '_1.fastq') + ' -o2 ' + os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '_2.fastq') + ' -t ' + str(threads))
else:
print('Assigned reads for k = %i exist. Skipping.' % k)
else:
for k in kmers:
if not os.path.isfile(os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '.fastq')):
os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '.fastq') + ' -d ' + os.path.join(
temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '.fastq') + ' -t ' + str(threads))
else:
print('Assigned reads for k = %i exist. Skipping.' % k)
def do_plots(kmers, outpath, temppath, name1, name2, amount, se, filetype):
if not se:
for k in kmers:
if not os.path.isfile(
os.path.join(outpath, 'ROCplot_k' + str(k) + '_' + str(amount) + '_PE.png')) or not os.path.isfile(
os.path.join(outpath, 'score_histogram_k' + str(k) + '_' + str(amount) + '_PE.png')) or not os.path.isfile(
os.path.join(outpath, 'fitted_histograms_k' + str(k) + '_' + str(amount) + '_PE.png')):
plot.main(k, outpath, name1, name2, os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '_1.fasta'), os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '_1.fastq'), amount, se, filetype)
else:
print('Plots for k = %i exist. Skipping' % k)
else:
for k in kmers:
if not os.path.isfile(os.path.join(outpath, 'ROCplot_k' + str(k) + '_' + str(amount) + '_SE.png')) or not os.path.isfile(os.path.join(outpath, 'score_histogram_k' + str(k) + '_' + str(amount) + '_SE.png')) or not os.path.isfile(os.path.join(outpath, 'fitted_histograms_k' + str(k) + '_' + str(amount) + '_SE.png')):
plot.main(k, outpath, name1, name2, os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '.fasta'), os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '.fastq'), amount, se, filetype)
else:
print('Plots for k = %i exist. Skipping.' % k)
def assign_all(cutoff_higher, cutoff_lower, kmers, amount, unassigned1, unassigned2, se, temppath, outpath, name1, name2, filetype, skriptpath):
if cutoff_higher:
if not se:
os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -2 ' + unassigned2 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_PE.txt') + ' -o1 ' + os.path.join(outpath, name1 + '_cutoff_' + str(cutoff_higher) + '_k_' + str(kmers[0]) + '_1.' + str(filetype)) + ' -o2 ' + os.path.join(outpath, name1 + '_cutoff_' + str(cutoff_higher) + '_k_' + str(kmers[0]) + '_2.' + str(filetype)) + ' -ch ' + str(cutoff_higher))
else:
os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_SE.txt') + ' -o1 ' + os.path.join(outpath, name1 + '_cutoff_' + str(cutoff_higher) + '_k_' + str(kmers[0]) + '.' + str(filetype)) + ' -ch ' + str(cutoff_higher))
elif cutoff_lower:
if not se:
os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -2 ' + unassigned2 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_PE.txt') + ' -o1 ' + os.path.join(outpath, name2 + '_cutoff_' + str(cutoff_lower) + '_k_' + str(kmers[0]) + '_1.' + str(filetype)) + ' -o2 ' + os.path.join(outpath, name2 + '_cutoff_' + str(cutoff_lower) + '_k_' + str(kmers[0]) + '_2.' + str(filetype)) + ' -cl ' + str(cutoff_lower))
else:
os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_SE.txt') + ' -o1 ' + os.path.join(outpath, name2 + '_cutoff_' + str(cutoff_lower) + '_k_' + str(kmers[0]) + '.' + str(filetype)) + ' -cl ' + str(cutoff_lower))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--reffile1', help='Reference file of species 1 in fasta-format. Should pair with name1', required=True)
parser.add_argument('-R', '--reffile2', help='Reference file of species 2 in fasta-format. Should pair with name2', required=True)
parser.add_argument('-n', '--name1', help='Name of species 1', default='species1', required=False)
parser.add_argument('-N', '--name2', help='Name of species 2', default='species2', required=False)
parser.add_argument('-1', '--unassigned1', help='Fasta- or fastq-file containing mixed reads.', required=True)
parser.add_argument('-2', '--unassigned2', help='Fasta- or fastq-file containing mixed reads, only required in paired end mode.', default=None, required=False)
parser.add_argument('-k', '--kmersizes', help='Order of Markov-Chain/kmer length. Enter as range (e.g. 4:8) or list (e.g. 4,6,8) or integer (e.g. 8). Default = 8', default='8', required=False)
parser.add_argument('-o', '--outpath', help='Folder to write results to. Default = $name1_$name2/ in your working directory', required=False)
parser.add_argument('-a', '--amount', help='Number of reads to be simulated, default = 50000', default=50000, required=False)
parser.add_argument('-t', '--threads', help='Number of Threads to use', default=1, required=False)
parser.add_argument('-x', '--chunksize', help='Size of chunks created at a time for simulation, default = 100000. Only change if you know what you are doing!', default=100000, required=False)
parser.add_argument('-g', '--gapsize', help='Estimated size of gapsize in case of paired end reads, default = 1', default=1, required=False)
parser.add_argument('-c', '--cutoff_lower', help='Lower cutoff: Output only reads with a score lower than or equal to this value, use m1 for -1')
parser.add_argument('-C', '--cutoff_higher', help='Higher cutoff: Output only reads with a score higher than or equal to this value, use m1 for -1')
parser.add_argument('-d', '--delete_temp', help='\Delete temporary files. Calculations will start from beginning next time.', action='store_true', default=False)
parser.add_argument('-f', '--filetype', help='Type of your input reads. fasta or fastq, default = fastq', default='fastq', required=False)
args = parser.parse_args()
name1 = args.name1
name2 = args.name2
if not args.outpath:
outpath = os.path.join(os.getcwd(), str(name1) + '_' + str(name2))
else:
outpath = args.outpath
outpath = os.path.join(os.getcwd(), outpath)
temppath = os.path.join(outpath, 'temp')
graphicspath = os.path.join(outpath, 'graphics')
skriptpath = os.path.dirname(os.path.realpath(sys.argv[0]))
if not os.path.exists(temppath):
os.makedirs(temppath)
if not os.path.exists(graphicspath):
os.makedirs(graphicspath)
amount = int(args.amount)
kmersarg = args.kmersizes
if ':' in kmersarg:
kmers = range(int(kmersarg.split(':')[0]), int(kmersarg.split(':')[1]) + 1)
elif ',' in kmersarg:
kmers = []
for each in kmersarg.split(','):
kmers.append(int(each))
else:
kmers = [int(kmersarg)]
threads = int(args.threads)
chunksize = int(args.chunksize)
gapsize = int(args.gapsize)
reffile1 = args.reffile1
reffile2 = args.reffile2
unassigned1 = args.unassigned1
unassigned2 = args.unassigned2
cutoff_lower = args.cutoff_lower
cutoff_higher = args.cutoff_higher
filetype = args.filetype
if not unassigned2:
se = True
else:
se = False
# ##Step1: Read Simulation
simulation(reffile1, reffile2, name1, name2, temppath, [unassigned1, unassigned2], gapsize, amount, chunksize, se)
###Step2: Training
training(kmers, name1, name2, temppath, threads, amount, se, skriptpath)
###Step3: Assign simulated reads
assign_simulated(kmers, temppath, name1, name2, 1, amount, se, skriptpath)
###Step4: Assign real reads
assign_real(kmers, temppath, name1, name2, threads, amount, se, skriptpath)
###Step5: Generate Graphics:
do_plots(kmers, graphicspath, temppath, name1, name2, amount, se, filetype)
###Final Step: Assign all reads
if ((cutoff_higher and not cutoff_lower) or (not cutoff_higher and cutoff_lower)) and len(kmers) == 1:
print('Assigning reads. This may take a while.')
assign_all(cutoff_higher, cutoff_lower, kmers, amount, unassigned1, unassigned2, se, temppath, outpath, name1,
name2, filetype, skriptpath)
else:
print('To assign your reads, please choose a cutoff as well as one kmer-size.')
if args.delete_temp:
shutil.rmtree(os.path.join(temppath))
if __name__ == '__main__':
main()
|