File: RAMBOK.py

package info (click to toggle)
rambo-k 1.21%2Bdfsg-4
links: PTS, VCS
area: main
in suites: bookworm
size: 3,144 kB
sloc: java: 1,054; python: 730; makefile: 16
file content (219 lines) | stat: -rwxr-xr-x 16,188 bytes
parent folder | download | duplicates (3)
#!/usr/bin/env python3
import os
import argparse
import simulate_reads
import plot
import shutil
import sys


def copyhead(unassigned, amount, outfile):
    infile = open(unassigned)
    outfile = open(outfile, 'w')
    i = 0
    while i < amount * 4:
        outfile.write(infile.readline())
        i += 1
    outfile.close()


def simulation(reffile1, reffile2, name1, name2, temppath, real_readfiles, gapsize, amount, chunksize, se):
    print('Step 1/5')
    if not se:
        if not os.path.isfile(os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '_1.fasta')) and not os.path.isfile(os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_2_+' + str(amount) + '.fasta')):
            simulate_reads.main(reffile1, reffile2, name1, name2, temppath, real_readfiles, se, amount, gapsize, chunksize)
        else:
            print('Simulated reads of %s and %s exist. Skipping.' % (name1, name2))
    else:
        if not os.path.isfile(os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '.fasta')):
            simulate_reads.main(reffile1, reffile2, name1, name2, temppath, real_readfiles, se, amount, gapsize, chunksize)
        else:
            print('Simulated reads of %s and %s exist. Skipping.' % (name1, name2))
    if not se:
        if not os.path.isfile(os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_1.fastq')):
            copyhead(real_readfiles[0], amount, os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_1.fastq'))
            copyhead(real_readfiles[1], amount, os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_2.fastq'))
    else:
        if not os.path.isfile(os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '.fastq')):
            copyhead(real_readfiles[0], amount, os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '.fastq'))


def training(kmers, name1, name2, temppath, threads, amount, se, skriptpath):
    print('Step 2/5')
    print('Calculating transition matrix')
    if not se:
        simulated11 = os.path.join(temppath, 'simulated_reads_' + name1 + '_' + str(amount) + '_1.fasta')
        simulated12 = os.path.join(temppath, 'simulated_reads_' + name1 + '_' + str(amount) + '_2.fasta')
        simulated21 = os.path.join(temppath, 'simulated_reads_' + name2 + '_' + str(amount) + '_1.fasta')
        simulated22 = os.path.join(temppath, 'simulated_reads_' + name2 + '_' + str(amount) + '_2.fasta')
        for k in kmers:
            if not os.path.isfile(os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt')):
                os.system('java -jar ' + os.path.join(skriptpath, 'trainer.jar') + ' -a1 ' + simulated11 + ' -a2 ' + simulated12 + ' -b1 ' + simulated21 + ' -b2 ' + simulated22 + ' -na ' + name1 + ' -nb ' + name2 + ' -o ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt') + ' -T -t ' + str(threads) + ' -k ' + str(k))
            else:
                print('Transition matrix for k = %i exists. Skipping.' % k)
    else:
        simulated11 = os.path.join(temppath, 'simulated_reads_' + name1 + '_' + str(amount) + '.fasta')
        simulated21 = os.path.join(temppath, 'simulated_reads_' + name2 + '_' + str(amount) + '.fasta')
        for k in kmers:
            if not os.path.isfile(os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt')):
                os.system('java -jar ' + os.path.join(skriptpath, 'trainer.jar') + ' -a1 ' + simulated11 + ' -b1 ' + simulated21 + ' -na ' + name1 + ' -nb ' + name2 + ' -o ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt') + ' -T -t ' + str(threads) + ' -k ' + str(k))
            else:
                print('Transition matrix for k = %i exists. Skipping.' % k)


def assign_simulated(kmers, temppath, name1, name2, threads, amount, se, skriptpath):
    print('Step 3/5')
    print('Assigning subset of simulated reads')

    if not se:
        for k in kmers:
            if not os.path.isfile(os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '_1.fasta')):
                os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '_1.fasta') + ' -2 ' + os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '_2.fasta') + ' -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '_1.fasta') + ' -o2 ' + os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '_2.fasta') + ' -t ' + str(threads))
            else:
                print('Assigned simulated reads for k = %i exist. Skipping' % k)
    else:
        for k in kmers:
            if not os.path.isfile(os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '.fasta')):
                os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '.fasta') + ' -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '.fasta') + ' -t 1')
            else:
                print('Assigned simulated reads for k = %i exist. Skipping' % k)


def assign_real(kmers, temppath, name1, name2, threads, amount, se, skriptpath):
    print('Step 4/5')
    print('Assigning subset of real reads')
    if not se:
        for k in kmers:
            if not os.path.isfile(os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '_1.fastq')):
                os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_1.fastq') + ' -2 ' + os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_2.fastq') + ' -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '_1.fastq') + ' -o2 ' + os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '_2.fastq') + ' -t ' + str(threads))
            else:
                print('Assigned reads for k = %i exist. Skipping.' % k)
    else:
        for k in kmers:
            if not os.path.isfile(os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '.fastq')):
                os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '.fastq') + ' -d ' + os.path.join(
                    temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '.fastq') + ' -t ' + str(threads))
            else:
                print('Assigned reads for k = %i exist. Skipping.' % k)


def do_plots(kmers, outpath, temppath, name1, name2, amount, se, filetype):
    if not se:
        for k in kmers:
            if not os.path.isfile(
                    os.path.join(outpath, 'ROCplot_k' + str(k) + '_' + str(amount) + '_PE.png')) or not os.path.isfile(
                    os.path.join(outpath, 'score_histogram_k' + str(k) + '_' + str(amount) + '_PE.png')) or not os.path.isfile(
                    os.path.join(outpath, 'fitted_histograms_k' + str(k) + '_' + str(amount) + '_PE.png')):
                plot.main(k, outpath, name1, name2, os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '_1.fasta'), os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '_1.fastq'), amount, se, filetype)
            else:
                print('Plots for k = %i exist. Skipping' % k)

    else:
        for k in kmers:
            if not os.path.isfile(os.path.join(outpath, 'ROCplot_k' + str(k) + '_' + str(amount) + '_SE.png')) or not os.path.isfile(os.path.join(outpath, 'score_histogram_k' + str(k) + '_' + str(amount) + '_SE.png')) or not os.path.isfile(os.path.join(outpath, 'fitted_histograms_k' + str(k) + '_' + str(amount) + '_SE.png')):
                plot.main(k, outpath, name1, name2, os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '.fasta'), os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '.fastq'), amount, se, filetype)
            else:
                print('Plots for k = %i exist. Skipping.' % k)


def assign_all(cutoff_higher, cutoff_lower, kmers, amount, unassigned1, unassigned2, se, temppath, outpath, name1, name2, filetype, skriptpath):
    if cutoff_higher:
        if not se:
            os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -2 ' + unassigned2 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_PE.txt') + ' -o1 ' + os.path.join(outpath, name1 + '_cutoff_' + str(cutoff_higher) + '_k_' + str(kmers[0]) + '_1.' + str(filetype)) + ' -o2 ' + os.path.join(outpath, name1 + '_cutoff_' + str(cutoff_higher) + '_k_' + str(kmers[0]) + '_2.' + str(filetype)) + ' -ch ' + str(cutoff_higher))
        else:
            os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_SE.txt') + ' -o1 ' + os.path.join(outpath, name1 + '_cutoff_' + str(cutoff_higher) + '_k_' + str(kmers[0]) + '.' + str(filetype)) + ' -ch ' + str(cutoff_higher))
    elif cutoff_lower:
        if not se:
            os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -2 ' + unassigned2 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_PE.txt') + ' -o1 ' + os.path.join(outpath, name2 + '_cutoff_' + str(cutoff_lower) + '_k_' + str(kmers[0]) + '_1.' + str(filetype)) + ' -o2 ' + os.path.join(outpath, name2 + '_cutoff_' + str(cutoff_lower) + '_k_' + str(kmers[0]) + '_2.' + str(filetype)) + ' -cl ' + str(cutoff_lower))
        else:
            os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_SE.txt') + ' -o1 ' + os.path.join(outpath, name2 + '_cutoff_' + str(cutoff_lower) + '_k_' + str(kmers[0]) + '.' + str(filetype)) + ' -cl ' + str(cutoff_lower))


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', '--reffile1', help='Reference file of species 1 in fasta-format. Should pair with name1', required=True)
    parser.add_argument('-R', '--reffile2', help='Reference file of species 2 in fasta-format. Should pair with name2', required=True)
    parser.add_argument('-n', '--name1', help='Name of species 1', default='species1', required=False)
    parser.add_argument('-N', '--name2', help='Name of species 2', default='species2', required=False)
    parser.add_argument('-1', '--unassigned1', help='Fasta- or fastq-file containing mixed reads.', required=True)
    parser.add_argument('-2', '--unassigned2', help='Fasta- or fastq-file containing mixed reads, only required in paired end mode.', default=None, required=False)
    parser.add_argument('-k', '--kmersizes', help='Order of Markov-Chain/kmer length. Enter as range (e.g. 4:8) or list (e.g. 4,6,8) or integer (e.g. 8). Default = 8', default='8', required=False)
    parser.add_argument('-o', '--outpath', help='Folder to write results to. Default = $name1_$name2/ in your working directory', required=False)
    parser.add_argument('-a', '--amount', help='Number of reads to be simulated, default = 50000', default=50000, required=False)
    parser.add_argument('-t', '--threads', help='Number of Threads to use', default=1, required=False)
    parser.add_argument('-x', '--chunksize', help='Size of chunks created at a time for simulation, default = 100000. Only change if you know what you are doing!', default=100000, required=False)
    parser.add_argument('-g', '--gapsize', help='Estimated size of gapsize in case of paired end reads, default = 1', default=1, required=False)
    parser.add_argument('-c', '--cutoff_lower', help='Lower cutoff:  Output only reads with a score lower than or equal to this value, use m1 for -1')
    parser.add_argument('-C', '--cutoff_higher', help='Higher cutoff: Output only reads with a score higher than or equal to this value, use m1 for -1')
    parser.add_argument('-d', '--delete_temp', help='\Delete temporary files. Calculations will start from beginning next time.', action='store_true', default=False)
    parser.add_argument('-f', '--filetype', help='Type of your input reads. fasta or fastq, default = fastq', default='fastq', required=False)
    args = parser.parse_args()
    name1 = args.name1
    name2 = args.name2
    if not args.outpath:
        outpath = os.path.join(os.getcwd(), str(name1) + '_' + str(name2))
    else:
        outpath = args.outpath
        outpath = os.path.join(os.getcwd(), outpath)
    temppath = os.path.join(outpath, 'temp')
    graphicspath = os.path.join(outpath, 'graphics')
    skriptpath = os.path.dirname(os.path.realpath(sys.argv[0]))
    if not os.path.exists(temppath):
        os.makedirs(temppath)
    if not os.path.exists(graphicspath):
        os.makedirs(graphicspath)
    amount = int(args.amount)
    kmersarg = args.kmersizes
    if ':' in kmersarg:
        kmers = range(int(kmersarg.split(':')[0]), int(kmersarg.split(':')[1]) + 1)
    elif ',' in kmersarg:
        kmers = []
        for each in kmersarg.split(','):
            kmers.append(int(each))
    else:
        kmers = [int(kmersarg)]
    threads = int(args.threads)
    chunksize = int(args.chunksize)
    gapsize = int(args.gapsize)
    reffile1 = args.reffile1
    reffile2 = args.reffile2
    unassigned1 = args.unassigned1
    unassigned2 = args.unassigned2
    cutoff_lower = args.cutoff_lower
    cutoff_higher = args.cutoff_higher
    filetype = args.filetype
    if not unassigned2:
        se = True
    else:
        se = False

    # ##Step1: Read Simulation
    simulation(reffile1, reffile2, name1, name2, temppath, [unassigned1, unassigned2], gapsize, amount, chunksize, se)

    ###Step2: Training
    training(kmers, name1, name2, temppath, threads, amount, se, skriptpath)

    ###Step3: Assign simulated reads
    assign_simulated(kmers, temppath, name1, name2, 1, amount, se, skriptpath)

    ###Step4: Assign real reads
    assign_real(kmers, temppath, name1, name2, threads, amount, se, skriptpath)

    ###Step5: Generate Graphics:
    do_plots(kmers, graphicspath, temppath, name1, name2, amount, se, filetype)

    ###Final Step: Assign all reads

    if ((cutoff_higher and not cutoff_lower) or (not cutoff_higher and cutoff_lower)) and len(kmers) == 1:
        print('Assigning reads. This may take a while.')
        assign_all(cutoff_higher, cutoff_lower, kmers, amount, unassigned1, unassigned2, se, temppath, outpath, name1,
                   name2, filetype, skriptpath)
    else:
        print('To assign your reads, please choose a cutoff as well as one kmer-size.')

    if args.delete_temp:
        shutil.rmtree(os.path.join(temppath))


if __name__ == '__main__':
    main()