1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
|
#!/usr/bin/env python
# file: pandaseq.py
# Application controller for pandaseq (v2.4)
# https://github.com/neufeld/pandaseq
#
from cogent.app.parameters import ValuedParameter, FlagParameter
from cogent.app.util import CommandLineApplication, ResultPath, \
ApplicationError
import os
import tempfile
__author__ = "Michael Robeson"
__copyright__ = "Copyright 2007-2013, The Cogent Project"
__credits__ = ["Michael Robeson"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Michael Robeson"
__email__ = "robesonms@ornl.gov"
__status__ = "Development"
class PandaSeq(CommandLineApplication):
"""pandaseq application controller for joining paired-end reads """
_command = 'pandaseq'
_parameters = {
# pandaseq 2.4 <andre@masella.name>
# Usage: pandaseq -f forward.fastq -r reverse.fastq [-6] [-a] [-B]
# [-C module1 -C module2 ...] [-d flags] [-F] [-j] [-L maxlen]
# [-l minlen] [-N] [-o minoverlap] [-p forwardprimer]
# [-q reverseprimer] [-T threads] [-t threshold] > assembled.fastq
# -6 Use PHRED+64 (CASAVA 1.3-1.7) instead of PHRED+33 (CASAVA 1.8+).
'-6':FlagParameter(Prefix='-', Name='6'),
# -a Strip the primers after assembly, rather than before.
'-a':FlagParameter(Prefix='-', Name='a'),
# -B Allow unbarcoded sequences (try this for BADID errors).
'-B':FlagParameter(Prefix='-', Name='B'),
# -C module Load a sequence validation module.
'-C':FlagParameter(Prefix='-', Name='C'),
# -d flags Control the logging messages. Capital to enable; small to disable.
# (R)econstruction detail.
# Sequence (b)uilding information.
# (F)ile processing.
# (k)-mer table construction.
# Show every (m)ismatch.
'-d':ValuedParameter(Prefix='-', Delimiter=' ', Name='d'),
# Optional (s)tatistics.
# -f Input FASTQ file containing forward reads.
'-f':ValuedParameter(Prefix='-', Delimiter=' ', Name='f'),
# -F Output FASTQ instead of FASTA.
'-F':FlagParameter(Prefix='-', Name='F'),
# -j Input files are bzipped.
'-j':FlagParameter(Prefix='-', Name='j'),
# -k kmers The number of k-mers in the table.
'-k':ValuedParameter(Prefix='-', Delimiter=' ', Name='k'),
# -L maxlen Maximum length for a sequence
'-L':ValuedParameter(Prefix='-', Delimiter=' ', Name='L'),
# -l minlen Minimum length for a sequence
'-l':ValuedParameter(Prefix='-', Delimiter=' ', Name='l'),
# -N Eliminate all sequences with unknown nucleotides in the output.
'-N':FlagParameter(Prefix='-', Name='N'),
# -o minoverlap Minimum overlap between forward and reverse reads (default = 1)
'-o':ValuedParameter(Prefix='-', Delimiter=' ', Name='o'),
# -p Forward primer sequence or number of bases to be removed.
'-p':ValuedParameter(Prefix='-', Delimiter=' ', Name='p'),
# -q Reverse primer sequence or number of bases to be removed.
'-q':ValuedParameter(Prefix='-', Delimiter=' ', Name='q'),
# -r Input FASTQ file containing reverse reads.
'-r':ValuedParameter(Prefix='-', Delimiter=' ', Name='r'),
# -T thread Run with a number of parallel threads.
'-T':ValuedParameter(Prefix='-', Delimiter=' ', Name='T'),
# -t The minimum probability that a sequence must have to match a primer.
# (default = 6.000000e-01)
'-t':ValuedParameter(Prefix='-', Delimiter=' ', Name='t'),
}
# No _get_results_path needed as all results (the merged paired-ends)
# are sent to STDOUT.
def getHelp(self):
"""pandaseq help"""
help_Str = """
For basic help, type the following at the command line:
'pandaseq' or 'pandaseq -h'
Website:
https://github.com/neufeld/pandaseq
"""
def join_paired_end_reads_pandaseq(
reads1_infile_path,
reads2_infile_path,
outfile_label='pandaseq',
phred_64=False,
fastq=True,
threads=1,
params={},
working_dir=tempfile.gettempdir(),
SuppressStderr=True,
SuppressStdout=False,
HALT_EXEC=False):
""" Runs pandaseq with default parameters to assemble paired-end reads.
-reads1_infile_path : reads1.fastq infile path
-reads2_infile_path : reads2.fastq infile path
-outfile_label : base name for outfile
-fastq : output assembly as fastq (True)
or Fasta (False)
-threads : number of cpus to use. Default 1.
-phred_64 : if you are using phred 64 scores instead of
phred 33
-params : other optional pandaseq parameters
"""
abs_r1_path = os.path.abspath(reads1_infile_path)
abs_r2_path = os.path.abspath(reads2_infile_path)
infile_paths = [abs_r1_path, abs_r2_path]
# check / make absolute infile paths
for p in infile_paths:
if not os.path.exists(p):
raise IOError, 'Infile not found at: %s' % p
# required by pandaseq to assemble
params['-f'] = abs_r1_path
params['-r'] = abs_r2_path
params['-T'] = threads
# set up controller
pandaseq_app = PandaSeq(
params=params,
WorkingDir=working_dir,
SuppressStderr=SuppressStderr,
SuppressStdout=SuppressStdout,
HALT_EXEC=HALT_EXEC)
# Fastq?
if fastq:
pandaseq_app.Parameters['-F'].on()
# if using phred 64:
if phred_64:
pandaseq_app.Parameters['-6'].on()
# run assembler
result = pandaseq_app()
# write STDOUT (assembly) to file
# NOTE: res['StdOut'] will be empty after this.
# We do this so that the actual output remains saved to
# disk and can be accessed outside python.
#assembled_output_file_name = get_tmp_filename(prefix='assembled_pandaseq_')
assembled_output_file_name = os.path.join(working_dir, outfile_label) + '_joined.fastq'
pandaseq_outfile_handle = open(assembled_output_file_name,'w')
for line in result['StdOut']:
pandaseq_outfile_handle.write(line)
pandaseq_outfile_handle.close()
result.cleanUp()
# We store ouput file path within a dictionary in order to kepp the
# expected output between various paired-end assembly wrappers
# identical.
path_dict = {}
path_dict['Assembled'] = assembled_output_file_name
# sanity check that files actually exist in path lcoations
for path in path_dict.values():
if not os.path.exists(path):
raise IOError, 'Output file not found at: %s' % path
return path_dict
|