File: pandaseq.py

package info (click to toggle)
python-cogent 1.9-14
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 19,752 kB
  • sloc: python: 137,485; makefile: 149; sh: 64
file content (199 lines) | stat: -rw-r--r-- 6,531 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python
# file: pandaseq.py

# Application controller for pandaseq (v2.4) 
# https://github.com/neufeld/pandaseq
# 

from cogent.app.parameters import ValuedParameter, FlagParameter
from cogent.app.util import CommandLineApplication, ResultPath, \
    ApplicationError
import os 
import tempfile

__author__ = "Michael Robeson"
__copyright__ = "Copyright 2007-2013, The Cogent Project"
__credits__ = ["Michael Robeson"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Michael Robeson"
__email__ = "robesonms@ornl.gov"
__status__ = "Development"


class PandaSeq(CommandLineApplication):
    """pandaseq application controller for joining paired-end reads """
    _command = 'pandaseq'
    _parameters = {
    # pandaseq 2.4 <andre@masella.name>
    # Usage: pandaseq -f forward.fastq -r reverse.fastq [-6] [-a] [-B] 
    #    [-C module1 -C module2 ...] [-d flags] [-F] [-j] [-L maxlen] 
    #    [-l minlen] [-N] [-o minoverlap] [-p forwardprimer] 
    #    [-q reverseprimer] [-T threads] [-t threshold] > assembled.fastq

   # -6  Use PHRED+64 (CASAVA 1.3-1.7) instead of PHRED+33 (CASAVA 1.8+).
    '-6':FlagParameter(Prefix='-', Name='6'),

    # -a  Strip the primers after assembly, rather than before.
    '-a':FlagParameter(Prefix='-', Name='a'),

    # -B  Allow unbarcoded sequences (try this for BADID errors).
    '-B':FlagParameter(Prefix='-', Name='B'),

    # -C module   Load a sequence validation module.
    '-C':FlagParameter(Prefix='-', Name='C'),

    # -d flags    Control the logging messages. Capital to enable; small to disable.
    #    (R)econstruction detail.
    #    Sequence (b)uilding information.
    #    (F)ile processing.
    #    (k)-mer table construction.
    #    Show every (m)ismatch.
    '-d':ValuedParameter(Prefix='-', Delimiter=' ', Name='d'),

    #    Optional (s)tatistics.
    # -f  Input FASTQ file containing forward reads.
    '-f':ValuedParameter(Prefix='-', Delimiter=' ', Name='f'),

    # -F  Output FASTQ instead of FASTA.
    '-F':FlagParameter(Prefix='-', Name='F'),

    # -j  Input files are bzipped.
    '-j':FlagParameter(Prefix='-', Name='j'),

    # -k kmers    The number of k-mers in the table.
    '-k':ValuedParameter(Prefix='-', Delimiter=' ', Name='k'),

    # -L maxlen   Maximum length for a sequence
    '-L':ValuedParameter(Prefix='-', Delimiter=' ', Name='L'),

    # -l minlen   Minimum length for a sequence
    '-l':ValuedParameter(Prefix='-', Delimiter=' ', Name='l'),

    # -N  Eliminate all sequences with unknown nucleotides in the output.
    '-N':FlagParameter(Prefix='-', Name='N'),

    # -o minoverlap   Minimum overlap between forward and reverse reads (default = 1)
    '-o':ValuedParameter(Prefix='-', Delimiter=' ', Name='o'),

    # -p  Forward primer sequence or number of bases to be removed.
    '-p':ValuedParameter(Prefix='-', Delimiter=' ', Name='p'),

    # -q  Reverse primer sequence or number of bases to be removed.
    '-q':ValuedParameter(Prefix='-', Delimiter=' ', Name='q'),

    # -r  Input FASTQ file containing reverse reads.
    '-r':ValuedParameter(Prefix='-', Delimiter=' ', Name='r'),

    # -T thread   Run with a number of parallel threads.
    '-T':ValuedParameter(Prefix='-', Delimiter=' ', Name='T'),

    # -t  The minimum probability that a sequence must have to match a primer.
    #     (default = 6.000000e-01)
    '-t':ValuedParameter(Prefix='-', Delimiter=' ', Name='t'),
    }


    # No _get_results_path needed as all results (the merged paired-ends)
    # are sent to STDOUT.

    def getHelp(self):
        """pandaseq help"""
        help_Str = """
        For basic help, type the following at the command line:
            'pandaseq' or 'pandaseq -h'

        Website:
            https://github.com/neufeld/pandaseq
        """


def join_paired_end_reads_pandaseq(
    reads1_infile_path,
    reads2_infile_path,
    outfile_label='pandaseq',
    phred_64=False,
    fastq=True,
    threads=1,
    params={},
    working_dir=tempfile.gettempdir(),
    SuppressStderr=True,
    SuppressStdout=False,
    HALT_EXEC=False):
    """ Runs pandaseq with default parameters to assemble paired-end reads.
        -reads1_infile_path : reads1.fastq infile path
        -reads2_infile_path : reads2.fastq infile path
        -outfile_label : base name for outfile
        -fastq : output assembly as fastq (True)
                 or Fasta (False)
        -threads : number of cpus to use. Default 1.
        -phred_64 : if you are using phred 64 scores instead of
                    phred 33
        -params : other optional pandaseq parameters
    """

    abs_r1_path = os.path.abspath(reads1_infile_path)
    abs_r2_path = os.path.abspath(reads2_infile_path)
    
    infile_paths = [abs_r1_path, abs_r2_path]
    # check / make absolute infile paths
    for p in infile_paths:
        if not os.path.exists(p):
            raise IOError, 'Infile not found at: %s' % p
    
    # required by pandaseq to assemble
    params['-f'] = abs_r1_path
    params['-r'] = abs_r2_path
    params['-T'] = threads
  
    # set up controller
    pandaseq_app = PandaSeq(
        params=params,
        WorkingDir=working_dir,
        SuppressStderr=SuppressStderr,
        SuppressStdout=SuppressStdout,
        HALT_EXEC=HALT_EXEC)
    
    # Fastq?
    if fastq:
        pandaseq_app.Parameters['-F'].on()

    # if using phred 64:
    if phred_64:
        pandaseq_app.Parameters['-6'].on()


    # run assembler
    result = pandaseq_app()
    
    # write STDOUT (assembly) to file 
    # NOTE: res['StdOut'] will be empty after this.
    # We do this so that the actual output remains saved to
    # disk and can be accessed outside python.
    #assembled_output_file_name = get_tmp_filename(prefix='assembled_pandaseq_')
    assembled_output_file_name = os.path.join(working_dir, outfile_label) + '_joined.fastq'
    pandaseq_outfile_handle = open(assembled_output_file_name,'w')

    for line in result['StdOut']:
        pandaseq_outfile_handle.write(line)
    pandaseq_outfile_handle.close()

    result.cleanUp()

    # We store ouput file path within a dictionary in order to kepp the 
    # expected output between various paired-end assembly wrappers
    # identical.
    path_dict = {}
    path_dict['Assembled'] = assembled_output_file_name

    # sanity check that files actually exist in path lcoations
    for path in path_dict.values():
        if not os.path.exists(path):
            raise IOError, 'Output file not found at: %s' % path

    return path_dict