File: clearcut.py

package info (click to toggle)
python-cogent 1.9-14
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 19,752 kB
  • sloc: python: 137,485; makefile: 149; sh: 64
file content (398 lines) | stat: -rw-r--r-- 13,771 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
#!/usr/bin/env python
"""Provides an application controller for the commandline version of:
Clearcut v1.0.8
"""
from cogent.app.parameters import FlagParameter, ValuedParameter, \
    MixedParameter
from cogent.app.util import CommandLineApplication, ResultPath, get_tmp_filename
from cogent.core.alignment import SequenceCollection, Alignment
from cogent.core.moltype import DNA, RNA, PROTEIN
from cogent.parse.tree import DndParser
from cogent.core.tree import PhyloNode
from cogent.util.dict2d import Dict2D
from cogent.format.table import phylipMatrix

__author__ = "Jeremy Widmann"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Jeremy Widmann"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Jeremy Widmann"
__email__ = "jeremy.widmann@colorado.edu"
__status__ = "Development"

MOLTYPE_MAP = {'DNA':'-D',
               'RNA':'-D',
               'PROTEIN':'-P',
               }

class Clearcut(CommandLineApplication):
    """ clearcut application controller 
   
    The parameters are organized by function to give some idea of how the 
    program works. However, no restrictions are put on any combinations 
    of parameters. Misuse of parameters can lead to errors or otherwise
    strange results.
    """
    #General options.
    _general = {\
        # --verbose.  More Output. (Default:OFF)
        '-v':FlagParameter('-',Name='v'),
        # --quiet.  Silent operation. (Default: ON)
        '-q':FlagParameter('-',Name='q',Value=True),
        # --seed=<seed>.  Explicitly set the PRNG seed to a specific value.
        '-s':ValuedParameter('-',Name='s',Delimiter='='),
        # --norandom.  Attempt joins deterministically.  (Default: OFF)
        '-r':FlagParameter('-',Name='r'),
        # --shuffle.  Randomly shuffle the distance matrix.  (Default: OFF)
        '-S':FlagParameter('-',Name='S'),
        #--neighbor.  Use traditional Neighbor-Joining algorithm. (Default: OFF)
        '-N':FlagParameter('-',Name='N'),
        
        }
         

    # Input file is distance matrix or alignment.  Default expects distance
    # matrix.  Output file is tree created by clearcut.
    _input = {\
        # --in=<infilename>.  Input file
        '--in':ValuedParameter('--',Name='in',Delimiter='=',IsPath=True),
        # --stdin.  Read input from STDIN. 
        '-I':FlagParameter('-',Name='I'),
        # --distance.  Input file is a distance matrix. (Default: ON)
        '-d':FlagParameter('-',Name='d',Value=True),
        # --alignment.  Input file is a set of aligned sequences.
        #     (Default: OFF)
        '-a':FlagParameter('-',Name='a'),
        # --DNA.  Input alignment are DNA sequences.
        '-D':FlagParameter('-',Name='D'),
        # --protein.  Input alignment are protein sequences.
        '-P':FlagParameter('-',Name='P'),
        }
  
  
    #Correction model for computing distance matrix (Default: NO Correction):
    _correction={\
        # --jukes.  Use Jukes-Cantor correction for computing distance matrix.
        '-j':FlagParameter('-',Name='j'),
        # --kimura.  Use Kimura correction for distance matrix.
        '-k':FlagParameter('-',Name='k'),
        
        }
    
    _output={\
        # --out=<outfilename>.  Output file
        '--out':ValuedParameter('--',Name='out',Delimiter='=',IsPath=True),
        # --stdout.  Output tree to STDOUT.
        '-O':FlagParameter('-',Name='O'),
        # --matrixout=<file> Output distance matrix to specified file.
        '-m':ValuedParameter('-',Name='m',Delimiter='='),
        # --ntrees=<n>.  Output n trees.  (Default: 1)
        '-n':ValuedParameter('-',Name='n',Delimiter='='),
        # --expblen.  Exponential notation for branch lengths. (Default: OFF)
        '-e':FlagParameter('-',Name='e'),
        # --expdist.  Exponential notation in distance output. (Default: OFF)
        '-E':FlagParameter('-',Name='E'),
        
        }

    
        #NOT SUPPORTED
        #'-h':FlagParameter('-','h'),       #Help
        #'-V':FlagParameter('-','V'),       #Version


    _parameters = {}
    _parameters.update(_general)
    _parameters.update(_input)
    _parameters.update(_correction)
    _parameters.update(_output)
 
    _command = 'clearcut'
   
    def getHelp(self):
        """Method that points to the Clearcut documentation."""
        help_str =\
        """
        See Clearcut homepage at:
        http://bioinformatics.hungry.com/clearcut/
        """
        return help_str
   
    def _input_as_multiline_string(self, data):
        """Writes data to tempfile and sets -infile parameter

        data -- list of lines
        """
        if data:
            self.Parameters['--in']\
                .on(super(Clearcut,self)._input_as_multiline_string(data))
        return ''

    def _input_as_lines(self,data):
        """Writes data to tempfile and sets -infile parameter

        data -- list of lines, ready to be written to file
        """
        if data:
            self.Parameters['--in']\
                .on(super(Clearcut,self)._input_as_lines(data))
        return ''

    def _input_as_seqs(self,data):
        """writes sequences to tempfile and sets -infile parameter

        data -- list of sequences

        Adds numbering to the sequences: >1, >2, etc.
        """
        lines = []
        for i,s in enumerate(data):
            #will number the sequences 1,2,3,etc.
            lines.append(''.join(['>',str(i+1)]))
            lines.append(s)
        return self._input_as_lines(lines)

    def _input_as_string(self,data):
        """Makes data the value of a specific parameter
    
        This method returns the empty string. The parameter will be printed
        automatically once set.
        """
        if data:
            self.Parameters['--in'].on(data)
        return ''
    
    def _tree_filename(self):
        """Return name of file containing the alignment
        
        prefix -- str, prefix of alignment file.
        """
        if self.Parameters['--out']:
            aln_filename = self._absolute(self.Parameters['--out'].Value)
        else:
            raise ValueError, "No tree output file specified."
        return aln_filename

    def _get_result_paths(self,data):
        """Return dict of {key: ResultPath}
        """
        result = {}
        if self.Parameters['--out'].isOn():
            out_name = self._tree_filename()
            result['Tree'] = ResultPath(Path=out_name,IsWritten=True)
        return result      


        
#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS


def align_unaligned_seqs(seqs, moltype, params=None):
    """Returns an Alignment object from seqs.

    seqs: SequenceCollection object, or data that can be used to build one.
    
    moltype: a MolType object.  DNA, RNA, or PROTEIN.

    params: dict of parameters to pass in to the Clearcut app controller.
    
    Result will be an Alignment object.
    """
    #Clearcut does not support alignment
    raise NotImplementedError, """Clearcut does not support alignment."""
    
def align_and_build_tree(seqs, moltype, best_tree=False, params={}):
    """Returns an alignment and a tree from Sequences object seqs.
    
    seqs: SequenceCollection object, or data that can be used to build one.
    
    best_tree: if True (default:False), uses a slower but more accurate
    algorithm to build the tree.

    params: dict of parameters to pass in to the Clearcut app controller.

    The result will be a tuple containing an Alignment object and a
    cogent.core.tree.PhyloNode object (or None for the alignment and/or tree
    if either fails).
    """
    #Clearcut does not support alignment
    raise NotImplementedError, """Clearcut does not support alignment."""
    
def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\
    working_dir='/tmp'):
    """Returns a tree from Alignment object aln.

    aln: an cogent.core.alignment.Alignment object, or data that can be used
    to build one.
        -  Clearcut only accepts aligned sequences.  Alignment object used to
        handle unaligned sequences.
    
    moltype: a cogent.core.moltype object.
        - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8
        gives incorrect results if RNA is passed in.  'U' is treated as an 
        incorrect character and is excluded from distance calculations.

    best_tree: if True (default:False), uses a slower but more accurate
    algorithm to build the tree.

    params: dict of parameters to pass in to the Clearcut app controller.

    The result will be an cogent.core.tree.PhyloNode object, or None if tree
    fails.
    """
    params['--out'] = get_tmp_filename(working_dir)
    
    # Create instance of app controller, enable tree, disable alignment
    app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \
                   WorkingDir=working_dir, SuppressStdout=True,\
                   SuppressStderr=True)
    #Input is an alignment
    app.Parameters['-a'].on()
    #Turn off input as distance matrix
    app.Parameters['-d'].off()
    
    #If moltype = RNA, we must convert to DNA.
    if moltype == RNA:
        moltype = DNA
    
    if best_tree:
        app.Parameters['-N'].on()
    
    #Turn on correct moltype
    moltype_string = moltype.label.upper()
    app.Parameters[MOLTYPE_MAP[moltype_string]].on()    

    # Setup mapping. Clearcut clips identifiers. We will need to remap them.
    # Clearcut only accepts aligned sequences.  Let Alignment object handle
    # unaligned sequences.
    seq_aln = Alignment(aln,MolType=moltype)
    #get int mapping
    int_map, int_keys = seq_aln.getIntMap()
    #create new Alignment object with int_map
    int_map = Alignment(int_map)

    # Collect result
    result = app(int_map.toFasta())
    
    # Build tree
    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
    for node in tree.tips():
        node.Name = int_keys[node.Name]

    # Clean up
    result.cleanUp()
    del(seq_aln, app, result, int_map, int_keys, params)

    return tree
    
def add_seqs_to_alignment(seqs, aln, params=None):
    """Returns an Alignment object from seqs and existing Alignment.

    seqs: an cogent.core.sequence.Sequence object, or data that can be used
    to build one.

    aln: an cogent.core.alignment.Alignment object, or data that can be used
    to build one

    params: dict of parameters to pass in to the Clearcut app controller.
    """
    #Clearcut does not support alignment
    raise NotImplementedError, """Clearcut does not support alignment."""

def align_two_alignments(aln1, aln2, params=None):
    """Returns an Alignment object from two existing Alignments.

    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
    used to build them.

    params: dict of parameters to pass in to the Clearcut app controller.
    """
    #Clearcut does not support alignment
    raise NotImplementedError, """Clearcut does not support alignment."""

    
def build_tree_from_distance_matrix(matrix, best_tree=False, params={},\
    working_dir='/tmp'):
    """Returns a tree from a distance matrix.

    matrix: a square Dict2D object (cogent.util.dict2d)
    
    best_tree: if True (default:False), uses a slower but more accurate
    algorithm to build the tree.

    params: dict of parameters to pass in to the Clearcut app controller.

    The result will be an cogent.core.tree.PhyloNode object, or None if tree
    fails.
    """
    params['--out'] = get_tmp_filename(working_dir)
    
    # Create instance of app controller, enable tree, disable alignment
    app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \
                   WorkingDir=working_dir, SuppressStdout=True,\
                   SuppressStderr=True)
    #Turn off input as alignment
    app.Parameters['-a'].off()
    #Input is a distance matrix
    app.Parameters['-d'].on()
    
    if best_tree:
        app.Parameters['-N'].on()
    
    # Turn the dict2d object into the expected input format
    matrix_input, int_keys = _matrix_input_from_dict2d(matrix)

    # Collect result
    result = app(matrix_input)
    
    # Build tree
    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)

    # reassign to original names
    for node in tree.tips():
        node.Name = int_keys[node.Name]

    # Clean up
    result.cleanUp()
    del(app, result, params)

    return tree

def _matrix_input_from_dict2d(matrix):
    """makes input for running clearcut on a matrix from a dict2D object"""
    #clearcut truncates names to 10 char- need to rename before and 
    #reassign after
    
    #make a dict of env_index:full name
    int_keys = dict([('env_' + str(i), k) for i,k in \
            enumerate(sorted(matrix.keys()))])
    #invert the dict
    int_map = {}
    for i in int_keys:
        int_map[int_keys[i]] = i

    #make a new dict2D object with the integer keys mapped to values instead of
    #the original names
    new_dists = []
    for env1 in matrix:
        for env2 in matrix[env1]:
            new_dists.append((int_map[env1], int_map[env2], matrix[env1][env2]))
    int_map_dists = Dict2D(new_dists)
    
    #names will be fed into the phylipTable function - it is the int map names
    names = sorted(int_map_dists.keys())
    rows = []
    #populated rows with values based on the order of names
    #the following code will work for a square matrix only
    for index, key1 in enumerate(names):
        row = []
        for key2 in names:
            row.append(str(int_map_dists[key1][key2]))
        rows.append(row)
    input_matrix = phylipMatrix(rows, names)
    #input needs a trailing whitespace or it will fail!
    input_matrix += '\n'
   
    return input_matrix, int_keys