File: cd_hit.py

package info (click to toggle)
python-cogent 1.9-14
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 19,752 kB
  • sloc: python: 137,485; makefile: 149; sh: 64
file content (338 lines) | stat: -rw-r--r-- 12,036 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
#!/usr/bin/env python
"""Application controller for CD-HIT v3.1.1"""

import shutil
from os import remove
from cogent.app.parameters import ValuedParameter
from cogent.app.util import CommandLineApplication, ResultPath,\
        get_tmp_filename
from cogent.core.moltype import RNA, DNA, PROTEIN
from cogent.core.alignment import SequenceCollection
from cogent.parse.fasta import MinimalFastaParser

__author__ = "Daniel McDonald"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Daniel McDonald"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Daniel McDonald"
__email__ = "mcdonadt@colorado.edu"
__status__ = "Development"

class CD_HIT(CommandLineApplication):
    """cd-hit Application Controller

    Use this version of CD-HIT if your MolType is PROTEIN
    """

    _command = 'cd-hit'
    _input_handler = '_input_as_multiline_string'
    _parameters = {
        # input input filename in fasta format, required
        '-i':ValuedParameter('-',Name='i',Delimiter=' ',IsPath=True),

        # output filename, required
        '-o':ValuedParameter('-',Name='o',Delimiter=' ',IsPath=True),

        # sequence identity threshold, default 0.9
        # this is the default cd-hit's "global sequence identity" calc'd as :
        # number of identical amino acids in alignment
        # divided by the full length of the shorter sequence
        '-c':ValuedParameter('-',Name='c',Delimiter=' '),

        # use global sequence identity, default 1
        # if set to 0, then use local sequence identity, calculated as :
        # number of identical amino acids in alignment
        # divided by the length of the alignment
        # NOTE!!! don't use -G 0 unless you use alignment coverage controls
        # see options -aL, -AL, -aS, -AS
        '-g':ValuedParameter('-',Name='g',Delimiter=' '),

        # band_width of alignment, default 20
        '-b':ValuedParameter('-',Name='b',Delimiter=' '),

        # max available memory (Mbyte), default 400
        '-M':ValuedParameter('-',Name='M',Delimiter=' '),

        # word_length, default 8, see user's guide for choosing it
        '-n':ValuedParameter('-',Name='n',Delimiter=' '),

        # length of throw_away_sequences, default 10
        '-l':ValuedParameter('-',Name='l',Delimiter=' '),

        # tolerance for redundance, default 2
        '-t':ValuedParameter('-',Name='t',Delimiter=' '),

        # length of description in .clstr file, default 20
        # if set to 0, it takes the fasta defline and stops at first space
        '-d':ValuedParameter('-',Name='d',Delimiter=' '),

        # length difference cutoff, default 0.0
        # if set to 0.9, the shorter sequences need to be
        # at least 90% length of the representative of the cluster
        '-s':ValuedParameter('-',Name='s',Delimiter=' '),

        # length difference cutoff in amino acid, default 999999
        # f set to 60, the length difference between the shorter sequences
        # and the representative of the cluster can not be bigger than 60
        '-S':ValuedParameter('-',Name='S',Delimiter=' '),

        # alignment coverage for the longer sequence, default 0.0
        # if set to 0.9, the alignment must covers 90% of the sequence
        '-aL':ValuedParameter('-',Name='aL',Delimiter=' '),

        # alignment coverage control for the longer sequence, default 99999999
        # if set to 60, and the length of the sequence is 400,
        # then the alignment must be >= 340 (400-60) residues
        '-AL':ValuedParameter('-',Name='AL',Delimiter=' '),

        # alignment coverage for the shorter sequence, default 0.0
        # if set to 0.9, the alignment must covers 90% of the sequence
        '-aS':ValuedParameter('-',Name='aS',Delimiter=' '),

        # alignment coverage control for the shorter sequence, default 99999999
        # if set to 60, and the length of the sequence is 400,
        # then the alignment must be >= 340 (400-60) residues
        '-AS':ValuedParameter('-',Name='AS',Delimiter=' '),

        # 1 or 0, default 0, by default, sequences are stored in RAM
        # if set to 1, sequence are stored on hard drive
        # it is recommended to use -B 1 for huge databases
        '-B':ValuedParameter('-',Name='B',Delimiter=' '),

        # 1 or 0, default 0
        # if set to 1, print alignment overlap in .clstr file
        '-p':ValuedParameter('-',Name='p',Delimiter=' '),

        # 1 or 0, default 0
        # by cd-hit's default algorithm, a sequence is clustered to the first 
        # cluster that meet the threshold (fast cluster). If set to 1, the program
        # will cluster it into the most similar cluster that meet the threshold
        # (accurate but slow mode)
        # but either 1 or 0 won't change the representatives of final clusters
        '-g':ValuedParameter('-',Name='g',Delimiter=' '),

        # print this help
        '-h':ValuedParameter('-',Name='h',Delimiter=' ')
    }
    _synonyms = {'Similarity':'-c'}
 
    def getHelp(self):
        """Method that points to documentation"""
        help_str =\
        """
        CD-HIT is hosted as an open source project at:
        http://www.bioinformatics.org/cd-hit/

        The following papers should be cited if this resource is used:

        Clustering of highly homologous sequences to reduce thesize of large 
        protein database", Weizhong Li, Lukasz Jaroszewski & Adam Godzik
        Bioinformatics, (2001) 17:282-283

        Tolerating some redundancy significantly speeds up clustering of large
        protein databases", Weizhong Li, Lukasz Jaroszewski & Adam Godzik 
        Bioinformatics, (2002) 18:77-82
        """
        return help_str

    def _input_as_multiline_string(self, data):
        """Writes data to tempfile and sets -i parameter

        data -- list of lines
        """
        if data:
            self.Parameters['-i']\
                    .on(super(CD_HIT,self)._input_as_multiline_string(data))
        return ''

    def _input_as_lines(self, data):
        """Writes data to tempfile and sets -i parameter

        data -- list of lines, ready to be written to file
        """
        if data:
            self.Parameters['-i']\
                    .on(super(CD_HIT,self)._input_as_lines(data))
        return ''

    def _input_as_seqs(self, data):
        """Creates a list of seqs to pass to _input_as_lines

        data -- list like object of sequences
        """
        lines = []
        for i,s in enumerate(data):
            # will number the sequences 1,2,3, etc...
            lines.append(''.join(['>',str(i+1)]))
            lines.append(s)
        return self._input_as_lines(lines)

    def _input_as_string(self, data):
        """Makes data the value of a specific parameter"""
        if data:
            self.Parameters['-i'].on(str(data))
        return ''

    def _get_seqs_outfile(self):
        """Returns the absolute path to the seqs outfile"""
        if self.Parameters['-o'].isOn():
            return self.Parameters['-o'].Value
        else:
            raise ValueError, "No output file specified"

    def _get_clstr_outfile(self):
        """Returns the absolute path to the clstr outfile"""
        if self.Parameters['-o'].isOn():
            return ''.join([self.Parameters['-o'].Value, '.clstr'])
        else:
            raise ValueError, "No output file specified"

    def _get_result_paths(self, data):
        """Return dict of {key: ResultPath}"""
        result = {}
        result['FASTA'] = ResultPath(Path=self._get_seqs_outfile())
        result['CLSTR'] = ResultPath(Path=self._get_clstr_outfile())
        return result

class CD_HIT_EST(CD_HIT):
    """cd-hit Application Controller

    Use this version of CD-HIT if your MolType is PROTEIN
    """

    _command = 'cd-hit-est'
    _input_handler = '_input_as_multiline_string'
    _parameters = CD_HIT._parameters
    _parameters.update({\
        # 1 or 0, default 0, by default only +/+ strand alignment
        # if set to 1, do both +/+ & +/- alignments
        '-r':ValuedParameter('-',Name='r',Delimiter=' ')
        })

def cdhit_clusters_from_seqs(seqs, moltype, params=None):
    """Returns the CD-HIT clusters given seqs

    seqs        : dict like collection of sequences
    moltype     : cogent.core.moltype object
    params      : cd-hit parameters

    NOTE: This method will call CD_HIT if moltype is PROTIEN,
        CD_HIT_EST if moltype is RNA/DNA, and raise if any other
        moltype is passed.
    """
    # keys are not remapped. Tested against seq_ids of 100char length
    seqs = SequenceCollection(seqs, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seqs.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)
    
    # setup params and make sure the output argument is set
    if params is None:
        params = {}
    if '-o' not in params:
        params['-o'] = get_tmp_filename()

    # call the correct version of cd-hit base on moltype
    working_dir = get_tmp_filename()
    if moltype is PROTEIN:
        app = CD_HIT(WorkingDir=working_dir, params=params)
    elif moltype is RNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    elif moltype is DNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    else:
        raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA"

    # grab result
    res = app(int_map.toFasta())
    clusters = parse_cdhit_clstr_file(res['CLSTR'].readlines())

    remapped_clusters = []
    for c in clusters:
        curr = [int_keys[i] for i in c]
        remapped_clusters.append(curr)

    # perform cleanup
    res.cleanUp()
    shutil.rmtree(working_dir)
    try:
	remove(params['-o'] + '.bak.clstr')
    except OSError:
	pass #maybe there was no file

    return remapped_clusters

def cdhit_from_seqs(seqs, moltype, params=None):
    """Returns the CD-HIT results given seqs

    seqs    : dict like collection of sequences
    moltype : cogent.core.moltype object
    params  : cd-hit parameters

    NOTE: This method will call CD_HIT if moltype is PROTIEN,
        CD_HIT_EST if moltype is RNA/DNA, and raise if any other
        moltype is passed.
    """
    # keys are not remapped. Tested against seq_ids of 100char length
    seqs = SequenceCollection(seqs, MolType=moltype)

    # setup params and make sure the output argument is set
    if params is None:
        params = {}
    if '-o' not in params:
        params['-o'] = get_tmp_filename()

    # call the correct version of cd-hit base on moltype
    working_dir = get_tmp_filename()
    if moltype is PROTEIN:
        app = CD_HIT(WorkingDir=working_dir, params=params)
    elif moltype is RNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    elif moltype is DNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    else:
        raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA"

    # grab result
    res = app(seqs.toFasta())
    new_seqs = dict(MinimalFastaParser(res['FASTA'].readlines()))

    # perform cleanup
    res.cleanUp()
    shutil.rmtree(working_dir)
    try:
	remove(params['-o'] + '.bak.clstr')
    except OSError:
	pass #maybe there was no file

    return SequenceCollection(new_seqs, MolType=moltype)

def clean_cluster_seq_id(id):
    """Returns a cleaned cd-hit sequence id

    The cluster file has sequence ids in the form of:
    >some_id...
    """
    return id[1:-3]

def parse_cdhit_clstr_file(lines):
    """Returns a list of list of sequence ids representing clusters"""
    clusters = []
    curr_cluster = []

    for l in lines:
        if l.startswith('>Cluster'):
            if not curr_cluster:
                continue
            clusters.append(curr_cluster)
            curr_cluster = []
        else:
            curr_cluster.append(clean_cluster_seq_id(l.split()[2]))

    if curr_cluster:
        clusters.append(curr_cluster)

    return clusters