File: test_cd_hit.py

package info (click to toggle)
python-cogent 1.9-14
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 19,752 kB
  • sloc: python: 137,485; makefile: 149; sh: 64
file content (185 lines) | stat: -rw-r--r-- 7,367 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python

from os import getcwd, rmdir
from cogent.core.moltype import PROTEIN, DNA
from cogent.util.unit_test import TestCase, main
from cogent.app.cd_hit import CD_HIT, CD_HIT_EST, cdhit_from_seqs, \
        cdhit_clusters_from_seqs, clean_cluster_seq_id, parse_cdhit_clstr_file

__author__ = "Daniel McDonald"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Daniel McDonald"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Daniel McDonald"
__email__ = "mcdonadt@colorado.edu"
__status__ = "Development"

class CD_HIT_Tests(TestCase):
    """Tests for the CD-HIT application controller"""

    def test_base_command(self):
        """CD_HIT BaseCommand should return the correct BaseCommand"""
        c = CD_HIT()
        self.assertEqual(c.BaseCommand,\
            ''.join(['cd "',getcwd(),'/"; ','cd-hit']))
        c.Parameters['-i'].on('seq.txt')
        self.assertEqual(c.BaseCommand,\
            ''.join(['cd "',getcwd(),'/"; ','cd-hit -i "seq.txt"']))
        c.Parameters['-c'].on(0.8)
        self.assertEqual(c.BaseCommand,\
            ''.join(['cd "',getcwd(),'/"; ','cd-hit -c 0.8' +
            ' -i "seq.txt"']))

    def test_changing_working_dir(self):
        """CD_HIT BaseCommand should change according to WorkingDir"""
        c = CD_HIT(WorkingDir='/tmp/cdhit_test')
        self.assertEqual(c.BaseCommand,\
            ''.join(['cd "','/tmp/cdhit_test','/"; ','cd-hit']))
        c = CD_HIT()
        c.WorkingDir = '/tmp/cdhit_test2'
        self.assertEqual(c.BaseCommand,\
            ''.join(['cd "','/tmp/cdhit_test2','/"; ','cd-hit']))

        #removing the dirs is proof that they were created at the same time
        #if the dirs are not there, an OSError will be raised
        rmdir('/tmp/cdhit_test')
        rmdir('/tmp/cdhit_test2')

class CD_HIT_EST_Tests(TestCase):
    """Tests for the CD-HIT application controller"""

    def test_base_command(self):
        """CD_HIT_EST BaseCommand should return the correct BaseCommand"""
        c = CD_HIT_EST()
        self.assertEqual(c.BaseCommand,\
            ''.join(['cd "',getcwd(),'/"; ','cd-hit-est']))
        c.Parameters['-i'].on('seq.txt')
        self.assertEqual(c.BaseCommand,\
            ''.join(['cd "',getcwd(),'/"; ','cd-hit-est -i "seq.txt"']))
        c.Parameters['-c'].on(0.8)
        self.assertEqual(c.BaseCommand,\
            ''.join(['cd "',getcwd(),'/"; ','cd-hit-est -c 0.8' +
            ' -i "seq.txt"']))

    def test_changing_working_dir(self):
        """CD_HIT_EST BaseCommand should change according to WorkingDir"""
        c = CD_HIT_EST(WorkingDir='/tmp/cdhitest_test')
        self.assertEqual(c.BaseCommand,\
            ''.join(['cd "','/tmp/cdhitest_test','/"; ','cd-hit-est']))
        c = CD_HIT_EST()
        c.WorkingDir = '/tmp/cdhitest_test2'
        self.assertEqual(c.BaseCommand,\
            ''.join(['cd "','/tmp/cdhitest_test2','/"; ','cd-hit-est']))

        #removing the dirs is proof that they were created at the same time
        #if the dirs are not there, an OSError will be raised
        rmdir('/tmp/cdhitest_test')
        rmdir('/tmp/cdhitest_test2')

class CD_HIT_SupportMethodTests(TestCase):
    """Tests for supporting methods"""
    def test_clean_cluster_seq_id(self):
        """clean_cluster_seq_id returns a cleaned sequence id"""
        data = ">foobar..."
        exp = "foobar"
        obs = clean_cluster_seq_id(data)
        self.assertEqual(obs, exp)

    def test_parse_cdhit_clstr_file(self):
        """parse_cdhit_clstr_file returns the correct clusters"""
        data = cdhit_clstr_file.split('\n')
        exp = [['seq0'],['seq1','seq10','seq3','seq23','seq145'],\
               ['seq7','seq17','seq69','seq1231']]
        obs = parse_cdhit_clstr_file(data)
        self.assertEqual(obs, exp)

dna_seqs = """>cdhit_test_seqs_0
AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
>cdhit_test_seqs_1
ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
>cdhit_test_seqs_2
CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
>cdhit_test_seqs_3
CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
>cdhit_test_seqs_4
GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
>cdhit_test_seqs_5
CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
>cdhit_test_seqs_6
CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
>cdhit_test_seqs_7
ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
>cdhit_test_seqs_8
CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
>cdhit_test_seqs_9
GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA"""

dna_expected = """>cdhit_test_seqs_0
AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
>cdhit_test_seqs_1
ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
>cdhit_test_seqs_2
CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
>cdhit_test_seqs_4
GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
>cdhit_test_seqs_5
CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
>cdhit_test_seqs_7
ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT"""

protein_seqs = """>seq1
MGNKWSKSWPQVRDRMRRAAPAPAADGVGAVSQDLAKHGAITSSNTAATNDDCAWLEAQTEEEVGFPVRPQVPLRPMTYK
>seq2
MGGKWSKSSIVGWSTVRERMRKTPPAADGVGAVSQDLDKHGAVTSSNTAFNNPDCAWLEAQEDEDVGFPVRPQVPLRPT
>seq3
MGGKWSKSSIVGWPAIRERMRRARPAADRVGTQPAADGVGAVSQDLARHGAVTSSNTSHNNPDCAWLEAQEEEEVGVR
>seq4
MGKIWSKSSIVGWPEIRERMRRQRPHEPAVEPAVGVGAASQDLANRGALTTSNTRTNNPTVAWVEAQEEEGEVVRPQ
>seq5
MGKIWSKSSLVGWPEIRERMRRQTQEPAVEPAVGAGAASQDLANRGAITIRNTRDNNESIAWLEAQEEEFPVRPQV
>seq6
MGKIWSKSSLVGWPEIRERIRRQTPEPAVGVGAVSQDLANRGAITTSNTKDNNQTVAWLEAQEEPVRPQVPLRPM
>seq7
MGNALRKGKFEGWAAVRERMRRTRTFPESEPCAPGVGQISRELAARGGIPSSHTPQNNESHQEEEVGFPVAPQV
>seq8
MGNAWSKSKFAGWSEVRDRMRRSSSDPQQPCAPGVGAVSRELATRGGISSSALAFLDSHKDEDVGFPVRPQVP
>seq9
MGNVLGKDKFKGWAAVRERMRKTSSDPDPQPCAPGVGPVSRELSYTPQNNAALAFLESHEDEDVGFPVXPQV
>seq10
MGNVLGKDKFKGWSAVRERMRKTSPEPEPCAPGVRGGISNSHTPQNNAALAFLESHQDEDVGFPVRPQVPL"""

protein_expected = """>seq1
MGNKWSKSWPQVRDRMRRAAPAPAADGVGAVSQDLAKHGAITSSNTAATNDDCAWLEAQTEEEVGFPVRPQVPLRPMTYK
>seq2
MGGKWSKSSIVGWSTVRERMRKTPPAADGVGAVSQDLDKHGAVTSSNTAFNNPDCAWLEAQEDEDVGFPVRPQVPLRPT
>seq3
MGGKWSKSSIVGWPAIRERMRRARPAADRVGTQPAADGVGAVSQDLARHGAVTSSNTSHNNPDCAWLEAQEEEEVGVR
>seq4
MGKIWSKSSIVGWPEIRERMRRQRPHEPAVEPAVGVGAASQDLANRGALTTSNTRTNNPTVAWVEAQEEEGEVVRPQ
>seq5
MGKIWSKSSLVGWPEIRERMRRQTQEPAVEPAVGAGAASQDLANRGAITIRNTRDNNESIAWLEAQEEEFPVRPQV
>seq7
MGNALRKGKFEGWAAVRERMRRTRTFPESEPCAPGVGQISRELAARGGIPSSHTPQNNESHQEEEVGFPVAPQV
>seq8
MGNAWSKSKFAGWSEVRDRMRRSSSDPQQPCAPGVGAVSRELATRGGISSSALAFLDSHKDEDVGFPVRPQVP
>seq9
MGNVLGKDKFKGWAAVRERMRKTSSDPDPQPCAPGVGPVSRELSYTPQNNAALAFLESHEDEDVGFPVXPQV"""

cdhit_clstr_file = """>Cluster 0 
0       2799aa, >seq0... * 
>Cluster 1 
0       2214aa, >seq1... at 80% 
1       2215aa, >seq10... at 84% 
2       2217aa, >seq3... * 
3       2216aa, >seq23... at 84% 
4       527aa, >seq145... at 63% 
>Cluster 2 
0       2202aa, >seq7... at 60% 
1       2208aa, >seq17... * 
2       2207aa, >seq69... at 73% 
3       2208aa, >seq1231... at 69%"""


if __name__ == '__main__':
    main()