File: test_kegg_ko.py

package info (click to toggle)
python-cogent 1.5.3-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 16,424 kB
  • ctags: 24,343
  • sloc: python: 134,200; makefile: 100; ansic: 17; sh: 10
file content (361 lines) | stat: -rw-r--r-- 17,141 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#!/usr/bin/env python

__author__ = "Jesse Zaneveld"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Jesse Zaneveld"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Jesse Zaneveld"
__email__ = "zaneveld@gmail.com"
__status__ = "Production"

"""
Test code for kegg_ko.py in cogent.parse.  
"""
from cogent.util.unit_test import TestCase, main

from cogent.parse.kegg_ko import kegg_label_fields,\
  parse_kegg_taxonomy, ko_record_iterator, ko_record_splitter,\
  ko_default_parser, ko_first_field_parser, delete_comments,\
  ko_colon_fields, ko_colon_delimited_parser, _is_new_kegg_rec_group,\
  group_by_end_char, class_lines_to_fields, ko_class_parser, parse_ko,\
  parse_ko_file, make_tab_delimited_line_parser



class ParseKOTests(TestCase):
    def make_tab_delimited_line_parser(self):
        """make_tab_delimited_line_parser should generate line parser"""
        line ="good\tbad:good\tgood\tgood\tbad:good\tgood"
        parse_fn = make_tab_delimited_line_parser([0,2,3,5])
        obs = parse_fn(line)
        exp = "good\tgood\tgood\tgood\tgood\tgood"
        self.assertEqual(obs,exp)

    def test_kegg_label_fields(self):
        """kegg_label_fields should return fields from line"""
        # Format is species:gene_id [optional gene_name]; description.
        # Note that the '>' should already be stripped by the Fasta Parser
        test1 = \
          """stm:STM0001  thrL; thr operon leader peptide ; K08278 thr operon leader peptide"""
        test2 = \
          """stm:STM0002  thrA; bifunctional aspartokinase I/homeserine dehydrogenase I (EC:2.7.2.4 1.1.1.13); K00003 homoserine dehydrogenase [EC:1.1.1.3]; K00928 aspartate kinase [EC:2.7.2.4]"""
        obs = kegg_label_fields(test1)
        exp = ('stm:STM0001','stm','STM0001',\
              'thrL','thr operon leader peptide ; K08278 thr operon leader peptide')
        self.assertEqual(obs,exp)

        obs = kegg_label_fields(test2)
        exp = ('stm:STM0002', 'stm', 'STM0002', 'thrA', \
            'bifunctional aspartokinase I/homeserine dehydrogenase I (EC:2.7.2.4 1.1.1.13); K00003 homoserine dehydrogenase [EC:1.1.1.3]; K00928 aspartate kinase [EC:2.7.2.4]')
        
        self.assertEqual(obs,exp)

    def test_ko_record_iterator(self):
        """ko_record_iterator should iterate over KO records"""
        recs = []
        for rec in ko_record_iterator(TEST_KO_LINES):
            recs.append(rec)
        
        self.assertEqual(len(recs),3)
        self.assertEqual(len(recs[0]),31)
        
        exp = 'ENTRY       K01559                      KO\n'
        self.assertEqual(recs[0][0],exp) 
          
        exp = '            RCI: RCIX1162 RCIX2396\n'
        self.assertEqual(recs[0][-1],exp) 
    
        exp = 'ENTRY       K01561                      KO\n'
        self.assertEqual(recs[-1][0],exp) 
    
        exp = '            MSE: Msed_1088\n'
        self.assertEqual(recs[-1][-1],exp) 
    
    def test_ko_record_splitter(self):
        """ko_record_splitter should split ko lines into a dict of groups"""
        
        recs=[rec for rec in ko_record_iterator(TEST_KO_LINES)]
        split_recs =  ko_record_splitter(recs[0])
        exp = ['GENES       AFM: AFUA_4G13070\n',\
               '            PHA: PSHAa2393\n',\
               '            ABO: ABO_0668\n',\
               '            BXE: Bxe_B0037 Bxe_C0683 Bxe_C1002 Bxe_C1023\n',\
               '            MPT: Mpe_A2274\n',\
               '            BBA: Bd0910(catD)\n',\
               '            GBE: GbCGDNIH1_0998 GbCGDNIH1_1171\n',\
               '            FNU: FN1345\n', \
               '            RBA: RB13257\n',\
               '            HMA: rrnAC1925(mhpC)\n',\
               '            RCI: RCIX1162 RCIX2396\n'] 
        self.assertEqual(exp,split_recs["GENES"])
        exp = ['CLASS       Metabolism; Biosynthesis of Secondary Metabolites; Limonene and\n', '            pinene degradation [PATH:ko00903]\n', '            Metabolism; Xenobiotics Biodegradation and Metabolism; Caprolactam\n', '            degradation [PATH:ko00930]\n', '            Metabolism; Xenobiotics Biodegradation and Metabolism;\n', '            1,1,1-Trichloro-2,2-bis(4-chlorophenyl)ethane (DDT) degradation\n', '            [PATH:ko00351]\n', '            Metabolism; Xenobiotics Biodegradation and Metabolism; Benzoate\n', '            degradation via CoA ligation [PATH:ko00632]\n', '            Metabolism; Xenobiotics Biodegradation and Metabolism; Benzoate\n', '            degradation via hydroxylation [PATH:ko00362]\n']
        
    def test_ko_default_parser(self):
        """ko_default parser should strip out newlines and join lines together"""
        
        # Applies to 'NAME' and 'DEFINITION' lines 
        
        default_line_1 = ['NAME        E3.8.1.2\n']
        obs = ko_default_parser(default_line_1)
        self.assertEqual(obs,'E3.8.1.2')
        
        default_line_2 = ['DEFINITION  2-haloacid dehalogenase [EC:3.8.1.2]\n']
        obs = ko_default_parser(default_line_2)
        self.assertEqual(obs,'2-haloacid dehalogenase [EC:3.8.1.2]')

    def test_ko_first_field_parser(self):
        """ko_first_field_parser should strip out newlines and join lines
        together (first field only)"""
        obs = ko_first_field_parser(\
         ['ENTRY       K01559                      KO\n'])
        exp = 'K01559'
        self.assertEqual(obs,exp)
    
    def test_delete_comments(self):
        """delete_comments should delete parenthetical comments from lines"""

        test_line = \
          "bifunctional aspartokinase I/homeserine dehydrogenase I (EC:2.7.2.4 1.1.1.13);"
        exp = "bifunctional aspartokinase I/homeserine dehydrogenase I ;"
        obs = delete_comments(test_line)
        self.assertEqual(obs,exp)

        nested_test_line = \
          "text(comment1(comment2));"
        exp = "text;"
        obs = delete_comments(nested_test_line)
        self.assertEqual(obs,exp)

    def test_ko_colon_fields(self):
        """ko_colon_fields should convert lines to (key, [list of values])"""
        test_lines =\
            ['            BXE: Bxe_B0037 Bxe_C0683 Bxe_C1002 Bxe_C1023\n']
        
        obs = ko_colon_fields(test_lines)
        exp = ('BXE', ['Bxe_B0037', 'Bxe_C0683', 'Bxe_C1002', 'Bxe_C1023'])
        self.assertEqual(obs,exp)

        test_lines = ['            HMA: rrnAC1925(mhpC)\n'] 
        obs = ko_colon_fields(test_lines, without_comments = True)
        exp = ('HMA', ['rrnAC1925'])
        self.assertEqual(obs,exp)
    
 
        test_lines = ['            HMA: rrnAC1925(mhpC)\n'] 
        obs = ko_colon_fields(test_lines, without_comments = False)
        exp = ('HMA', ['rrnAC1925(mhpC)'])
        self.assertEqual(obs,exp)
    
    def test_ko_colon_delimited_parser(self):
        """ko_colon_delimited_parser should return a dict of id: values for
        colon delimited lines"""
        test_lines =\
           ['GENES       AFM: AFUA_4G13070\n',\
            '            PHA: PSHAa2393\n',\
            '            ABO: ABO_0668\n',\
            '            BXE: Bxe_B0037 Bxe_C0683 Bxe_C1002 Bxe_C1023\n',\
            '            MPT: Mpe_A2274\n',\
            '            BBA: Bd0910(catD)\n',\
            '            GBE: GbCGDNIH1_0998 GbCGDNIH1_1171\n',\
            '            FNU: FN1345\n',\
            '            RBA: RB13257\n',\
            '            HMA: rrnAC1925(mhpC)\n',\
            '            RCI: RCIX1162 RCIX2396\n']
        
        obs = ko_colon_delimited_parser(test_lines, without_comments = True)
        self.assertEqual(obs['BXE'],['Bxe_B0037','Bxe_C0683', 'Bxe_C1002','Bxe_C1023'])
        self.assertEqual(obs['PHA'],['PSHAa2393'])
        # Check that comments are stripped
        self.assertEqual(obs['BBA'],['Bd0910'])
        
        obs = ko_colon_delimited_parser(test_lines, without_comments = False)
        # Lines without comments shouldn't be affected
        self.assertEqual(obs['BXE'],['Bxe_B0037','Bxe_C0683', 'Bxe_C1002','Bxe_C1023'])
        self.assertEqual(obs['PHA'],['PSHAa2393'])
        # Comments should be preserved 
        self.assertEqual(obs['BBA'],['Bd0910(catD)'])

    def test_is_new_kegg_rec_group(self):
        """_is_new_kegg_rec_group should check for irregular field terminators in KEGG"""
        pass 
        # Handle unusual KEGG fields.

    def test_group_by_end_char(self):
        """group_by_end_char should yield successive lines that end with a given
        char, plus the last group of lines"""
        class_lines=['CLASS       Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
                    '            gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n',\
                    '            Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
                    '             1,2-Dichloroethane degradation [PATH:ko00631]\n']
       
        exp =[['CLASS       Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
               '            gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n'],\
              ['            Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
               '             1,2-Dichloroethane degradation [PATH:ko00631]\n']]
        for i,group in enumerate(group_by_end_char(class_lines)):
            self.assertEqual(group, exp[i])
 
    def test_class_lines_to_fields(self):
        """class_lines_to_fields should split groups of lines for one KO class
        definition"""
        class_lines1=['CLASS      Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
                    '            gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n']
        
        class_lines2=['            Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
                    '            1,2-Dichloroethane degradation [PATH:ko00631]\n']
       
        obs = class_lines_to_fields(class_lines1)
        exp = ('PATH:ko00361',('Metabolism', 'Xenobiotics Biodegradation and Metabolism', 'gamma-Hexachlorocyclohexane degradation'))
        self.assertEqual(obs,exp)
         
        obs = class_lines_to_fields(class_lines2)
        exp = ('PATH:ko00631',('Metabolism', 'Xenobiotics Biodegradation and Metabolism','1,2-Dichloroethane degradation'))
        self.assertEqual(obs,exp)
            

    def test_ko_class_parser(self):
        """ko_class_parser should return fields"""
        class_lines='CLASS       Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
                    '            gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n',\
                    '            Metabolism; Xenobiotics Biodegradation and Metabolism;\n',\
                    '             1,2-Dichloroethane degradation [PATH:ko00631]\n'
        exp = [('PATH:ko00361',('Metabolism','Xenobiotics Biodegradation and Metabolism',\
           'gamma-Hexachlorocyclohexane degradation')),\
              ('PATH:ko00631',('Metabolism', 'Xenobiotics Biodegradation and Metabolism', '1,2-Dichloroethane degradation'))]
               
        for i,obs in enumerate(ko_class_parser(class_lines)):
            self.assertEqual(obs,exp[i])

    
    def test_parse_ko(self):
        """parse_ko should parse a ko record into fields """
        lines = TEST_KO_LINES
        r =  parse_ko(lines)
        results = []
        for result in r:
            results.append(result)
        # For each entry we expect a dict
        
        self.assertEqual(results[0]["ENTRY"], "K01559")
        self.assertEqual(results[1]["ENTRY"], "K01560")
        self.assertEqual(results[2]["ENTRY"], "K01561")
        
        self.assertEqual(results[0]["NAME"], "E3.7.1.-")
        self.assertEqual(results[1]["NAME"], "E3.8.1.2")
        self.assertEqual(results[2]["NAME"], "E3.8.1.3")

        self.assertEqual(results[0].get("DEFINITION"), None) #case 1 has no def
        self.assertEqual(results[1]["DEFINITION"],\
          "2-haloacid dehalogenase [EC:3.8.1.2]")
        self.assertEqual(results[2]["DEFINITION"],\
          "haloacetate dehalogenase [EC:3.8.1.3]")
        self.assertEqual(len(results[0]["CLASS"]), 5)
        self.assertEqual(results[0]["CLASS"][4], \
          ('PATH:ko00362', ('Metabolism', \
          'Xenobiotics Biodegradation and Metabolism',\
          'Benzoate degradation via hydroxylation'))) 
       
        self.assertEqual(results[0]["DBLINKS"], \
          {'RN': ['R04488', 'R05100', 'R05363', \
                  'R05365', 'R06371', 'R07515', \
                  'R07831']}) 
        
        self.assertEqual(results[1]["DBLINKS"], \
          {'GO': ['0018784'], 'RN': ['R05287'], 'COG': ['COG1011']}) 
        
        self.assertEqual(results[2]["DBLINKS"], \
          {'GO': ['0018785'], 'RN': ['R05287']})
        
        self.assertEqual(results[0]["GENES"], \
          {'AFM': ['AFUA_4G13070'], 'FNU': ['FN1345'],\
           'GBE': ['GbCGDNIH1_0998', 'GbCGDNIH1_1171'],\
           'PHA': ['PSHAa2393'], \
           'BBA': ['Bd0910'], \
           'ABO': ['ABO_0668'],\
           'MPT': ['Mpe_A2274'],\
           'RCI': ['RCIX1162', 'RCIX2396'], \
           'BXE': ['Bxe_B0037', 'Bxe_C0683', 'Bxe_C1002', 'Bxe_C1023'],\
           'HMA': ['rrnAC1925'], \
           'RBA': ['RB13257']})
        

               
TEST_KO_LINES = ['ENTRY       K01559                      KO\n', '\
NAME        E3.7.1.-\n', '\
PATHWAY     ko00351  1,1,1-Trichloro-2,2-bis(4-chlorophenyl)ethane (DDT)\n', '\
                            degradation\n', '\
            ko00362  Benzoate degradation via hydroxylation\n', '\
            ko00632  Benzoate degradation via CoA ligation\n', '\
            ko00903  Limonene and pinene degradation\n', '\
            ko00930  Caprolactam degradation\n', '\
CLASS       Metabolism; Biosynthesis of Secondary Metabolites; Limonene and\n', '\
            pinene degradation [PATH:ko00903]\n', '\
            Metabolism; Xenobiotics Biodegradation and Metabolism; Caprolactam\n', '\
            degradation [PATH:ko00930]\n', '\
            Metabolism; Xenobiotics Biodegradation and Metabolism;\n', '\
            1,1,1-Trichloro-2,2-bis(4-chlorophenyl)ethane (DDT) degradation\n', '\
            [PATH:ko00351]\n', '\
            Metabolism; Xenobiotics Biodegradation and Metabolism; Benzoate\n', '\
            degradation via CoA ligation [PATH:ko00632]\n', '\
            Metabolism; Xenobiotics Biodegradation and Metabolism; Benzoate\n', '\
            degradation via hydroxylation [PATH:ko00362]\n', '\
DBLINKS     RN: R04488 R05100 R05363 R05365 R06371 R07515 R07831\n', '\
GENES       AFM: AFUA_4G13070\n', '\
            PHA: PSHAa2393\n', '\
            ABO: ABO_0668\n', '\
            BXE: Bxe_B0037 Bxe_C0683 Bxe_C1002 Bxe_C1023\n', '\
            MPT: Mpe_A2274\n', '\
            BBA: Bd0910(catD)\n', '\
            GBE: GbCGDNIH1_0998 GbCGDNIH1_1171\n', '\
            FNU: FN1345\n', '\
            RBA: RB13257\n', '\
            HMA: rrnAC1925(mhpC)\n', '\
            RCI: RCIX1162 RCIX2396\n', '\
///\n', '\
ENTRY       K01560                      KO\n', '\
NAME        E3.8.1.2\n', '\
DEFINITION  2-haloacid dehalogenase [EC:3.8.1.2]\n', '\
PATHWAY     ko00361  gamma-Hexachlorocyclohexane degradation\n', '\
            ko00631  1,2-Dichloroethane degradation\n', '\
CLASS       Metabolism; Xenobiotics Biodegradation and Metabolism;\n', '\
            gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n', '\
            Metabolism; Xenobiotics Biodegradation and Metabolism;\n', '\
            1,2-Dichloroethane degradation [PATH:ko00631]\n', '\
DBLINKS     RN: R05287\n', '\
            COG: COG1011\n', '\
            GO: 0018784\n', '\
GENES       NCR: NCU03617\n', '\
            ANI: AN5830.2 AN7918.2\n', '\
            AFM: AFUA_2G07750 AFUA_5G14640 AFUA_8G05870\n', '\
            AOR: AO090001000019 AO090003001435 AO090011000921\n', '\
            PST: PSPTO_0247(dehII)\n', '\
            PSP: PSPPH_1747(dehII1) PSPPH_5028(dehII2)\n', '\
            ATU: Atu0797 Atu3405(hadL)\n', '\
            ATC: AGR_C_1458 AGR_L_2834\n', '\
            RET: RHE_CH00996(ypch00330) RHE_PF00342(ypf00173)\n', '\
            MSE: Msed_0732\n', '\
///\n', '\
ENTRY       K01561                      KO\n', '\
NAME        E3.8.1.3\n', '\
DEFINITION  haloacetate dehalogenase [EC:3.8.1.3]\n', '\
PATHWAY     ko00361  gamma-Hexachlorocyclohexane degradation\n', '\
            ko00631  1,2-Dichloroethane degradation\n', '\
CLASS       Metabolism; Xenobiotics Biodegradation and Metabolism;\n', '\
            gamma-Hexachlorocyclohexane degradation [PATH:ko00361]\n', '\
            Metabolism; Xenobiotics Biodegradation and Metabolism;\n', '\
            1,2-Dichloroethane degradation [PATH:ko00631]\n', '\
DBLINKS     RN: R05287\n', '\
            GO: 0018785\n', '\
GENES       RSO: RSc0256(dehH)\n', '\
            REH: H16_A0197\n', '\
            BPS: BPSL0329\n', '\
            BPM: BURPS1710b_0537(dehH)\n', '\
            BPD: BURPS668_0347\n', '\
            STO: ST2570\n', '\
            MSE: Msed_1088\n', '\
///\n']


if __name__=="__main__":
    main()