File: kegg_ko.py

package info (click to toggle)
python-cogent 1.5.3-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 16,424 kB
  • ctags: 24,343
  • sloc: python: 134,200; makefile: 100; ansic: 17; sh: 10
file content (344 lines) | stat: -rw-r--r-- 11,635 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
#!/usr/bin/env python

from sys import argv
from string import strip
from os import listdir,path 
from optparse import OptionParser
from datetime import datetime
import tarfile

_author__ = "Jesse Zaneveld"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Jesse Zaneveld", "Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Jesse Zaneveld"
__email__ = "zaneveld@gmail.com"
__status__ = "Development"


"""A parser for the KEGG 'ko' file 
containing information on KEGG orthology groups and their associated pathways.
"""

def parse_ko_file(filepath,dir_prefix=None,debug = True): 
    """Parse the NCBI KO file lines, and output several tab-delimited files
    
    filepath - the full filepath to the input KO file from KEGG
    dir_prefix - the directory to which tab-delimited output files will be saved.
    debug - if set to True, pring debugging output to the screen
    """
    
    lines = open(filepath,"U")
    ko_gene_fname = 'ko_to_gene.tab'
    ko_fname = 'ko.tab'
    ko_pathway_fname = 'ko_to_pathway.tab'
    pathway_fname = 'pathway.tab'
    ko_cog_fname = 'ko_to_cog.tab'
    ko_cazy_fname = 'ko_to_cazy.tab'
    ko_go_fname = 'ko_to_go.tab'
    fnames = [ko_gene_fname, ko_fname, ko_pathway_fname,\
              pathway_fname, ko_cog_fname, ko_cazy_fname,\
              ko_go_fname]
    
    if dir_prefix:
        fnames = [dir_prefix + '/' + f for f in fnames]
    
    if debug:
        for res_fp in fnames:
            print "Outputting parsed info to: %s" %(res_fp) 
    
    ko_gene, ko, ko_pathway, pathway, ko_cog, ko_cazy, ko_go = \
        [open(i, 'w') for i in fnames]

    #figure out what fields we want (and get them), and get pathway data
    fields = ['ENTRY', 'NAME', 'DEFINITION']
    ko_to_pathway = {}
    for rec in parse_ko(lines):
        ko.write('\t'.join([rec.get(f,'') for f in fields]))
        ko.write('\n')
        entry = rec['ENTRY']
        if 'GENES' not in rec:
            continue    #apparently, some records don't have genes...
        genes = rec['GENES']

        for species, gene_list in genes.items():
            for g in gene_list:
                ko_gene.write('%s\t%s:%s\n' % (entry, species.lower(), g))
        
        if 'CLASS' not in rec:
            continue    #apparently they also lack classes...
        
        ko_to_pathway[entry] = rec['CLASS']

        dblinks = rec.get('DBLINKS', None)
        
        if dblinks:
            cogs = dblinks.get('COG', None)
            cazy = dblinks.get('CAZy', None)
            go   = dblinks.get('GO', None)
	    
        if cogs:
            for c in cogs:
                ko_cog.write("%s\t%s\n" % (entry, c))
        if go:
            for g in go:
                ko_go.write("%s\t%s\n" % (entry, g))
        if cazy:
            for c in cazy:
                ko_cazy.write("%s\t%s\n" % (entry,c))
    #postprocess the ko_to_pathway data to find out what the pathway terms
    #are and to write them out into a join file
    max_terms = 10
    unique_recs = {}    #will hold tuple(fields) -> unique_id
    curr_uid = 0
    for ko, classes in ko_to_pathway.items():
        for (id_, fields) in classes:
            if fields not in unique_recs:
                unique_recs[fields] = curr_uid
                fields_for_output = fields[:]
                if len(fields_for_output) > max_terms:
                    fields_for_output = fields_for_output[:max_terms]
                elif len(fields_for_output) < max_terms:
                    fields_for_output += \
                        ('',)*(max_terms - len(fields_for_output))
                pathway.write('\t'.join((str(curr_uid),str(id_)) +\
                    fields_for_output)+'\n')
                curr_uid += 1
            uid = unique_recs[fields]
            ko_pathway.write(str(ko)+ '\t'+ str(uid) + '\n')


def make_tab_delimited_line_parser(columns_to_convert):
    """Generates a function that parses a tab-delimited line
    
    columns_to_convert:  a list of column indexes to convert into integers
    by splitting on ':' and taking the second entry (e.g. to convert listings
    like GO:0008150 to 0008150 or ncbi-gi:14589889 to 14589889)"""

    def parse_tab_delimited_line(line):
        """Parse a tab-delimited line taking only the second item of cols %s""" %\
          str(columns_to_convert)
        fields = line.split("\t")
        for index in columns_to_convert:
            fields[index] = fields[index].split(":")[1]
        return "\t".join(fields)

    return parse_tab_delimited_line

def ko_default_parser(lines):
    """Handle default KEGG KO entry lines
     lines -- default format of space separated lines.
             Examples include the NAME and DEFINITION 
             entries
     
     Strips out newlines and joins lines together."""

    return ' '.join(map(strip, lines)).split(None, 1)[1]

def ko_first_field_parser(lines):
    """Handles KEGG KO entries where only the first field is of interest

    For example, ENTRY fields like:
    'ENTRY       K01559             KO\n'
    
    Strips out newlines and joins lines together for the first field only."""
    return ' '.join(map(strip, lines)).split()[1]

def delete_comments(line):
    """Deletes comments in parentheses from a line."""
    fields = line.split(')')
    result = []
    for f in fields:
        if '(' in f:
            result.append(f.split('(',1)[0])
        else:
            result.append(f)
    return ''.join(result)

def ko_colon_fields(lines, without_comments=True):
    """Converts line to (key, [list of values])
    
    lines -- colon fields such as DBLINKS or GENES 
    in the KEGG KO file.

    Example:
    '            BXE: Bxe_B0037 Bxe_C0683 Bxe_C1002 Bxe_C1023\n'
    """
    merged = ' '.join(map(strip, lines))
    if without_comments:
        merged = delete_comments(merged)
    key, remainder = merged.split(':',1)
    vals = remainder.split()
    return key, vals

def ko_colon_delimited_parser(lines, without_comments=True):
    """For lines of the form LABEL: id: values.

    Returns dict of id:values.
    """
    first_line = lines[0]
    without_first_field = first_line.split(None, 1)[1]
    data_start = len(first_line) - len(without_first_field)
    result = {}
    curr = []
    for line in lines:
        line = line[data_start:]
        if line[0] != ' ':  #start of new block
            if curr:
                key, vals = ko_colon_fields(curr, without_comments)
                result[key] = vals
                curr = []
        curr.append(line)
    if curr:
        key, vals = ko_colon_fields(curr, without_comments)
        result[key] = vals
    return result

def _is_new_kegg_rec_group(prev, curr):
    """Check for irregular record group terminators"""
    
    return curr[0].isupper() and not prev.endswith(';') and \
    not curr.startswith('CoA biosynthesis') and not prev.endswith(' and') and \
        not prev.endswith('-') and not prev.endswith(' in') and not \
        prev.endswith(' type') and not prev.endswith('Bindng') and not \
        prev.endswith('Binding')

def group_by_end_char(lines, end_char = ']', \
    is_new_rec=_is_new_kegg_rec_group):
    """Yields successive groups of lines that end with the specified char.
    
    Note: also returns the last group of lines whether or not the end char
    is present.
    """
    curr_lines = []
    prev_line = ''
    for line in lines:
        stripped = line.strip()
        #unfortunately, not all records in kegg actually end with the
        #terminator, so need to check for termination condition
        if is_new_rec(prev_line, stripped):
            if curr_lines:
                yield curr_lines
            curr_lines = []
        #if the line ends with the character we're looking for, assume we've
        #found a new record
        if stripped.endswith(end_char):
            yield curr_lines + [line]
            curr_lines = []
        else:
            curr_lines.append(line)
        prev_line = stripped
    if curr_lines:
        yield curr_lines

def class_lines_to_fields(lines):
    """Converts a list of lines in a single pathway within one KO class definition.
    """
    rec = ' '.join(map(strip, lines))
    #need to split off the class declaration if it is present
    if rec.startswith('CLASS'):
        rec = rec.split(None,1)[1]
    #figure out if it has an id and process accordingly
    if rec.endswith(']'):
        rec, class_id = rec.rsplit('[', 1)
        class_id = class_id[:-1]
    else:
        class_id = None
    rec_fields = map(strip, rec.split(';'))
    return class_id, tuple(rec_fields)

def ko_class_parser(lines, without_comments='ignored'):
    """For the CLASS declaration lines.

    These take the form of multi-line semicolon-delimited fields (where
    each field is a successive entry in the KEGG pathway hierarchy), ending
    in a field of the form [PATH:ko00071].

    Strategy:
    - iterate over groups of lines that end in ] (each represents one pathway)
    - for each line:
        - split off and extract the pathway id
        - split the rest of the terms on semicolon
        - return a tuple of (pathway_id, [terms_in_order])
    
    Don't consolidate the terms in this parser because each KO group has
    its own class declaration so we would have to merge them for each class:
    instead, merge at higher level.
    """
    for group in group_by_end_char(lines):
        yield class_lines_to_fields(group)

def parse_ko(lines):
    """Parses a KO record into fields."""
    
    # Here we define records by their category 
    # to allow parsers to be reused on 
    # similar entries.
    
    default_fields = ['NAME', 'DEFINITION']
    colon_fields = ['DBLINKS', 'GENES']
    first_field_only = ['ENTRY']
    class_fields = ['CLASS']
    
    for rec in ko_record_iterator(lines):
        split_fields = ko_record_splitter(rec)
        result = {}
        for k, v in split_fields.items():
            if k in default_fields:
                result[k] = ko_default_parser(v)
            elif k in colon_fields:
                result[k] = ko_colon_delimited_parser(v)
            elif k in first_field_only:
                result[k] = ko_first_field_parser(v)
            elif k in class_fields:
                result[k] = list(ko_class_parser(v))
        yield result

#parse_ko: lightweight standalone ko parser
def ko_record_iterator(lines):
    """Iterates over KO records, delimited by '///'"""
    curr = []
    for line in lines:
        if line.startswith('///') and curr:
            yield curr
            curr = []
        else:
            curr.append(line)
    if curr:
        yield curr

def ko_record_splitter(lines):
    """Splits KO lines into dict of groups keyed by type."""
    result = {}
    curr_label = None
    curr = []
    i = 0
    for line in lines:
        i+= 1
        if line[0] != ' ':
            if curr_label is not None:
                result[curr_label] = curr
            fields = line.split(None, 1)
            
            # Annoyingly, can have blank REFERENCE lines
            # Lacking PMID, these still have auth/title info, however...
            if len(fields) == 1:
                curr_label = fields[0]
                curr_line = ''
            else:    
                curr_label, curr_line = fields
            curr = [line]
        else:
            curr.append(line)
    if curr:
        result[curr_label] = curr
    return result

if __name__ == '__main__':
    from sys import argv
    filename = argv[1]
    out_dir = argv[2]
    parse_ko_file(filename, \
      dir_prefix = out_dir, \
      debug = True)