File: kegg_fasta.py

package info (click to toggle)
python-cogent 1.5.3-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 16,424 kB
  • ctags: 24,343
  • sloc: python: 134,200; makefile: 100; ansic: 17; sh: 10
file content (49 lines) | stat: -rw-r--r-- 1,403 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python

from string import strip
from cogent.parse.fasta import MinimalFastaParser

__author__ = "Jesse Zaneveld"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Jesse Zaneveld", "Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Jesse Zaneveld"
__email__ = "zaneveld@gmail.com"
__status__ = "Production"

"""
Parser for KEGG fasta files 

This code is useful for parsing the KEGG .nuc or .pep files
"""

def parse_fasta(lines):
    """lightweight parser for KEGG FASTA format sequences"""
    for label, seq in MinimalFastaParser(lines):
        yield '\t'.join(list(kegg_label_fields(label)) \
          + [seq] + ["\n"])

def kegg_label_fields(line):
    """Splits line into KEGG label fields.

    Format is species:gene_id [optional gene_name]; description.
    """
    fields = map(strip, line.split(None, 1))
    id_ = fields[0] 
    species, gene_id = map(strip, id_.split(':',1))
    #check if we got a description
    gene_name = description = ''
    if len(fields) > 1:
        description = fields[1]
        if ';' in description:
            gene_name, description = map(strip, description.split(';',1))
    return id_, species, gene_id, gene_name, description



if __name__ == '__main__':
    from sys import argv
    filename = argv[1]
    for result_line in parse_fasta(open(filename)):
        print result_line.strip()