File: macsim.py

package info (click to toggle)
python-cogent 1.4.1-1.2
  • links: PTS, VCS
  • area: non-free
  • in suites: squeeze
  • size: 13,260 kB
  • ctags: 20,087
  • sloc: python: 116,163; ansic: 732; makefile: 74; sh: 9
file content (41 lines) | stat: -rw-r--r-- 1,326 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python

from cogent.core import annotation, moltype

__author__ = "Peter Maxwell"
__copyright__ = "Copyright 2007-2009, The Cogent Project"
__credits__ = ["Raymond Sammut", "Peter Maxwell", "Gavin Huttley",
                    "Rob Knight"]
__license__ = "GPL"
__version__ = "1.4.1"
__maintainer__ = "Peter Maxwell"
__email__ = "pm67nz@gmail.com"
__status__ = "Production"

#<?xml version="1.0"?>
#<!DOCTYPE macsim SYSTEM "http://www-bio3d-igbmc.u-strasbg.fr/macsim.dtd">

# As used by BAliBASE

def MacsimParser(doc):
    doc = doc.getElementsByTagName('macsim')[0]
    align = doc.getElementsByTagName('alignment')[0]
    for record in align.getElementsByTagName('sequence'):
        name = record.getElementsByTagName(
                        'seq-name')[0].childNodes[0].nodeValue
        raw_seq = record.getElementsByTagName(
                        'seq-data')[0].childNodes[0].nodeValue
        
        #cast as string to de-unicode
        raw_string = ''.join(str(raw_seq).upper().split())
        name=str(name).strip()
        
        if str(record.getAttribute('seq-type')).lower() == 'protein':
            alphabet = moltype.PROTEIN
        else:
            alphabet = moltype.DNA

        seq = alphabet.makeSequence(raw_string, Name=name)
                
        yield (name, seq)