File: unigene.py

package info (click to toggle)
python-cogent 1.5.3-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 16,424 kB
  • ctags: 24,343
  • sloc: python: 134,200; makefile: 100; ansic: 17; sh: 10
file content (112 lines) | stat: -rw-r--r-- 3,964 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
"""Parsers for the various files in the UniGene database.
"""
from cogent.parse.record import MappedRecord, ByPairs, semi_splitter, \
    equal_pairs, LineOrientedConstructor, list_adder, int_setter
from cogent.parse.record_finder import GbFinder
from string import maketrans, strip

__author__ = "Rob Knight"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Rob Knight"
__email__ = "rob@spot.colorado.edu"
__status__ = "Development"

def _read_sts(line):
    """Turns an STS line (without label) into a record.
    
    Infuritatingly, STS lines are not semicolon-delimited, and spaces appear
    in places they shouldn't. This was the case as of 10/9/03: expect this
    'feature' to be unstable!
    """
    filtered = line.replace('=', ' ')
    return MappedRecord(list(ByPairs(filtered.split())))

def _read_expression(line):
    """Turns a semicolon-delimited  expression line into list of expressions"""
    return semi_splitter(line)

class UniGeneSeqRecord(MappedRecord):
    Aliases = {'ACC':'Accession', 'CLONE':'CloneId', 'END':'End',\
        'LID':'LibraryId', 'SEQTYPE':'SequenceType', 'TRACE':'Trace', \
        'EST':'EstId', 'NID':'NucleotideId', 'PID':'ProteinId'}

class UniGeneProtSimRecord(MappedRecord):
    Aliases = {'ORG':'Species', 'PROTGI':'ProteinGi', 'ProtId':'ProteinId',\
        'PCT':'PercentSimilarity', 'ALN':'AlignmentScore'}

def _read_seq(line):
    """Turns a sequence line into a UniGeneSeqRecord.
    
    BEWARE: first level delimiter is ';' and second level delimiter is '=', but
    '=' can also appear inside the _value_ of the second level!
    """
    first_level = semi_splitter(line)
    second_level = map(equal_pairs, first_level)
    return UniGeneSeqRecord(second_level)

def _read_protsim(line):
    """Turns a protsim line into a UniGeneProtSim record.
    
    BEWARE: first level delimiter is ';' and second level delimiter is '=', but
    '=' can also appear inside the _value_ of the second level!
    """
    first_level = semi_splitter(line)
    second_level = map(equal_pairs, first_level)
    return UniGeneProtSimRecord(second_level)

class UniGene(MappedRecord):
    """Holds data for a UniGene record."""
    Required = {    'STS':[], 'PROTSIM':[], 'SEQUENCE':[], 'EXPRESS': []}
    Aliases = {'STS':'Sts', 'PROTSIM':'ProteinSimilarities',\
    'SEQUENCE':'SequenceIds','SCOUNT':'SequenceCount','CTYOBAND':'CytoBand',\
    'EXPRESS':'ExpressedIn', 'CHROMOSOME':'Chromosome','ID':'UniGeneId', \
    'TITLE':'UniGeneTitle','LOCUSLINK':'LocusLinkId'}

def _expressions_setter(obj, field, val):
    """Sets specified field to a list of expressions"""
    setattr(obj, field, semi_splitter(val))

def _sts_adder(obj, field, val):
    """Appends the current STS-type record to specified field"""
    list_adder(obj, field, _read_sts(val))

def _seq_adder(obj, field, val):
    """Appends the current Sequence-type record to specified field"""
    list_adder(obj, field, _read_seq(val))

def _protsim_adder(obj, field, val):
    """Appends the current ProtSim record to specified field"""
    list_adder(obj, field, _read_protsim(val))
 
LinesToUniGene = LineOrientedConstructor()
LinesToUniGene.Constructor = UniGene
LinesToUniGene.FieldMap = {
    'LOCUSLINK':int_setter,
    'EXPRESS':_expressions_setter,
    'PROTSIM':_protsim_adder,
    'SCOUNT':int_setter,
    'SEQUENCE':_seq_adder,
    'STS':_sts_adder,
}

def UniGeneParser(lines):
    """Treats lines as a stream of unigene records"""
    for record in GbFinder(lines):
        curr = LinesToUniGene(record)
        del curr['//']  #clean up delimiter
        yield curr

if __name__ == '__main__':
    from sys import argv, stdout
    filename = argv[1]
    count = 0
    for record in UniGeneParser(open(filename)):
        stdout.write('.')
        stdout.flush()
        count += 1
    print "read %s records" % count