File: test_ncbi.py

package info (click to toggle)
python-cogent 1.5.3-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 16,424 kB
  • ctags: 24,343
  • sloc: python: 134,200; makefile: 100; ansic: 17; sh: 10
file content (247 lines) | stat: -rw-r--r-- 11,120 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#!/usr/bin/env python
"""Tests of data retrieval from NCBI."""
from cogent.util.unit_test import TestCase, main
from cogent.db.ncbi import EUtils, ESearch, EFetch, ELink, ESearchResultParser,\
    ELinkResultParser, get_primary_ids, ids_to_taxon_ids, \
    taxon_lineage_extractor, taxon_ids_to_lineages, taxon_ids_to_names, \
    taxon_ids_to_names_and_lineages, \
    get_unique_lineages, get_unique_taxa, parse_taxonomy_using_elementtree_xml_parse
from string import strip
from StringIO import StringIO

__author__ = "Mike Robeson"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Mike Robeson", "Rob Knight", "Zongzhi Liu"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Mike Robeson"
__email__ = "mike.robeson@colorado.edu"
__status__ = "Production"

class EUtilsTests(TestCase):
    """Tests of the EUtils class."""
    def test_simple_get(self):
        """EUtils simple access of an item should work"""
        g = EUtils(db='protein',rettype='gp')
        result = g['NP_003320'].read()
        assert result.startswith('LOCUS')
        assert 'NP_003320' in result

    def test_get_slice(self):
        """EUtils access of a slice should work"""
        g = EUtils(db='protein',rettype='gp', retmax=1)
        result = g['NP_003320':'NP_003322'].read()
        lines = result.splitlines()
        is_locus = lambda x: x.startswith('LOCUS')
        loci = filter(is_locus, lines)
        self.assertEqual(len(loci), 3)

        #EUtils access of a slice should work, while limiting 
        #the esearch term length
        g = EUtils(db='protein',rettype='gp', retmax=1, url_limit=2)
        result = g['NP_003320':'NP_003322'].read()
        lines = result.splitlines()
        is_locus = lambda x: x.startswith('LOCUS')
        loci = filter(is_locus, lines)
        self.assertEqual(len(loci), 3)


    def test_get_list(self):
        """EUtils access of a list should work"""
        g = EUtils(db='protein',rettype='gp')
        result = g['NP_003320','NP_003321','NP_003322'].read()
        lines = result.splitlines()
        is_locus = lambda x: x.startswith('LOCUS')
        loci = filter(is_locus, lines)
        self.assertEqual(len(loci), 3)

        #EUtils access of a slice should work, while limiting
        #the esearch term length
        g = EUtils(db='protein',rettype='gp',url_limit=2)
        result = g['NP_003320','NP_003321','NP_003322'].read()
        lines = result.splitlines()
        is_locus = lambda x: x.startswith('LOCUS')
        loci = filter(is_locus, lines)
        self.assertEqual(len(loci), 3)


#    def test_get_from_taxonomy_db(self):
#        """EUtils access from taxonomy database should work"""
#        #note: this is more fragile than the nucleotide databases
#        g = EUtils(db='taxonomy', rettype='Brief', retmode='text')
#        ids = '9606[taxid] OR 28901[taxid]'
#        result = sorted(g[ids].read().splitlines())
#        self.assertEqual(result, ['Homo sapiens', 'Salmonella enterica'])

    def test_get_from_taxonomy_db(self):
        """EUtils access from taxonomy database should work"""
        #note: this is more fragile than the nucleotide databases
        g = EUtils(db='taxonomy', rettype='xml', retmode='xml')
        ids = '9606[taxid] OR 28901[taxid]'
        fh = StringIO()
        fh.write(g[ids].read())
        fh.seek(0)
        data = parse_taxonomy_using_elementtree_xml_parse(fh)
        result = sorted([item['ScientificName'] for item in data])
        self.assertEqual(result, ['Homo sapiens', 'Salmonella enterica'])

        
    def test_query(self):
        """EUtils access via a query should work"""
        g = EUtils(db='protein', rettype='gi', retmax=100)
        result = g['homo[organism] AND erf1[ti]'].read().splitlines()
        assert '5499721' in result  #gi of human eRF1
        #note: successfully retrieved 841,821 ids on a query for 'rrna',
        #although it took about 40 min so not in the tests. RK 1/3/07.

    def test_query_retmax(self):
        """EUtils should join results taken retmax at a time"""
        g = EUtils(db='protein', rettype='gi', retmax=3, DEBUG=False)
        result = g['homo[organism] AND myh7'].read().splitlines()
        assert len(result) > 1
        assert '83304912' in result  #gi of human myh7

    def test_query_max_recs(self):
        """EUtils should stop query at max_recs when max_recs < retmax"""
        g = EUtils(db='protein', rettype='gi', max_recs=5, DEBUG=False,
            retmax=100)
        result = g['homo[organism] AND myh7'].read().splitlines()
        self.assertEqual(len(result), 5)

    def test_query_max_recs_gt_retmax(self):
        """EUtils should stop query at max_recs when max_recs > retmax"""
        g = EUtils(db='protein', rettype='gi', max_recs=5, DEBUG=False,
            retmax=3)
        result = g['homo[organism] AND myh7'].read().splitlines()
        self.assertEqual(len(result), 5)


class ESearchTests(TestCase):
    """Tests of the ESearch class: gets primary ids from search."""
    def test_simple_search(self):
        """ESearch Access via a query should return accessions"""
        s = ESearch(db='protein', rettype='gi', retmax=1000,
            term='homo[organism] AND myh7')
        result = s.read()
        parsed = ESearchResultParser(result)
        assert '83304912' in parsed.IdList  #gi of human cardiac beta myh7

class ELinkTests(TestCase):
    """Tests of the ELink class: converts ids between databases"""
    def test_simple_elink(self):
        """ELink should retrieve a link from a single id"""
        l = ELink(db='taxonomy', dbfrom='protein', id='83304912')
        result = l.read()
        parsed = ELinkResultParser(result)
        self.assertEqual(parsed, ['9606'])  #human sequence

    def test_multiple_elink(self):
        """ELink should find unique links in a set of ids"""
        l = ELink(db='taxonomy', dbfrom='protein', 
            id='83304912 115496169 119586556 111309484')
        result = l.read()
        parsed = ELinkResultParser(result)
        self.assertEqual(sorted(parsed), ['10090', '9606'])  
        #human and mouse sequences

class EFetchTests(TestCase):
    """Tests of the EFetch class: gets records using primary ids."""
    def test_simple_efetch(self):
        """EFetch should return records from list of ids"""
        f = EFetch(db='protein', rettype='fasta', retmode='text', 
            id='111309484')
        result = f.read().splitlines()
        assert result[0].startswith('>')
        assert result[1].startswith('madaemaafg'.upper())

class NcbiTests(TestCase):
    """Tests of top-level convenience wrappers."""
    def setUp(self):
        """Define some lengthy data."""
        self.mouse_taxonomy = map(strip, 'cellular organisms; Eukaryota; Opisthokonta; Metazoa; Eumetazoa; Bilateria; Coelomata; Deuterostomia; Chordata; Craniata; Vertebrata; Gnathostomata; Teleostomi; Euteleostomi; Sarcopterygii; Tetrapoda; Amniota; Mammalia; Theria; Eutheria; Euarchontoglires; Glires; Rodentia; Sciurognathi; Muroidea; Muridae; Murinae; Mus; Mus'.split(';'))
        self.human_taxonomy = map(strip, 'cellular organisms; Eukaryota; Opisthokonta; Metazoa; Eumetazoa; Bilateria; Coelomata; Deuterostomia; Chordata; Craniata; Vertebrata; Gnathostomata; Teleostomi; Euteleostomi; Sarcopterygii; Tetrapoda; Amniota; Mammalia; Theria; Eutheria; Euarchontoglires; Primates; Haplorrhini; Simiiformes; Catarrhini; Hominoidea; Hominidae; Homininae; Homo'.split(';'))

    def test_get_primary_ids(self):
        """get_primary_ids should get primary ids from query"""
        res = get_primary_ids('homo[orgn] AND myh7[ti]', retmax=5, max_recs=7)
        self.assertEqual(len(res), 7)
        res = get_primary_ids('homo[orgn] AND myh7[ti]', retmax=5, max_recs=2)
        self.assertEqual(len(res), 2)
        res = get_primary_ids('homo[orgn] AND myh7[ti]', retmax=100)
        assert '115496168' in res

    def test_ids_to_taxon_ids(self):
        """ids_to_taxonomy should get taxon ids from primary ids"""
        ids = ['83304912', '115496169', '119586556', '111309484']
        result = ids_to_taxon_ids(ids, db='protein')
        self.assertEqual(sorted(result), ['10090', '9606'])

    def test_taxon_lineage_extractor(self):
        """taxon_lineage_extractor should find lineage lines"""
        lines = """ignore
        <Lineage>xxx;yyy</Lineage>
        ignore

        <Lineage>aaa;bbb</Lineage>
        """
        self.assertEqual(list(taxon_lineage_extractor(lines.splitlines())),
            [['xxx','yyy'],['aaa','bbb']])

    def test_parse_taxonomy_using_elementtree_xml_parse(self):
        """parse_taxonomy_using_elementtree_xml_parse should return taxonomy associated information"""
        g = EUtils(db='taxonomy', rettype='xml', retmode='xml')
        ids = '28901[taxid]'
        fh = StringIO()
        fh.write(g[ids].read())
        fh.seek(0)
        data = parse_taxonomy_using_elementtree_xml_parse(fh)[0]
        obs = (data['Lineage'],data['TaxId'],data['ScientificName'],\
               data['Rank'])
        exp = ('cellular organisms; Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; Enterobacteriaceae; Salmonella',\
              '28901','Salmonella enterica','species')
        self.assertEqual(obs,exp)

    
    def test_taxon_ids_to_lineages(self):
        """taxon_ids_to_lineages should return lineages from taxon ids"""
        taxon_ids = ['10090', '9606']
        result = [self.mouse_taxonomy, self.human_taxonomy]
        self.assertEqualItems(list(taxon_ids_to_lineages(taxon_ids)), result)

    
#    def test_taxon_ids_to_names(self):
#        """taxon_ids_to_names should return names from taxon ids"""
#        taxon_ids = ['10090', '9606']
#        result = set(['Mus musculus', 'Homo sapiens'])
#        self.assertEqual(set(taxon_ids_to_names(taxon_ids)), result)

    def test_taxon_ids_to_names(self):
        """taxon_ids_to_names should return names from taxon ids"""
        taxon_ids = ['10090', '9606']
        result = set(['Mus musculus', 'Homo sapiens'])
        self.assertEqual(set(taxon_ids_to_names(taxon_ids)), result)

    def test_taxon_ids_to_names_and_lineages(self):
        """taxon_ids_to_names should return names/lineages from taxon ids"""
        taxon_ids = ['10090', '9606']
        exp = [('10090', 'Mus musculus', '; '.join(self.mouse_taxonomy)),
                  ('9606', 'Homo sapiens', '; '.join(self.human_taxonomy))]
        obs = list(taxon_ids_to_names_and_lineages(taxon_ids))
        self.assertEqualItems(obs, exp)

    def test_get_unique_lineages(self):
        """get_unique_lineages should return all lineages from a query"""
        result = get_unique_lineages('angiotensin[ti] AND rodents[orgn]')
        
        assert tuple(self.mouse_taxonomy) in result
        assert len(result) > 2

    def test_get_unique_taxa(self):
        """get_unique_taxa should return all taxa from a query"""
        result = get_unique_taxa('angiotensin[ti] AND primate[orgn]')
        assert 'Homo sapiens' in result
        assert len(result) > 2
        

if __name__ == '__main__':
    main()