File: test_compara.py

package info (click to toggle)
python-cogent 1.4.1-1.2
  • links: PTS, VCS
  • area: non-free
  • in suites: squeeze
  • size: 13,260 kB
  • ctags: 20,087
  • sloc: python: 116,163; ansic: 732; makefile: 74; sh: 9
file content (193 lines) | stat: -rw-r--r-- 10,852 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
from __future__ import division

import os
from cogent.util.unit_test import TestCase, main
from cogent.db.ensembl.host import HostAccount, get_ensembl_account
from cogent.db.ensembl.compara import Compara

__author__ = "Gavin Huttley, Hua Ying"
__copyright__ = "Copyright 2007-2009, The Cogent Project"
__credits__ = ["Gavin Huttley", "hua Ying"]
__license__ = "GPL"
__version__ = "1.4.1"
__maintainer__ = "Gavin Huttley"
__email__ = "Gavin.Huttley@anu.edu.au"
__status__ = "alpha"

Release = 56

if 'ENSEMBL_ACCOUNT' in os.environ:
    username, password = os.environ['ENSEMBL_ACCOUNT'].split()
    account = HostAccount('127.0.0.1', username, password)
else:
    account = get_ensembl_account(release=Release)

def calc_slope(x1, y1, x2, y2):
    """computes the slope from two coordinate sets, assigning a delta of 1
    when values are identical"""
    delta_y = y2-y1
    delta_x = x2-x1
    delta_y = [delta_y, 1][delta_y == 0]
    delta_x = [delta_x, 1][delta_x == 0]
    return delta_y/delta_x

class ComparaTestBase(TestCase):
    comp = Compara(['human', 'mouse', 'rat', 'platypus'], Release=Release, account=account)

class TestCompara(ComparaTestBase):
    def test_query_genome(self):
        """compara should attach valid genome attributes by common name"""
        brca2 = list(self.comp.Mouse.getGenesMatching(Symbol="brca2"))[0]
        self.assertEquals(brca2.Symbol.lower(), 'brca2')
    
    def test_get_related_genes(self):
        """should correctly return the related gene regions from each genome"""
        brca2 = list(self.comp.Human.getGenesMatching(StableId="ENSG00000139618"))[0]
        Orthologs = self.comp.getRelatedGenes(gene_region=brca2,
                Relationship="ortholog_one2one")
        self.assertEquals("ortholog_one2one", Orthologs.Relationships[0])
    
    def test_get_related_genes2(self):
        """should handle case where gene is absent from one of the genomes"""
        clec2d = list(self.comp.Mouse.getGenesMatching(
                      StableId='ENSMUSG00000030157'))[0]
        orthologs = self.comp.getRelatedGenes(gene_region=clec2d,
                        Relationship='ortholog_one2many')
        self.assertEquals(len(orthologs.Members),3)
    
    def test_get_collection(self):
        brca2 = list(self.comp.Human.getGenesMatching(StableId="ENSG00000139618"))[0]
        Orthologs = self.comp.getRelatedGenes(gene_region=brca2,
                        Relationship="ortholog_one2one")
        collection = Orthologs.getSeqCollection()
        self.assertTrue(len(collection.Seqs[0])> 1000)
    
    def test_getting_alignment(self):
        mid = "ENSMUSG00000041147"
        brca2 = list(self.comp.Mouse.getGenesMatching(StableId=mid))[0]
        result = list(self.comp.getSyntenicRegions(region=brca2,
                        align_method='PECAN', align_clade='vertebrates'))[0]
        aln = result.getAlignment(feature_types='gene')
        self.assertTrue(len(aln) > 1000)
    
    def test_generate_method_clade_data(self):
        """should correctly determine the align_method align_clade options for
        a group of species"""
        # we should correctly infer the method_species_links, which is a
        # cogent.util.Table instance
        self.assertTrue(self.comp.method_species_links.Shape > (0,0))
    
    def test_get_syntenic_returns_nothing(self):
        """should correctly return None for a SyntenicRegion with golden-path
        assembly gap"""
        Start = 100000
        End = Start + 100000
        related = list(self.comp.getSyntenicRegions(Species='mouse',
                        CoordName='1', Start=Start, End=End,
                        align_method='PECAN', align_clade='vertebrates'))
        self.assertEquals(related, [])
    
    def test_get_species_set(self):
        """should return the correct set of species"""
        expect = set(['Homo sapiens', 'Ornithorhynchus anatinus',
                      'Mus musculus', 'Rattus norvegicus'])
        brca2 = list(self.comp.Human.getGenesMatching(StableId="ENSG00000139618"))[0]
        Orthologs = self.comp.getRelatedGenes(gene_region=brca2,
                        Relationship="ortholog_one2one")
        self.assertEquals(Orthologs.getSpeciesSet(), expect)
        
    def test_pool_connection(self):
        """excercising ability to specify pool connection"""
        dog = Compara(['chimp', 'dog'], Release=Release, account=account,
                pool_recycle=1000)
        
    

class TestSyntenicRegions(TestCase):
    comp = Compara(['human', 'chimp', 'macaque'], account=account,
                  Release=Release)
    
    def test_correct_alignments(self):
        """should return the correct alignments"""
        # following cases have a mixture of strand between ref seq and others
        coords_expected = [
            [{'CoordName': 4, 'End': 78099, 'Species': 'human', 'Start': 77999, 'Strand':-1},
              {'Homo sapiens:chromosome:4:77999-78099:-1':
               'ATGTAAATCAAAACCAAAGTCTGCATTTATTTGCGGAAAGAGATGCTACATGTTCAAAGATAAATATGGAACATTTTTTAAAAGCATTCATGACTTAGAA',
               'Macaca mulatta:chromosome:1:3891064-3891163:1':
               'ATGTCAATCAAAACCAAAGTCTGTATTTATTTGCAGAAAGAGATACTGCATGTTCAAAGATAAATATGGAAC-TTTTTAAAAAGCATTAATGACTTATAC',
               'Pan troglodytes:chromosome:4:102056-102156:-1':
               'ATGTAAATCAAAACCAAAGTCTGCATTTATTTGCGGAAAGAGATGCTACATGTTCAAAGATAAATATGGAACATTTTTAAAAAGCATTCATGACTTAGAA'}],
            [{'CoordName': 18, 'End': 213739, 'Species': 'human', 'Start': 213639, 'Strand':-1},
                {'Homo sapiens:chromosome:18:213639-213739:-1':
                'ATAAGCATTTCCCTTTAGGGCTCTAAGATGAGGTCATCATCGTTTTTAATCCTGAAGAAGGGCTACTGAGTGAGTGCAGATTATTCGGTAAACACT----CTTA',
                 'Macaca mulatta:chromosome:18:13858303-13858397:1':
                 '------GTTTCCCTTTAGGGCTCTAAGATGAGGTCATCATTGTTTTTAATCCTGAAGAAGGGCTACTGA----GTGCAGATTATTCTGTAAATGTGCTTACTTG',
                 'Pan troglodytes:chromosome:18:16601082-16601182:1':
                 'ATAAGCATTTCCCTTTAGGGCTCTAAGATGAGGTCATCATCGTTTTTAATCCTGAAGAAGGGCTACTGA----GTGCAGATTATTCTGTAAACACTCACTCTTA'}],
            [{'CoordName': 5, 'End': 204974, 'Species': 'human', 'Start': 204874, 'Strand':1},
                {'Homo sapiens:chromosome:5:204874-204974:1':
                 'AACACTTGGTATTT----CCCCTTTATGGAGTGAGAGAGATCTTTAAAATATAAACCCTTGATAATATAATATTACTACTTCCTATTA---CCTGTTATGCAGTTCT',
                 'Macaca mulatta:chromosome:6:1297736-1297840:-1':
                 'AACTCTTGGTGTTTCCTTCCCCTTTATGG---GAGAGAGATCTTTAAAATAAAAAACCTTGATAATATAATATTACTACTTTCTATTATCATCTGTTATGCAGTTCT',
                 'Pan troglodytes:chromosome:5:335911-336011:1':
                 'AACACTTGGTAGTT----CCCCTTTATGGAGTGAGAGAGATCTTTAAAATATAAACCCTTGATAATATAATATTACTACTTTCTATTA---CCTGTTATGCAGTTCT'}],
            [{'CoordName': 18, 'End': 203270, 'Species': 'human', 'Start': 203170, 'Strand':-1},
                {'Homo sapiens:chromosome:18:203170-203270:-1':
                 'GGAATAATGAAAGCAATTGTGAGTTAGCAATTACCTTCAAAGAATTACATTTCTTATACAAAGTAAAGTTCATTACTAACCTTAAGAACTTTGGCATTCA',
                 'Macaca mulatta:chromosome:18:13869026-13869126:1':
                 'GGAATAATGAAAGCAATTGTAAGTTAGCAATTACCTTCAAAGAATTACATTTCTTATACAAAGTAAAGTTCATTACTAACCTTAAGAACTTTGGCATTCA',
                 'Pan troglodytes:chromosome:18:16611584-16611684:1':
                 'GGAATAATGAAAGCAATTGTAAGTTAGCAATTACCTTCAAAGAATTACATTTCTTATACAAAGTAAAGTTCATTACTAACCTTAAGAACTTTGGCATTCA'}],
            [{'CoordName': 2, 'End': 46445, 'Species': 'human', 'Start': 46345, 'Strand':-1},
                {'Homo sapiens:chromosome:2:46345-46445:-1':
                 'CTACCACTCGAGCGCGTCTCCGCTGGACCCGGAACCCCGGTCGGTCCATTCCCCGCGAAGATGCGCGCCCTGGCGGCCCTGAGCGCGCCCCCGAACGAGC',
                 'Macaca mulatta:chromosome:13:43921-44021:-1':
                 'CTGCCACTCCAGCGCGTCTCCGCTGCACCCGGAGCGCCGGCCGGTCCATTCCCCGCGAGGATGCGCGCCCTGGCGGCCCTGAACACGTCGGCGAGAGAGC',
                 'Pan troglodytes:chromosome:2a:36792-36892:-1':
                 'CTACCACTCGAGCGCGTCTCCGCTGGACCCGGAACCCCAGTCGGTCCATTCCCCGCGAAGATGCGCGCCCTGGCGGCCCTGAACGCGCCCCCGAACGAGC'}],
            [{'CoordName': 18, 'End': 268049, 'Species': 'human', 'Start': 267949, 'Strand':-1},
                {'Homo sapiens:chromosome:18:267949-268049:-1':
                 'GCGCAGTGGCGGGCACGCGCAGCCGAGAAGATGTCTCCGACGCCGCCGCTCTTCAGTTTGCCCGAAGCGCGGACGCGGTTTACGGTGAGCTGTAGAGGGG',
                 'Macaca mulatta:chromosome:18:13805604-13805703:1':
                 'GCGCAG-GGCGGGCACGCGCAGCCGAGAAGATGTCTCCGACGCCGCCGCTCTTCAGTTTGCCCGAAGCGCGGACGCGGTTTACGGTGAGCTGTAGGCGGG',
                 'Pan troglodytes:chromosome:18:16546800-16546900:1':
                 'GCGCAGTGGCGGGCACGCGCAGCCGAGAAGATGTCTCCGACGCCGCCGCTCTTCAGTTTGCCCGAAGCGCGGACGCGGTTTACGGTGAGCTGTAGCGGGG'}],
            [{'CoordName': 16, 'End': 107443, 'Species': 'human', 'Start': 107343, 'Strand':-1},
                {'Homo sapiens:chromosome:16:107343-107443:-1':
                 'AAGAAGCAAACAGGTTTATTTTATACAGTGGGCCAGGCCGTGGGTCTGCCATGTGACTAGGGCATTTGGACCTAGGGAGAGGTCAGTCTCAGGCCAAGTA',
                 'Pan troglodytes:chromosome:16:48943-49032:-1':
                 'AAGAAGCAAACAGGTTTATTTTATACACTGGGCCAGGCCGTGGGTCTGCCATGTGACTAGGGAATTTGGACC-----------CAGTCTCAGGCCAAGTA'}]
            ]
        for coord, expect in coords_expected[1:]:
            hum_length = coord['End'] - coord['Start']
            syntenic = list(
                self.comp.getSyntenicRegions(method_clade_id=435, **coord))[0]
            # check the slope computed from the expected and returned
            # coordinates is ~ 1
            got_names = dict([(n.split(':')[0], n.split(':')) for n in syntenic.getAlignment().Names])
            exp_names = dict([(n.split(':')[0], n.split(':')) for n in expect.keys()])
            for species in exp_names:
                exp_chrom = exp_names[species][2]
                got_chrom = got_names[species][2]
                self.assertEquals(exp_chrom, got_chrom)
                exp_start, exp_end = map(int, exp_names[species][3].split('-'))
                got_start, got_end = map(int, got_names[species][3].split('-'))
                slope = calc_slope(exp_start, exp_end, got_start, got_end)
                self.assertFloatEqual(abs(slope), 1.0)
        
    
    def test_failing_region(self):
        """should correctly handle queries where multiple Ensembl have
        genome block associations for multiple coord systems"""
        gene = list(self.comp.Human.getGenesMatching(
                                    StableId='ENSG00000188554'))[0]
        # this should simply not raise any exceptions
        syntenic_regions = list(self.comp.getSyntenicRegions(region=gene,
                                align_method='PECAN',
                                align_clade='vertebrates'))
        
    

if __name__ == "__main__":
    main()