1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
|
from __future__ import division
import os
from cogent.util.unit_test import TestCase, main
from cogent.db.ensembl.host import HostAccount, get_ensembl_account
from cogent.db.ensembl.compara import Compara
__author__ = "Gavin Huttley, Hua Ying"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Gavin Huttley", "hua Ying"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Gavin Huttley"
__email__ = "Gavin.Huttley@anu.edu.au"
__status__ = "alpha"
Release = 68
if 'ENSEMBL_ACCOUNT' in os.environ:
args = os.environ['ENSEMBL_ACCOUNT'].split()
host, username, password = args[0:3]
kwargs = {}
if len(args) > 3:
kwargs['port'] = int(args[3])
account = HostAccount(host, username, password, **kwargs)
else:
account = get_ensembl_account(release=Release)
def calc_slope(x1, y1, x2, y2):
"""computes the slope from two coordinate sets, assigning a delta of 1
when values are identical"""
delta_y = y2-y1
delta_x = x2-x1
delta_y = [delta_y, 1][delta_y == 0]
delta_x = [delta_x, 1][delta_x == 0]
return delta_y/delta_x
class ComparaTestBase(TestCase):
comp = Compara(['human', 'mouse', 'rat', 'platypus'], Release=Release,
account=account)
class TestCompara(ComparaTestBase):
def test_query_genome(self):
"""compara should attach valid genome attributes by common name"""
brca2 = self.comp.Mouse.getGeneByStableId("ENSMUSG00000041147")
self.assertEquals(brca2.Symbol.lower(), 'brca2')
def test_get_related_genes(self):
"""should correctly return the related gene regions from each genome"""
brca2 = self.comp.Mouse.getGeneByStableId("ENSMUSG00000041147")
Orthologs = self.comp.getRelatedGenes(gene_region=brca2,
Relationship="ortholog_one2one")
self.assertEquals("ortholog_one2one", Orthologs.Relationships[0])
def test_get_related_genes2(self):
"""should handle case where gene is absent from one of the genomes"""
clec2d = self.comp.Mouse.getGeneByStableId(
StableId='ENSMUSG00000030157')
orthologs = self.comp.getRelatedGenes(gene_region=clec2d,
Relationship='ortholog_one2many')
self.assertTrue(len(orthologs.Members) < 4)
def test_get_collection(self):
brca2 = self.comp.Human.getGeneByStableId(StableId="ENSG00000139618")
Orthologs = self.comp.getRelatedGenes(gene_region=brca2,
Relationship="ortholog_one2one")
collection = Orthologs.getSeqCollection()
self.assertTrue(len(collection.Seqs[0])> 1000)
def test_getting_alignment(self):
mid = "ENSMUSG00000041147"
brca2 = self.comp.Mouse.getGeneByStableId(StableId=mid)
result = list(self.comp.getSyntenicRegions(region=brca2,
align_method='PECAN', align_clade='vertebrates'))[0]
aln = result.getAlignment(feature_types='gene')
self.assertTrue(len(aln) > 1000)
def test_generate_method_clade_data(self):
"""should correctly determine the align_method align_clade options for
a group of species"""
# we should correctly infer the method_species_links, which is a
# cogent.util.Table instance
self.assertTrue(self.comp.method_species_links.Shape > (0,0))
def test_no_method_clade_data(self):
"""generate a Table with no rows if no alignment data"""
compara = Compara(['S.cerevisiae'], Release=Release, account=account)
self.assertEquals(compara.method_species_links.Shape[0], 0)
def test_get_syntenic_returns_nothing(self):
"""should correctly return None for a SyntenicRegion with golden-path
assembly gap"""
Start = 100000
End = Start + 100000
related = list(self.comp.getSyntenicRegions(Species='mouse',
CoordName='1', Start=Start, End=End,
align_method='PECAN', align_clade='vertebrates'))
self.assertEquals(related, [])
def test_get_species_set(self):
"""should return the correct set of species"""
expect = set(['Homo sapiens', 'Ornithorhynchus anatinus',
'Mus musculus', 'Rattus norvegicus'])
brca2 = self.comp.Human.getGeneByStableId(StableId="ENSG00000139618")
Orthologs = self.comp.getRelatedGenes(gene_region=brca2,
Relationship="ortholog_one2one")
self.assertEquals(Orthologs.getSpeciesSet(), expect)
def test_pool_connection(self):
"""excercising ability to specify pool connection"""
dog = Compara(['chimp', 'dog'], Release=Release, account=account,
pool_recycle=1000)
class TestSyntenicRegions(TestCase):
comp = Compara(['human', 'chimp', 'macaque'], account=account,
Release=Release)
def test_correct_alignments(self):
"""should return the correct alignments"""
# following cases have a mixture of strand between ref seq and others
coords_expected = [
[{'CoordName': 4, 'End': 78099, 'Species': 'human', 'Start': 77999, 'Strand':-1},
{'Homo sapiens:chromosome:4:77999-78099:-1':
'ATGTAAATCAAAACCAAAGTCTGCATTTATTTGCGGAAAGAGATGCTACATGTTCAAAGATAAATATGGAACATTTTTTAAAAGCATTCATGACTTAGAA',
'Macaca mulatta:chromosome:1:3891064-3891163:1':
'ATGTCAATCAAAACCAAAGTCTGTATTTATTTGCAGAAAGAGATACTGCATGTTCAAAGATAAATATGGAAC-TTTTTAAAAAGCATTAATGACTTATAC',
'Pan troglodytes:chromosome:4:102056-102156:-1':
'ATGTAAATCAAAACCAAAGTCTGCATTTATTTGCGGAAAGAGATGCTACATGTTCAAAGATAAATATGGAACATTTTTAAAAAGCATTCATGACTTAGAA'}],
[{'CoordName': 18, 'End': 213739, 'Species': 'human', 'Start': 213639, 'Strand':-1},
{'Homo sapiens:chromosome:18:213639-213739:-1':
'ATAAGCATTTCCCTTTAGGGCTCTAAGATGAGGTCATCATCGTTTTTAATCCTGAAGAAGGGCTACTGAGTGAGTGCAGATTATTCGGTAAACACT----CTTA',
'Macaca mulatta:chromosome:18:13858303-13858397:1':
'------GTTTCCCTTTAGGGCTCTAAGATGAGGTCATCATTGTTTTTAATCCTGAAGAAGGGCTACTGA----GTGCAGATTATTCTGTAAATGTGCTTACTTG',
'Pan troglodytes:chromosome:18:16601082-16601182:1':
'ATAAGCATTTCCCTTTAGGGCTCTAAGATGAGGTCATCATCGTTTTTAATCCTGAAGAAGGGCTACTGA----GTGCAGATTATTCTGTAAACACTCACTCTTA'}],
[{'CoordName': 5, 'End': 204974, 'Species': 'human', 'Start': 204874, 'Strand':1},
{'Homo sapiens:chromosome:5:204874-204974:1':
'AACACTTGGTATTT----CCCCTTTATGGAGTGAGAGAGATCTTTAAAATATAAACCCTTGATAATATAATATTACTACTTCCTATTA---CCTGTTATGCAGTTCT',
'Macaca mulatta:chromosome:6:1297736-1297840:-1':
'AACTCTTGGTGTTTCCTTCCCCTTTATGG---GAGAGAGATCTTTAAAATAAAAAACCTTGATAATATAATATTACTACTTTCTATTATCATCTGTTATGCAGTTCT',
'Pan troglodytes:chromosome:5:335911-336011:1':
'AACACTTGGTAGTT----CCCCTTTATGGAGTGAGAGAGATCTTTAAAATATAAACCCTTGATAATATAATATTACTACTTTCTATTA---CCTGTTATGCAGTTCT'}],
[{'CoordName': 18, 'End': 203270, 'Species': 'human', 'Start': 203170, 'Strand':-1},
{'Homo sapiens:chromosome:18:203170-203270:-1':
'GGAATAATGAAAGCAATTGTGAGTTAGCAATTACCTTCAAAGAATTACATTTCTTATACAAAGTAAAGTTCATTACTAACCTTAAGAACTTTGGCATTCA',
'Pan troglodytes:chromosome:18:16611584-16611684:1':
'GGAATAATGAAAGCAATTGTAAGTTAGCAATTACCTTCAAAGAATTACATTTCTTATACAAAGTAAAGTTCATTACTAACCTTAAGAACTTTGGCATTCA'}],
[{'CoordName': 2, 'End': 46445, 'Species': 'human', 'Start': 46345, 'Strand':-1},
{'Homo sapiens:chromosome:2:46345-46445:-1':
'CTACCACTCGAGCGCGTCTCCGCTGGACCCGGAACCCCGGTCGGTCCATTCCCCGCGAAGATGCGCGCCCTGGCGGCCCTGAGCGCGCCCCCGAACGAGC',
'Macaca mulatta:chromosome:13:43921-44021:-1':
'CTGCCACTCCAGCGCGTCTCCGCTGCACCCGGAGCGCCGGCCGGTCCATTCCCCGCGAGGATGCGCGCCCTGGCGGCCCTGAACACGTCGGCGAGAGAGC',
'Pan troglodytes:chromosome:2a:36792-36892:-1':
'CTACCACTCGAGCGCGTCTCCGCTGGACCCGGAACCCCAGTCGGTCCATTCCCCGCGAAGATGCGCGCCCTGGCGGCCCTGAACGCGCCCCCGAACGAGC'}],
[{'CoordName': 18, 'End': 268049, 'Species': 'human', 'Start': 267949, 'Strand':-1},
{'Homo sapiens:chromosome:18:267949-268049:-1':
'GCGCAGTGGCGGGCACGCGCAGCCGAGAAGATGTCTCCGACGCCGCCGCTCTTCAGTTTGCCCGAAGCGCGGACGCGGTTTACGGTGAGCTGTAGAGGGG',
'Macaca mulatta:chromosome:18:13805604-13805703:1':
'GCGCAG-GGCGGGCACGCGCAGCCGAGAAGATGTCTCCGACGCCGCCGCTCTTCAGTTTGCCCGAAGCGCGGACGCGGTTTACGGTGAGCTGTAGGCGGG',
'Pan troglodytes:chromosome:18:16546800-16546900:1':
'GCGCAGTGGCGGGCACGCGCAGCCGAGAAGATGTCTCCGACGCCGCCGCTCTTCAGTTTGCCCGAAGCGCGGACGCGGTTTACGGTGAGCTGTAGCGGGG'}],
[{'CoordName': 16, 'End': 107443, 'Species': 'human', 'Start': 107343, 'Strand':-1},
{'Homo sapiens:chromosome:16:107343-107443:-1':
'AAGAAGCAAACAGGTTTATTTTATACAGTGGGCCAGGCCGTGGGTCTGCCATGTGACTAGGGCATTTGGACCTAGGGAGAGGTCAGTCTCAGGCCAAGTA',
'Pan troglodytes:chromosome:16:48943-49032:-1':
'AAGAAGCAAACAGGTTTATTTTATACACTGGGCCAGGCCGTGGGTCTGCCATGTGACTAGGGAATTTGGACC-----------CAGTCTCAGGCCAAGTA'}]
]
# print self.comp.method_species_links
for coord, expect in coords_expected[1:]:
syntenic = list(
self.comp.getSyntenicRegions(method_clade_id=548, **coord))[0]
# check the slope computed from the expected and returned
# coordinates is ~ 1
got_names = dict([(n.split(':')[0], n.split(':')) for n in syntenic.getAlignment().Names])
exp_names = dict([(n.split(':')[0], n.split(':')) for n in expect.keys()])
for species in exp_names:
exp_chrom = exp_names[species][2]
got_chrom = got_names[species][2]
self.assertEquals(exp_chrom.lower(), got_chrom.lower())
exp_start, exp_end = map(int, exp_names[species][3].split('-'))
got_start, got_end = map(int, got_names[species][3].split('-'))
slope = calc_slope(exp_start, exp_end, got_start, got_end)
self.assertFloatEqual(abs(slope), 1.0, eps=1e-3)
def test_failing_region(self):
"""should correctly handle queries where multiple Ensembl have
genome block associations for multiple coord systems"""
gene = self.comp.Human.getGeneByStableId(StableId='ENSG00000188554')
# this should simply not raise any exceptions
syntenic_regions = list(self.comp.getSyntenicRegions(region=gene,
align_method='PECAN',
align_clade='vertebrates'))
if __name__ == "__main__":
main()
|