1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
|
#!/usr/bin/env python
"""Unit tests for unigene-specific classes
"""
from unittest import TestCase
from cogent3.parse.record_finder import GbFinder
from cogent3.parse.unigene import (
LinesToUniGene,
UniGeneProtSimRecord,
UniGeneSeqRecord,
_read_expression,
_read_seq,
_read_sts,
)
class unigeneTests(TestCase):
"""Tests toplevel functions."""
def test_read_sts(self):
"""_read_sts should perform correct conversions"""
self.assertEqual(
_read_sts("ACC=RH128467 UNISTS=211775\n"),
{"ACC": "RH128467", "UNISTS": "211775"},
)
def test_read_expression(self):
"""_read_expression should perform correct conversions"""
self.assertEqual(
_read_expression("embryo ; whole body ; mammary gland ; brain\n"),
["embryo", "whole body", "mammary gland", "brain"],
)
def test_read_seq(self):
"""_read_seq should perform correct conversions"""
# reset the found fields, since we can't guarantee order of test
# execution and it's persistent class data
UniGeneSeqRecord.found_fields = {}
self.assertEqual(
_read_seq("ACC=BC025044.1\n"), UniGeneSeqRecord({"ACC": "BC025044.1"})
)
self.assertEqual(
_read_seq(
"ACC=AI842963.1; NID=g5477176; CLONE=UI-M-AO1-aem-f-10-0-UI; END=3'; LID=1944; SEQTYPE=EST; TRACE=158501677\n"
),
UniGeneSeqRecord(
{
"ACC": "AI842963.1",
"NID": "g5477176",
"CLONE": "UI-M-AO1-aem-f-10-0-UI",
"END": "3'",
"LID": "1944",
"SEQTYPE": "EST",
"TRACE": "158501677",
}
),
)
def test_LinesToUniGene(self):
"""LinesToUniGene should give expected results on sample data"""
fake_file = """ID Mm.1
TITLE S100 calcium binder
GENE S100a10
CYTOBAND 3 41.7 cM
LOCUSLINK 20194
EXPRESS embryo ; whole body ; mammary gland ; brain
CHROMOSOME 3
STS ACC=RH128467 UNISTS=211775
STS ACC=M16465 UNISTS= 178878
PROTSIM ORG=Homo sapiens; PROTGI=107251; PROTID=pir:JC1139; PCT=91; ALN=97
PROTSIM ORG=Mus musculus; PROTGI=116487; PROTID=sp:P08207; PCT=100; ALN=97
PROTSIM ORG=Rattus norvegicus; PROTGI=116489; PROTID=sp:P05943; PCT=94; ALN=94
SCOUNT 5
SEQUENCE ACC=BC025044.1; NID=g19263549; PID=g19263550; SEQTYPE=mRNA
SEQUENCE ACC=AA471893.1; NID=g2199884; CLONE=IMAGE:872193; END=5'; LID=539; SEQTYPE=EST
SEQUENCE ACC=AI842963.1; NID=g5477176; CLONE=UI-M-AO1-aem-f-10-0-UI; END=3'; LID=1944; SEQTYPE=EST; TRACE=158501677
SEQUENCE ACC=CB595147.1; NID=g29513003; CLONE=IMAGE:30300703; END=5'; LID=12885; MGC=6677832; SEQTYPE=EST
SEQUENCE ACC=BY144053.1; NID=g26280109; CLONE=L930184D22; END=5'; LID=12267; SEQTYPE=EST
//
ID Mm.5
TITLE homeo box A10
GENE Hoxa10
CYTOBAND 6 26.33 cM
LOCUSLINK 15395
EXPRESS kidney ; colon ; mammary gland
CHROMOSOME 6
PROTSIM ORG=Caenorhabditis elegans; PROTGI=7510074; PROTID=pir:T31611; PCT=30; ALN=326
SCOUNT 1
SEQUENCE ACC=AW990320.1; NID=g8185938; CLONE=IMAGE:1513482; END=5'; LID=1043; SEQTYPE=EST; TRACE=94472873
//
"""
records = list(GbFinder(fake_file.split("\n")))
self.assertEqual(len(records), 2)
first, second = list(map(LinesToUniGene, records))
self.assertEqual(first.ID, "Mm.1")
self.assertEqual(first.TITLE, "S100 calcium binder")
self.assertEqual(first.GENE, "S100a10")
self.assertEqual(first.CYTOBAND, "3 41.7 cM")
self.assertEqual(first.CHROMOSOME, "3")
self.assertEqual(first.LOCUSLINK, 20194)
self.assertEqual(
first.EXPRESS, ["embryo", "whole body", "mammary gland", "brain"]
)
self.assertEqual(
first.STS,
[
{"ACC": "RH128467", "UNISTS": "211775"},
{"ACC": "M16465", "UNISTS": "178878"},
],
)
exp_prot_sim = list(
map(
UniGeneProtSimRecord,
[
{
"ORG": "Homo sapiens",
"PROTGI": "107251",
"PROTID": "pir:JC1139",
"PCT": "91",
"ALN": "97",
},
{
"ORG": "Mus musculus",
"PROTGI": "116487",
"PROTID": "sp:P08207",
"PCT": "100",
"ALN": "97",
},
{
"ORG": "Rattus norvegicus",
"PROTGI": "116489",
"PROTID": "sp:P05943",
"PCT": "94",
"ALN": "94",
},
],
)
)
for obs, exp in zip(first.PROTSIM, exp_prot_sim):
self.assertEqual(obs, exp)
self.assertEqual(first.SCOUNT, 5)
exp_seqs = list(
map(
UniGeneSeqRecord,
[
{
"ACC": "BC025044.1",
"NID": "g19263549",
"PID": "g19263550",
"SEQTYPE": "mRNA",
},
{
"ACC": "AA471893.1",
"NID": "g2199884",
"END": "5'",
"CLONE": "IMAGE:872193",
"LID": "539",
"SEQTYPE": "EST",
},
{
"ACC": "AI842963.1",
"NID": "g5477176",
"CLONE": "UI-M-AO1-aem-f-10-0-UI",
"END": "3'",
"LID": "1944",
"SEQTYPE": "EST",
"TRACE": "158501677",
},
{
"ACC": "CB595147.1",
"NID": "g29513003",
"CLONE": "IMAGE:30300703",
"END": "5'",
"LID": "12885",
"MGC": "6677832",
"SEQTYPE": "EST",
},
{
"ACC": "BY144053.1",
"NID": "g26280109",
"CLONE": "L930184D22",
"END": "5'",
"LID": "12267",
"SEQTYPE": "EST",
},
],
)
)
for obs, exp in zip(first.SEQUENCE, exp_seqs):
self.assertEqual(obs, exp)
self.assertEqual(second.ID, "Mm.5")
self.assertEqual(second.TITLE, "homeo box A10")
self.assertEqual(second.GENE, "Hoxa10")
self.assertEqual(second.CYTOBAND, "6 26.33 cM")
self.assertEqual(second.LOCUSLINK, 15395)
self.assertEqual(second.EXPRESS, ["kidney", "colon", "mammary gland"])
self.assertEqual(second.CHROMOSOME, "6")
self.assertEqual(
second.PROTSIM,
list(
map(
UniGeneProtSimRecord,
[
{
"ORG": "Caenorhabditis elegans",
"PROTGI": "7510074",
"PROTID": "pir:T31611",
"PCT": "30",
"ALN": "326",
}
],
)
),
)
self.assertEqual(second.SCOUNT, 1)
self.assertEqual(second.STS, [])
self.assertEqual(
second.SEQUENCE,
list(
map(
UniGeneSeqRecord,
[
{
"ACC": "AW990320.1",
"NID": "g8185938",
"CLONE": "IMAGE:1513482",
"END": "5'",
"LID": "1043",
"SEQTYPE": "EST",
"TRACE": "94472873",
}
],
)
),
)
# test that the synonym mapping works OK
self.assertEqual(second.SequenceIds[0].NucleotideId, "g8185938")
|