1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
#!/usr/bin/env python
"""Test for the Uniprot parser on Uniprot XML files.
"""
import os
import copy
import unittest
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
#Left as None if the import within UniProtIO fails
if SeqIO.UniprotIO.ElementTree is None:
from Bio import MissingPythonDependencyError
raise MissingPythonDependencyError("No ElementTree module was found. "
"Use Python 2.5+, lxml or elementtree if you "
"want to use Bio.SeqIO.UniprotIO.")
from seq_tests_common import compare_reference, compare_record
class TestUniprot(unittest.TestCase):
def test_uni001(self):
"Parsing Uniprot file uni001"
filename = 'uni001'
# test the record parser
datafile = os.path.join('SwissProt', filename)
test_handle = open(datafile)
seq_record = SeqIO.read(test_handle, "uniprot-xml")
test_handle.close()
self.assertTrue(isinstance(seq_record, SeqRecord))
# test a couple of things on the record -- this is not exhaustive
self.assertEqual(seq_record.id, "Q91G55")
self.assertEqual(seq_record.name, "043L_IIV6")
self.assertEqual(seq_record.description, "Uncharacterized protein 043L")
self.assertEqual(repr(seq_record.seq), "Seq('MDLINNKLNIEIQKFCLDLEKKYNINYNNLIDLWFNKESTERLIKCEVNLENKI...IPI', ProteinAlphabet())")
# self.assertEqual(seq_record.accessions, ['Q91G55']) #seq_record.accessions does not exist
# self.assertEqual(seq_record.organism_classification, ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Mammalia', 'Eutheria', 'Primates', 'Catarrhini', 'Hominidae', 'Homo'])
# self.assertEqual(record.seqinfo, (348, 39676, '75818910'))
self.assertEqual(len(seq_record.features), 1)
self.assertEqual(repr(seq_record.features[0]), "SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(116)), type='chain', id='PRO_0000377969')")
self.assertEqual(len(seq_record.annotations['references']), 2)
self.assertEqual(seq_record.annotations['references'][0].authors, 'Jakob N.J., Mueller K., Bahr U., Darai G.')
self.assertEqual(seq_record.annotations['references'][0].title, 'Analysis of the first complete DNA sequence of an invertebrate iridovirus: coding strategy of the genome of Chilo iridescent virus.')
self.assertEqual(seq_record.annotations['references'][0].journal, 'Virology 286:182-196(2001)')
self.assertEqual(seq_record.annotations['references'][0].comment, 'journal article | 2001 | Scope: NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA] | ')
self.assertEqual(len(seq_record.dbxrefs), 11)
self.assertEqual(seq_record.dbxrefs[0], 'DOI:10.1006/viro.2001.0963')
self.assertEqual(seq_record.annotations['sequence_length'], 116)
self.assertEqual(seq_record.annotations['sequence_checksum'], '4A29B35FB716523C')
self.assertEqual(seq_record.annotations['modified'], '2009-07-07')
self.assertEqual(seq_record.annotations['accessions'], ['Q91G55'])
self.assertEqual(seq_record.annotations['taxonomy'], ['Viruses', 'dsDNA viruses, no RNA stage', 'Iridoviridae', 'Iridovirus'])
self.assertEqual(seq_record.annotations['sequence_mass'], 13673)
self.assertEqual(seq_record.annotations['dataset'], 'Swiss-Prot')
self.assertEqual(seq_record.annotations['gene_name_ORF'], ['IIV6-043L'])
self.assertEqual(seq_record.annotations['version'], 21)
self.assertEqual(seq_record.annotations['sequence_modified'], '2001-12-01')
self.assertEqual(seq_record.annotations['keywords'], ['Complete proteome', 'Virus reference strain'])
self.assertEqual(seq_record.annotations['organism_host'], ['Acheta domesticus', 'House cricket', 'Chilo suppressalis', 'striped riceborer', 'Gryllus bimaculatus', 'Two-spotted cricket', 'Gryllus campestris', 'Spodoptera frugiperda', 'Fall armyworm'])
self.assertEqual(seq_record.annotations['created'], '2009-06-16')
self.assertEqual(seq_record.annotations['organism_name'], ['Chilo iridescent virus'])
self.assertEqual(seq_record.annotations['organism'], 'Invertebrate iridescent virus 6 (IIV-6)')
self.assertEqual(seq_record.annotations['recommendedName_fullName'], ['Uncharacterized protein 043L'])
self.assertEqual(seq_record.annotations['sequence_version'], 1)
self.assertEqual(seq_record.annotations['proteinExistence'], ['Predicted'])
def compare_txt_xml(self, old, new):
self.assertEqual(old.id, new.id)
self.assertEqual(old.name, new.name)
self.assertEqual(len(old), len(new))
self.assertEqual(str(old.seq), str(new.seq))
for key in set(old.annotations).intersection(new.annotations):
if key == "references":
self.assertEqual(len(old.annotations[key]),
len(new.annotations[key]))
for r1, r2 in zip(old.annotations[key], new.annotations[key]):
#Tweak for line breaks in plain text SwissProt
r1.title = r1.title.replace("- ", "-")
r2.title = r2.title.replace("- ", "-")
r1.journal = r1.journal.rstrip(".") #Should parser do this?
r1.medline_id = "" #Missing in UniPort MXL? TODO - check
#Lots of extra comments in UniProt XML
r1.comment = ""
r2.comment = ""
if not r2.journal: r1.journal = ""
compare_reference(r1, r2)
elif old.annotations[key] == new.annotations[key]:
pass
elif key in ["date"]:
#TODO - Why is this a list vs str?
pass
elif type(old.annotations[key]) != type(new.annotations[key]):
raise TypeError("%s gives %s vs %s" % \
(key, old.annotations[key], new.annotations[key]))
elif key in ["organism"]:
if old.annotations[key] == new.annotations[key]:
pass
elif old.annotations[key].startswith(new.annotations[key]+" "):
pass
else:
raise ValueError(key)
elif isinstance(old.annotations[key], list) \
and sorted(old.annotations[key]) == sorted(new.annotations[key]):
pass
else:
raise ValueError("%s gives %s vs %s" % \
(key, old.annotations[key], new.annotations[key]))
self.assertEqual(len(old.features), len(new.features),
"Features in %s, %i vs %i" %
(old.id, len(old.features), len(new.features)))
for f1, f2 in zip(old.features, new.features):
"""
self.assertEqual(f1.location.nofuzzy_start, f2.location.nofuzzy_start,
"%s %s vs %s %s" %
(f1.location, f1.type, f2.location, f2.type))
self.assertEqual(f1.location.nofuzzy_end, f2.location.nofuzzy_end,
"%s %s vs %s %s" %
(f1.location, f1.type, f2.location, f2.type))
"""
self.assertEqual(repr(f1.location), repr(f2.location),
"%s %s vs %s %s" %
(f1.location, f1.type, f2.location, f2.type))
def test_Q13639(self):
"""Compare SwissProt text and uniprot XML versions of Q13639."""
old = SeqIO.read("SwissProt/Q13639.txt", "swiss")
new = SeqIO.read("SwissProt/Q13639.xml", "uniprot-xml")
self.compare_txt_xml(old, new)
def test_multi_ex(self):
"""Compare SwissProt text and uniprot XML versions of several examples."""
txt_list = list(SeqIO.parse("SwissProt/multi_ex.txt", "swiss"))
xml_list = list(SeqIO.parse("SwissProt/multi_ex.xml", "uniprot-xml"))
fas_list = list(SeqIO.parse("SwissProt/multi_ex.fasta", "fasta"))
ids = [x.strip() for x in open("SwissProt/multi_ex.list")]
self.assertEqual(len(txt_list), len(ids))
self.assertEqual(len(txt_list), len(fas_list))
self.assertEqual(len(txt_list), len(xml_list))
for txt, xml, fas, id in zip(txt_list, xml_list, fas_list, ids):
self.assertEqual(txt.id, id)
self.assertTrue(txt.id in fas.id.split("|"))
self.assertEqual(str(txt.seq), str(fas.seq))
self.compare_txt_xml(txt, xml)
def test_multi_ex_index(self):
"""Index SwissProt text and uniprot XML versions of several examples."""
txt_list = list(SeqIO.parse("SwissProt/multi_ex.txt", "swiss"))
xml_list = list(SeqIO.parse("SwissProt/multi_ex.xml", "uniprot-xml"))
ids = [x.strip() for x in open("SwissProt/multi_ex.list")]
txt_index = SeqIO.index("SwissProt/multi_ex.txt", "swiss")
xml_index = SeqIO.index("SwissProt/multi_ex.xml", "uniprot-xml")
self.assertEqual(sorted(txt_index), sorted(ids))
self.assertEqual(sorted(xml_index), sorted(ids))
#Check SeqIO.parse() versus SeqIO.index() for plain text "swiss"
for old in txt_list:
new = txt_index[old.id]
compare_record(old, new)
#Check SeqIO.parse() versus SeqIO.index() for XML "uniprot-xml"
for old in xml_list:
new = xml_index[old.id]
compare_record(old, new)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity = 2)
unittest.main(testRunner=runner)
|