
|
# Copyright 2006-2014 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Tests for Bio.AlignIO.ClustalIO"""
import unittest
from Bio._py3k import StringIO
from Bio.AlignIO.ClustalIO import ClustalIterator, ClustalWriter
# This is a truncated version of the example in Tests/cw02.aln
# Notice the inclusion of sequence numbers (right hand side)
aln_example1 = \
"""CLUSTAL W (1.81) multiple sequence alignment
gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50
gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41
* *: :: :. :* : :. : . :* :: .
gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100
gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65
: ** **:... *.*** ..
gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150
gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92
.:* * *: .* :* : :* .*
gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200
gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141
*::. . .:: :*..* :* .* .. . : . :
gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210
gi|671626|emb|CAA85685.1| VAYVKTFQGP 151
*. .:: : .
""" # noqa for pep8 W291 trailing whitespace
# This example is a truncated version of the dataset used here:
# http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm
# with the last record repeated twice (deliberate toture test)
aln_example2 = \
"""CLUSTAL X (1.83) multiple sequence alignment
V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
: . : :.
V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
** .: *::::. : :. . ..:
V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI
B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI
B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV
YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI
FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL
E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV
Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII
HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
*.: . * . * *: :
""" # noqa for pep8 W291 trailing whitespace
aln_example3 = \
"""CLUSTAL 2.0.9 multiple sequence alignment
Test1seq ------------------------------------------------------------
AT3G20900.1-SEQ ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT
AT3G20900.1-SEQ ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
AT3G20900.1-SEQ ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA
AT3G20900.1-SEQ AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA
AT3G20900.1-SEQ CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
AT3G20900.1-SEQ CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
AT3G20900.1-SEQ GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
AT3G20900.1-CDS ------------------------------------------------------------
Test1seq CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
AT3G20900.1-SEQ CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
AT3G20900.1-CDS ------------------------------------------------------ATGAAC
*
Test1seq TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
AT3G20900.1-SEQ GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
AT3G20900.1-CDS AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT
* *** ***** * * ** ****************************
Test1seq AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
AT3G20900.1-SEQ AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
AT3G20900.1-CDS AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
******* ** * **** ***************************************
Test1seq GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT
AT3G20900.1-SEQ GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
AT3G20900.1-CDS GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
**************************************** *******************
Test1seq GCTGGGGATGGAGAGGGAACAGAGTT-
AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG
AT3G20900.1-CDS GCTGGGGATGGAGAGGGAACAGAGTAG
*************************
""" # noqa for pep8 W291 trailing whitespace
aln_example4 = \
"""Kalign (2.0) alignment in ClustalW format
Test1seq GCTGGGGATGGAGAGGGAACAGAGTT-
AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG
"""
class TestClustalIO(unittest.TestCase):
def test_one(self):
alignments = list(ClustalIterator(StringIO(aln_example1)))
self.assertEqual(1, len(alignments))
self.assertEqual(alignments[0]._version, "1.81")
alignment = alignments[0]
self.assertEqual(2, len(alignment))
self.assertEqual(alignment[0].id, "gi|4959044|gb|AAD34209.1|AF069")
self.assertEqual(alignment[1].id, "gi|671626|emb|CAA85685.1|")
self.assertEqual(str(alignment[0].seq),
"MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN"
"LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW"
"LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT"
"SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE"
"VPTTRAQRRA")
def test_two(self):
alignments = list(ClustalIterator(StringIO(aln_example2)))
self.assertEqual(1, len(alignments))
self.assertEqual(alignments[0]._version, "1.83")
alignment = alignments[0]
self.assertEqual(9, len(alignment))
self.assertEqual(alignment[-1].id, "HISJ_E_COLI")
self.assertEqual(str(alignment[-1].seq),
"MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG"
"TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS"
"LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV")
def test_cat_one_two(self):
alignments = list(ClustalIterator(StringIO(aln_example2 + aln_example1)))
self.assertEqual(2, len(alignments))
self.assertEqual(9, len(alignments[0]))
self.assertEqual(137, alignments[0].get_alignment_length())
self.assertEqual(2, len(alignments[1]))
self.assertEqual(210, alignments[1].get_alignment_length())
def test_empy(self):
"""Checking empty file."""
self.assertEqual(0, len(list(ClustalIterator(StringIO("")))))
def test_write_read(self):
"""Checking write/read."""
alignments = list(ClustalIterator(StringIO(aln_example1))) \
+ list(ClustalIterator(StringIO(aln_example2))) * 2
handle = StringIO()
self.assertEqual(3, ClustalWriter(handle).write_file(alignments))
handle.seek(0)
for i, a in enumerate(ClustalIterator(handle)):
self.assertEqual(a.get_alignment_length(), alignments[i].get_alignment_length())
def test_write_read_single(self):
"""Testing write/read when there is only one sequence."""
alignment = next(ClustalIterator(StringIO(aln_example1)))
# Now thae just the first row as a new alignment:
alignment = alignment[0:1]
handle = StringIO()
ClustalWriter(handle).write_file([alignment])
handle.seek(0)
for i, a in enumerate(ClustalIterator(handle)):
self.assertEqual(a.get_alignment_length(), alignment.get_alignment_length())
self.assertEqual(len(a), 1)
def test_three(self):
alignments = list(ClustalIterator(StringIO(aln_example3)))
self.assertEqual(1, len(alignments))
self.assertEqual(alignments[0]._version, "2.0.9")
def test_kalign_header(self):
"""Make sure we can parse the Kalign header."""
alignments = next(ClustalIterator(StringIO(aln_example4)))
self.assertEqual(2, len(alignments))
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)
|