# Copyright 2006-2014 by Peter Cock.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""Tests for Bio.AlignIO.ClustalIO"""

import unittest

from Bio._py3k import StringIO

from Bio.AlignIO.ClustalIO import ClustalIterator, ClustalWriter

# This is a truncated version of the example in Tests/cw02.aln
# Notice the inclusion of sequence numbers (right hand side)
aln_example1 = \
"""CLUSTAL W (1.81) multiple sequence alignment


gi|4959044|gb|AAD34209.1|AF069      MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50
gi|671626|emb|CAA85685.1|           ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41
                                              * *: ::    :.   :*  :  :. : . :*  ::   .

gi|4959044|gb|AAD34209.1|AF069      LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100
gi|671626|emb|CAA85685.1|           VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65
                                    :   **                  **:...   *.*** ..         

gi|4959044|gb|AAD34209.1|AF069      LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150
gi|671626|emb|CAA85685.1|           WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92
                                     .:*   * *: .* :*        : :* .*                  

gi|4959044|gb|AAD34209.1|AF069      SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200
gi|671626|emb|CAA85685.1|           -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141
                                     *::.  .    .:: :*..*  :* .*   .. .  :    .  :    

gi|4959044|gb|AAD34209.1|AF069      VPTTRAQRRA 210
gi|671626|emb|CAA85685.1|           VAYVKTFQGP 151
                                    *. .:: : .
"""  # noqa for pep8 W291 trailing whitespace

# This example is a truncated version of the dataset used here:
# http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm
# with the last record repeated twice (deliberate toture test)
aln_example2 = \
"""CLUSTAL X (1.83) multiple sequence alignment


V_Harveyi_PATH                 --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
B_subtilis_YXEM                MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
B_subtilis_GlnH_homo_YCKK      MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
YA80_HAEIN                     MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
FLIY_ECOLI                     MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
E_coli_GlnH                    --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
Deinococcus_radiodurans        -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
HISJ_E_COLI                    MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
HISJ_E_COLI                    MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
                                         : .                                 : :.

V_Harveyi_PATH                 MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
B_subtilis_YXEM                ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
B_subtilis_GlnH_homo_YCKK      TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
YA80_HAEIN                     TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
FLIY_ECOLI                     LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
E_coli_GlnH                    TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
Deinococcus_radiodurans        MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
HISJ_E_COLI                    TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
HISJ_E_COLI                    TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
                                     **       .:  *::::.   : :.   .        ..:   

V_Harveyi_PATH                 LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI
B_subtilis_YXEM                LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI
B_subtilis_GlnH_homo_YCKK      LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV
YA80_HAEIN                     LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI
FLIY_ECOLI                     LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL
E_coli_GlnH                    LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV
Deinococcus_radiodurans        LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII
HISJ_E_COLI                    LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
HISJ_E_COLI                    LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
                               *.: . *        .  *     *:          :

"""  # noqa for pep8 W291 trailing whitespace


aln_example3 = \
"""CLUSTAL 2.0.9 multiple sequence alignment


Test1seq             ------------------------------------------------------------
AT3G20900.1-SEQ      ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC
AT3G20900.1-CDS      ------------------------------------------------------------
                                                                                 

Test1seq             -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT
AT3G20900.1-SEQ      ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT
AT3G20900.1-CDS      ------------------------------------------------------------
                                                                                 

Test1seq             ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
AT3G20900.1-SEQ      ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
AT3G20900.1-CDS      ------------------------------------------------------------
                                                                                 

Test1seq             AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA
AT3G20900.1-SEQ      AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA
AT3G20900.1-CDS      ------------------------------------------------------------
                                                                                 

Test1seq             CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA
AT3G20900.1-SEQ      CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA
AT3G20900.1-CDS      ------------------------------------------------------------
                                                                                 

Test1seq             AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
AT3G20900.1-SEQ      CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
AT3G20900.1-CDS      ------------------------------------------------------------
                                                                                 

Test1seq             GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
AT3G20900.1-SEQ      GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
AT3G20900.1-CDS      ------------------------------------------------------------
                                                                                 

Test1seq             CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
AT3G20900.1-SEQ      CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
AT3G20900.1-CDS      ------------------------------------------------------ATGAAC
                                                                             *   

Test1seq             TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
AT3G20900.1-SEQ      GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
AT3G20900.1-CDS      AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT
                         *  *** ***** *   *  **      ****************************

Test1seq             AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
AT3G20900.1-SEQ      AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
AT3G20900.1-CDS      AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
                     ******* **   * ****  ***************************************

Test1seq             GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT
AT3G20900.1-SEQ      GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
AT3G20900.1-CDS      GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
                     **************************************** *******************

Test1seq             GCTGGGGATGGAGAGGGAACAGAGTT-
AT3G20900.1-SEQ      GCTGGGGATGGAGAGGGAACAGAGTAG
AT3G20900.1-CDS      GCTGGGGATGGAGAGGGAACAGAGTAG
                     *************************  
"""  # noqa for pep8 W291 trailing whitespace

aln_example4 = \
"""Kalign (2.0) alignment in ClustalW format

Test1seq             GCTGGGGATGGAGAGGGAACAGAGTT-
AT3G20900.1-SEQ      GCTGGGGATGGAGAGGGAACAGAGTAG

"""


class TestClustalIO(unittest.TestCase):

    def test_one(self):
        alignments = list(ClustalIterator(StringIO(aln_example1)))
        self.assertEqual(1, len(alignments))
        self.assertEqual(alignments[0]._version, "1.81")
        alignment = alignments[0]
        self.assertEqual(2, len(alignment))
        self.assertEqual(alignment[0].id, "gi|4959044|gb|AAD34209.1|AF069")
        self.assertEqual(alignment[1].id, "gi|671626|emb|CAA85685.1|")
        self.assertEqual(str(alignment[0].seq),
                         "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN"
                         "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW"
                         "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT"
                         "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE"
                         "VPTTRAQRRA")

    def test_two(self):
        alignments = list(ClustalIterator(StringIO(aln_example2)))
        self.assertEqual(1, len(alignments))
        self.assertEqual(alignments[0]._version, "1.83")
        alignment = alignments[0]
        self.assertEqual(9, len(alignment))
        self.assertEqual(alignment[-1].id, "HISJ_E_COLI")
        self.assertEqual(str(alignment[-1].seq),
                         "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG"
                         "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS"
                         "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV")

    def test_cat_one_two(self):
        alignments = list(ClustalIterator(StringIO(aln_example2 + aln_example1)))
        self.assertEqual(2, len(alignments))
        self.assertEqual(9, len(alignments[0]))
        self.assertEqual(137, alignments[0].get_alignment_length())
        self.assertEqual(2, len(alignments[1]))
        self.assertEqual(210, alignments[1].get_alignment_length())

    def test_empy(self):
        """Checking empty file."""
        self.assertEqual(0, len(list(ClustalIterator(StringIO("")))))

    def test_write_read(self):
        """Checking write/read."""
        alignments = list(ClustalIterator(StringIO(aln_example1))) \
                   + list(ClustalIterator(StringIO(aln_example2))) * 2
        handle = StringIO()
        self.assertEqual(3, ClustalWriter(handle).write_file(alignments))
        handle.seek(0)
        for i, a in enumerate(ClustalIterator(handle)):
            self.assertEqual(a.get_alignment_length(), alignments[i].get_alignment_length())

    def test_write_read_single(self):
        """Testing write/read when there is only one sequence."""
        alignment = next(ClustalIterator(StringIO(aln_example1)))
        # Now thae just the first row as a new alignment:
        alignment = alignment[0:1]
        handle = StringIO()
        ClustalWriter(handle).write_file([alignment])
        handle.seek(0)
        for i, a in enumerate(ClustalIterator(handle)):
            self.assertEqual(a.get_alignment_length(), alignment.get_alignment_length())
            self.assertEqual(len(a), 1)

    def test_three(self):
        alignments = list(ClustalIterator(StringIO(aln_example3)))
        self.assertEqual(1, len(alignments))
        self.assertEqual(alignments[0]._version, "2.0.9")

    def test_kalign_header(self):
        """Make sure we can parse the Kalign header."""
        alignments = next(ClustalIterator(StringIO(aln_example4)))
        self.assertEqual(2, len(alignments))

if __name__ == "__main__":
    runner = unittest.TextTestRunner(verbosity=2)
    unittest.main(testRunner=runner)
