#!/usr/bin/env python
# Copyright 2010 by Andrea Pierleoni
# Revisions copyright 2010-2015 by Peter Cock.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""Test for the Uniprot parser on Uniprot XML files."""
import os
import unittest

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Left as None if the import within UniProtIO fails
if SeqIO.UniprotIO.ElementTree is None:
    from Bio import MissingPythonDependencyError
    raise MissingPythonDependencyError("No ElementTree module was found. "
                                       "Use Python 2.5+, lxml or elementtree "
                                       "if you want to use "
                                       "Bio.SeqIO.UniprotIO.")

from seq_tests_common import compare_reference, compare_record


class TestUniprot(unittest.TestCase):

    def test_uni001(self):
        "Parsing Uniprot file uni001"
        filename = 'uni001'
        # test the record parser

        datafile = os.path.join('SwissProt', filename)

        with open(datafile) as test_handle:
            seq_record = SeqIO.read(test_handle, "uniprot-xml")

        self.assertTrue(isinstance(seq_record, SeqRecord))

        # test a couple of things on the record -- this is not exhaustive
        self.assertEqual(seq_record.id, "Q91G55")
        self.assertEqual(seq_record.name, "043L_IIV6")
        self.assertEqual(seq_record.description, "Uncharacterized protein 043L")
        self.assertEqual(repr(seq_record.seq), "Seq('MDLINNKLNIEIQKFCLDLEKKYNINYNNLIDLWFNKESTERLIKCEVNLENKI...IPI', ProteinAlphabet())")

        # self.assertEqual(seq_record.accessions, ['Q91G55']) #seq_record.accessions does not exist
        # self.assertEqual(seq_record.organism_classification, ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Mammalia', 'Eutheria', 'Primates', 'Catarrhini', 'Hominidae', 'Homo'])
        # self.assertEqual(record.seqinfo, (348, 39676, '75818910'))

        self.assertEqual(len(seq_record.features), 1)
        self.assertEqual(repr(seq_record.features[0]), "SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(116)), type='chain', id='PRO_0000377969')")

        self.assertEqual(len(seq_record.annotations['references']), 2)
        self.assertEqual(seq_record.annotations['references'][0].authors, 'Jakob N.J., Mueller K., Bahr U., Darai G.')
        self.assertEqual(seq_record.annotations['references'][0].title, 'Analysis of the first complete DNA sequence of an invertebrate iridovirus: coding strategy of the genome of Chilo iridescent virus.')
        self.assertEqual(seq_record.annotations['references'][0].journal, 'Virology 286:182-196(2001)')
        self.assertEqual(seq_record.annotations['references'][0].comment, 'journal article | 2001 | Scope: NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA] | ')

        self.assertEqual(len(seq_record.dbxrefs), 11)
        self.assertEqual(seq_record.dbxrefs[0], 'DOI:10.1006/viro.2001.0963')

        self.assertEqual(seq_record.annotations['sequence_length'], 116)
        self.assertEqual(seq_record.annotations['sequence_checksum'], '4A29B35FB716523C')
        self.assertEqual(seq_record.annotations['modified'], '2009-07-07')
        self.assertEqual(seq_record.annotations['accessions'], ['Q91G55'])
        self.assertEqual(seq_record.annotations['taxonomy'], ['Viruses', 'dsDNA viruses, no RNA stage', 'Iridoviridae', 'Iridovirus'])
        self.assertEqual(seq_record.annotations['sequence_mass'], 13673)
        self.assertEqual(seq_record.annotations['dataset'], 'Swiss-Prot')
        self.assertEqual(seq_record.annotations['gene_name_ORF'], ['IIV6-043L'])
        self.assertEqual(seq_record.annotations['version'], 21)
        self.assertEqual(seq_record.annotations['sequence_modified'], '2001-12-01')
        self.assertEqual(seq_record.annotations['keywords'], ['Complete proteome', 'Virus reference strain'])
        self.assertEqual(seq_record.annotations['organism_host'], ['Acheta domesticus', 'House cricket', 'Chilo suppressalis', 'striped riceborer', 'Gryllus bimaculatus', 'Two-spotted cricket', 'Gryllus campestris', 'Spodoptera frugiperda', 'Fall armyworm'])
        self.assertEqual(seq_record.annotations['created'], '2009-06-16')
        self.assertEqual(seq_record.annotations['organism_name'], ['Chilo iridescent virus'])
        self.assertEqual(seq_record.annotations['organism'], 'Invertebrate iridescent virus 6 (IIV-6)')
        self.assertEqual(seq_record.annotations['recommendedName_fullName'], ['Uncharacterized protein 043L'])
        self.assertEqual(seq_record.annotations['sequence_version'], 1)
        self.assertEqual(seq_record.annotations['proteinExistence'], ['Predicted'])

    def test_uni003(self):
        "Parsing Uniprot file uni003"
        filename = 'uni003'
        # test the record parser

        datafile = os.path.join('SwissProt', filename)

        test_handle = open(datafile)
        seq_record = SeqIO.read(test_handle, "uniprot-xml")
        test_handle.close()

        self.assertTrue(isinstance(seq_record, SeqRecord))

        # test general record entries
        self.assertEqual(seq_record.id, "O44185")
        self.assertEqual(seq_record.name, "FLP13_CAEEL")
        self.assertEqual(seq_record.description,
                         "FMRFamide-like neuropeptides 13")
        self.assertEqual(repr(seq_record.seq),
                         "Seq('MMTSLLTISMFVVAIQAFDSSEIRMLDEQYDTKNPFFQF"
                         "LENSKRSDRPTRAMD...GRK', ProteinAlphabet())")

        self.assertEqual(len(seq_record.annotations['references']), 7)
        self.assertEqual(seq_record.annotations['references'][5].authors,
                         'Kim K., Li C.')
        self.assertEqual(seq_record.annotations['references'][5].title,
                         'Expression and regulation of an FMRFamide-related '
                         'neuropeptide gene family in Caenorhabditis elegans.')
        self.assertEqual(seq_record.annotations['references'][5].journal,
                         'J. Comp. Neurol. 475:540-550(2004)')
        self.assertEqual(seq_record.annotations['references'][5].comment,
                         'journal article | 2004 | Scope: TISSUE SPECIFICITY, '
                         'DEVELOPMENTAL STAGE | ')

        self.assertEqual(seq_record.annotations["accessions"], ['O44185'])
        self.assertEqual(seq_record.annotations["created"], "2004-05-10")
        self.assertEqual(seq_record.annotations["dataset"], "Swiss-Prot")
        self.assertEqual(seq_record.annotations["gene_name_ORF"], ['F33D4.3'])
        self.assertEqual(seq_record.annotations["gene_name_primary"], "flp-13")
        self.assertEqual(seq_record.annotations["keywords"],
                         ['Amidation', 'Cleavage on pair of basic residues',
                          'Complete proteome', 'Direct protein sequencing',
                          'Neuropeptide', 'Reference proteome', 'Repeat',
                          'Secreted', 'Signal'])
        self.assertEqual(seq_record.annotations["modified"], "2012-11-28")
        self.assertEqual(seq_record.annotations["organism"],
                         "Caenorhabditis elegans")
        self.assertEqual(seq_record.annotations["proteinExistence"],
                         ['evidence at protein level'])
        self.assertEqual(seq_record.annotations["recommendedName_fullName"],
                         ['FMRFamide-like neuropeptides 13'])
        self.assertEqual(seq_record.annotations["sequence_length"], 160)
        self.assertEqual(seq_record.annotations["sequence_checksum"],
                         "BE4C24E9B85FCD11")
        self.assertEqual(seq_record.annotations["sequence_mass"], 17736)
        self.assertEqual(seq_record.annotations["sequence_modified"], "1998-06-01")
        self.assertEqual(seq_record.annotations["sequence_precursor"], "true")
        self.assertEqual(seq_record.annotations["sequence_version"], 1)
        self.assertEqual(seq_record.annotations["taxonomy"],
                         ['Eukaryota', 'Metazoa', 'Ecdysozoa', 'Nematoda',
                          'Chromadorea', 'Rhabditida', 'Rhabditoidea', 'Rhabditidae',
                          'Peloderinae', 'Caenorhabditis'])
        self.assertEqual(seq_record.annotations["type"],
                         ['ECO:0000006', 'ECO:0000001'])
        self.assertEqual(seq_record.annotations["version"], 74)

        # test comment entries
        self.assertEqual(seq_record.annotations["comment_allergen"],
                         ['Causes an allergic reaction in human.'])
        self.assertEqual(seq_record.annotations["comment_alternativeproducts_isoform"],
                         ['Q8W1X2-1', 'Q8W1X2-2'])
        self.assertEqual(seq_record.annotations["comment_biotechnology"],
                         ['Green fluorescent protein has been engineered to '
                          'produce a vast number of variously colored '
                          'mutants, fusion proteins, and biosensors. '
                          'Fluorescent proteins and its mutated allelic '
                          'forms, blue, cyan and yellow have become a useful '
                          'and ubiquitous tool for making chimeric proteins, '
                          'where they function as a fluorescent protein tag. '
                          'Typically they tolerate N- and C-terminal fusion '
                          'to a broad variety of proteins. They have been '
                          'expressed in most known cell types and are used '
                          'as a noninvasive fluorescent marker in living '
                          'cells and organisms. They enable a wide range of '
                          'applications where they have functioned as a cell '
                          'lineage tracer, reporter of gene expression, or as '
                          'a measure of protein-protein interactions.',
                          'Can also be used as a molecular thermometer, '
                          'allowing accurate temperature measurements in '
                          'fluids. The measurement process relies on the '
                          'detection of the blinking of GFP using '
                          'fluorescence correlation spectroscopy.'])
        self.assertEqual(seq_record.annotations["comment_catalyticactivity"],
                         ['ATP + acetyl-CoA + HCO(3)(-) = ADP + phosphate '
                          '+ malonyl-CoA.',
                          'ATP + biotin-[carboxyl-carrier-protein] '
                          '+ CO(2) = ADP + phosphate + '
                          'carboxy-biotin-[carboxyl-carrier-protein].'])
        self.assertEqual(seq_record.annotations["comment_caution"],
                         ['Could be the product of a pseudogene. The '
                          'existence of a transcript at this locus is '
                          'supported by only one sequence submission '
                          '(PubMed:2174397).'])
        self.assertEqual(seq_record.annotations["comment_cofactor"],
                         ['Biotin (By similarity).', 'Binds 2 manganese ions '
                          'per subunit (By similarity).'])
        self.assertEqual(seq_record.annotations["comment_developmentalstage"],
                         ['Expressed from the comma stage of embryogenesis, '
                          'during all larval stages, and in low levels in '
                          'adults.'])
        self.assertEqual(seq_record.annotations["comment_disease"],
                         ['Defects in MC2R are the cause of glucocorticoid '
                          'deficiency type 1 (GCCD1) [MIM:202200]; also known '
                          'as familial glucocorticoid deficiency type 1 '
                          '(FGD1). GCCD1 is an autosomal recessive disorder '
                          'due to congenital insensitivity or resistance to '
                          'adrenocorticotropin (ACTH). It is characterized by '
                          'progressive primary adrenal insufficiency, without '
                          'mineralocorticoid deficiency.'])
        self.assertEqual(seq_record.annotations["comment_disruptionphenotype"],
                         ['Mice display impaired B-cell development which '
                          'does not progress pass the progenitor stage.'])
        self.assertEqual(seq_record.annotations["comment_domain"],
                         ['Two regions, an N-terminal (aa 96-107) and a '
                          'C-terminal (aa 274-311) are required for binding '
                          'FGF2.'])
        self.assertEqual(seq_record.annotations["comment_enzymeregulation"],
                         ['By phosphorylation. The catalytic activity is '
                          'inhibited by soraphen A, a polyketide isolated '
                          'from the myxobacterium Sorangium cellulosum and '
                          'a potent inhibitor of fungal growth.'])
        self.assertEqual(seq_record.annotations["comment_function"],
                         ['FMRFamides and FMRFamide-like peptides are '
                          'neuropeptides. AADGAPLIRF-amide and '
                          'APEASPFIRF-amide inhibit muscle tension in somatic '
                          'muscle. APEASPFIRF-amide is a potent inhibitor of '
                          'the activity of dissected pharyngeal myogenic '
                          'muscle system.'])
        self.assertEqual(seq_record.annotations["comment_induction"],
                         ['Repressed in presence of fatty acids. Repressed '
                          '3-fold by ''lipid precursors, inositol and '
                          'choline, and also controlled by regulatory '
                          'factors INO2, INO4 and OPI1.'])
        self.assertEqual(seq_record.annotations["comment_interaction_intactId"],
                         ['EBI-356720', 'EBI-746969', 'EBI-720116'])
        self.assertEqual(seq_record.annotations["comment_massspectrometry"],
                         ['88..98:1032|MALDI', '100..110:1133.7|MALDI'])
        self.assertEqual(seq_record.annotations["comment_miscellaneous"],
                         ['Present with 20200 molecules/cell in log phase '
                          'SD medium.'])
        self.assertEqual(seq_record.annotations["comment_onlineinformation"],
                         ['NIEHS-SNPs@http://egp.gs.washington.edu/data/api5/'])
        self.assertEqual(seq_record.annotations["comment_pathway"],
                         ['Lipid metabolism; malonyl-CoA biosynthesis; '
                         'malonyl-CoA from acetyl-CoA: step 1/1.'])
        self.assertEqual(seq_record.annotations["comment_RNAediting"],
                         ['Partially edited. RNA editing generates receptor '
                          'isoforms that differ in their ability to interact '
                          'with the phospholipase C signaling cascade in a '
                          'transfected cell line, suggesting that this RNA '
                          'processing event may contribute to the modulation '
                          'of serotonergic neurotransmission in the central '
                          'nervous system.'])
        self.assertEqual(seq_record.annotations["comment_PTM"],
                         ['Acetylation at Lys-251 impairs '
                          'antiapoptotic function.'])
        self.assertEqual(seq_record.annotations["comment_pharmaceutical"],
                         ['Could be used as a possible therapeutic agent for '
                          'treating rheumatoid arthritis.'])
        self.assertEqual(seq_record.annotations["comment_polymorphism"],
                         ['Position 23 is polymorphic; the frequencies in '
                          'unrelated Caucasians are 0.87 for Cys and 0.13 '
                          'for Ser.'])
        self.assertEqual(seq_record.annotations["comment_similarity"],
                         ['Belongs to the FARP (FMRFamide related peptide) '
                          'family.'])
        self.assertEqual(seq_record.annotations["comment_subcellularlocation_location"],
                         ['Secreted'])
        self.assertEqual(seq_record.annotations["comment_subunit"],
                         ['Homodimer.'])
        self.assertEqual(seq_record.annotations["comment_tissuespecificity"],
                         ['Each flp gene is expressed in a distinct set of '
                          'neurons. Flp-13 is expressed in the ASE sensory '
                          'neurons, the DD motor neurons, the 15, M3 and M5 '
                          'cholinergic pharyngeal motoneurons, and the ASG, '
                          'ASK and BAG neurons.'])
        self.assertEqual(seq_record.annotations["comment_toxicdose"],
                         ['LD(50) is 50 ug/kg in mouse by '
                          'intracerebroventricular injection '
                          'and 600 ng/g in Blatella germanica.'])

    def test_sp016(self):
        "Parsing SwissProt file sp016"
        filename = 'sp016'
        # test the record parser

        datafile = os.path.join('SwissProt', filename)

        test_handle = open(datafile)
        seq_record = SeqIO.read(test_handle, "swiss")
        test_handle.close()

        self.assertTrue(isinstance(seq_record, SeqRecord))

        # test ProteinExistence (the numerical value describing the evidence for the existence of the protein)
        self.assertEqual(seq_record.annotations['protein_existence'], 1)
        # test Sequence version
        self.assertEqual(seq_record.annotations['sequence_version'], 1)
        # test Entry version
        self.assertEqual(seq_record.annotations['entry_version'], 93)

    def test_sp002(self):
        "Parsing SwissProt file sp002"
        filename = 'sp002'
        # test the record parser

        datafile = os.path.join('SwissProt', filename)

        test_handle = open(datafile)
        seq_record = SeqIO.read(test_handle, "swiss")
        test_handle.close()

        self.assertTrue(isinstance(seq_record, SeqRecord))

        # test Sequence version
        self.assertEqual(seq_record.annotations['sequence_version'], 34)
        # test Entry version
        self.assertEqual(seq_record.annotations['entry_version'], 36)

    def compare_txt_xml(self, old, new):
        self.assertEqual(old.id, new.id)
        self.assertEqual(old.name, new.name)
        self.assertEqual(len(old), len(new))
        self.assertEqual(str(old.seq), str(new.seq))
        for key in set(old.annotations).intersection(new.annotations):
            if key == "references":
                self.assertEqual(len(old.annotations[key]),
                                 len(new.annotations[key]))
                for r1, r2 in zip(old.annotations[key], new.annotations[key]):
                    # Tweak for line breaks in plain text SwissProt
                    r1.title = r1.title.replace("- ", "-")
                    r2.title = r2.title.replace("- ", "-")
                    r1.journal = r1.journal.rstrip(".")  # Should parser do this?
                    r1.medline_id = ""  # Missing in UniPort XML? TODO - check
                    # Lots of extra comments in UniProt XML
                    r1.comment = ""
                    r2.comment = ""
                    if not r2.journal:
                        r1.journal = ""
                    compare_reference(r1, r2)
            elif old.annotations[key] == new.annotations[key]:
                pass
            elif key in ["date"]:
                # TODO - Why is this a list vs str?
                pass
            elif not isinstance(old.annotations[key], type(new.annotations[key])):
                raise TypeError("%s gives %s vs %s" %
                                (key, old.annotations[key], new.annotations[key]))
            elif key in ["organism"]:
                if old.annotations[key] == new.annotations[key]:
                    pass
                elif old.annotations[key].startswith(new.annotations[key] + " "):
                    pass
                else:
                    raise ValueError(key)
            elif isinstance(old.annotations[key], list) \
            and sorted(old.annotations[key]) == sorted(new.annotations[key]):
                pass
            else:
                raise ValueError("%s gives %s vs %s" %
                                 (key, old.annotations[key], new.annotations[key]))
        self.assertEqual(len(old.features), len(new.features),
                         "Features in %s, %i vs %i" %
                         (old.id, len(old.features), len(new.features)))
        for f1, f2 in zip(old.features, new.features):
            """
            self.assertEqual(f1.location.nofuzzy_start, f2.location.nofuzzy_start,
                             "%s %s vs %s %s" %
                             (f1.location, f1.type, f2.location, f2.type))
            self.assertEqual(f1.location.nofuzzy_end, f2.location.nofuzzy_end,
                             "%s %s vs %s %s" %
                             (f1.location, f1.type, f2.location, f2.type))
            """
            self.assertEqual(repr(f1.location), repr(f2.location),
                             "%s %s vs %s %s" %
                             (f1.location, f1.type, f2.location, f2.type))

    def test_Q13639(self):
        """Compare SwissProt text and uniprot XML versions of Q13639."""
        old = SeqIO.read("SwissProt/Q13639.txt", "swiss")
        new = SeqIO.read("SwissProt/Q13639.xml", "uniprot-xml")
        self.compare_txt_xml(old, new)

    def test_H2CNN8(self):
        """Compare SwissProt text and uniprot XML versions of H2CNN8."""
        old = SeqIO.read("SwissProt/H2CNN8.txt", "swiss")
        new = SeqIO.read("SwissProt/H2CNN8.xml", "uniprot-xml")
        self.compare_txt_xml(old, new)

    def test_F2CXE6(self):
        """Compare SwissProt text and uniprot XML versions of F2CXE6."""
        # This evil record has a semi-colon in the genem name,
        # GN   Name=HvPIP2;8 {ECO:0000313|EMBL:BAN04711.1};
        # <gene><name type="primary" evidence="3">HvPIP2;8</name></gene>
        old = SeqIO.read("SwissProt/F2CXE6.txt", "swiss")
        new = SeqIO.read("SwissProt/F2CXE6.xml", "uniprot-xml")
        self.compare_txt_xml(old, new)
        # TODO - Why the mismatch gene_name vs gene_name_primary?
        # TODO - Handle evidence codes on GN line (see GitHub isse #416)
        self.assertEqual(old.annotations["gene_name"], 'Name=HvPIP2;8 {ECO:0000313|EMBL:BAN04711.1};')
        self.assertEqual(new.annotations["gene_name_primary"], 'HvPIP2;8')
        self.assertEqual(old.name, 'F2CXE6_HORVD')
        self.assertEqual(new.name, 'F2CXE6_HORVD')

    def test_P84001(self):
        """Parse mass spec structured comment with unknown loc"""
        xml = list(SeqIO.parse("SwissProt/P84001.xml", "uniprot-xml"))[0]
        self.assertEqual(xml.id, 'P84001')
        self.assertEqual(len(xml.annotations['comment_massspectrometry']), 1)
        self.assertEqual(xml.annotations['comment_massspectrometry'][0], 'undefined:9571|Electrospray')

    def test_multi_ex(self):
        """Compare SwissProt text and uniprot XML versions of several examples."""
        txt_list = list(SeqIO.parse("SwissProt/multi_ex.txt", "swiss"))
        xml_list = list(SeqIO.parse("SwissProt/multi_ex.xml", "uniprot-xml"))
        fas_list = list(SeqIO.parse("SwissProt/multi_ex.fasta", "fasta"))
        with open("SwissProt/multi_ex.list") as handle:
            ids = [x.strip() for x in handle]
        self.assertEqual(len(txt_list), len(ids))
        self.assertEqual(len(txt_list), len(fas_list))
        self.assertEqual(len(txt_list), len(xml_list))
        for txt, xml, fas, id in zip(txt_list, xml_list, fas_list, ids):
            self.assertEqual(txt.id, id)
            self.assertIn(txt.id, fas.id.split("|"))
            self.assertEqual(str(txt.seq), str(fas.seq))
            self.compare_txt_xml(txt, xml)

    def test_multi_ex_index(self):
        """Index SwissProt text and uniprot XML versions of several examples."""
        txt_list = list(SeqIO.parse("SwissProt/multi_ex.txt", "swiss"))
        xml_list = list(SeqIO.parse("SwissProt/multi_ex.xml", "uniprot-xml"))
        with open("SwissProt/multi_ex.list") as handle:
            ids = [x.strip() for x in handle]
        txt_index = SeqIO.index("SwissProt/multi_ex.txt", "swiss")
        xml_index = SeqIO.index("SwissProt/multi_ex.xml", "uniprot-xml")
        self.assertEqual(sorted(txt_index), sorted(ids))
        self.assertEqual(sorted(xml_index), sorted(ids))
        # Check SeqIO.parse() versus SeqIO.index() for plain text "swiss"
        for old in txt_list:
            new = txt_index[old.id]
            compare_record(old, new)
        # Check SeqIO.parse() versus SeqIO.index() for XML "uniprot-xml"
        for old in xml_list:
            new = xml_index[old.id]
            compare_record(old, new)
        txt_index.close()
        xml_index.close()

    def test_submittedName_allowed(self):
        """Checks if parser supports new XML Element (submittedName)."""
        for entry in SeqIO.parse(open("SwissProt/R5HY77.xml"), "uniprot-xml"):
            self.assertEqual(entry.id, "R5HY77")
            self.assertEqual(entry.description, "Elongation factor Ts")


if __name__ == "__main__":
    runner = unittest.TextTestRunner(verbosity=2)
    unittest.main(testRunner=runner)