#!/usr/bin/env python

from string import split, strip
from os import popen, remove
from glob import glob
from cogent.util.unit_test import TestCase, main
from cogent.parse.blast import QMEBlast9
from cogent.app.blast import seqs_to_stream,\
    make_subject_match_scorer, make_shotgun_scorer, keep_everything_scorer, \
    ids_from_seq_lower_threshold, PsiBlast, psiblast_n_neighbors

__author__ = "Micah Hamady"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Micah Hamady", "Rob Knight", "Catherine Lozupone"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Micah Hamady"
__email__ = "hamady@colorado.edu"
__status__ = "Prototype"

class BlastTests(TestCase):
    """Tests of top-level functions"""

    def setUp(self):
        """Define some standard data"""
        self.rec = """# BLASTP 2.2.10 [Oct-19-2004]
# Iteration: 1
# Query: ece:Z4181
# Database: db/everything.faa
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
ece:Z4181	ece:Z4181	100.00	110	0	0	1	110	1	110	3e-47	 187
ece:Z4181	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-47	 187
ece:Z4181	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-06	52.8
# BLASTP 2.2.10 [Oct-19-2004]
# Iteration: 2
# Query: ece:Z4181
# Database: db/everything.faa
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
ece:Z4181	ece:Z4181	100.00	110	0	0	1	110	1	110	3e-54	 211
ece:Z4181	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-54	 211
ece:Z4181	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-08	59.0
ece:Z4181	sfl:CP0138	33.98	103	57	2	8	110	6	97	6e-06	50.5
ece:Z4181	spt:SPA2730	37.50	72	45	0	39	110	30	101	1e-05	49.8
ece:Z4181	sec:SC2804	37.50	72	45	0	39	110	30	101	1e-05	49.8
ece:Z4181	stm:STM2872	37.50	72	45	0	39	110	30	101	1e-05	49.8
# BLASTP 2.2.10 [Oct-19-2004]
# Iteration: 1
# Query: ece:Z4182
# Database: db/everything.faa
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
ece:Z4182	ece:Z4182	100.00	110	0	0	1	110	1	110	3e-47	 187
ece:Z4182	ecs:ECs3718	100.00	110	0	0	1	110	1	110	3e-47	 187
ece:Z4182	cvi:CV2422	41.67	72	42	0	39	110	29	100	2e-06	52.8""".split('\n')

        self.rec2 = """# BLASTP 2.2.10 [Oct-19-2004]
# Iteration: 1
# Query: ece:Z4181
# Database: db/everything.faa
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
ece:Z4181	ece:Z4181	100.00	110	0	0	1	110	1	110	3e-47	 187
ece:Z4181	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-47	 187
ece:Z4181	spt:SPA2730	37.50	72	45	0	39	110	30	101	1e-05	49.8
# BLASTP 2.2.10 [Oct-19-2004]
# Iteration: 2
# Query: ece:Z4181
# Database: db/everything.faa
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
ece:Z4181	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-54	 211
ece:Z4181	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-08	59.0
# BLASTP 2.2.10 [Oct-19-2004]
# Iteration: 1
# Query: ece:Z4182
# Database: db/everything.faa
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
ece:Z4182	ece:Z4182	100.00	110	0	0	1	110	1	110	3e-47	 187
ece:Z4182	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-06	52.8""".split('\n')

        self.rec3 = """# BLASTP 2.2.10 [Oct-19-2004]
# BLASTP 2.2.10 [Oct-19-2004]
# Query: ece:Z4181
# Database: db/everything.faa
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
ece:Z4181	ece:Z4182	100.00	110	0	0	1	110	1	110	3e-47	 187
ece:Z4181	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-47	 187
ece:Z4181	spt:SPA2730	37.50	72	45	0	39	110	30	101	1e-05	49.8
# BLASTP 2.2.10 [Oct-19-2004]
# Iteration: 1
# Query: ece:Z4182
# Database: db/everything.faa
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
ece:Z4182	ece:Z4182	100.00	110	0	0	1	110	1	110	3e-47	 187
ece:Z4182	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-06	52.8
# BLASTP 2.2.10 [Oct-19-2004]
# Query: ece:Z4183
# Database: db/everything.faa
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
ece:Z4183	ece:Z4182	100.00	110	0	0	1	110	1	110	3e-47	 187
ece:Z4183	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-54	 211
ece:Z4183	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-08	59.0""".split('\n')

        self.query_1 = """>gi|100002553| Bd2556c Bd2556c two-component system sensor histidine kinase 3092017:3094158 reverse MW:81963
MRLKNRLNNWISIRMGMVIVIFLGVSCGSMRSSTPPPAKDRLTEIDSLERLLPDCPTIASTLPLLRRLAFLYQQQSEMKVYNERLYENAMAVDSISVAYLGLKNLAEYYYDQSVRDSLEYYCSLVDSIAKARHEYPNVLFDVKSLSSQDLLWLGNYELAMSEAMDLYRLASNLDHRYGLLRCSETLGLIYQRIRRDSDAVVSFQESLDLLKDIKDVPDIMDTKVRLTSYQLESSVRTKQYASTERILGQYMALLDEQYKIYQEKNDLLSIKREYWLLYSFYTSFYLSQGDLENAKRSLDQASSYADSNWVEGDYAINTYLTVKARYHKAAGDIPLALHCINEVLETERLPEDIQFKADILKEQGQLGEVMALYDELYSTLTKRRGTSFLRQVNQLRTLHELHEKELKETELKEAGQRIARKQDLLIFILSISVVLLILLYVLFLYYRHLRSLKNQLQREKELLLESQRQLIKEKTRAEEASLMKSAFLANMSHEVRTPLNAIVGFSGLLVEPSTDEEERKEYSSIIRNNTDLMLNLVNDVLDLSRMETGDLHFDIKDHLLLVCCQMALESVRHRIPDGVKLTFSPAGEPIVVHVDNLRLQQLLTNLLTNAAKFTEKGEINLSFQLEPDRKKVCIAVTDTGAGIPLEKQATIFNRFEKLDDYKPGVGLGLSICLLIAERLDGALFIDSSYTDGARFVLILSCEIDSSIYNPPIEV"""

        self.query_2 = """>gi|100002557| Bd2560c Bd2560c conserved hypothetical protein 3097971:3098210 reverse MW:8927
MGKNQLIHGNEFHLLKQAEIHKATGKLVESLNLAAGSTGGFDIYKVVEAYFTDLEKRKEINDLLGISEPCETRVTEECFS
"""

        self.fasta_recs = """>gi|100002550| Bd2553c Bd2553c conserved hypothetical protein 3090609:3091013 reverse MW:14682
MMDFISVPLVVGIVCAGIYGLFELFVRKRERLAIIEKIGDKLDTSAFDGKLGLPNYMRNFSFSSLKAGCLLAGIGLGLLVGFIINMCMATNSYYDDGWYRHEVAGTAYGASVLLFGGIGLIIAFVIELKLGKNNK
>gi|100002551| Bd2554 Bd2554 RNA polymerase ECF-type sigma factor 3091112:3091717 forward MW:23408
LLPQVVTYLPGLRPLSTMELYTDTYYIQRIQAGDVACFACLLDKYSRPIHSLILKVVRSQEEAEELAQDTFMKVFKNLASFKGDCSFSTWIYRIAYNTAISSVRKKRYEFLAIEETTLENVSEEEITNLFGQTESTEQVQRLEVALEQLLPDERALILLFYWKEKTIEELVSITGLTASNIKVKLHRIRKKLFVLLNGMDHE
>gi|100002552| Bd2555 Bd2555 conserved hypothetical protein 3091713:3092066 forward MW:13332
MSKINTNKEQPDLLGDLFKRIPEEELPASFRSNVMRQIMLESAKAKKRDERFSLLAAIVASLIMISLAIVSFVYMEIPKIAIPTISTSALAFYLYIGAITLILLLADYKLRNLFHKKG
>gi|100002553| Bd2556c Bd2556c two-component system sensor histidine kinase 3092017:3094158 reverse MW:81963
MRLKNRLNNWISIRMGMVIVIFLGVSCGSMRSSTPPPAKDRLTEIDSLERLLPDCPTIASTLPLLRRLAFLYQQQSEMKVYNERLYENAMAVDSISVAYLGLKNLAEYYYDQSVRDSLEYYCSLVDSIAKARHEYPNVLFDVKSLSSQDLLWLGNYELAMSEAMDLYRLASNLDHRYGLLRCSETLGLIYQRIRRDSDAVVSFQESLDLLKDIKDVPDIMDTKVRLTSYQLESSVRTKQYASTERILGQYMALLDEQYKIYQEKNDLLSIKREYWLLYSFYTSFYLSQGDLENAKRSLDQASSYADSNWVEGDYAINTYLTVKARYHKAAGDIPLALHCINEVLETERLPEDIQFKADILKEQGQLGEVMALYDELYSTLTKRRGTSFLRQVNQLRTLHELHEKELKETELKEAGQRIARKQDLLIFILSISVVLLILLYVLFLYYRHLRSLKNQLQREKELLLESQRQLIKEKTRAEEASLMKSAFLANMSHEVRTPLNAIVGFSGLLVEPSTDEEERKEYSSIIRNNTDLMLNLVNDVLDLSRMETGDLHFDIKDHLLLVCCQMALESVRHRIPDGVKLTFSPAGEPIVVHVDNLRLQQLLTNLLTNAAKFTEKGEINLSFQLEPDRKKVCIAVTDTGAGIPLEKQATIFNRFEKLDDYKPGVGLGLSICLLIAERLDGALFIDSSYTDGARFVLILSCEIDSSIYNPPIEV
>gi|100002554| Bd2557c Bd2557c two-component system sensor histidine kinase 3094158:3095507 reverse MW:51247
LERKYNGEGKIFPVKRHRCLMSCYYCELYTMKGNSGKAQAYLDQATAYLDSSFGDRVEAQYLRTKSFYYWKEKDYRHALSAVNLALKINRDLDKLEMKKAVLQSSGQLQEAVTIYEEIINKTETINTDAFDRQIEQLRVLNDLNDLEKQDRELKLKSEQEALKQKQIVVSIGLLLVLMGLLYMLWRIYMHTKRLRNELLQEKDSLTASEKQLRVVTKEAEAANKKKSAFIANISHEVRTPLNAIVGFSELLASSEYSEEEKIRFAGEVNHSSELLLNLVNDVLDLSRLESGKIKFSVKPNDLVACCQRALDSIRHRVKPGVRLTFTPSIESYTLNTDALRLQQLLTNLLSNAAKFTSEGEINLSFTVDEGKEEVCFSVTDTGCGIPEDKCEKIFERFEKLDDFIQGTGLGLSVCQIISEQLNGSLSVDISYKDGARFVFIHPTNLIETPI
>gi|100002555| Bd2558c Bd2558c hypothetical protein 3095527:3095985 reverse MW:17134
LRGKNIHLGRVGCNYGKLLIFIDIYFVSLRIVSDKSMSRGFLRKSSVNTFIGIVWILFAVGTSAQNAVSKFRADSIRQSLSRIQKPQDKIPLLKELIGLYWQLPEEVLALKEIIDIAMPLDSIGIVYDAMAGLSRYYPAIRTFVRVGGALETV
>gi|100002556| Bd2559 Bd2559 30S ribosomal protein S1 3096095:3097882 forward MW:67092
MENLKNIQPVEDFNWDAFEQGETYTEVSKDDLVKTYDETLNTVKDKEVVMGTVTSMNKREVVVNIGFKSDGVVPMSEFRYNPDLKIGDEVEVYIESQEDKKGQLILSHKKARATRSWDRVNEALEKDEIIKGYIKCRTKGGMIVDVFGIEAFLPGSQIDVKPIRDYDVFVGKTMEFKIVKINQEFKNVVVSHKALIEAELEQQKKDIISKLEKGQVLEGTVKNITSYGVFIDLGGVDGLIHITDLSWGRVSHPEEIVQLDQKINVVILDFDDEKKRIALGLKQLTPHPWDALDTNLKVGDKVKGKVVVMADYGAFIEIAPGVEGLIHVSEMSWTQHLRSAQDFMKVGDEIEAVILTLDRDERKMSLGIKQLKADPWENIEERFPVGSRHAAKVRNFTNFGVFVEIEEGVDGLIHISDLSWTKKIKHPSEFTQIGAEIEVQVLEIDKENRRLSLGHKQLEENPWDVFETIFTVGSIHEGTIIEVLDKGAVISLPYGVEGFATPKHLVKEDGSQAQVDEKLSFKVIEFNKEAKRIILSHSRIFEDEQKGAKATSEKKASSKRGGKKEEESGMVTGPVEKTTLGDIEELAALKEKLSGK
>gi|100002557| Bd2560c Bd2560c conserved hypothetical protein 3097971:3098210 reverse MW:8927
MGKNQLIHGNEFHLLKQAEIHKATGKLVESLNLAAGSTGGFDIYKVVEAYFTDLEKRKEINDLLGISEPCETRVTEECFS
>gi|100002558| Bd2561 Bd2561 phosphoglycolate phosphatase 3098389:3099033 forward MW:24182
MKKLVIFDLDGTLLNTIADLAHSTNHALRQNGFPTHDVKEYNFFVGNGINKLFERALPEGEKTAENILKVREEFLKHYDLHNTDRSVPYPGVPELLALLQERGIKLAVASNKYQAATRKLIAHFFPSIQFTEVLGQREGVKAKPDPSIVNEIVERASISKESTLYVGDSDVDMQTAINSEVTSCGVTWGFRPRTELEKYAPDHIAEKAEDILKFI
>gi|100002559| Bd2562 Bd2562 conserved hypothetical protein 3099382:3100299 forward MW:35872
MSGNIKKIVEPNSGIDYSLEKDFKIFTLSKELPITTYPSYIRLGIVIYCVKGNAKIDIYSNKHIITPKELIIILPGQLVALTDVSVDFQIRYFTITESFYSDILSGISRFSPHFFFYMRQHYYFKMEDVETLSFVDFFELLIRKAVDPENQYRRESVILLLRILFLDIYNHYKVNSLDSTATIDVHKKELTHKFFQLVMSNYKVNRSVTFYANSLCITPKYLTMVVKEVSGKSAKDWITEYMILELKGLLTNSTLNIQEIVEKTQFSNQSSLGRFFRRHTGLSPLQYRKKYLTTEQRTNFSKNNTI
"""

    def test_seqs_to_stream(self):
        """seqs_to_stream should iterate over seqs"""
        sts = seqs_to_stream
        self.assertEqual(list(sts('>a\nTG\n>b\nWW\n', \
            '_input_as_multiline_string')),\
            [['>a','TG'],['>b','WW']])
        #skipping test for file open
        self.assertEqual(list(sts(['TG','WW'], '_input_as_seqs')), \
            [['>0','TG'],['>1','WW']])
        self.assertEqual(list(sts(['>a','TG','>b','WW'], \
            '_input_as_lines')),\
            [['>a','TG'],['>b','WW']])
        self.assertRaises(TypeError, sts, 'abc', 'xyz')
   
    def test_make_subject_match_scorer(self):
        """make_subject_match_scorer should keep ids matching n queries"""
        qm1 = make_subject_match_scorer(1)
        qm3 = make_subject_match_scorer(3)
        qm5 = make_subject_match_scorer(5)
        qmes = wrap_qmes(QMEBlast9(self.rec3))
        self.assertEqualItems(qm1(qmes), ['ece:Z4181','ece:Z4182','ece:Z4183'])
        self.assertEqualItems(qm3(qmes), ['ece:Z4181','ece:Z4183'])
        self.assertEqualItems(qm5(qmes), [])
   
    def test_make_shotgun_scorer(self):
        """make_shotgun_scorer should keep ids matching n queries"""
        sg1 = make_shotgun_scorer(1)
        sg2 = make_shotgun_scorer(2)
        sg3 = make_shotgun_scorer(3)
        sg4 = make_shotgun_scorer(4)
        sg5 = make_shotgun_scorer(5)
        qmes = wrap_qmes(QMEBlast9(self.rec3))
        self.assertEqualItems(sg1(qmes), keep_everything_scorer(qmes))
        self.assertEqualItems(sg2(qmes), \
            ['ece:Z4181','ece:Z4182','ece:Z4183','cvi:CV2421','ecs:ECs3717'])
        self.assertEqualItems(sg3(qmes), \
            ['ece:Z4181','ece:Z4182','ece:Z4183'])
        self.assertEqualItems(sg4(qmes), \
            ['ece:Z4182'])
        self.assertEqualItems(sg5(qmes), [])
     
    def test_keep_everything_scorer(self):
        """keep_everything_scorer should keep all ids found."""
        k = keep_everything_scorer(wrap_qmes(QMEBlast9(self.rec2)))
        self.assertEqualItems(k, \
            ['ece:Z4181','ecs:ECs3717','spt:SPA2730','cvi:CV2421','ece:Z4182'])


def wrap_qmes(qmes):
    """Converts qmes into a dict of {q:{m:e}}"""
    d = {}
    for q, m, e in qmes:
        if q not in d:
            d[q] = {}
        d[q][m] = e
    return d

if __name__ == "__main__":
    main()
