File: wublast.py

package info (click to toggle)
python-biopython 1.42-2
links: PTS
area: main
in suites: etch, etch-m68k
size: 17,584 kB
ctags: 12,272
sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203
file content (404 lines) | stat: -rw-r--r-- 15,283 bytes
parent folder | download | duplicates (2)
from Martel import *
from Martel import Time
from Bio import Std
import ncbiblast

wublast_version = Re("2\.[^-]+-WashU")

# BLASTN 2.0a19MP-WashU [05-Feb-1998] [Build decunix3.2 01:53:29 05-Feb-1998]
# BLASTP 2.0a19MP-WashU [05-Feb-1998] [Build sol2.5-ultra 01:47:24 05-Feb-1998]
blastn_appheader = replace_groups(ncbiblast.blastn_appheader,
                                  [("appversion", wublast_version)])
blastp_appheader = replace_groups(ncbiblast.blastp_appheader,
                                  [("appversion", wublast_version)])
blastx_appheader = replace_groups(ncbiblast.blastx_appheader,
                                  [("appversion", wublast_version)])
tblastn_appheader = replace_groups(ncbiblast.tblastn_appheader,
                                  [("appversion", wublast_version)])
tblastx_appheader = replace_groups(ncbiblast.tblastx_appheader,
                                   [("appversion", wublast_version)])

spaces_line = ncbiblast.spaces_line
Number = ncbiblast.Number

# Database:  YeastORF-P
#            6183 sequences; 2,900,438 total letters.

# Database:  Non-redundant GenBank CDS translations+PDB+SwissProt+SPupdate+PIR
#            307,320 sequences; 92,696,426 total letters.

# Database: mgpep
#            468 sequences; 170,400 total letters

# Database:  plantPept
#            6418 sequences; 2,370,771 total letters.

# Database: Non-redundant GenBank CDS
# translations+PDB+SwissProt+SPupdate+PIR
#            301,269 sequences; 90,873,415 total letters

query_database = (Str("Database:") + Opt(Spaces()) +
                  Std.database_name(UntilEol()) + AnyEol() +
                  Spaces() +
                  Std.database_num_sequences(Number(),
                                           {"bioformat:decode": "int.comma"}) +
                  Str(" sequences;") + Spaces() + 
                  Std.database_num_letters(Number(),
                                           {"bioformat:decode": "int.comma"}) +
                  Spaces() + Str("total letters.") + ToEol())
#                        notice the "."  -----^^^


#                                                                      Smallest
#                                                                        Sum
#                                                               High  Probability
# Sequences producing High-scoring Segment Pairs:              Score  P(N)      N
#  
# sp|P09429|HMG1_HUMAN HIGH MOBILITY GROUP PROTEIN HMG1 (HM...  1144  8.6e-117  1
# sp|P10103|HMG1_BOVIN HIGH MOBILITY GROUP PROTEIN HMG1 (HM...  1140  2.3e-116  1

to_table = (SkipLinesUntil(Str("Sequences producing High-scoring Segment")) +
            ToEol() +
            AnyEol())

table_entry = (AssertNot(AnyEol()) +
               Std.search_table_description(Re(r".{60}")) + Spaces() +
               Std.search_table_value(Float(), {"name": "score",
                                                "bioformat:decode": "float"}) +
               Spaces() +
               Std.search_table_value(Float(), {"name": "P",
                                                "bioformat:decode": "float"}) +
               Spaces() +
               Std.search_table_value(Digits(), {"name": "N",
                                              "bioformat:decode": "float"}) +
               AnyEol())
     
header = replace_groups(ncbiblast.header,
                        (("query_database", query_database),
                         ("to_table", to_table),
                         ("table_entry", table_entry)))

# >sp|P09429|HMG1_HUMAN HIGH MOBILITY GROUP PROTEIN HMG1 (HMG-1).
#             Length = 214
#  
#  Score = 1144 (402.7 bits), Expect = 8.6e-117, P = 8.6e-117
#  Identities = 214/214 (100%), Positives = 214/214 (100%)
#  
# Query:     1 GKGDPKKPRGKMSSYAFFVQTCREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFE 60
#              GKGDPKKPRGKMSSYAFFVQTCREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFE
# Sbjct:     1 GKGDPKKPRGKMSSYAFFVQTCREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFE 60

hit_descr = ncbiblast.hit_descr
hit_length = ncbiblast.hit_length


#  Score = 1144 (402.7 bits), Expect = 8.6e-117, P = 8.6e-117
#  Score = 79 (27.8 bits), Expect = 0.0088, Sum P(2) = 0.0088
#  Score = 65 (22.9 bits), Expect = 3.1, Sum P(3) = 0.96

blastp_score = (Re(r" *Score = *") +
                Std.hsp_value(Digits(), {"name": "bits",
                                         "bioformat:decode": "int",
                                         }) +
                # XXX Skipping the "402.7 bits" field, what's it called?
                ToSep(sep = "=") +
                Spaces() +
                Std.hsp_value(Float(), {"name": "expect",
                                         "bioformat:decode": "float",
                                         }) +
                # What should I do with the (2)?, or the (3)?
                (Str(", P =") | Re(r", Sum P\(\d+\) =")) + 
                Spaces() +
                Std.hsp_value(Float(), {"name": "P",
                                         "bioformat:decode": "float",
                                         }) +
                ToEol()
                )

#  Identities = 214/214 (100%), Positives = 214/214 (100%)
identities = (Re(r" *Identities = ") +
              Std.hsp_value(Digits(), {"name": "identical",
                                       "bioformat:decode": "int",
                                       }) +
              Str("/") +
              
              Std.hsp_value(Digits(), {"name": "length",
                                       "bioformat:decode": "int",
                                       }) +
              ToSep(sep = ",") +
              Str(" Positives =") +
              Spaces() +
              Std.hsp_value(Digits(), {"name": "positives",
                                       "bioformat:decode": "int",
                                       }) +
              ToEol())

blastp_hsp_header = blastp_score + identities + spaces_line


######################### blastn_hit


# >U51677 Human non-histone chromatin protein HMG1 (HMG1) gene, complete cds.
#         Length = 2575
#  
#   Plus Strand HSPs:
#  
#  Score = 12875 (1931.8 bits), Expect = 0.0, P = 0.0
#  Identities = 2575/2575 (100%), Positives = 2575/2575 (100%), Strand = Plus / Plus
#  
# Query:     1 ATGGGCAAAGGAGATCCTAAGAAGCCGAGAGGCAAAATGTCATCATATGCATTTTTTGTG 60
#              ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
# Sbjct:     1 ATGGGCAAAGGAGATCCTAAGAAGCCGAGAGGCAAAATGTCATCATATGCATTTTTTGTG 60
# ....

# Don't need to parse this since it comes from the identities line
blastn_strand =Re(r" *(Plus|Minus) Strand HSPs:\R")

blastn_score = blastp_score

blastn_identities = (
    Re(r" *Identities = ") +
    Std.hsp_value(Digits(), {"name": "identical",
                             "bioformat:decode": "int",
                             }) +
    Str("/") +
    
    Std.hsp_value(Digits(), {"name": "length",
                             "bioformat:decode": "int",
                             }) +
    ToSep(sep = ",") +
    Str(" Positives =") +
    Spaces() +
    Std.hsp_value(Digits(), {"name": "positives",
                             "bioformat:decode": "int",
                             }) +
    ToSep(sep = ",") + Str(" ") +
    ncbiblast.strand
    # that 'strand' already includes a newline
    )


blastn_hsp_header = (Opt(blastn_strand + spaces_line) +
                     blastn_score + blastn_identities + spaces_line)

######################### blastx

#                                                                      Smallest
#                                                                        Sum
#                                                      Reading  High  Probability
# Sequences producing High-scoring Segment Pairs:        Frame Score  P(N)      N
#  
# sp|P09429|HMG1_HUMAN HIGH MOBILITY GROUP PROTEIN HMG1 ... +2   311  5.0e-108  4
# sp|P10103|HMG1_BOVIN HIGH MOBILITY GROUP PROTEIN HMG1 ... +2   311  1.3e-107  4
# sp|P07155|HMG1_MOUSE HIGH MOBILITY GROUP PROTEIN HMG1 ... +2   311  2.7e-107  4                                        

blastx_table_entry = (
    AssertNot(AnyEol()) +
    Std.search_table_description(Re(r".{57}")) + Spaces() +
    Std.search_table_value(Integer(), {"name": "frame",
                                       "bioformat:decode": "int"}) +
    Spaces() +
    Std.search_table_value(Float(), {"name": "score",
                                     "bioformat:decode": "float"}) +
    Spaces() +
    Std.search_table_value(Float(), {"name": "P",
                                     "bioformat:decode": "float"}) +
    Spaces() +
    Std.search_table_value(Digits(), {"name": "N",
                                      "bioformat:decode": "int"}) +
    AnyEol())


blastx_to_database = (
 Str("  Translating both strands of query sequence in all 6 reading frames") +
 AnyEol() +
 AnyEol())

blastx_identities = (
    Re(r" *Identities = ") +
    Std.hsp_value(Digits(), {"name": "identical",
                             "bioformat:decode": "int",
                             }) +
    Str("/") +
    
    Std.hsp_value(Digits(), {"name": "length",
                             "bioformat:decode": "int",
                             }) +
    ToSep(sep = ",") +
    Str(" Positives =") +
    Spaces() +
    Std.hsp_value(Digits(), {"name": "positives",
                             "bioformat:decode": "int",
                             }) +
    ToSep(sep = "=") + Str(" ") +
    Std.hsp_frame(Integer(), {"which": "query"}) +
    AnyEol()
    )

blastx_score = blastp_score
blastx_strand = blastn_strand

blastx_hsp_header = (Opt(blastx_strand + spaces_line) +
                     blastx_score + blastx_identities + spaces_line)


#############################
######################### tblastn
tblastn_strand =Re(r" *(Plus|Minus) Strand HSPs:\R")

tblastn_score = blastp_score

tblastn_identities = (
    Re(r" *Identities = ") +
    Std.hsp_value(Digits(), {"name": "identical",
                             "bioformat:decode": "int",
                             }) +
    Str("/") +
    
    Std.hsp_value(Digits(), {"name": "length",
                             "bioformat:decode": "int",
                             }) +
    ToSep(sep = ",") +
    Str(" Positives =") +
    Spaces() +
    Std.hsp_value(Digits(), {"name": "positives",
                             "bioformat:decode": "int",
                             }) +
    ToSep(sep = ",") + 
    ncbiblast.frame
    # that 'strand' already includes a newline
    )


tblastn_hsp_header = (Opt(tblastn_strand + spaces_line) +
                     tblastn_score + tblastn_identities + spaces_line)



#############################
# May have to skip the WARNING section
ending_start = (Opt(Str("WARNING") + ToEol() +
                    Rep(AssertNot(Str(">")) +
                        AssertNot(Str("Parameters")) +
                        ToEol())) +
                Str("Parameters:") + ToEol())

parameters = SkipLinesUntil(Str("Statistics:"))

_nv = ncbiblast._nv
int_stat = ncbiblast.int_stat

timestamp = Time.make_expression(
    "%(Mon) %(Jan) %(day) %(hour):%(minute):%(second) %(YYYY)")

statistics = (
    Str("Statistics:") + AnyEol() +
    spaces_line +
        #  Database:  /dbEST1/wublast/database/swissprot/sprot.fa
    _nv("  Database: ",
        Std.search_statistic(UntilEol(), {"name": "database"})) +
    
        #    Title:  swissprot
    _nv("    Title: ",
        Std.search_statistic(UntilEol(), {"name": "database_title"})) +
    
        #    Release date:  unknown
    _nv("    Release date: ",
        Std.search_statistic(UntilEol(), {"name": "release_date"})) +
    
        #    Posted date:  11:37 AM GMT Feb 21, 2000
    _nv("    Posted date: ",
        Std.search_statistic(UntilEol(), {"name": "posted_date"})) +
    
        #    Format:  BLAST
    _nv("    Format: ",
        Std.search_statistic(UntilEol(), {"name": "format"})) +
    
        #  # of letters in database:  26,335,942
    _nv("  # of letters in database:  ",
        int_stat("num_letters_in_database", comma = 1)) +
    
        #  # of sequences in database:  72,615
    _nv("  # of sequences in database:  ",
        int_stat("num_sequences_in_database", comma = 1)) +

        #  # of database sequences satisfying E:  1119
    _nv("  # of database sequences satisfying E:  ",
        int_stat("num_sequences_satisying_E")) +

        #  No. of states in DFA:  549 (54 KB)
    _nv("  No. of states in DFA: ",
        Std.search_statistic(UntilEol(), {"name": "num_dfa_states"})) +
        
        #  Total size of DFA:  117 KB (128 KB)
    _nv("  Total size of DFA: ",
        Std.search_statistic(UntilEol(), {"name": "total_dfa_size"})) +

        #  Time to generate neighborhood:  0.01u 0.00s 0.01t  Elapsed: 00:00:00
    _nv("  Time to generate neighborhood: ",
        Std.search_statistic(UntilEol(),
                             {"name": "neighborhood_generation_time"})) +

        #  No. of threads or processors used:  2
    Opt(
    	_nv("  No. of threads or processors used:  ",
        	int_stat("num_threads"))) +

        #  Search cpu time:  38.04u 0.15s 38.19t  Elapsed: 00:00:33
    _nv("  Search cpu time: ",
        Std.search_statistic(UntilEol(), {"name": "search_cpu_time"})) +
        
        #  Total cpu time:  38.96u 0.29s 39.25t  Elapsed: 00:00:34
    _nv("  Total cpu time: ",
        Std.search_statistic(UntilEol(), {"name": "total_time"})) +

        #  Start:  Mon Feb 21 14:33:13 2000   End:  Mon Feb 21 14:33:47 2000
    _nv("  Start:  ",
        Std.search_statistic(timestamp, {"name": "start_time"}) +
        Spaces() + Str("End:  ") +
        Std.search_statistic(timestamp, {"name": "end_time"})) +
    Opt(spaces_line +
            #WARNINGS ISSUED:  2
        _nv("WARNINGS ISSUED:  ",
            int_stat("num_warnings")))
    )

ending = ending_start + parameters + statistics
blastp_ending = ending_start + parameters + statistics
blastn_ending = ending_start + parameters + statistics
blastx_ending = ending_start + parameters + statistics

format = replace_groups(ncbiblast.format,
                        (("to_table", to_table),
                         ("table_entry", table_entry),
                         ("ending", ending)))

blastp = replace_groups(format,
                        (("appheader", blastp_appheader),
                         ("hsp_header", blastp_hsp_header)))

blastn = replace_groups(format,
                        (("appheader", blastn_appheader),
                         ("hsp_header", blastn_hsp_header)))

blastx = replace_groups(format,
                        (("appheader", blastx_appheader),
                         ("table_entry", blastx_table_entry),
                         ("to_database", blastx_to_database),
                         ("hsp_header", blastx_hsp_header)))
tblastn = replace_groups(format,
                        (("appheader", tblastn_appheader),
                         ("table_entry", blastx_table_entry),
                         ("hsp_header", tblastn_hsp_header)))
			 

def main(outf):
    from xml.sax import saxutils
    parser = blastn.make_parser(debug_level = 0)
    parser.setContentHandler(saxutils.XMLGenerator())
    parser.parse(outf)
    #parser.parse("wu-sh_blastp.out")
    
if __name__ == "__main__":
    main()