File: blastSequences.R

package info (click to toggle)
r-bioc-annotate 1.84.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,624 kB
  • sloc: makefile: 2
file content (131 lines) | stat: -rw-r--r-- 5,181 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
.blastSequencesToDNAMultipleAlignment <- function(xml) {
   loadNamespace("Biostrings")
   loadNamespace("IRanges")
   qseq <- xpathSApply(xml, "//Hsp_qseq", xmlValue)
   hseq <- xpathSApply(xml, "//Hsp_hseq", xmlValue)
   res <- vector("list", length(qseq))
   for(i in seq_along(qseq)){
     res[[i]] <- Biostrings::DNAMultipleAlignment(
         c(hseq[[i]],qseq[[i]]),
         rowmask=as(IRanges::IRanges(), "NormalIRanges"),
         colmask=as(IRanges::IRanges(), "NormalIRanges"))
   }
   res
}

.blastSequencesToDataFrame <- function(xml) {
    if (xpathSApply(xml, "count(//Hit)") == 0L) {
        message("'blastSequences' returned 0 matches")
        return(data.frame())
    }

    iter <- xml["//Iteration"]
    iterlen <- sapply(iter, xpathSApply, "count(.//Hsp)")
    iterdf <- xmlToDataFrame(iter, stringsAsFactors=FALSE)

    hit <- xml["//Hit"]
    hitlen <- sapply(hit, xpathSApply, "count(.//Hsp)")
    hitdf <- xmlToDataFrame(hit, stringsAsFactors=FALSE)
    hitdf <- hitdf[, names(hitdf) != "Hit_hsps", drop=FALSE]

    hsp <- xmlToDataFrame(xml["//Hsp"] , stringsAsFactors=FALSE)

    df <- cbind(
        iterdf[rep(seq_len(nrow(iterdf)), iterlen),, drop=FALSE],
        hitdf[rep(seq_len(nrow(hitdf)), hitlen),, drop=FALSE],
        hsp)
    rownames(df) <- NULL
    df
}

.tryParseResult <- function(baseUrl, rid, rtoe, timeout) {
    message("estimated response time ", rtoe, " seconds")
    start <- Sys.time()
    end <- Sys.time() + timeout
    url <- sprintf("%s?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=%s",
                   baseUrl, rid)
    Sys.sleep(min(rtoe, timeout))
    repeat {
        elapsed <- as.double(Sys.time() - start, units="secs")
        ## RCurl::getURL(url, followlocation=TRUE) has issues.
        ## See getURL2() in R/query.R
        result <- as(htmlParse(getURL2(url),
                               error = xmlErrorCumulator(immediate=FALSE)),
                     "character")

        if (grepl("Status=FAILED", result))
            stop("BLAST search failed")
        else if  (grepl("Status=UNKNOWN", result))
            stop("BLAST search expired")
        else if (grepl("Status=READY", result)) {
            url <- sprintf("%s?RID=%s&FORMAT_TYPE=XML&CMD=Get", baseUrl, rid)
            ## RCurl::getURL(url, followlocation=TRUE) has issues.
            ## See getURL2() in R/query.R
            result <- xmlParse(getURL2(url),
                               error = xmlErrorCumulator(immediate=FALSE))
            return(result)
        } else if (grepl("Status=WAITING", result)) {
            message(sprintf("elapsed time %.0f seconds", elapsed))
            if (Sys.time() > end && interactive()) {
                msg <- sprintf("wait another %d seconds? [y/n] ", timeout)
                repeat {
                    ans <- substr(trimws(tolower(readline(msg))), 1, 1)
                    if (ans %in% c("y", "n"))
                        break
                }
                if (ans == "n")
                    break
                end <- Sys.time() + timeout
            }
            Sys.sleep(10)
        } else
            stop("BLAST search unknown response") 
    }
    msg <- sprintf("'blastSequences' timeout after %.0f seconds",
                   elapsed)
    stop(msg, call.=FALSE)
}

## Using the REST-ish API described at
## http://www.ncbi.nlm.nih.gov/blast/Doc/node2.html
blastSequences <- function(x, database="nr",
                           hitListSize="10",
                           filter="L",
                           expect="10",
                           program="blastn",
                           timeout=40,
                           as=c("DNAMultipleAlignment", "data.frame", "XML"))
{
    PARSE <- switch(match.arg(as),
                    DNAMultipleAlignment=.blastSequencesToDNAMultipleAlignment,
                    data.frame=.blastSequencesToDataFrame,
                    XML=identity)
    ## TODO: lots of argument checking and testing.  Also,
    ## depending on which program string is used we need to make the correct
    ## kind of object at the end (so blastn means DNAMultipleAlignment, and
    ## blastp means AAMultipleAlignment etc.

    ## So:
    ## 1) get online values these parameters can be
    ## 2) document those
    ## 3) restrict their vals in the code here.
    ## 4) for program, use this to determine what object is returned.
    
    ## assemble the query
    baseUrl <- "https://www.ncbi.nlm.nih.gov/blast/Blast.cgi"
    query <- paste("QUERY=", URLencode(as.character(x)), "&DATABASE=",database,
                   "&HITLIST_SIZE=",hitListSize,"&FILTER=",filter,
                   "&EXPECT=",expect,"&PROGRAM=",program, sep="")
    url0 <- sprintf("%s?%s&CMD=Put", baseUrl, query)
    ## RCurl::getURL(url, followlocation=TRUE) has issues.
    ## See getURL2() in R/query.R
    post <- htmlParse(getURL2(url0))
    
    x <- post[['string(//comment()[contains(., "QBlastInfoBegin")])']]
    rid <- sub(".*RID = ([[:alnum:]]+).*", "\\1", x)
    rtoe <- as.integer(sub(".*RTOE = ([[:digit:]]+).*", "\\1", x))
    result <- .tryParseResult(baseUrl, rid, rtoe, timeout)
    PARSE(result)
}

## took 11.5 minutes to do a blast...  (ugh)