File: EZRetrieve.py

package info (click to toggle)
python-biopython 1.42-2
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 17,584 kB
  • ctags: 12,272
  • sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203
file content (61 lines) | stat: -rw-r--r-- 1,855 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""This module contains code to access EZRetrieve.

Functions:
retrieve_single  Retrieve a single sequence from EZRetrieve.
parse_single     Parse the results from EZRetrieve into FASTA format.

"""

def retrieve_single(id, from_, to, retrieve_by=None, organism=None,
                    parse_results=1):
    import urllib
    
    CGI = "http://siriusb.umdnj.edu:18080/EZRetrieve/single_r_run.jsp"
    org2value = {"Hs" : "0", "Mm" : "1", "Rn" : 2}
    organism = organism or "Hs"
    assert organism in org2value

    acctype2value = {"genbank":0, "unigene":1, "locuslink":2, "image":3}
    retrieve_by = retrieve_by or "GenBank"
    retrieve_by = retrieve_by.lower()
    assert retrieve_by in acctype2value

    params = {
        "input" : str(id),
        "from" : str(from_),
        "to" : str(to),
        "org" : org2value[organism],
        "AccType" : acctype2value[retrieve_by],
        }
    options = urllib.urlencode(params)
    handle = urllib.urlopen(CGI, options)
    if parse_results:
        results = parse_single(handle)
    else:
        results = handle.read()
    return results

def parse_single(handle):
    """Return a FASTA-formatted string for the sequence.  May raise an
    AssertionError if there was a problem retrieving the sequence.

    """
    import re
    results = handle.read()
    lresults = results.lower()
    
    i = results.find("Error: ")
    if i >= 0:
        j = lresults.index("<br>", i)
        errmsg = results[i:j].strip()
        raise AssertionError, errmsg

    i = lresults.find("<b>>")
    assert i >= 0, "Couldn't find sequence."
    j = lresults.find("<br><br>", i)
    seqdata = results[i:j]
    reobj = re.compile(r"<[^>]*>", re.IGNORECASE|re.DOTALL)
    seqdata = reobj.sub("", seqdata)
    seqdata = re.sub(r"\s+", r"\n", seqdata)
    seqdata = seqdata.strip() + "\n"
    return seqdata