File: query_pubmed.py

package info (click to toggle)
python-biopython 1.85%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 126,372 kB
  • sloc: xml: 1,047,995; python: 332,722; ansic: 16,944; sql: 1,208; makefile: 140; sh: 81
file content (89 lines) | stat: -rwxr-xr-x 2,452 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python

# Copyright 2000 by Jeffrey Chang.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Query PubMed and print MEDLINE format results."""

import getopt
import sys

from Bio import Entrez


def print_usage():
    """Print a help message."""
    print(
        """query_pubmed.py [-h] [-c] [-d delay] query

This script sends a query to PubMed (via the NCBI Entrez webservice*)
and prints the MEDLINE formatted results to the screen.

Arguments:
    -h           Print out this help message.
    -c           Count the hits, and don't print them out.

* http://www.ncbi.nlm.nih.gov/Entrez/
"""
    )


if __name__ == "__main__":
    try:
        optlist, args = getopt.getopt(sys.argv[1:], "hcd:")
    except getopt.error as x:
        print(x)
        sys.exit(0)
    if len(args) != 1:  # If they gave extraneous arguments,
        print_usage()  # print the instructions and quit.
        sys.exit(0)
    query = args[0]

    show_help = False
    count_only = False
    for opt, arg in optlist:
        if opt == "-h":
            show_help = True
        elif opt == "-c":
            count_only = True
        elif opt == "-d":
            sys.stderr.write("The delay parameter is now ignored\n")
    if show_help:
        print_usage()
        sys.exit(0)

    print("Doing a PubMed search for %r..." % query)

    if count_only:
        handle = Entrez.esearch(db="pubmed", term=query)
    else:
        handle = Entrez.esearch(db="pubmed", term=query, usehistory="Y")
    search_results = Entrez.read(handle)
    ids = search_results["IdList"]
    count = len(ids)
    print(f"Found {count:d} citations")

    if count_only:
        sys.exit(0)

    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"]
    batch_size = 3
    for start in range(0, count, batch_size):
        end = min(count, start + batch_size)
        # print("Going to download record %i to %i" % (start+1, end))
        fetch_handle = Entrez.efetch(
            db="pubmed",
            rettype="medline",
            retmode="text",
            retstart=start,
            retmax=batch_size,
            webenv=webenv,
            query_key=query_key,
        )
        data = fetch_handle.read()
        fetch_handle.close()
        sys.stdout.write(data)
        sys.stdout.flush()