File: fetch.rb

package info (click to toggle)
ruby-bio 2.0.6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,108 kB
  • sloc: ruby: 68,331; perl: 13; makefile: 11; sh: 1
file content (251 lines) | stat: -rw-r--r-- 8,673 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#
# = bio/io/biofetch.rb - BioFetch access module
#
# Copyright::	Copyright (C) 2002, 2005 Toshiaki Katayama <k@bioruby.org>,
#               Copyright (C) 2006 Jan Aerts <jan.aerts@bbsrc.ac.uk>
# License::     The Ruby License
#
# == DESCRIPTION
#
# Using EBI Dbfetch server
#
#   ebi_server = Bio::Fetch::EBI.new
#   puts ebi_server.fetch('embl', 'J00231')
#   puts ebi_server.fetch('embl', 'J00231', 'raw')
#   puts ebi_server.fetch('embl', 'J00231', 'html')
#
# Getting metadata from EBI Dbfetch server
#
#   puts ebi_server.databases
#   puts ebi_server.formats('embl')
#   puts ebi_server.maxids
#
# Using EBI Dbfetch server without creating a Bio::Fetch::EBI instance
#
#   puts Bio::Fetch::EBI.query('ena_sequence', 'J00231')
#   puts Bio::Fetch::EBI.query('ena_sequence', 'J00231', 'raw', 'fasta')
#
# Using a BioFetch server with specifying URL
#
#   server = Bio::Fetch.new('http://www.ebi.ac.uk/Tools/dbfetch/dbfetch')
#   puts server.fetch('ena_sequence', 'J00231')
#   puts server.fetch('ena_sequence', 'J00231', 'raw', 'fasta')
# 

require 'uri'
require 'cgi'
require 'bio/command'

module Bio
  # = DESCRIPTION
  # The Bio::Fetch class provides an interface to dbfetch servers. Given
  # a database name and an accession number, these servers return the associated
  # record. For example, for the embl database on the EBI, that would be a
  # nucleic or amino acid sequence.
  #
  # Possible dbfetch servers include:
  # * http://www.ebi.ac.uk/Tools/dbfetch/dbfetch
  #
  # Note that old URL http://www.ebi.ac.uk/cgi-bin/dbfetch still alives
  # probably because of compatibility, but using the new URL is recommended.
  #
  # Historically, there were other dbfetch servers including:
  # * http://bioruby.org/cgi-bin/biofetch.rb (default before BioRuby 1.4)
  # But they are unavailable now.
  #
  #
  # If you're behind a proxy server, be sure to set your HTTP_PROXY
  # environment variable accordingly.
  #
  # = USAGE
  #  require 'bio'
  #  
  #  # Retrieve the sequence of accession number M33388 from the EMBL
  #  # database.
  #  server = Bio::Fetch::EBI.new  #uses EBI server
  #  puts server.fetch('ena_sequence','M33388')
  #
  #  # database name "embl" can also be used though it is not officially listed
  #  puts server.fetch('embl','M33388')
  #
  #  # Do the same thing with explicitly giving the URL.
  #  server = Bio::Fetch.new(Bio::Fetch::EBI::URL)  #uses EBI server
  #  puts server.fetch('ena_sequence','M33388')
  #
  #  # Do the same thing without creating a Bio::Fetch::EBI object.
  #  puts Bio::Fetch::EBI.query('ena_sequence','M33388')
  #
  #  # To know what databases are available on the dbfetch server:
  #  server = Bio::Fetch::EBI.new
  #  puts server.databases
  #
  #  # Some databases provide their data in different formats (e.g. 'fasta',
  #  # 'genbank' or 'embl'). To check which formats are supported by a given
  #  # database:
  #  puts server.formats('embl')
  #
  class Fetch
  
    # Bio::Fetch::EBI is a client of EBI Dbfetch
    # (http://www.ebi.ac.uk/Tools/dbfetch/dbfetch).
    #
    # An instance of this class works the same as:
    #  obj = Bio::Fetch.new("http://www.ebi.ac.uk/Tools/dbfetch/dbfetch")
    #  obj.database = "ena_sequence"
    #
    # See the documents of Bio::Fetch for more details.
    class EBI < Fetch

      # EBI Dbfetch server URL
      URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch".freeze

      # For the usage, see the document of Bio::Fetch.new.
      def initialize(url = URL)
        @database = "ena_sequence"
        super
      end

      # Shortcut for using EBI Dbfetch server. You can fetch an entry
      # without creating an instance of Bio::Fetch::EBI. This method uses
      # EBI Dbfetch server http://www.ebi.ac.uk/Tools/dbfetch/dbfetch .
      # 
      # Example:
      #   puts Bio::Fetch::EBI.query('refseq','NM_123456')
      #   puts Bio::Fetch::EBI.query('ena_sequence','J00231')
      #
      # ---
      # *Arguments*:
      # * _database_: name of database to query (see Bio::Fetch#databases to get list of supported databases)
      # * _id_: single ID or ID list separated by commas or white space
      # * _style_: [raw|html] (default = 'raw')
      # * _format_: name of output format (see Bio::Fetch#formats)
      def self.query(*args)
        self.new.fetch(*args)
      end
    end #class EBI

    # Create a new Bio::Fetch server object that can subsequently be queried
    # using the Bio::Fetch#fetch method.
    #
    # You must specify _url_ of a server.
    # The preset default server is deprecated.
    #
    # If you want to use a server without explicitly specifying the URL,
    # use Bio::Fetch::EBI.new that uses EBI Dbfetch server.
    #
    # ---
    # *Arguments*:
    # * _url_: URL of dbfetch server. (no default value)
    # *Returns*:: Bio::Fetch object
    def initialize(url = nil)
      unless url then
        raise ArgumentError, "No server URL is given in Bio::Fetch.new. The default server URL value have been deprecated. You must explicitly specify the url or use Bio::Fetch::EBI for using EBI Dbfetch."
      end
      @url = url
    end
  
    # The default database to query
    #--
    # This will be used by the get_by_id method
    #++
    attr_accessor :database
  
    # Get raw database entry by id. This method lets the Bio::Registry class
    # use Bio::Fetch objects.
    def get_by_id(id)
      fetch(@database, id)
    end
  
    # Fetch a database entry as specified by database (db), entry id (id),
    # 'raw' text or 'html' (style), and format.
    #
    # Examples:
    #   server = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch')
    #   puts server.fetch('embl','M33388','raw','fasta')
    #   puts server.fetch('refseq','NM_12345','html','embl')
    # ---
    # *Arguments*:
    # * _database_: name of database to query (see Bio::Fetch#databases to get list of supported databases)
    # * _id_: single ID or ID list separated by commas or white space
    # * _style_: [raw|html] (default = 'raw')
    # * _format_: name of output format (see Bio::Fetch#formats)
    def fetch(db, id, style = 'raw', format = nil)
      query = [ [ 'db',    db ],
                [ 'id',    id ],
                [ 'style', style ] ]
      query.push([ 'format', format ]) if format
  
      _get(query)
    end
  
    # Using this method, the user can ask a dbfetch server what databases
    # it supports. This would normally be the first step you'd take when
    # you use a dbfetch server for the first time.
    # Example:
    #  server = Bio::Fetch.new()
    #  puts server.databases # returns "aa aax bl cpd dgenes dr ec eg emb ..."
    #
    # This method works for EBI Dbfetch server (and for the bioruby dbfetch
    # server). Not all servers support this method.
    # ---
    # *Returns*:: array of database names
    def databases
      _get_single('info', 'dbs').strip.split(/\s+/)
    end
  
    # Lists the formats that are available for a given database. Like the
    # Bio::Fetch#databases method, not all servers support this method.
    # This method is available on the EBI Dbfetch server (and on the bioruby
    # dbfetch server).
    #
    # Example:
    #  server = Bio::Fetch::EBI.new()
    #  puts server.formats('embl') # returns [ "default", "annot", ... ]
    # ---
    # *Arguments*:
    # * _database_:: name of database you want the supported formats for
    # *Returns*:: array of formats
    def formats(database = @database)
      if database
        query = [ [ 'info', 'formats' ],
                  [ 'db',   database  ] ]
        _get(query).strip.split(/\s+/)
      end
    end
  
    # A dbfetch server will only return entries up to a given maximum number.
    # This method retrieves that number from the server. As for the databases
    # and formats methods, not all servers support the maxids method.
    # This method is available on the EBI Dbfetch server (and on the bioruby
    # dbfetch server).
    #
    # Example:
    #  server = Bio::Fetch::EBI.new
    #  puts server.maxids # currently returns 200
    # ---
    # *Arguments*: none
    # *Returns*:: number
    def maxids
      _get_single('info', 'maxids').to_i
    end

    private
    # (private) query to the server.
    # ary must be nested array, e.g. [ [ key0, val0 ], [ key1, val1 ], ... ]
    def _get(ary)
      query = ary.collect do |a|
        "#{CGI.escape(a[0])}=#{CGI.escape(a[1])}"
      end.join('&')
      Bio::Command.read_uri(@url + '?' + query)
    end

    # (private) query with single parameter
    def _get_single(key, val)
      query = "#{CGI.escape(key)}=#{CGI.escape(val)}"
      Bio::Command.read_uri(@url + '?' + query)
    end

  end

end # module Bio