File: db.rb

package info (click to toggle)
ruby-bio 2.0.6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,108 kB
  • sloc: ruby: 68,331; perl: 13; makefile: 11; sh: 1
file content (330 lines) | stat: -rw-r--r-- 8,396 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
# frozen_string_literal: true
#
# = bio/db.rb - common API for database parsers
#
# Copyright::  Copyright (C) 2001, 2002, 2005
#              Toshiaki Katayama <k@bioruby.org>
# License::    The Ruby License
#
# $Id: db.rb,v 0.38 2007/05/08 17:02:13 nakao Exp $
#
# == On-demand parsing and cache
#
# The flatfile parsers (sub classes of the Bio::DB) split the original entry
# into a Hash and store the hash in the @orig instance variable.  To parse
# in detail is delayed until the method is called which requires a further
# parsing of a content of the @orig hash.  Fully parsed data is cached in the
# another hash, @data, separately.
#
# == Guide lines for the developers to create an new database class
#
# --- Bio::DB.new(entry)
#
# The 'new' method should accept the entire entry in one String and
# return the parsed database object.
#
# --- Bio::DB#entry_id
#
# Database classes should implement the following methods if appropriate:
#
# * entry_id
# * definition
#
# Every sub class should define the following constants if appropriate:
#
# * DELIMITER (RS)
#   * entry separator of the flatfile of the database.
#   * RS (= record separator) is an alias for the DELIMITER in short.
#
# * TAGSIZE
#   * length of the tag field in the FORTRAN-like format.
#
#       |<- tag       ->||<- data                           ---->|
#       ENTRY_ID         A12345
#       DEFINITION       Hoge gene of the Pokemonia pikachuae
#
# === Template of the sub class
# 
#   module Bio
#   class Hoge < DB
# 
#     DELIMITER	= RS = "\n//\n"
#     TAGSIZE		= 12		# You can omit this line if not needed
# 
#     def initialize(entry)
#     end
# 
#     def entry_id
#     end
# 
#   end # class Hoge
#   end # module Bio
# 
# === Recommended method names for sub classes
# 
# In general, the method name should be in the singular form when returns
# a Object (including the case when the Object is a String), and should be
# the plural form when returns same Objects in Array.  It depends on the
# database classes that which form of the method name can be use.
# 
# For example, GenBank has several REFERENCE fields in one entry, so define
# Bio::GenBank#references and this method should return an Array of the
# Reference objects.  On the other hand, MEDLINE has one REFERENCE information
# per one entry, so define Bio::MEDLINE#reference method and this should
# return a Reference object.
# 
# The method names used in the sub classes should be taken from the following
# list if appropriate:
# 
# --- entry_id #=> String
# 
# The entry identifier.
# 
# --- definition #=> String
# 
# The description of the entry.
# 
# --- reference	#=> Bio::Reference
# --- references #=> Array of Bio::Reference
# 
# The reference field(s) of the entry.
# 
# --- dblink #=> String
# --- dblinks #=> Array of String
# 
# The link(s) to the other database entry.
# 
# --- naseq #=> Bio::Sequence::NA
# 
# The DNA/RNA sequence of the entry.
# 
# --- nalen #=> Integer
# 
# The length of the DNA/RNA sequence of the entry.
# 
# --- aaseq #=> Bio::Sequence::AA
# 
# The amino acid sequence of the entry.
# 
# --- aalen #=> Integer
# 
# The length of the amino acid sequence of the entry.
# 
# --- seq #=> Bio::Sequence::NA or Bio::Sequence::AA
# 
# Returns an appropriate sequence object.
# 
# --- position #=> String
# 
# The position of the sequence in the entry or in the genome (depends on
# the database).
# 
# --- locations #=> Bio::Locations
# 
# Returns Bio::Locations.new(position).
# 
# --- division #=> String
# 
# The sub division name of the database.
# 
# * Example:
#   * EST, VRL etc. for GenBank
#   * PATTERN, RULE etc. for PROSITE
# 
# --- date #=> String
# 
# The date of the entry.
# Should we use Date (by ParseDate) instead of String?
# 
# --- gene #=> String
# --- genes #=> Array of String
# 
# The name(s) of the gene.
# 
# --- organism #=> String
# 
# The name of the organism.
# 

require 'bio/sequence'
require 'bio/reference'
require 'bio/feature'

module Bio
  
class DB

  def self.open(filename, *mode, &block)
    Bio::FlatFile.open(self, filename, *mode, &block)
  end

  # Returns an entry identifier as a String.  This method must be
  # implemented in every database classes by overriding this method.
  def entry_id
    raise NotImplementedError
  end

  # Returns a list of the top level tags of the entry as an Array of String.
  def tags
    @orig.keys
  end

  # Returns true or false - wether the entry contains the field of the
  # given tag name.
  def exists?(tag)
    @orig.include?(tag)
  end

  # Returns an intact field of the tag as a String.
  def get(tag)
    @orig[tag]
  end

  # Similar to the get method, however, fetch returns the content of the
  # field without its tag and any extra white spaces stripped.
  def fetch(tag, skip = 0)
    field = @orig[tag].split(/\n/, skip + 1).last.to_s
    truncate(field.gsub(/^.{0,#{@tagsize}}/,''))
  end


  private

  # Returns a String with successive white spaces are replaced by one
  # space and stripeed.
  def truncate(str)
    str ||= String.new
    return str.gsub(/\s+/, ' ').strip
  end

  # Returns a tag name of the field as a String.
  def tag_get(str)
    str ||= String.new
    return str[0,@tagsize].strip
  end

  # Returns a String of the field without a tag name.
  def tag_cut(str)
    str ||= String.new
    str[0,@tagsize] = ''
    return str
  end

  # Returns the content of the field as a String like the fetch method.
  # Furthermore, field_fetch stores the result in the @data hash.
  def field_fetch(tag, skip = 0)
    unless @data[tag]
      @data[tag] = fetch(tag, skip)
    end
    return @data[tag]
  end

  # Returns an Array containing each line of the field without a tag.
  # lines_fetch also stores the result in the @data hash.
  def lines_fetch(tag)
    unless @data[tag]
      list = []
      lines = get(tag).split(/\n/)
      lines.each do |line|
        data = tag_cut(line)
        if data[/^\S/]                  # next sub field
          list << data
        else                            # continued sub field
          data.strip!
          if list.last[/\-$/]           # folded
            list[-1] += data
          else
            list[-1] += " #{data}"     # rest of list
          end
        end
      end
      @data[tag] = list
    end
    @data[tag]
  end

end # class DB


# Stores a NCBI style (GenBank, KEGG etc.) entry.
class NCBIDB < DB

  autoload :Common, 'bio/db/genbank/common'

  # The entire entry is passed as a String.  The length of the tag field is
  # passed as an Integer.  Parses the entry roughly by the entry2hash method
  # and returns a database object.
  def initialize(entry, tagsize)
    @tagsize = tagsize
    @orig = entry2hash(entry.strip)	# Hash of the original entry
    @data = {}				# Hash of the parsed entry
  end

  private

  # Splits an entry into an Array of Strings at the level of top tags.
  def toptag2array(str)
    sep = "\001"
    str.gsub(/\n([A-Za-z\/\*])/, "\n#{sep}\\1").split(sep)
  end

  # Splits a field into an Array of Strings at the level of sub tags.
  def subtag2array(str)
    sep = "\001"
    str.gsub(/\n(\s{1,#{@tagsize-1}}\S)/, "\n#{sep}\\1").split(sep)
  end

  # Returns the contents of the entry as a Hash with the top level tags as
  # its keys.
  def entry2hash(entry)
    hash = Hash.new('')

    fields = toptag2array(entry)

    fields.each do |field|
      tag = tag_get(field)
      hash[tag] += field
    end
    return hash
  end

end # class NCBIDB


# Class for KEGG databases. Inherits a NCBIDB class.
class KEGGDB < NCBIDB
end


# Stores an EMBL style (EMBL, TrEMBL, Swiss-Prot etc.) entry.
class EMBLDB < DB

  autoload :Common, 'bio/db/embl/common'

  # The entire entry is passed as a String.  The length of the tag field is
  # passed as an Integer.  Parses the entry roughly by the entry2hash method
  # and returns a database object.
  def initialize(entry, tagsize)
    @tagsize = tagsize
    @orig = entry2hash(entry.strip)	# Hash of the original entry
    @data = {}			# Hash of the parsed entry
  end

  private

  # Returns the contents of the entry as a Hash.
  def entry2hash(entry)
    hash = Hash.new { |h,k| h[k] = String.new }
    entry.each_line do |line|
      tag = tag_get(line)
      next if tag == 'XX'
      tag = 'R' if tag =~ /^R./	# Reference lines
      hash[tag].concat line
    end
    return hash
  end

end # class EMBLDB

end # module Bio