File: flatfile.rb

package info (click to toggle)
ruby-bio 2.0.6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,108 kB
  • sloc: ruby: 68,331; perl: 13; makefile: 11; sh: 1
file content (467 lines) | stat: -rw-r--r-- 13,629 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
#
# = bio/io/flatfile.rb - flatfile access wrapper class
#
#   Copyright (C) 2001-2006 Naohisa Goto <ng@bioruby.org>
#
# License:: The Ruby License
#
#  $Id:$
#
#
# Bio::FlatFile is a helper and wrapper class to read a biological data file.
# It acts like a IO object.
# It can automatically detect data format, and users do not need to tell
# the class what the data is.
#

module Bio

  # Bio::FlatFile is a helper and wrapper class to read a biological data file.
  # It acts like a IO object.
  # It can automatically detect data format, and users do not need to tell
  # the class what the data is.
  class FlatFile

    autoload :AutoDetect,          'bio/io/flatfile/autodetection'
    autoload :Splitter,            'bio/io/flatfile/splitter'
    autoload :BufferedInputStream, 'bio/io/flatfile/buffer'

    include Enumerable

    #
    #   Bio::FlatFile.open(file, *arg)
    #   Bio::FlatFile.open(dbclass, file, *arg)
    #
    # Creates a new Bio::FlatFile object to read a file or a stream
    # which contains _dbclass_ data.
    #
    # _dbclass_ should be a class (or module) or nil.
    # e.g. Bio::GenBank, Bio::FastaFormat.
    #
    # If _file_ is a filename (which doesn't have gets method),
    # the method opens a local file named _file_
    # with <code>File.open(filename, *arg)</code>.
    #
    # When _dbclass_ is omitted or nil is given to _dbclass_,
    # the method tries to determine database class
    # (file format) automatically.
    # When it fails to determine, dbclass is set to nil
    # and FlatFile#next_entry would fail.
    # You can still set dbclass using FlatFile#dbclass= method.
    #
    # * Example 1
    #     Bio::FlatFile.open(Bio::GenBank, "genbank/gbest40.seq")
    # * Example 2
    #     Bio::FlatFile.open(nil, "embl/est_hum17.dat")
    # * Example 3
    #     Bio::FlatFile.open("genbank/gbest40.seq")
    #
    # * Example 4
    #     Bio::FlatFile.open(Bio::GenBank, $stdin)
    #
    # If it is called with a block, the block will be executed with
    # a new Bio::FlatFile object. If filename is given,
    # the file is automatically closed when leaving the block.
    #
    # * Example 5
    #     Bio::FlatFile.open(nil, 'test4.fst') do |ff|
    #         ff.each { |e| print e.definition, "\n" }
    #     end
    #
    # * Example 6
    #     Bio::FlatFile.open('test4.fst') do |ff|
    #         ff.each { |e| print e.definition, "\n" }
    #     end
    #
    # Compatibility Note:
    # <em>*arg</em> is completely passed to the <code>File.open</code>
    # and you cannot specify ":raw => true" or ":raw => false".
    #
    def self.open(*arg, &block)
      # FlatFile.open(dbclass, file, mode, perm)
      # FlatFile.open(file, mode, perm)
      if arg.size <= 0
        raise ArgumentError, 'wrong number of arguments (0 for 1)'
      end
      x = arg.shift
      if x.is_a?(Module) then
        # FlatFile.open(dbclass, filename_or_io, ...)
        dbclass = x
      elsif x.nil? then
        # FlatFile.open(nil, filename_or_io, ...)
        dbclass = nil
      else
        # FlatFile.open(filename, ...)
        dbclass = nil
        arg.unshift(x)
      end
      if arg.size <= 0
        raise ArgumentError, 'wrong number of arguments (1 for 2)'
      end
      file = arg.shift
      # check if file is filename or IO object
      unless file.respond_to?(:gets)
        # 'file' is a filename
        _open_file(dbclass, file, *arg, &block)
      else
        # 'file' is a IO object
        ff = self.new(dbclass, file)
        block_given? ? (yield ff) : ff
      end
    end

    # Same as Bio::FlatFile.open(nil, filename_or_stream, mode, perm, options).
    #
    # * Example 1
    #    Bio::FlatFile.auto(ARGF)
    # * Example 2
    #    Bio::FlatFile.auto("embl/est_hum17.dat")
    # * Example 3
    #    Bio::FlatFile.auto(IO.popen("gzip -dc nc1101.flat.gz"))
    #
    def self.auto(*arg, &block)
      self.open(nil, *arg, &block)
    end

    # Same as FlatFile.auto(filename_or_stream, *arg).to_a
    #
    # (This method might be OBSOLETED in the future.)
    def self.to_a(*arg)
      self.auto(*arg) do |ff|
        raise 'cannot determine file format' unless ff.dbclass
        ff.to_a
      end
    end

    # Same as FlatFile.auto(filename, *arg),
    # except that it only accept filename and doesn't accept IO object.
    # File format is automatically determined.
    #
    # It can accept a block.
    # If a block is given, it returns the block's return value.
    # Otherwise, it returns a new FlatFile object.
    #
    def self.open_file(filename, *arg)
      _open_file(nil, filename, *arg)
    end

    # Same as FlatFile.open(dbclass, filename, *arg),
    # except that it only accept filename and doesn't accept IO object.
    #
    # It can accept a block.
    # If a block is given, it returns the block's return value.
    # Otherwise, it returns a new FlatFile object.
    #
    def self._open_file(dbclass, filename, *arg)
      if block_given? then
        BufferedInputStream.open_file(filename, *arg) do |stream|
          yield self.new(dbclass, stream)
        end
      else
        stream = BufferedInputStream.open_file(filename, *arg)
        self.new(dbclass, stream)
      end
    end
    private_class_method :_open_file

    # Opens URI specified as _uri_.
    # _uri_ must be a String or URI object.
    # *arg is passed to OpenURI.open_uri or URI#open.
    #
    # Like FlatFile#open, it can accept a block.
    #
    # Note that you MUST explicitly require 'open-uri'.
    # Because open-uri.rb modifies existing class,
    # it isn't required by default.
    # 
    def self.open_uri(uri, *arg)
      if block_given? then
        BufferedInputStream.open_uri(uri, *arg) do |stream|
          yield self.new(nil, stream)
        end
      else
        stream = BufferedInputStream.open_uri(uri, *arg)
        self.new(nil, stream)
      end
    end

    # Executes the block for every entry in the stream.
    # Same as FlatFile.open(*arg) { |ff| ff.each { |entry| ... }}.
    # 
    # * Example
    #     Bio::FlatFile.foreach('test.fst') { |e| puts e.definition }
    #
    def self.foreach(*arg)
      self.open(*arg) do |flatfileobj|
        flatfileobj.each do |entry|
          yield entry
        end
      end
    end

    # Same as FlatFile.open, except that 'stream' should be a opened
    # stream object (IO, File, ..., who have the 'gets' method).
    #
    # * Example 1
    #    Bio::FlatFile.new(Bio::GenBank, ARGF)
    # * Example 2
    #    Bio::FlatFile.new(Bio::GenBank, IO.popen("gzip -dc nc1101.flat.gz"))
    #
    # Compatibility Note:
    # Now, you cannot specify ":raw => true" or ":raw => false".
    # Below styles are DEPRECATED.
    #
    # * Example 3 (deprecated)
    #    # Bio::FlatFile.new(nil, $stdin, :raw=>true) # => ERROR
    #    # Please rewrite as below.
    #    ff = Bio::FlatFile.new(nil, $stdin)
    #    ff.raw = true
    # * Example 3 in old style (deprecated)
    #    # Bio::FlatFile.new(nil, $stdin, true) # => ERROR
    #    # Please rewrite as below.
    #    ff = Bio::FlatFile.new(nil, $stdin)
    #    ff.raw = true
    #
    def initialize(dbclass, stream)
      # 2nd arg: IO object
      if stream.kind_of?(BufferedInputStream)
        @stream = stream
      else
        @stream = BufferedInputStream.for_io(stream)
      end
      # 1st arg: database class (or file format autodetection)
      if dbclass then
	self.dbclass = dbclass
      else
	autodetect
      end
      #
      @skip_leader_mode = :firsttime
      @firsttime_flag = true
      # default raw mode is false
      self.raw = false
    end

    # The mode how to skip leader of the data.
    # :firsttime :: (DEFAULT) only head of file (= first time to read)
    # :everytime :: everytime to read entry
    # nil :: never skip
    attr_accessor :skip_leader_mode

    # (DEPRECATED) IO object in the flatfile object.
    #
    # Compatibility Note: Bio::FlatFile#io is deprecated.
    # Please use Bio::FlatFile#to_io instead.
    def io
      warn "Bio::FlatFile#io is deprecated."
      @stream.to_io
    end

    # IO object in the flatfile object.
    #
    # Compatibility Note: Bio::FlatFile#io is deprecated.
    def to_io
      @stream.to_io
    end

    # Pathname, filename or URI (or nil).
    def path
      @stream.path
    end

    # Exception class to be raised when data format hasn't been specified.
    class UnknownDataFormatError < IOError
    end

    # Get next entry.
    def next_entry
      raise UnknownDataFormatError, 
      'file format auto-detection failed?' unless @dbclass
      if @skip_leader_mode and
          ((@firsttime_flag and @skip_leader_mode == :firsttime) or
             @skip_leader_mode == :everytime)
        @splitter.skip_leader
      end
      if raw then
        r = @splitter.get_entry
      else
        r = @splitter.get_parsed_entry
      end
      @firsttime_flag = false
      return nil unless r
      if raw then
	r
      else
        @entry = r
        @entry
      end
    end
    attr_reader :entry

    # Returns the last raw entry as a string.
    def entry_raw
      @splitter.entry
    end

    # a flag to write down entry start and end positions
    def entry_pos_flag
      @splitter.entry_pos_flag
    end

    # Sets flag to write down entry start and end positions
    def entry_pos_flag=(x)
      @splitter.entry_pos_flag = x
    end

    # start position of the last entry
    def entry_start_pos
      @splitter.entry_start_pos
    end

    # (end position of the last entry) + 1
    def entry_ended_pos
      @splitter.entry_ended_pos
    end

    # Iterates over each entry in the flatfile.
    #
    # * Example
    #    include Bio
    #    ff = FlatFile.open(GenBank, "genbank/gbhtg14.seq")
    #    ff.each_entry do |x|
    #      puts x.definition
    #    end
    def each_entry
      while e = self.next_entry
	yield e
      end
    end
    alias :each :each_entry

    # Resets file pointer to the start of the flatfile.
    # (similar to IO#rewind)
    def rewind
      r = (@splitter || @stream).rewind
      @firsttime_flag = true
      r
    end

    # Closes input stream.
    # (similar to IO#close)
    def close
      @stream.close
    end

    # Returns current position of input stream.
    # If the input stream is not a normal file,
    # the result is not guaranteed.
    # It is similar to IO#pos.
    # Note that it will not be equal to io.pos,
    # because FlatFile has its own internal buffer.
    def pos
      @stream.pos
    end

    # (Not recommended to use it.)
    # Sets position of input stream.
    # If the input stream is not a normal file,
    # the result is not guaranteed.
    # It is similar to IO#pos=.
    # Note that it will not be equal to io.pos=,
    # because FlatFile has its own internal buffer.
    def pos=(p)
      @stream.pos=(p)
    end

    # Returns true if input stream is end-of-file.
    # Otherwise, returns false.
    # (Similar to IO#eof?, but may not be equal to io.eof?,
    # because FlatFile has its own internal buffer.)
    def eof?
      @stream.eof?
    end

    # If true is given, the next_entry method returns
    # a entry as a text, whereas if false, returns as a parsed object.
    def raw=(bool)
      @raw = (bool ? true : false)
    end

    # If true, raw mode.
    attr_reader :raw

    # Similar to IO#gets.
    # Internal use only. Users should not call it directly.
    def gets(*arg)
      @stream.gets(*arg)
    end

    # Sets database class. Plese use only if autodetect fails.
    def dbclass=(klass)
      if klass then
	@dbclass = klass
        begin
          @splitter = @dbclass.flatfile_splitter(@dbclass, @stream)
        rescue NameError, NoMethodError
          begin
            splitter_class = @dbclass::FLATFILE_SPLITTER
          rescue NameError
            splitter_class = Splitter::Default
          end
          @splitter = splitter_class.new(klass, @stream)
        end
      else
	@dbclass = nil
	@splitter = nil
      end
    end

    # Returns database class which is automatically detected or
    # given in FlatFile#initialize.
    attr_reader :dbclass

    # Performs determination of database class (file format).
    # Pre-reads +lines+ lines for format determination (default 31 lines).
    # If fails, returns nil or false. Otherwise, returns database class.
    #
    # The method can be called anytime if you want (but not recommended).
    # This might be useful if input file is a mixture of muitiple format data.
    def autodetect(lines = 31, ad = AutoDetect.default)
      if r = ad.autodetect_flatfile(self, lines)
        self.dbclass = r
      else
        self.dbclass = nil unless self.dbclass
      end
      r
    end

    # Detects database class (== file format) of given file.
    # If fails to determine, returns nil.
    def self.autodetect_file(filename)
      self.open_file(filename).dbclass
    end

    # Detects database class (== file format) of given input stream.
    # If fails to determine, returns nil.
    # Caution: the method reads some data from the input stream,
    # and the data will be lost.
    def self.autodetect_io(io)
      self.new(nil, io).dbclass
    end

    # This is OBSOLETED. Please use autodetect_io(io) instead.
    def self.autodetect_stream(io)
      $stderr.print "Bio::FlatFile.autodetect_stream will be deprecated." if $VERBOSE
      self.autodetect_io(io)
    end

    # Detects database class (== file format) of given string.
    # If fails to determine, returns false or nil.
    def self.autodetect(text)
      AutoDetect.default.autodetect(text)
    end

  end #class FlatFile

end #module Bio