File: format.rb

package info (click to toggle)
ruby-bio 2.0.6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,108 kB
  • sloc: ruby: 68,331; perl: 13; makefile: 11; sh: 1
file content (383 lines) | stat: -rw-r--r-- 10,668 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#
# = bio/sequence/format.rb - various output format of the biological sequence
#
# Copyright::   Copyright (C) 2006-2008
#               Toshiaki Katayama <k@bioruby.org>,
#               Naohisa Goto <ng@bioruby.org>,
#               Ryan Raaum <ryan@raaum.org>,
#               Jan Aerts <jan.aerts@bbsrc.ac.uk>
# License::     The Ruby License
#

require 'erb'
require 'date'

module Bio

class Sequence

# = DESCRIPTION
# A Mixin[http://www.rubycentral.com/book/tut_modules.html]
# of methods used by Bio::Sequence#output to output sequences in 
# common bioinformatic formats.  These are not called in isolation.
#
# = USAGE
#   # Given a Bio::Sequence object,
#   puts s.output(:fasta)
#   puts s.output(:genbank)
#   puts s.output(:embl)
module Format

  # Repository of generic (or both nucleotide and protein) sequence
  # formatter classes
  module Formatter

    # Raw format generatar
    autoload :Raw, 'bio/sequence/format_raw'

    # Fasta format generater
    autoload :Fasta, 'bio/db/fasta/format_fasta'

    # NCBI-style Fasta format generatar
    # (resemble to EMBOSS "ncbi" format)
    autoload :Fasta_ncbi, 'bio/db/fasta/format_fasta'

    # FASTQ "fastq-sanger" format generator
    autoload :Fastq, 'bio/db/fastq/format_fastq'
    # FASTQ "fastq-sanger" format generator
    autoload :Fastq_sanger, 'bio/db/fastq/format_fastq'
    # FASTQ "fastq-solexa" format generator
    autoload :Fastq_solexa, 'bio/db/fastq/format_fastq'
    # FASTQ "fastq-illumina" format generator
    autoload :Fastq_illumina, 'bio/db/fastq/format_fastq'

    # FastaNumericFormat format generator
    autoload :Fasta_numeric, 'bio/db/fasta/format_qual'
    # Qual format generator.
    # Its format is the same as Fasta_numeric, but it would perform
    # to convert quality score or generates scores from error probability.
    autoload :Qual, 'bio/db/fasta/format_qual'

  end #module Formatter

  # Repository of nucleotide sequence formatter classes
  module NucFormatter

    # GenBank format generater
    # Note that the name is 'Genbank' and NOT 'GenBank'
    autoload :Genbank, 'bio/db/genbank/format_genbank'

    # EMBL format generater
    # Note that the name is 'Embl' and NOT 'EMBL'
    autoload :Embl, 'bio/db/embl/format_embl'

  end #module NucFormatter

  # Repository of protein sequence formatter classes
  module AminoFormatter
    # currently no formats available
  end #module AminoFormatter

  # Formatter base class.
  # Any formatter class should inherit this class.
  class FormatterBase

    # Returns a formatterd string of the given sequence
    # ---
    # *Arguments*:
    # * (required) _sequence_: Bio::Sequence object
    # * (optional) _options_: a Hash object
    # *Returns*:: String object
    def self.output(sequence, options = {})
      self.new(sequence, options).output
    end

    # register new Erb template
    def self.erb_template(str)
      erb = ERB.new(str)
      erb.def_method(self, 'output')
      true
    end
    private_class_method :erb_template

    # generates output data
    # ---
    # *Returns*:: String object
    def output
      raise NotImplementedError, 'should be implemented in subclass'
    end

    # creates a new formatter object for output
    def initialize(sequence, options = {})
      @sequence = sequence
      @options = options
    end

    private

    # any unknown methods are delegated to the sequence object
    def method_missing(sym, *args, &block) #:nodoc:
      begin
        @sequence.__send__(sym, *args, &block)
      rescue NoMethodError => evar
        lineno = __LINE__ - 2
        file = __FILE__
        bt_here = [ "#{file}:#{lineno}:in \`__send__\'",
                    "#{file}:#{lineno}:in \`method_missing\'"
                  ]
        if bt_here == evar.backtrace[0, 2] then
          bt = evar.backtrace[2..-1]
          evar = evar.class.new("undefined method \`#{sym.to_s}\' for #{self.inspect}")
          evar.set_backtrace(bt)
        end
        raise(evar)
      end
    end
  end #class FormatterBase

  # Using Bio::Sequence::Format, return a String with the Bio::Sequence
  # object formatted in the given style.
  #
  # Formats currently implemented are: 'fasta', 'genbank', and 'embl'
  #
  #   s = Bio::Sequence.new('atgc')
  #   puts s.output(:fasta)                   #=> "> \natgc\n"
  #
  # The style argument is given as a Ruby 
  # Symbol(http://www.ruby-doc.org/core/classes/Symbol.html)
  # ---
  # *Arguments*: 
  # * (required) _format_: :fasta, :genbank, *or* :embl
  # *Returns*:: String object
  def output(format = :fasta, options = {})
    formatter_const = format.to_s.capitalize.intern

    formatter_class = nil
    get_formatter_repositories.each do |mod|
      begin
        formatter_class = mod.const_get(formatter_const)
      rescue NameError
      end
      break if formatter_class
    end
    unless formatter_class then
      raise "unknown format name #{format.inspect}"
    end

    formatter_class.output(self, options)
  end

  # Returns a list of available output formats for the sequence
  # ---
  # *Arguments*: 
  # *Returns*:: Array of Symbols
  def list_output_formats
    a = get_formatter_repositories.collect { |mod| mod.constants }
    a.flatten!
    a.collect! { |x| x.to_s.downcase.intern }
    a
  end

  # The same as output(:fasta, :header=>definition, :width=>width)
  # This method is intended to replace Bio::Sequence#to_fasta.
  #
  #   s = Bio::Sequence.new('atgc')
  #   puts s.output_fasta                   #=> "> \natgc\n"
  # ---
  # *Arguments*: 
  # * (optional) _definition_: (String) definition line
  # * (optional) _width_: (Integer) width (default 70)
  # *Returns*:: String object
  def output_fasta(definition = nil, width = 70)
    output(:fasta, :header=> definition, :width => width)
  end

  private

  # returns formatter repository modules
  def get_formatter_repositories
    if self.moltype == Bio::Sequence::NA then
      [ NucFormatter, Formatter ]
    elsif self.moltype == Bio::Sequence::AA then
      [ AminoFormatter, Formatter ]
    else
      [ NucFormatter, AminoFormatter, Formatter ]
    end
  end

  #---

  # Not yet implemented :)
  # Remove the nodoc command after implementation!
  # ---
  # *Returns*:: String object
  #def format_gff #:nodoc:
  #  raise NotImplementedError
  #end

  #+++

# Formatting helper methods for INSD (NCBI, EMBL, DDBJ) feature table
module INSDFeatureHelper
  private

  # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
  # case, it would be difficult to successfully call this method outside
  # its expected context).
  #
  # Output the Genbank feature format string of the sequence.
  # Used in Bio::Sequence#output.
  # ---
  # *Returns*:: String object
  def format_features_genbank(features)
    prefix = ' ' * 5
    indent = prefix + ' ' * 16
    fwidth = 79 - indent.length
  
    format_features(features, prefix, indent, fwidth)
  end

  # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
  # case, it would be difficult to successfully call this method outside
  # its expected context).
  #
  # Output the EMBL feature format string of the sequence.
  # Used in Bio::Sequence#output.
  # ---
  # *Returns*:: String object
  def format_features_embl(features)
    prefix = 'FT   '
    indent = prefix + ' ' * 16
    fwidth = 80 - indent.length
  
    format_features(features, prefix, indent, fwidth)
  end

  # format INSD featurs
  def format_features(features, prefix, indent, width)
    result = []
    features.each do |feature|
      result.push format_feature(feature, prefix, indent, width)
    end
    return result.join('')
  end

  # format an INSD feature
  def format_feature(feature, prefix, indent, width)
    result = prefix + sprintf("%-16s", feature.feature)

    position = feature.position
    #position = feature.locations.to_s

    result << wrap_and_split_lines(position, width).join("\n" + indent)
    result << "\n"
    result << format_qualifiers(feature.qualifiers, indent, width)
    return result
  end

  # format qualifiers
  def format_qualifiers(qualifiers, indent, width)
    qualifiers.collect do |qualifier|
      q = qualifier.qualifier
      v = qualifier.value.to_s

      if v == true
        lines = wrap_with_newline('/' + q, width)
      elsif q == 'translation'
        lines = fold("/#{q}=\"#{v}\"", width)
      else
        if v[/\D/] or q == 'chromosome'
          #v.delete!("\x00-\x1f\x7f-\xff")
          v.gsub!(/"/, '""')
          v = '"' + v + '"'
        end
        lines = wrap_with_newline('/' + q + '=' + v, width)
      end

      lines.gsub!(/^/, indent)
      lines
    end.join
  end

  def fold(str, width)
    str.gsub(Regexp.new("(.{1,#{width}})"), "\\1\n")
  end

  def fold_and_split_lines(str, width)
    str.scan(Regexp.new(".{1,#{width}}"))
  end

  def wrap_and_split_lines(str, width)
    result = []
    lefts = str.chomp.split(/(?:\r\n|\r|\n)/)
    lefts.each do |left|
      left.rstrip!
      while left and left.length > width
        line = nil
        width.downto(1) do |i|
          if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)]  then
            line = left[0..(i-1)].sub(/ +\z/, '')
            left = left[i..-1].sub(/\A +/, '')
            break
          end
        end
        if line.nil? then
          line = left[0..(width-1)]
          left = left[width..-1]
        end
        result << line
        left = nil if  left.to_s.empty?
      end
      result << left if left
    end
    return result
  end

  def wrap_with_newline(str, width)
    result = wrap_and_split_lines(str, width)
    result_string = result.join("\n")
    result_string << "\n" unless result_string.empty?
    return result_string
  end

  def wrap(str, width = 80, prefix = '')
    actual_width = width - prefix.length
    result = wrap_and_split_lines(str, actual_width)
    result_string = result.join("\n#{prefix}")
    result_string = prefix + result_string unless result_string.empty?
    return result_string
  end

  #--
  # internal use only
  MonthStr = [ nil, 
               'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN',
               'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'
             ].collect { |x| x.freeze }.freeze
  #++

  # formats a date from Date, DateTime, or Time object, or String.
  def format_date(d)
    begin
      yy = d.year
      mm = d.month
      dd = d.day
    rescue NoMethodError, NameError, ArgumentError, TypeError
      return sprintf("%-11s", d)
    end
    sprintf("%02d-%-3s-%04d", dd, MonthStr[mm], yy)
  end

  # null date
  def null_date
    Date.new(0, 1, 1)
  end

end #module INSDFeatureHelper

end #module Format

end #class Sequence

end #module Bio