File: format_qual.rb

package info (click to toggle)
ruby-bio 1.5.0-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 7,480 kB
  • ctags: 9,428
  • sloc: ruby: 74,117; xml: 3,383; makefile: 17; perl: 13; sh: 1
file content (199 lines) | stat: -rw-r--r-- 6,232 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#
# = bio/db/fasta/format_qual.rb - Qual format and FastaNumericFormat generater
#
# Copyright::   Copyright (C) 2009
#               Naohisa Goto <ng@bioruby.org>
# License::    The Ruby License
#

module Bio::Sequence::Format::Formatter

  # INTERNAL USE ONLY, YOU SHOULD NOT USE THIS CLASS.
  # Simple FastaNumeric format output class for Bio::Sequence.
  class Fasta_numeric < Bio::Sequence::Format::FormatterBase

    # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD.
    #
    # Creates a new FastaNumericFormat generater object from the sequence.
    #
    # It does not care whether the content of the quality score is
    # consistent with the sequence or not, e.g. it does not check
    # length of the quality score.
    #
    # ---
    # *Arguments*:
    # * _sequence_: Bio::Sequence object
    # * (optional) :header => _header_: (String) (default nil)
    # * (optional) :width => _width_: (Fixnum) (default 70)
    def initialize; end if false # dummy for RDoc

    # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD.
    #
    # Output the FASTA format string of the sequence.  
    #
    # Currently, this method is used in Bio::Sequence#output like so,
    #
    #   s = Bio::Sequence.new('atgc')
    #   s.quality_scores = [ 70, 80, 90, 100 ]
    #   puts s.output(:fasta_numeric)
    # ---
    # *Returns*:: String object
    def output
      header = @options[:header]
      width = @options.has_key?(:width) ? @options[:width] : 70
      seq = @sequence.seq.to_s
      entry_id = @sequence.entry_id || 
        "#{@sequence.primary_accession}.#{@sequence.sequence_version}"
      definition = @sequence.definition
      header ||= "#{entry_id} #{definition}"

      sc = fastanumeric_quality_scores(seq)
      if width then
        if width <= 0 then
          main = sc.join("\n")
        else
          len = 0
          main = sc.collect do |x|
            str = (len == 0) ? "#{x}" : " #{x}"
            len += str.size
            if len > width then
              len = "#{x}".size
              str = "\n#{x}"
            end
            str
          end.join('')
        end
      else
        main = sc.join(' ')
      end

      ">#{header}\n#{main}\n"
    end

    private

    def fastanumeric_quality_scores(seq)
      @sequence.quality_scores || []
    end

  end #class Fasta_numeric

  # INTERNAL USE ONLY, YOU SHOULD NOT USE THIS CLASS.
  # Simple Qual format (sequence quality) output class for Bio::Sequence.
  class Qual < Fasta_numeric

    # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD.
    #
    # Creates a new Qual format generater object from the sequence.
    #
    # The only difference from Fastanumeric is that Qual outputs
    # Phred score by default, and data conversion will be performed
    # if needed. Output score type can be changed by the
    # ":quality_score_type" option.
    #
    # If the sequence have no quality score type information
    # and no error probabilities, but the score exists,
    # the score is regarded as :phred (Phred score).
    #
    # ---
    # *Arguments*:
    # * _sequence_: Bio::Sequence object
    # * (optional) :header => _header_: (String) (default nil)
    # * (optional) :width => _width_: (Fixnum) (default 70)
    # * (optional) :quality_score_type => _type_: (Symbol) (default nil)
    # * (optional) :default_score => _score_: (Integer) default score for bases that have no valid quality scores or error probabilities (default 0)
    def initialize; end if false # dummy for RDoc

    private

    def fastanumeric_quality_scores(seq)
      qsc = qual_quality_scores(seq)
      if qsc.size > seq.length then
        qsc = qsc[0, seq.length]
      elsif qsc.size < seq.length then
        padding = @options[:default_score] || 0
        psize = seq.length - qsc.size
        qsc += Array.new(psize, padding)
      end
      qsc
    end

    def qual_quality_scores(seq)
      return [] if seq.length <= 0

      # get output quality score type
      fmt = @options[:quality_score_type]

      qsc = @sequence.quality_scores
      qsc_type = @sequence.quality_score_type

      # checks if no need to convert
      if qsc and qsc_type == fmt and
          qsc.size >= seq.length then
        return qsc
      end

      # default output quality score type is :phred
      fmt ||= :phred
      # If quality score type of the sequence is nil, implicitly
      # regarded as :phred.
      qsc_type ||= :phred

      # checks error_probabilities
      ep = @sequence.error_probabilities
      if ep and ep.size >= seq.length then
        case fmt
        when :phred
          return Bio::Sequence::QualityScore::Phred.p2q(ep[0, seq.length])
        when :solexa
          return Bio::Sequence::QualityScore::Solexa.p2q(ep[0, seq.length])
        end
      end

      # Checks if scores can be converted.
      if qsc and qsc.size >= seq.length then
        case [ qsc_type, fmt ]
        when [ :phred, :solexa ]
          return Bio::Sequence::QualityScore::Phred.convert_scores_to_solexa(qsc[0, seq.length])
        when [ :solexa, :phred ]
          return Bio::Sequence::QualityScore::Solexa.convert_scores_to_phred(qsc[0, seq.length])
        end
      end

      # checks quality scores type
      case qsc_type
      when :phred, :solexa
        #does nothing
      else
        qsc_type = nil
        qsc = nil
      end

      # collects piece of information
      qsc_cov = qsc ? qsc.size.quo(seq.length) : 0
      ep_cov = ep ? ep.size.quo(seq.length) : 0
      if qsc_cov > ep_cov then
        case [ qsc_type, fmt ]
        when [ :phred, :phred ], [ :solexa, :solexa ]
          return qsc
        when [ :phred, :solexa ]
          return Bio::Sequence::QualityScore::Phred.convert_scores_to_solexa(qsc)
        when [ :solexa, :phred ]
          return Bio::Sequence::QualityScore::Solexa.convert_scores_to_phred(qsc)
        end
      elsif ep_cov > qsc_cov then
        case fmt
        when :phred
          return Bio::Sequence::QualityScore::Phred.p2q(ep)
        when :solexa
          return Bio::Sequence::QualityScore::Solexa.p2q(ep)
        end
      end

      # if no information, returns empty array
      return []
    end
  end #class Qual

end #module Bio::Sequence::Format::Formatter