File: format_fastq.rb

package info (click to toggle)
ruby-bio 1.5.0-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 7,480 kB
  • ctags: 9,428
  • sloc: ruby: 74,117; xml: 3,383; makefile: 17; perl: 13; sh: 1
file content (174 lines) | stat: -rw-r--r-- 5,473 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#
# = bio/db/fasta/format_fastq.rb - FASTQ format generater
#
# Copyright::   Copyright (C) 2009
#               Naohisa Goto <ng@bioruby.org>
# License::    The Ruby License
#

require 'bio/db/fastq'

module Bio::Sequence::Format::Formatter

  # INTERNAL USE ONLY, YOU SHOULD NOT USE THIS CLASS.
  #
  # FASTQ format output class for Bio::Sequence.
  #
  # The default FASTQ format is fastq-sanger.
  class Fastq < Bio::Sequence::Format::FormatterBase

    # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD.
    #
    # Creates a new Fasta format generater object from the sequence.
    #
    # ---
    # *Arguments*:
    # * _sequence_: Bio::Sequence object
    # * (optional) :repeat_title => (true or false) if true, repeating title in the "+" line; if not true, "+" only (default false)
    # * (optional) :width => _width_: (Fixnum) width to wrap sequence and quality lines;  nil to prevent wrapping (default nil)
    # * (optional) :title => _title_: (String) completely replaces title line with the _title_ (default nil)
    # * (optional) :default_score => _score_: (Integer) default score for bases that have no valid quality scores or error probabilities; false or nil means the lowest score, true means the highest score (default nil)
    def initialize; end if false # dummy for RDoc

    # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD.
    #
    # Output the FASTQ format string of the sequence.  
    #
    # Currently, this method is used in Bio::Sequence#output like so,
    #
    #   s = Bio::Sequence.new('atgc')
    #   puts s.output(:fastq_sanger)
    # ---
    # *Returns*:: String object
    def output
      title = @options[:title]
      width = @options.has_key?(:width) ? @options[:width] : nil
      seq = @sequence.seq.to_s
      entry_id = @sequence.entry_id || 
        "#{@sequence.primary_accession}.#{@sequence.sequence_version}"
      definition = @sequence.definition
      unless title then
        title = definition.to_s
        unless title[0, entry_id.length] == entry_id and
            /\s/ =~ title[entry_id.length, 1].to_s then
          title = "#{entry_id} #{title}"
        end
      end
      title2 = @options[:repeat_title] ? title : ''
      qstr = fastq_quality_string(seq, @options[:default_score])

      "@#{title}\n" +
        if width then
          seq.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
        else
          seq + "\n"
        end +
        "+#{title2}\n" +
        if width then
          qstr.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
        else
          qstr + "\n"
        end
    end

    private
    def fastq_format_data
      Bio::Fastq::FormatData::FASTQ_SANGER.instance
    end

    def fastq_quality_string(seq, default_score)
      sc = fastq_quality_scores(seq)
      if sc.size < seq.length then
        if default_score == true then
          # when true, the highest score
          default_score = fastq_format_data.score_range.end
        else
          # when false or nil, the lowest score
          default_score ||= fastq_format_data.score_range.begin
        end
        sc = sc + ([ default_score ] * (seq.length - sc.size))
      end
      fastq_format_data.scores2str(sc)
    end

    def fastq_quality_scores(seq)
      return [] if seq.length <= 0
      fmt = fastq_format_data
      # checks quality_scores
      qsc = @sequence.quality_scores
      qsc_type = @sequence.quality_score_type
      if qsc and qsc_type and
          qsc_type == fmt.quality_score_type and
          qsc.size >= seq.length then
        return qsc
      end
      
      # checks error_probabilities
      ep = @sequence.error_probabilities
      if ep and ep.size >= seq.length then
        return fmt.p2q(ep[0, seq.length])
      end

      # If quality score type of the sequence is nil, regarded as :phred.
      qsc_type ||= :phred

      # checks if scores can be converted
      if qsc and qsc.size >= seq.length then
        case [ qsc_type, fmt.quality_score_type ]
        when [ :phred, :solexa ]
          return fmt.convert_scores_from_phred_to_solexa(qsc[0, seq.length])
        when [ :solexa, :phred ]
          return fmt.convert_scores_from_solexa_to_phred(qsc[0, seq.length])
        end
      end

      # checks quality scores type
      case qsc_type
      when :phred, :solexa
        #does nothing
      else
        qsc_type = nil
        qsc = nil
      end

      # collects piece of information
      qsc_cov = qsc ? qsc.size.quo(seq.length) : 0
      ep_cov = ep ? ep.size.quo(seq.length) : 0
      if qsc_cov > ep_cov then
        case [ qsc_type, fmt.quality_score_type ]
        when [ :phred, :phred ], [ :solexa, :solexa ]
          return qsc
        when [ :phred, :solexa ]
          return fmt.convert_scores_from_phred_to_solexa(qsc)
        when [ :solexa, :phred ]
          return fmt.convert_scores_from_solexa_to_phred(qsc)
        end
      elsif ep_cov > qsc_cov then
        return fmt.p2q(ep)
      end

      # if no information, returns empty array
      return []
    end
  end #class Fastq

  # class Fastq_sanger is the same as the Fastq class.
  Fastq_sanger = Fastq

  class Fastq_solexa < Fastq
    private
    def fastq_format_data
      Bio::Fastq::FormatData::FASTQ_SOLEXA.instance
    end
  end #class Fastq_solexa

  class Fastq_illumina < Fastq
    private
    def fastq_format_data
      Bio::Fastq::FormatData::FASTQ_ILLUMINA.instance
    end
  end #class Fastq_illumina

end #module Bio::Sequence::Format::Formatter