File: alignment.rb

package info (click to toggle)
ruby-bio 1.5.0-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 7,480 kB
  • ctags: 9,428
  • sloc: ruby: 74,117; xml: 3,383; makefile: 17; perl: 13; sh: 1
file content (129 lines) | stat: -rw-r--r-- 3,444 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#
# = bio/appl/phylip/alignment.rb - phylip multiple alignment format parser
#
# Copyright:: Copyright (C) 2006
#             GOTO Naohisa <ng@bioruby.org>
#
# License:: The Ruby License
#
#  $Id: alignment.rb,v 1.2 2007/04/05 23:35:40 trevor Exp $
#
# = About Bio::Phylip::PhylipFormat
#
# Please refer document of Bio::Phylip::PhylipFormat class.
#

module Bio
  module Phylip

    # This is phylip multiple alignment format parser.
    # The two formats, interleaved and non-interleaved, are
    # automatically determined.
    #
    class PhylipFormat

      # create a new object from a string
      def initialize(str)
        @data = str.strip.split(/(?:\r\n|\r|\n)/)
        @first_line = @data.shift
        @number_of_sequences, @alignment_length =
          @first_line.to_s.strip.split(/\s+/).collect { |x| x.to_i }
      end

      # number of sequences
      attr_reader :number_of_sequences

      # alignment length
      attr_reader :alignment_length

      # If the alignment format is "interleaved", returns true.
      # If not, returns false.
      # It would mistake to determine if the alignment is very short.
      def interleaved?
        unless defined? @interleaved_flag then
          if /\A +/ =~ @data[1].to_s then
            @interleaved_flag = false
          else
            @interleaved_flag = true
          end
        end
        @interleaved_flag
      end

      # Gets the alignment. Returns a Bio::Alignment object.
      def alignment
        unless defined? @alignment then
          do_parse
          a = Bio::Alignment.new
          (0...@number_of_sequences).each do |i|
            a.add_seq(@sequences[i], @sequence_names[i])
          end
          @alignment = a
        end
        @alignment
      end

      private

      def do_parse
        if interleaved? then
          do_parse_interleaved
        else
          do_parse_noninterleaved
        end
      end

      def do_parse_interleaved
        first_block = @data[0, @number_of_sequences]
        @data[0, @number_of_sequences] = ''
        @sequence_names = Array.new(@number_of_sequences) { '' }
        @sequences = Array.new(@number_of_sequences) do
          ' ' * @alignment_length
        end
        first_block.each_with_index do |x, i|
          n, s = x.split(/ +/, 2)
          @sequence_names[i] = n
          @sequences[i].replace(s.gsub(/\s+/, ''))
        end
        i = 0
        @data.each do |x|
          if x.strip.length <= 0 then
            i = 0
          else
            @sequences[i] << x.gsub(/\s+/, '')
            i = (i + 1) % @number_of_sequences
          end
        end
        @data.clear
        true
      end

      def do_parse_noninterleaved
        @sequence_names = Array.new(@number_of_sequences) { '' }
        @sequences = Array.new(@number_of_sequences) do
          ' ' * @alignment_length
        end
        curseq = nil
        i = 0
        @data.each do |x|
          next if x.strip.length <= 0
          if !curseq or
              curseq.length > @alignment_length or /^\s/ !~ x then
            p i
            n, s = x.strip.split(/ +/, 2)
            @sequence_names[i] = n
            curseq = @sequences[i]
            curseq.replace(s.gsub(/\s+/, ''))
            i += 1
          else
            curseq << x.gsub(/\s+/, '')
          end
        end
        @data.clear
        true
      end

    end #class PhylipFormat
  end #module Phylip
end #module Bio