File: sequence_range.rb

package info (click to toggle)
ruby-bio 2.0.6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,108 kB
  • sloc: ruby: 68,331; perl: 13; makefile: 11; sh: 1
file content (256 lines) | stat: -rw-r--r-- 9,623 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#
# bio/util/restriction_enzyme/range/sequence_range.rb - A defined range over a nucleotide sequence
#
# Author::    Trevor Wennblom  <mailto:trevor@corevx.com>
# Copyright:: Copyright (c) 2005-2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License::   The Ruby License
#

module Bio

require 'bio/util/restriction_enzyme' unless const_defined?(:RestrictionEnzyme)

class RestrictionEnzyme
class Range

  autoload :CutRange,                'bio/util/restriction_enzyme/range/cut_range'
  autoload :CutRanges,               'bio/util/restriction_enzyme/range/cut_ranges'
  autoload :HorizontalCutRange,      'bio/util/restriction_enzyme/range/horizontal_cut_range'
  autoload :VerticalCutRange,        'bio/util/restriction_enzyme/range/vertical_cut_range'

# A defined range over a nucleotide sequence.
#
# This class accomadates having cuts defined on a sequence and returning the
# fragments made by those cuts.
class SequenceRange

  autoload :Fragment,                'bio/util/restriction_enzyme/range/sequence_range/fragment'
  autoload :Fragments,               'bio/util/restriction_enzyme/range/sequence_range/fragments'
  autoload :CalculatedCuts,          'bio/util/restriction_enzyme/range/sequence_range/calculated_cuts'

  # Left-most index of primary strand
  attr_reader :p_left
  
  # Right-most index of primary strand
  attr_reader :p_right
  
  # Left-most index of complementary strand
  attr_reader :c_left
  
  # Right-most index of complementary strand
  attr_reader :c_right

  # Left-most index of DNA sequence
  attr_reader :left
  
  # Right-most index of DNA sequence
  attr_reader :right
  
  # Size of DNA sequence
  attr_reader :size
  
  # CutRanges in this SequenceRange
  attr_reader :cut_ranges

  def initialize( p_left = nil, p_right = nil, c_left = nil, c_right = nil )
    raise ArgumentError if p_left == nil and c_left == nil
    raise ArgumentError if p_right == nil and c_right == nil
    (raise ArgumentError unless p_left <= p_right) unless p_left == nil or p_right == nil
    (raise ArgumentError unless c_left <= c_right) unless c_left == nil or c_right == nil

    @p_left, @p_right, @c_left, @c_right = p_left, p_right, c_left, c_right
    @left = [p_left, c_left].compact.sort.first
    @right = [p_right, c_right].compact.sort.last
    @size = (@right - @left) + 1 unless @left == nil or @right == nil
    @cut_ranges = CutRanges.new
    @__fragments_current = false
  end


  # If the first object is HorizontalCutRange or VerticalCutRange, that is
  # added to the SequenceRange.  Otherwise this method
  # builds a VerticalCutRange object and adds it to the SequenceRange.
  # 
  # Note:
  # Cut occurs immediately after the index supplied.
  # For example, a cut at '0' would mean a cut occurs between bases 0 and 1.
  #
  # ---
  # *Arguments*
  # * +p_cut_left+: (_optional_) Left-most cut on the primary strand *or* a CutRange object.  +nil+ to skip
  # * +p_cut_right+: (_optional_) Right-most cut on the primary strand.  +nil+ to skip
  # * +c_cut_left+: (_optional_) Left-most cut on the complementary strand.  +nil+ to skip
  # * +c_cut_right+: (_optional_) Right-most cut on the complementary strand.  +nil+ to skip
  # *Returns*:: nothing
  def add_cut_range( p_cut_left=nil, p_cut_right=nil, c_cut_left=nil, c_cut_right=nil )
    @__fragments_current = false
    if p_cut_left.kind_of? CutRange # shortcut
      @cut_ranges << p_cut_left
    else
      [p_cut_left, p_cut_right, c_cut_left, c_cut_right].each { |n| (raise IndexError unless n >= @left and n <= @right) unless n == nil }
      @cut_ranges << VerticalCutRange.new( p_cut_left, p_cut_right, c_cut_left, c_cut_right )
    end
  end

  # Add a series of CutRange objects (HorizontalCutRange or VerticalCutRange).
  #
  # ---
  # *Arguments*
  # * +cut_ranges+: A series of CutRange objects
  # *Returns*:: nothing
  def add_cut_ranges(*cut_ranges)
    cut_ranges.flatten.each do |cut_range|
      raise TypeError, "Not of type CutRange" unless cut_range.kind_of? CutRange
      self.add_cut_range( cut_range )
    end
  end

  # Builds a HorizontalCutRange object and adds it to the SequenceRange.
  #
  # ---
  # *Arguments*
  # * +left+: Left-most cut
  # * +right+: (_optional_) Right side - by default this equals the left side, default is recommended.
  # *Returns*:: nothing
  def add_horizontal_cut_range( left, right=left )
    @__fragments_current = false
    @cut_ranges << HorizontalCutRange.new( left, right )
  end
  
  # A Bio::RestrictionEnzyme::Range::SequenceRange::Bin holds an +Array+ of 
  # indexes for the primary and complement strands (+p+ and +c+ accessors).
  # 
  # Example hash with Bin values:
  #   {0=>#<struct Bio::RestrictionEnzyme::Range::SequenceRange::Bin c=[0, 1], p=[0]>,
  #    2=>#<struct Bio::RestrictionEnzyme::Range::SequenceRange::Bin c=[], p=[1, 2]>,
  #    3=>#<struct Bio::RestrictionEnzyme::Range::SequenceRange::Bin c=[2, 3], p=[]>,
  #    4=>#<struct Bio::RestrictionEnzyme::Range::SequenceRange::Bin c=[4, 5], p=[3, 4, 5]>}
  #
  # Note that the bin cannot be easily stored as a range since there may be
  # nucleotides excised in the middle of a range.
  #
  # TODO: Perhaps store the bins as one-or-many ranges since missing
  # nucleotides due to enzyme cutting is a special case.
  Bin = Struct.new(:c, :p)

  # Calculates the fragments over this sequence range as defined after using
  # the methods add_cut_range, add_cut_ranges, and/or add_horizontal_cut_range
  #
  # Example return value:
  #   [#<Bio::RestrictionEnzyme::Range::SequenceRange::Fragment:0x277bdc
  #     @complement_bin=[0, 1],
  #     @primary_bin=[0]>,
  #    #<Bio::RestrictionEnzyme::Range::SequenceRange::Fragment:0x277bc8
  #     @complement_bin=[],
  #     @primary_bin=[1, 2]>,
  #    #<Bio::RestrictionEnzyme::Range::SequenceRange::Fragment:0x277bb4
  #     @complement_bin=[2, 3],
  #     @primary_bin=[]>,
  #    #<Bio::RestrictionEnzyme::Range::SequenceRange::Fragment:0x277ba0
  #     @complement_bin=[4, 5],
  #     @primary_bin=[3, 4, 5]>]
  #
  # ---
  # *Arguments*
  # * _none_
  # *Returns*:: Bio::RestrictionEnzyme::Range::SequenceRange::Fragments
  def fragments
    return @__fragments if @__fragments_current == true
    @__fragments_current = true
    
    num_txt = '0123456789'
    num_txt_repeat = (num_txt * ( @size.div(num_txt.size) + 1))[0..@size-1]
    fragments = Fragments.new(num_txt_repeat, num_txt_repeat)

    cc = Bio::RestrictionEnzyme::Range::SequenceRange::CalculatedCuts.new(@size)
    cc.add_cuts_from_cut_ranges(@cut_ranges)
    cc.remove_incomplete_cuts
    
    create_bins(cc).sort.each { |k, bin| fragments << Fragment.new( bin.p, bin.c ) }
    @__fragments = fragments
    return fragments
  end
  
  #########
  protected
  #########
  
  # Example:
  #   cc = Bio::RestrictionEnzyme::Range::SequenceRange::CalculatedCuts.new(@size)
  #   cc.add_cuts_from_cut_ranges(@cut_ranges)
  #   cc.remove_incomplete_cuts
  #   bins = create_bins(cc)
  # 
  # Example return value:
  #   {0=>#<struct Bio::RestrictionEnzyme::Range::SequenceRange::Bin c=[0, 1], p=[0]>,
  #    2=>#<struct Bio::RestrictionEnzyme::Range::SequenceRange::Bin c=[], p=[1, 2]>,
  #    3=>#<struct Bio::RestrictionEnzyme::Range::SequenceRange::Bin c=[2, 3], p=[]>,
  #    4=>#<struct Bio::RestrictionEnzyme::Range::SequenceRange::Bin c=[4, 5], p=[3, 4, 5]>}
  #
  # ---
  # *Arguments*
  # * +cc+: Bio::RestrictionEnzyme::Range::SequenceRange::CalculatedCuts
  # *Returns*:: +Hash+ Keys are unique, values are Bio::RestrictionEnzyme::Range::SequenceRange::Bin objects filled with indexes of the sequence locations they represent.
  def create_bins(cc)
    p_cut = cc.vc_primary_as_original_class
    c_cut = cc.vc_complement_as_original_class
    h_cut = cc.hc_between_strands_as_original_class
    
    if (defined? @circular) && @circular
      # NOTE
      # if it's circular we should start at the beginning of a cut for orientation
      # scan for it, hack off the first set of hcuts and move them to the back
  
      unique_id = 0
    else
      p_cut.unshift(-1) unless p_cut.include?(-1)
      c_cut.unshift(-1) unless c_cut.include?(-1)
      unique_id = -1
    end

    p_bin_id = c_bin_id = unique_id
    bins = {}
    setup_new_bin(bins, unique_id)

    -1.upto(@size-1) do |idx| # NOTE - circular, for the future - should '-1' be replace with 'unique_id'?
      
      # if bin_ids are out of sync but the strands are attached
      if (p_bin_id != c_bin_id) and !h_cut.include?(idx)
        min_id, max_id = [p_bin_id, c_bin_id].sort
        bins.delete(max_id)
        p_bin_id = c_bin_id = min_id
      end

      bins[ p_bin_id ].p << idx
      bins[ c_bin_id ].c << idx
      
      if p_cut.include? idx
        p_bin_id = (unique_id += 1)
        setup_new_bin(bins, p_bin_id)
      end

      if c_cut.include? idx             # repetition
        c_bin_id = (unique_id += 1)     # repetition
        setup_new_bin(bins, c_bin_id)   # repetition
      end                               # repetition
       
    end
  
    # Bin "-1" is an easy way to indicate the start of a strand just in case
    # there is a horizontal cut at position 0
    bins.delete(-1) unless ((defined? @circular) && @circular)
    bins
  end
  
  # Modifies bins in place by creating a new element with key bin_id and
  # initializing the bin.
  def setup_new_bin(bins, bin_id)
    bins[ bin_id ] = Bin.new
    bins[ bin_id ].p = DenseIntArray[] #could be replaced by SortedNumArray[]
    bins[ bin_id ].c = DenseIntArray[] #could be replaced by SortedNumArray[] 
  end
  
end # SequenceRange
end # Range
end # RestrictionEnzyme
end # Bio