File: double_stranded.rb

package info (click to toggle)
ruby-bio 2.0.6-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 7,108 kB
sloc: ruby: 68,331; perl: 13; makefile: 11; sh: 1
file content (320 lines) | stat: -rw-r--r-- 10,843 bytes
parent folder | download | duplicates (7)
#
# bio/util/restriction_enzyme/double_stranded.rb - DoubleStranded restriction enzyme sequence
#
# Author::    Trevor Wennblom  <mailto:trevor@corevx.com>
# Copyright:: Copyright (c) 2005-2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License::   The Ruby License
#

module Bio

require 'bio/util/restriction_enzyme' unless const_defined?(:RestrictionEnzyme)

class RestrictionEnzyme

# A pair of SingleStrand and SingleStrandComplement objects with methods to
# add utility to their relation.
# 
# = Notes
# * This is created by Bio::RestrictionEnzyme.new for convenience.
# * The two strands accessible are +primary+ and +complement+.
# * SingleStrand methods may be used on DoubleStranded and they will be passed to +primary+.
# 
# 
# FIXME needs better docs
class DoubleStranded

  autoload :AlignedStrands,  'bio/util/restriction_enzyme/double_stranded/aligned_strands'
  autoload :CutLocations,    'bio/util/restriction_enzyme/double_stranded/cut_locations'
  autoload :CutLocationPair, 'bio/util/restriction_enzyme/double_stranded/cut_location_pair'
  autoload :CutLocationsInEnzymeNotation,    'bio/util/restriction_enzyme/double_stranded/cut_locations_in_enzyme_notation'
  autoload :CutLocationPairInEnzymeNotation, 'bio/util/restriction_enzyme/double_stranded/cut_location_pair_in_enzyme_notation'

  include CutSymbol
  extend CutSymbol
  include StringFormatting
  extend StringFormatting

  # The primary strand
  attr_reader :primary

  # The complement strand
  attr_reader :complement

  # Cut locations in 0-based index format, DoubleStranded::CutLocations object
  attr_reader :cut_locations

  # Cut locations in enzyme index notation, DoubleStranded::CutLocationsInEnzymeNotation object
  attr_reader :cut_locations_in_enzyme_notation

  # [+erp+] One of three possible parameters:  The name of an enzyme, a REBASE::EnzymeEntry object, or a nucleotide pattern with a cut mark.
  # [+raw_cut_pairs+] The cut locations in enzyme index notation.
  #
  # Enzyme index notation:: 1.._n_, value before 1 is -1
  #
  # Examples of the allowable cut locations for +raw_cut_pairs+ follows.  'p' and
  # 'c' refer to a cut location on the 'p'rimary and 'c'omplement strands.
  #
  #   1, [3,2], [20,22], 57
  #   p, [p,c], [p, c],  p
  #
  # Which is the same as:
  #
  #   1, (3..2), (20..22), 57
  #   p, (p..c), (p..c),   p
  #
  # Examples of partial cuts:
  #   1, [nil,2], [20,nil], 57
  #   p, [p,  c], [p, c],   p
  #
  def initialize(erp, *raw_cut_pairs)
    # 'erp' : 'E'nzyme / 'R'ebase / 'P'attern
    k = erp.class

    if k == Bio::REBASE::EnzymeEntry
      # Passed a Bio::REBASE::EnzymeEntry object

      unless raw_cut_pairs.empty?
        err = "A Bio::REBASE::EnzymeEntry object was passed, however the cut locations contained values.  Ambiguous or redundant.\n"
        err += "inspect = #{raw_cut_pairs.inspect}"
        raise ArgumentError, err
      end
      initialize_with_rebase( erp )

    elsif erp.kind_of? String
      # Passed something that could be an enzyme pattern or an anzyme name

      # Decide if this String is an enzyme name or a pattern
      if Bio::RestrictionEnzyme.enzyme_name?( erp )
        # FIXME we added this to rebase...
        # Check if it's a known name
        known_enzyme = false
        known_enzyme = true if Bio::RestrictionEnzyme.rebase[ erp ]

        # Try harder to find the enzyme
        unless known_enzyme
          re = %r"^#{erp}$"i
          Bio::RestrictionEnzyme.rebase.each { |name, v| (known_enzyme = true; erp = name; break) if name =~ re }
        end

        if known_enzyme
          initialize_with_rebase( Bio::RestrictionEnzyme.rebase[erp] )
        else
          raise IndexError, "No entry found for enzyme named '#{erp}'"
        end

      else
        # Not an enzyme name, so a pattern is assumed
        if erp =~ re_cut_symbol
          initialize_with_pattern_and_cut_symbols( erp )
        else
          initialize_with_pattern_and_cut_locations( erp, raw_cut_pairs )
        end
      end

    elsif k == NilClass
      err = "Passed a nil value.  Perhaps you tried to pass a Bio::REBASE::EnzymeEntry that does not exist?\n"
      err += "inspect = #{erp.inspect}"
      raise ArgumentError, err
    else
      err = "I don't know what to do with class #{k} for erp.\n"
      err += "inspect = #{erp.inspect}"
      raise ArgumentError, err
    end

  end

  # See AlignedStrands.align
  def aligned_strands
    AlignedStrands.align(@primary.pattern, @complement.pattern)
  end

  # See AlignedStrands.align_with_cuts
  def aligned_strands_with_cuts
    AlignedStrands.align_with_cuts(@primary.pattern, @complement.pattern, @primary.cut_locations, @complement.cut_locations)
  end

  # Returns +true+ if the cut pattern creates blunt fragments.
  # (opposite of sticky)
  def blunt?
    as = aligned_strands_with_cuts
    ary = [as.primary, as.complement]
    ary.collect! { |seq| seq.split( cut_symbol ) }
    # convert the cut sections to their lengths
    ary.each { |i| i.collect! { |c| c.length } }
    ary[0] == ary[1]
  end

  # Returns +true+ if the cut pattern creates sticky fragments.
  # (opposite of blunt)
  def sticky?
    !blunt?
  end
  
  # Takes a RestrictionEnzyme object and a numerical offset to the sequence and 
  # returns an EnzymeAction
  #
  # +restriction_enzyme+:: RestrictionEnzyme
  # +offset+:: Numerical offset of where the enzyme action occurs on the seqeunce
  def create_action_at( offset )
    # x is the size of the fully aligned sequence with maximum padding needed
    # to make a match on the primary and complement strand.
    #
    # For example -
    # Note how EcoRII needs extra padding on the beginning and ending of the
    # sequence 'ccagg' to make the match since the cut must occur between 
    # two nucleotides and can not occur on the very end of the sequence.
    #   
    #   EcoRII:
    #     :blunt: "0"
    #     :c2: "5"
    #     :c4: "0"
    #     :c1: "-1"
    #     :pattern: CCWGG
    #     :len: "5"
    #     :name: EcoRII
    #     :c3: "0"
    #     :ncuts: "2"
    #   
    #        -1 1 2 3 4 5
    #   5' - n^c c w g g n - 3'
    #   3' - n g g w c c^n - 5'
    #   
    #   (w == [at])
    
    x = aligned_strands.primary.size
    
    enzyme_action = EnzymeAction.new( offset,
                                      offset + x-1,
                                      offset,
                                      offset + x-1)

    @cut_locations.each do |cut_location_pair|
      # cut_pair is a DoubleStranded::CutLocationPair
      p, c = cut_location_pair.primary, cut_location_pair.complement
      if c >= p
        enzyme_action.add_cut_range(offset+p, nil, nil, offset+c)
      else
        enzyme_action.add_cut_range(nil, offset+p, offset+c, nil)
      end
    end

    enzyme_action
  end
  
  # An EnzymeAction is a way of representing a potential effect that a
  # RestrictionEnzyme may have on a nucleotide sequence, an 'action'.
  #
  # Multiple cuts in multiple locations on a sequence may occur in one
  # 'action' if it is done by a single enzyme.
  #
  # An EnzymeAction is a series of locations that represents where the restriction
  # enzyme will bind on the sequence, as well as what ranges are cut on the
  # sequence itself.  The complexity is due to the fact that our virtual
  # restriction enzyme may create multiple segments from its cutting action, 
  # on which another restriction enzyme may operate upon.
  #
  # For example, the DNA sequence:
  # 
  #   5' - G A A T A A A C G A - 3'
  #   3' - C T T A T T T G C T - 5'
  #
  # When mixed with the restriction enzyme with the following cut pattern:
  #
  #   5' -   A|A T A A A C|G   - 3'
  #           +-+         +  
  #   3' -   T T|A T T T G|C   - 5'
  #
  # And also mixed with the restriction enzyme of the following cut pattern:
  #
  #   5' -         A A|A C     - 3'
  #                 +-+  
  #   3' -         T|T T G     - 5'
  #
  # Would result in a DNA sequence with these cuts:
  # 
  #   5' - G A|A T A A|A C|G A - 3'
  #           +-+   +-+   +
  #   3' - C T T|A T|T T G|C T - 5'
  #
  # Or these separate "free-floating" sequences:
  #
  #   5' - G A   - 3'
  #   3' - C T T - 5'
  #
  #   5' - A T A A - 3'
  #   3' -   A T   - 5'
  #
  #   5' -   A C - 3'
  #   3' - T T G - 5'
  #  
  #   5' - G A - 3'
  #   3' - C T - 5'
  #
  # This would be represented by two EnzymeActions - one for each
  # RestrictionEnzyme.
  # 
  # This is, however, subject to competition.  If the second enzyme reaches
  # the target first, the the first enzyme will not be able to find the
  # appropriate bind site.
  # 
  # FIXME complete these docs
  #
  # To initialize an EnzymeAction you must first instantiate it with the
  # beginning and ending locations of where it will operate on a nucleotide
  # sequence.
  #
  # Next the ranges of cu
  #
  # An EnzymeAction is
  # Defines a single enzyme action, in this case being a range that correlates
  # to the DNA sequence that may contain it's own internal cuts.
  class EnzymeAction < Bio::RestrictionEnzyme::Range::SequenceRange
  end

  #########
  protected
  #########

  def initialize_with_pattern_and_cut_symbols( s )
    p_cl = SingleStrand::CutLocationsInEnzymeNotation.new( strip_padding(s) )
    s = Bio::Sequence::NA.new( strip_cuts_and_padding(s) )

    # * Reflect cuts that are in enzyme notation
    # * 0 is not a valid enzyme index, decrement 0 and all negative
    c_cl = p_cl.collect {|n| (n >= s.length or n < 1) ? ((s.length - n) - 1) : (s.length - n)}

    create_cut_locations( p_cl.zip(c_cl) )
    create_primary_and_complement( s, p_cl, c_cl )
  end

  def initialize_with_pattern_and_cut_locations( s, raw_cl )
    create_cut_locations(raw_cl)
    create_primary_and_complement( Bio::Sequence::NA.new(s), @cut_locations_in_enzyme_notation.primary, @cut_locations_in_enzyme_notation.complement )
  end

  def create_primary_and_complement(primary_seq, p_cuts, c_cuts)
    @primary = SingleStrand.new( primary_seq, p_cuts )
    @complement = SingleStrandComplement.new( primary_seq.forward_complement, c_cuts )
  end

  def create_cut_locations(raw_cl)
    @cut_locations_in_enzyme_notation = CutLocationsInEnzymeNotation.new( *raw_cl.collect {|cl| CutLocationPairInEnzymeNotation.new(cl)} )
    @cut_locations = @cut_locations_in_enzyme_notation.to_array_index
  end

  def initialize_with_rebase( e )
    p_cl = [e.primary_strand_cut1, e.primary_strand_cut2]
    c_cl = [e.complementary_strand_cut1, e.complementary_strand_cut2]

    # If there's no cut in REBASE it's represented as a 0.
    # 0 is an invalid index, it just means no cut.
    p_cl.delete(0)
    c_cl.delete(0)
    raise IndexError unless p_cl.size == c_cl.size
    initialize_with_pattern_and_cut_locations( e.pattern, p_cl.zip(c_cl) )
  end

end # DoubleStranded
end # RestrictionEnzyme
end # Bio