#
# = bio/sequence/na.rb - nucleic acid sequence class
#
# Copyright::   Copyright (C) 2006
#               Toshiaki Katayama <k@bioruby.org>,
#               Ryan Raaum <ryan@raaum.org>
# License::     The Ruby License
#

module Bio

  autoload :NucleicAcid, 'bio/data/na' unless const_defined?(:NucleicAcid)
  autoload :CodonTable,  'bio/data/codontable' unless const_defined?(:CodonTable)

  require 'bio/sequence' unless const_defined?(:Sequence)

class Sequence


# = DESCRIPTION
# Bio::Sequence::NA represents a bare Nucleic Acid sequence in bioruby.
#
# = USAGE
#   # Create a Nucleic Acid sequence.
#   dna = Bio::Sequence.auto('atgcatgcATGCATGCAAAA')
#   rna = Bio::Sequence.auto('augcaugcaugcaugcaaaa')
#
#   # What are the names of all the bases?
#   puts dna.names
#   puts rna.names
#
#   # What is the GC percentage?
#   puts dna.gc_percent
#   puts rna.gc_percent
#
#   # What is the molecular weight?
#   puts dna.molecular_weight
#   puts rna.molecular_weight
#
#   # What is the reverse complement?
#   puts dna.reverse_complement
#   puts dna.complement
#
#   # Is this sequence DNA or RNA?
#   puts dna.rna?
#
#   # Translate my sequence (see method docs for many options)
#   puts dna.translate
#   puts rna.translate
class NA < String

  include Bio::Sequence::Common

  # Generate an nucleic acid sequence object from a string.
  #
  #   s = Bio::Sequence::NA.new("aagcttggaccgttgaagt")
  #
  # or maybe (if you have an nucleic acid sequence in a file)
  #
  #   s = Bio::Sequence:NA.new(File.open('dna.txt').read)
  #
  # Nucleic Acid sequences are *always* all lowercase in bioruby
  #
  #   s = Bio::Sequence::NA.new("AAGcTtGG")
  #   puts s                                  #=> "aagcttgg"
  #
  # Whitespace is stripped from the sequence
  #
  #   seq = Bio::Sequence::NA.new("atg\nggg\ttt\r  gc")
  #   puts s                                  #=> "atggggttgc"
  # ---
  # *Arguments*:
  # * (required) _str_: String
  # *Returns*:: Bio::Sequence::NA object
  def initialize(str)
    super
    self.downcase!
    self.tr!(" \t\n\r",'')
  end

  # Alias of Bio::Sequence::Common splice method, documented there.
  def splicing(position) #:nodoc:
    mRNA = super
    if mRNA.rna?
      mRNA.tr!('t', 'u')
    else
      mRNA.tr!('u', 't')
    end
    mRNA
  end

  # Returns a new complementary sequence object (without reversing).
  # The original sequence object is not modified.
  #
  #   s = Bio::Sequence::NA.new('atgc')
  #   puts s.forward_complement               #=> 'tacg'
  #   puts s                                  #=> 'atgc'
  # ---
  # *Returns*:: new Bio::Sequence::NA object
  def forward_complement
    s = self.class.new(self)
    s.forward_complement!
    s
  end

  # Converts the current sequence into its complement (without reversing).
  # The original sequence object is modified.
  #
  #   seq = Bio::Sequence::NA.new('atgc')
  #   puts s.forward_complement!              #=> 'tacg'
  #   puts s                                  #=> 'tacg'
  # ---
  # *Returns*:: current Bio::Sequence::NA object (modified)
  def forward_complement!
    if self.rna?
      self.tr!('augcrymkdhvbswn', 'uacgyrkmhdbvswn')
    else
      self.tr!('atgcrymkdhvbswn', 'tacgyrkmhdbvswn')
    end
    self
  end

  # Returns a new sequence object with the reverse complement 
  # sequence to the original.  The original sequence is not modified.
  #
  #   s = Bio::Sequence::NA.new('atgc')
  #   puts s.reverse_complement               #=> 'gcat'
  #   puts s                                  #=> 'atgc'
  # ---
  # *Returns*:: new Bio::Sequence::NA object
  def reverse_complement
    s = self.class.new(self)
    s.reverse_complement!
    s
  end

  # Converts the original sequence into its reverse complement.  
  # The original sequence is modified.
  #
  #   s = Bio::Sequence::NA.new('atgc')
  #   puts s.reverse_complement               #=> 'gcat'
  #   puts s                                  #=> 'gcat'
  # ---
  # *Returns*:: current Bio::Sequence::NA object (modified)
  def reverse_complement!
    self.reverse!
    self.forward_complement!
  end

  # Alias for Bio::Sequence::NA#reverse_complement
  alias complement reverse_complement
  
  # Alias for Bio::Sequence::NA#reverse_complement!
  alias complement! reverse_complement!


  # Translate into an amino acid sequence.
  #   
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts s.translate                        #=> "MA*"
  #
  # By default, translate starts in reading frame position 1, but you
  # can start in either 2 or 3 as well,
  #
  #   puts s.translate(2)                     #=> "WR"
  #   puts s.translate(3)                     #=> "GV"
  #
  # You may also translate the reverse complement in one step by using frame
  # values of -1, -2, and -3 (or 4, 5, and 6)
  #
  #   puts s.translate(-1)                    #=> "SRH"
  #   puts s.translate(4)                     #=> "SRH"
  #   puts s.reverse_complement.translate(1)  #=> "SRH"
  #
  # The default codon table in the translate function is the Standard
  # Eukaryotic codon table.  The translate function takes either a 
  # number or a Bio::CodonTable object for its table argument. 
  # The available tables are 
  # (NCBI[http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=t]):
  #
  #   1. "Standard (Eukaryote)"
  #   2. "Vertebrate Mitochondrial"
  #   3. "Yeast Mitochondorial"
  #   4. "Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma"
  #   5. "Invertebrate Mitochondrial"
  #   6. "Ciliate Macronuclear and Dasycladacean"
  #   9. "Echinoderm Mitochondrial"
  #   10. "Euplotid Nuclear"
  #   11. "Bacteria"
  #   12. "Alternative Yeast Nuclear"
  #   13. "Ascidian Mitochondrial"
  #   14. "Flatworm Mitochondrial"
  #   15. "Blepharisma Macronuclear"
  #   16. "Chlorophycean Mitochondrial"
  #   21. "Trematode Mitochondrial"
  #   22. "Scenedesmus obliquus mitochondrial"
  #   23. "Thraustochytrium Mitochondrial"
  #
  # If you are using anything other than the default table, you must specify 
  # frame in the translate method call,
  #
  #   puts s.translate                #=> "MA*"  (using defaults)
  #   puts s.translate(1,1)           #=> "MA*"  (same as above, but explicit)
  #   puts s.translate(1,2)           #=> "MAW"  (different codon table)
  #
  # and using a Bio::CodonTable instance in the translate method call,
  #
  #   mt_table = Bio::CodonTable[2]
  #   puts s.translate(1, mt_table)           #=> "MAW"
  #
  # By default, any invalid or unknown codons (as could happen if the 
  # sequence contains ambiguities) will be represented by 'X' in the 
  # translated sequence. 
  # You may change this to any character of your choice.
  #
  #   s = Bio::Sequence::NA.new('atgcNNtga')
  #   puts s.translate                        #=> "MX*"
  #   puts s.translate(1,1,'9')               #=> "M9*"
  #
  # The translate method considers gaps to be unknown characters and treats 
  # them as such (i.e. does not collapse sequences prior to translation), so
  #
  #   s = Bio::Sequence::NA.new('atgc--tga')
  #   puts s.translate                        #=> "MX*"
  # ---
  # *Arguments*:
  # * (optional) _frame_:  one of 1,2,3,4,5,6,-1,-2,-3 (default 1)
  # * (optional) _table_: Fixnum in range 1,23 or Bio::CodonTable object
  #   (default 1)
  # * (optional) _unknown_: Character (default 'X')
  # *Returns*:: Bio::Sequence::AA object
  def translate(frame = 1, table = 1, unknown = 'X')
    if table.is_a?(Bio::CodonTable)
      ct = table
    else
      ct = Bio::CodonTable[table]
    end
    naseq = self.dna
    case frame
    when 1, 2, 3
      from = frame - 1
    when 4, 5, 6
      from = frame - 4
      naseq.complement!
    when -1, -2, -3
      from = -1 - frame
      naseq.complement!
    else
      from = 0
    end
    nalen = naseq.length - from
    nalen -= nalen % 3
    aaseq = naseq[from, nalen].gsub(/.{3}/) {|codon| ct[codon] or unknown}
    return Bio::Sequence::AA.new(aaseq)
  end

  # Returns counts of each codon in the sequence in a hash.
  #
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts s.codon_usage                #=> {"gcg"=>1, "tga"=>1, "atg"=>1}
  #
  # This method does not validate codons!  Any three letter group is a 'codon'. So,
  #
  #   s = Bio::Sequence::NA.new('atggNNtga')
  #   puts s.codon_usage                #=> {"tga"=>1, "gnn"=>1, "atg"=>1}
  #
  #   seq = Bio::Sequence::NA.new('atgg--tga')
  #   puts s.codon_usage                #=> {"tga"=>1, "g--"=>1, "atg"=>1}
  #
  # Also, there is no option to work in any frame other than the first.
  # ---
  # *Returns*:: Hash object
  def codon_usage
    hash = Hash.new(0)
    self.window_search(3, 3) do |codon|
      hash[codon] += 1
    end
    return hash
  end

  # Calculate the ratio of GC / ATGC bases as a percentage rounded to 
  # the nearest whole number. U is regarded as T.
  #
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts s.gc_percent                       #=> 55
  #
  # Note that this method only returns an integer value.
  # When more digits after decimal points are needed,
  # use gc_content and sprintf like below:
  #
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts sprintf("%3.2f", s.gc_content * 100)  #=> "55.56"
  # 
  # ---
  # *Returns*:: Fixnum
  def gc_percent
    count = self.composition
    at = count['a'] + count['t'] + count['u']
    gc = count['g'] + count['c']
    return 0 if at + gc == 0
    gc = 100 * gc / (at + gc)
    return gc
  end

  # Calculate the ratio of GC / ATGC bases. U is regarded as T.
  #
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts s.gc_content                       #=> (5/9)
  #   puts s.gc_content.to_f                  #=> 0.5555555555555556
  #
  # In older Ruby versions, Float is always returned.
  #   
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts s.gc_content                       #=> 0.555555555555556
  #
  # Note that "u" is regarded as "t".
  # If there are no ATGC bases in the sequence, 0.0 is returned.
  #
  # ---
  # *Returns*:: Rational or Float
  def gc_content
    count = self.composition
    at = count['a'] + count['t'] + count['u']
    gc = count['g'] + count['c']
    total = at + gc
    return 0.0 if total == 0
    return gc.quo(total)
  end

  # Calculate the ratio of AT / ATGC bases. U is regarded as T.
  #
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts s.at_content                       #=> 4/9
  #   puts s.at_content.to_f                  #=> 0.444444444444444
  #
  # In older Ruby versions, Float is always returned.
  #
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts s.at_content                       #=> 0.444444444444444
  #
  # Note that "u" is regarded as "t".
  # If there are no ATGC bases in the sequence, 0.0 is returned.
  #
  # ---
  # *Returns*:: Rational or Float
  def at_content
    count = self.composition
    at = count['a'] + count['t'] + count['u']
    gc = count['g'] + count['c']
    total = at + gc
    return 0.0 if total == 0
    return at.quo(total)
  end

  # Calculate the ratio of (G - C) / (G + C) bases.
  #
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts s.gc_skew                          #=> 3/5
  #   puts s.gc_skew.to_f                     #=> 0.6
  #
  # In older Ruby versions, Float is always returned.
  #
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts s.gc_skew                          #=> 0.6
  #
  # If there are no GC bases in the sequence, 0.0 is returned.
  #
  # ---
  # *Returns*:: Rational or Float
  def gc_skew
    count = self.composition
    g = count['g']
    c = count['c']
    gc = g + c
    return 0.0 if gc == 0
    return (g - c).quo(gc)
  end

  # Calculate the ratio of (A - T) / (A + T) bases. U is regarded as T.
  #
  #   s = Bio::Sequence::NA.new('atgttgttgttc')
  #   puts s.at_skew                          #=> (-3/4)
  #   puts s.at_skew.to_f                     #=> -0.75
  #
  # In older Ruby versions, Float is always returned.
  #
  #   s = Bio::Sequence::NA.new('atgttgttgttc')
  #   puts s.at_skew                          #=> -0.75
  #
  # Note that "u" is regarded as "t".
  # If there are no AT bases in the sequence, 0.0 is returned.
  #
  # ---
  # *Returns*:: Rational or Float
  def at_skew
    count = self.composition
    a = count['a']
    t = count['t'] + count['u']
    at = a + t
    return 0.0 if at == 0
    return (a - t).quo(at)
  end

  # Returns an alphabetically sorted array of any non-standard bases 
  # (other than 'atgcu').
  #
  #   s = Bio::Sequence::NA.new('atgStgQccR')
  #   puts s.illegal_bases                    #=> ["q", "r", "s"]
  # ---
  # *Returns*:: Array object
  def illegal_bases
    self.scan(/[^atgcu]/).sort.uniq
  end

  # Estimate molecular weight (using the values from BioPerl's 
  # SeqStats.pm[http://doc.bioperl.org/releases/bioperl-1.0.1/Bio/Tools/SeqStats.html] module).
  #
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts s.molecular_weight                 #=> 2841.00708
  #
  # RNA and DNA do not have the same molecular weights,
  #
  #   s = Bio::Sequence::NA.new('auggcguga')
  #   puts s.molecular_weight                 #=> 2956.94708
  # ---
  # *Returns*:: Float object
  def molecular_weight
    if self.rna?
      Bio::NucleicAcid.weight(self, true)
    else
      Bio::NucleicAcid.weight(self)
    end
  end

  # Create a ruby regular expression instance 
  # (Regexp)[http://corelib.rubyonrails.org/classes/Regexp.html]  
  #
  #   s = Bio::Sequence::NA.new('atggcgtga')
  #   puts s.to_re                            #=> /atggcgtga/
  # ---
  # *Returns*:: Regexp object
  def to_re
    if self.rna?
      Bio::NucleicAcid.to_re(self.dna, true)
    else
      Bio::NucleicAcid.to_re(self)
    end
  end

  # Generate the list of the names of each nucleotide along with the
  # sequence (full name).  Names used in bioruby are found in the
  # Bio::AminoAcid::NAMES hash.
  #
  #   s = Bio::Sequence::NA.new('atg')
  #   puts s.names                    #=> ["Adenine", "Thymine", "Guanine"]
  # ---
  # *Returns*:: Array object
  def names
    array = []
    self.each_byte do |x|
      array.push(Bio::NucleicAcid.names[x.chr.upcase])
    end
    return array
  end

  # Returns a new sequence object with any 'u' bases changed to 't'.
  # The original sequence is not modified.
  #
  #   s = Bio::Sequence::NA.new('augc')
  #   puts s.dna                              #=> 'atgc'
  #   puts s                                  #=> 'augc'
  # ---
  # *Returns*:: new Bio::Sequence::NA object
  def dna
    self.tr('u', 't')
  end

  # Changes any 'u' bases in the original sequence to 't'.
  # The original sequence is modified.
  #
  #   s = Bio::Sequence::NA.new('augc')
  #   puts s.dna!                             #=> 'atgc'
  #   puts s                                  #=> 'atgc'
  # ---
  # *Returns*:: current Bio::Sequence::NA object (modified)
  def dna!
    self.tr!('u', 't')
  end

  # Returns a new sequence object with any 't' bases changed to 'u'.
  # The original sequence is not modified.
  #
  #   s = Bio::Sequence::NA.new('atgc')
  #   puts s.dna                              #=> 'augc'  
  #   puts s                                  #=> 'atgc'
  # ---
  # *Returns*:: new Bio::Sequence::NA object
  def rna
    self.tr('t', 'u')
  end

  # Changes any 't' bases in the original sequence to 'u'.
  # The original sequence is modified.
  #
  #   s = Bio::Sequence::NA.new('atgc')
  #   puts s.dna!                             #=> 'augc'
  #   puts s                                  #=> 'augc'
  # ---
  # *Returns*:: current Bio::Sequence::NA object (modified)
  def rna!
    self.tr!('t', 'u')
  end

  def rna?
    self.index('u')
  end
  protected :rna?

  # Example:
  #
  #   seq = Bio::Sequence::NA.new('gaattc')
  #   cuts = seq.cut_with_enzyme('EcoRI')
  #
  # _or_
  #
  #   seq = Bio::Sequence::NA.new('gaattc')
  #   cuts = seq.cut_with_enzyme('g^aattc')
  # ---
  # See Bio::RestrictionEnzyme::Analysis.cut
  def cut_with_enzyme(*args)
    Bio::RestrictionEnzyme::Analysis.cut(self, *args)
  end
  alias cut_with_enzymes cut_with_enzyme

end # NA

end # Sequence

end # Bio