1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
|
#
# = bio/appl/gcg/seq.rb - GCG sequence file format class (.seq/.pep file)
#
# Copyright:: Copyright (C) 2003, 2006
# Naohisa Goto <ng@bioruby.org>
# License:: The Ruby License
#
# $Id: seq.rb,v 1.3 2007/04/05 23:35:39 trevor Exp $
#
# = About Bio::GCG::Seq
#
# Please refer document of Bio::GCG::Seq.
#
module Bio
module GCG
#
# = Bio::GCG::Seq
#
# This is GCG sequence file format (.seq or .pep) parser class.
#
# = References
#
# * Information about GCG Wisconsin Package(R)
# http://www.accelrys.com/products/gcg_wisconsin_package .
# * EMBOSS sequence formats
# http://www.hgmp.mrc.ac.uk/Software/EMBOSS/Themes/SequenceFormats.html
# * BioPerl document
# http://docs.bioperl.org/releases/bioperl-1.2.3/Bio/SeqIO/gcg.html
class Seq #< DB
# delimiter used by Bio::FlatFile
DELIMITER = RS = nil
# Creates new instance of this class.
# str must be a GCG seq formatted string.
def initialize(str)
@heading = str[/.*/] # '!!NA_SEQUENCE 1.0' or like this
str = str.sub(/.*/, '')
str.sub!(/.*\.\.$/m, '')
@definition = $&.to_s.sub(/^.*\.\.$/, '').to_s
desc = $&.to_s
if m = /(.+)\s+Length\:\s+(\d+)\s+(.+)\s+Type\:\s+(\w)\s+Check\:\s+(\d+)/.match(desc) then
@entry_id = m[1].to_s.strip
@length = (m[2] ? m[2].to_i : nil)
@date = m[3].to_s.strip
@seq_type = m[4]
@checksum = (m[5] ? m[5].to_i : nil)
end
@data = str
@seq = nil
@definition.strip!
end
# ID field.
attr_reader :entry_id
# Description field.
attr_reader :definition
# "Length:" field.
# Note that sometimes this might differ from real sequence length.
attr_reader :length
# Date field of this entry.
attr_reader :date
# "Type:" field, which indicates sequence type.
# "N" means nucleic acid sequence, "P" means protein sequence.
attr_reader :seq_type
# "Check:" field, which indicates checksum of current sequence.
attr_reader :checksum
# heading
# ('!!NA_SEQUENCE 1.0' or whatever like this)
attr_reader :heading
#---
## data (internally used, will be obsoleted)
#attr_reader :data
#+++
# Sequence data.
# The class of the sequence is Bio::Sequence::NA, Bio::Sequence::AA
# or Bio::Sequence::Generic, according to the sequence type.
def seq
unless @seq then
case @seq_type
when 'N', 'n'
k = Bio::Sequence::NA
when 'P', 'p'
k = Bio::Sequence::AA
else
k = Bio::Sequence
end
@seq = k.new(@data.tr('^-a-zA-Z.~', ''))
end
@seq
end
# If you know the sequence is AA, use this method.
# Returns a Bio::Sequence::AA object.
#
# If you call naseq for protein sequence,
# or aaseq for nucleic sequence, RuntimeError will be raised.
def aaseq
if seq.is_a?(Bio::Sequence::AA) then
@seq
else
raise 'seq_type != \'P\''
end
end
# If you know the sequence is NA, use this method.
# Returens a Bio::Sequence::NA object.
#
# If you call naseq for protein sequence,
# or aaseq for nucleic sequence, RuntimeError will be raised.
def naseq
if seq.is_a?(Bio::Sequence::NA) then
@seq
else
raise 'seq_type != \'N\''
end
end
# Validates checksum.
# If validation succeeds, returns true.
# Otherwise, returns false.
def validate_checksum
checksum == self.class.calc_checksum(seq)
end
#---
# class methods
#+++
# Calculates checksum from given string.
def self.calc_checksum(str)
# Reference: Bio::SeqIO::gcg of BioPerl-1.2.3
idx = 0
sum = 0
str.upcase.tr('^A-Z.~', '').each_byte do |c|
idx += 1
sum += idx * c
idx = 0 if idx >= 57
end
(sum % 10000)
end
# Creates a new GCG sequence format text.
# Parameters can be omitted.
#
# Examples:
# Bio::GCG::Seq.to_gcg(:definition=>'H.sapiens DNA',
# :seq_type=>'N', :entry_id=>'gi-1234567',
# :seq=>seq, :date=>date)
#
def self.to_gcg(hash)
seq = hash[:seq]
if seq.is_a?(Bio::Sequence::NA) then
seq_type = 'N'
elsif seq.is_a?(Bio::Sequence::AA) then
seq_type = 'P'
else
seq_type = (hash[:seq_type] or 'P')
end
if seq_type == 'N' then
head = '!!NA_SEQUENCE 1.0'
else
head = '!!AA_SEQUENCE 1.0'
end
date = (hash[:date] or Time.now.strftime('%B %d, %Y %H:%M'))
entry_id = hash[:entry_id].to_s.strip
len = seq.length
checksum = self.calc_checksum(seq)
definition = hash[:definition].to_s.strip
seq = seq.upcase.gsub(/.{1,50}/, "\\0\n")
seq.gsub!(/.{10}/, "\\0 ")
w = len.to_s.size + 1
i = 1
seq.gsub!(/^/) { |x| s = sprintf("\n%*d ", w, i); i += 50; s }
[ head, "\n", definition, "\n\n",
"#{entry_id} Length: #{len} #{date} " \
"Type: #{seq_type} Check: #{checksum} ..\n",
seq, "\n" ].join('')
end
end #class Seq
end #module GCG
end #module Bio
|