1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
|
#
# = bio/db/nbrf.rb - NBRF/PIR format sequence data class
#
# Copyright:: Copyright (C) 2001-2003,2006 Naohisa Goto <ng@bioruby.org>
# Copyright (C) 2001-2002 Toshiaki Katayama <k@bioruby.org>
# License:: The Ruby License
#
# $Id: nbrf.rb,v 1.10 2007/04/05 23:35:40 trevor Exp $
#
# Sequence data class for NBRF/PIR flatfile format.
#
# = References
#
# * http://pir.georgetown.edu/pirwww/otherinfo/doc/techbulletin.html
# * http://www.sander.embl-ebi.ac.uk/Services/webin/help/webin-align/align_format_help.html#pir
# * http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html
#
require 'bio/db'
require 'bio/sequence'
module Bio
# Sequence data class for NBRF/PIR flatfile format.
class NBRF < DB
#--
# based on Bio::FastaFormat class
#++
# Delimiter of each entry. Bio::FlatFile uses it.
DELIMITER = RS = "\n>"
# (Integer) excess read size included in DELIMITER.
DELIMITER_OVERRUN = 1 # '>'
#--
# Note: DELIMITER is changed due to the change of Bio::FlatFile.
# DELIMITER = RS = "*\n"
#++
# Creates a new NBRF object. It stores the comment and sequence
# information from one entry of the NBRF/PIR format string.
# If the argument contains more than one
# entry, only the first entry is used.
def initialize(str)
str = str.sub(/\A[\r\n]+/, '') # remove first void lines
line1, line2, rest = str.split(/^/, 3)
rest = rest.to_s
rest.sub!(/^>.*/m, '') # remove trailing entries for sure
@entry_overrun = $&
rest.sub!(/\*\s*\z/, '') # remove last '*' and "\n"
@data = rest
@definition = line2.to_s.chomp
if /^>?([A-Za-z0-9]{2})\;(.*)/ =~ line1.to_s then
@seq_type = $1
@entry_id = $2
end
end
# Returns sequence type described in the entry.
# P1 (protein), F1 (protein fragment)
# DL (DNA linear), DC (DNA circular)
# RL (DNA linear), RC (DNA circular)
# N3 (tRNA), N1 (other functional RNA)
attr_accessor :seq_type
# Returns ID described in the entry.
attr_accessor :entry_id
alias accession entry_id
# Returns the description line of the NBRF/PIR formatted data.
attr_accessor :definition
# sequence data of the entry (???)
attr_accessor :data
# piece of next entry. Bio::FlatFile uses it.
attr_reader :entry_overrun
# Returns the stored one entry as a NBRF/PIR format. (same as to_s)
def entry
@entry = ">#{@seq_type or 'XX'};#{@entry_id}\n#{definition}\n#{@data}*\n"
end
alias to_s entry
# Returns Bio::Sequence::AA, Bio::Sequence::NA, or Bio::Sequence,
# depending on sequence type.
def seq_class
case @seq_type
when /[PF]1/
# protein
Sequence::AA
when /[DR][LC]/, /N[13]/
# nucleic
Sequence::NA
else
Sequence
end
end
# Returns sequence data.
# Returns Bio::Sequence::NA, Bio::Sequence::AA or Bio::Sequence,
# according to the sequence type.
def seq
unless defined?(@seq)
@seq = seq_class.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up
end
@seq
end
# Returns sequence length.
def length
seq.length
end
# Returens the nucleic acid sequence.
# If you call naseq for protein sequence, RuntimeError will be occurred.
# Use the method if you know whether the sequence is NA or AA.
def naseq
if seq.is_a?(Bio::Sequence::AA) then
raise 'not nucleic but protein sequence'
elsif seq.is_a?(Bio::Sequence::NA) then
seq
else
Bio::Sequence::NA.new(seq)
end
end
# Returens the length of sequence.
# If you call nalen for protein sequence, RuntimeError will be occurred.
# Use the method if you know whether the sequence is NA or AA.
def nalen
naseq.length
end
# Returens the protein (amino acids) sequence.
# If you call aaseq for nucleic acids sequence,
# RuntimeError will be occurred.
# Use the method if you know whether the sequence is NA or AA.
def aaseq
if seq.is_a?(Bio::Sequence::NA) then
raise 'not nucleic but protein sequence'
elsif seq.is_a?(Bio::Sequence::AA) then
seq
else
Bio::Sequence::AA.new(seq)
end
end
# Returens the length of protein (amino acids) sequence.
# If you call aaseq for nucleic acids sequence,
# RuntimeError will be occurred.
# Use the method if you know whether the sequence is NA or AA.
def aalen
aaseq.length
end
#--
#class method
#++
# Creates a NBRF/PIR formatted text.
# Parameters can be omitted.
def self.to_nbrf(hash)
seq_type = hash[:seq_type]
seq = hash[:seq]
unless seq_type
if seq.is_a?(Bio::Sequence::AA) then
seq_type = 'P1'
elsif seq.is_a?(Bio::Sequence::NA) then
seq_type = /u/i =~ seq ? 'RL' : 'DL'
else
seq_type = 'XX'
end
end
width = hash.has_key?(:width) ? hash[:width] : 70
if width then
seq = seq.to_s + "*"
seq.gsub!(Regexp.new(".{1,#{width}}"), "\\0\n")
else
seq = seq.to_s + "*\n"
end
">#{seq_type};#{hash[:entry_id]}\n#{hash[:definition]}\n#{seq}"
end
end #class NBRF
end #module Bio
|