1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
|
#
# = bio/db/fasta.rb - FASTA format class
#
# Copyright:: Copyright (C) 2001, 2002
# Naohisa Goto <ng@bioruby.org>,
# Toshiaki Katayama <k@bioruby.org>
# License:: The Ruby License
#
# $Id:$
#
# == Description
#
# FASTA format class.
#
# == Examples
#
# See documents of Bio::FastaFormat class.
#
# == References
#
# * FASTA format (WikiPedia)
# http://en.wikipedia.org/wiki/FASTA_format
#
# * Fasta format description (NCBI)
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
#
require 'bio/db'
require 'bio/sequence'
require 'bio/sequence/dblink'
require 'bio/db/fasta/defline'
module Bio
# Treats a FASTA formatted entry, such as:
#
# >id and/or some comments <== definition line
# ATGCATGCATGCATGCATGCATGCATGCATGCATGC <== sequence lines
# ATGCATGCATGCATGCATGCATGCATGCATGCATGC
# ATGCATGCATGC
#
# The precedent '>' can be omitted and the trailing '>' will be removed
# automatically.
#
# === Examples
#
# fasta_string = <<END_OF_STRING
# >gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]
# MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI
# VRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ
# NLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP
# IFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP
# INRISARRAAIHPYFQES
# END_OF_STRING
#
# f = Bio::FastaFormat.new(fasta_string)
#
# f.entry #=> ">gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]\n"+
# # MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI\n"+
# # VRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ\n"+
# # NLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP\n"+
# # IFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP\n"+
# # INRISARRAAIHPYFQES"
#
# ==== Methods related to the name of the sequence
#
# A larger range of methods for dealing with Fasta definition lines can be found in FastaDefline, accessed through the FastaFormat#identifiers method.
#
# f.entry_id #=> "gi|398365175"
# f.first_name #=> "gi|398365175|ref|NP_009718.3|"
# f.definition #=> "gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]"
# f.identifiers #=> Bio::FastaDefline instance
# f.accession #=> "NP_009718"
# f.accessions #=> ["NP_009718"]
# f.acc_version #=> "NP_009718.3"
# f.comment #=> nil
#
# ==== Methods related to the actual sequence
#
# f.seq #=> "MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES"
# f.data #=> "\nMSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI\nVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ\nNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP\nIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP\nINRISARRAAIHPYFQES\n"
# f.length #=> 298
# f.aaseq #=> "MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES"
# f.aaseq.composition #=> {"M"=>5, "S"=>15, "G"=>21, "E"=>16, "L"=>36, "A"=>17, "N"=>8, "Y"=>13, "K"=>22, "R"=>20, "V"=>18, "T"=>7, "D"=>23, "P"=>17, "Q"=>10, "I"=>23, "H"=>7, "F"=>12, "C"=>4, "W"=>4}
# f.aalen #=> 298
#
#
# === A less structured fasta entry
#
# f.entry #=> ">abc 123 456\nASDF"
#
# f.entry_id #=> "abc"
# f.first_name #=> "abc"
# f.definition #=> "abc 123 456"
# f.comment #=> nil
# f.accession #=> nil
# f.accessions #=> []
# f.acc_version #=> nil
#
# f.seq #=> "ASDF"
# f.data #=> "\nASDF\n"
# f.length #=> 4
# f.aaseq #=> "ASDF"
# f.aaseq.composition #=> {"A"=>1, "S"=>1, "D"=>1, "F"=>1}
# f.aalen #=> 4
#
#
# === References
#
# * FASTA format (WikiPedia)
# http://en.wikipedia.org/wiki/FASTA_format
#
class FastaFormat < DB
# Entry delimiter in flatfile text.
DELIMITER = RS = "\n>"
# (Integer) excess read size included in DELIMITER.
DELIMITER_OVERRUN = 1 # '>'
# The comment line of the FASTA formatted data.
attr_accessor :definition
# The seuqnce lines in text.
attr_accessor :data
attr_reader :entry_overrun
# Stores the comment and sequence information from one entry of the
# FASTA format string. If the argument contains more than one
# entry, only the first entry is used.
def initialize(str)
@definition = str[/.*/].sub(/^>/, '').strip # 1st line
@data = str.sub(/.*/, '') # rests
@data.sub!(/^>.*/m, '') # remove trailing entries for sure
@entry_overrun = $&
end
# Returns the stored one entry as a FASTA format. (same as to_s)
def entry
@entry = ">#{@definition}\n#{@data.strip}\n"
end
alias to_s entry
# Executes FASTA/BLAST search by using a Bio::Fasta or a Bio::Blast
# factory object.
#
# #!/usr/bin/env ruby
# require 'bio'
#
# factory = Bio::Fasta.local('fasta34', 'db/swissprot.f')
# flatfile = Bio::FlatFile.open(Bio::FastaFormat, 'queries.f')
# flatfile.each do |entry|
# p entry.definition
# result = entry.fasta(factory)
# result.each do |hit|
# print "#{hit.query_id} : #{hit.evalue}\t#{hit.target_id} at "
# p hit.lap_at
# end
# end
#
def query(factory)
factory.query(entry)
end
alias fasta query
alias blast query
# Returns a joined sequence line as a String.
def seq
unless defined?(@seq)
unless /\A\s*^\#/ =~ @data then
@seq = Sequence::Generic.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up
else
a = @data.split(/(^\#.*$)/)
i = 0
cmnt = {}
s = []
a.each do |x|
if /^# ?(.*)$/ =~ x then
cmnt[i] ? cmnt[i] << "\n" << $1 : cmnt[i] = $1
else
x.tr!(" \t\r\n0-9", '') # lazy clean up
i += x.length
s << x
end
end
@comment = cmnt
@seq = Bio::Sequence::Generic.new(s.join(''))
end
end
@seq
end
# Returns comments.
def comment
seq
@comment
end
# Returns sequence length.
def length
seq.length
end
# Returens the Bio::Sequence::NA.
def naseq
Sequence::NA.new(seq)
end
# Returens the length of Bio::Sequence::NA.
def nalen
self.naseq.length
end
# Returens the Bio::Sequence::AA.
def aaseq
Sequence::AA.new(seq)
end
# Returens the length of Bio::Sequence::AA.
def aalen
self.aaseq.length
end
# Returns sequence as a Bio::Sequence object.
#
# Note: If you modify the returned Bio::Sequence object,
# the sequence or definition in this FastaFormat object
# might also be changed (but not always be changed)
# because of efficiency.
#
def to_biosequence
Bio::Sequence.adapter(self, Bio::Sequence::Adapter::FastaFormat)
end
alias to_seq to_biosequence
# Parsing FASTA Defline, and extract IDs.
# IDs are NSIDs (NCBI standard FASTA sequence identifiers)
# or ":"-separated IDs.
# It returns a Bio::FastaDefline instance.
def identifiers
unless defined?(@ids) then
@ids = FastaDefline.new(@definition)
end
@ids
end
# Parsing FASTA Defline (using #identifiers method), and
# shows a possibly unique identifier.
# It returns a string.
def entry_id
identifiers.entry_id
end
# Parsing FASTA Defline (using #identifiers method), and
# shows GI/locus/accession/accession with version number.
# If a entry has more than two of such IDs,
# only the first ID are shown.
# It returns a string or nil.
def gi
identifiers.gi
end
# Returns an accession number.
def accession
identifiers.accession
end
# Parsing FASTA Defline (using #identifiers method), and
# shows accession numbers.
# It returns an array of strings.
def accessions
identifiers.accessions
end
# Returns accession number with version.
def acc_version
identifiers.acc_version
end
# Returns locus.
def locus
identifiers.locus
end
# Returns the first name (word) of the definition line - everything
# before the first whitespace.
#
# >abc def #=> 'abc'
# >gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c] #=> 'gi|398365175|ref|NP_009718.3|'
# >abc #=> 'abc'
def first_name
index = definition.index(/\s/)
if index.nil?
return @definition
else
return @definition[0...index]
end
end
end #class FastaFormat
end #module Bio
|