1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
|
#
# bio/db/soft.rb - Interface for SOFT formatted files
#
# Author:: Trevor Wennblom <mailto:trevor@corevx.com>
# Copyright:: Copyright (c) 2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License:: The Ruby License
#
# $Id:$
#
module Bio
#
# bio/db/soft.rb - Interface for SOFT formatted files
#
# Author:: Trevor Wennblom <mailto:trevor@corevx.com>
# Copyright:: Copyright (c) 2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License:: The Ruby License
#
#
# = Description
#
# "SOFT (Simple Omnibus in Text Format) is a compact, simple, line-based,
# ASCII text format that incorporates experimental data and metadata."
# -- <em>GEO, National Center for Biotechnology Information</em>
#
# The Bio::SOFT module reads SOFT Series or Platform formatted files that
# contain information
# describing one database, one series, one platform, and many samples (GEO
# accessions). The data from the file can then be viewed with Ruby methods.
#
# Bio::SOFT also supports the reading of SOFT DataSet files which contain
# one database, one dataset, and many subsets.
#
# Format specification is located here:
# * http://www.ncbi.nlm.nih.gov/projects/geo/info/soft2.html#SOFTformat
#
# SOFT data files may be directly downloaded here:
# * ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT
#
# NCBI's Gene Expression Omnibus (GEO) is here:
# * http://www.ncbi.nlm.nih.gov/geo
#
# = Usage
#
# If an attribute has more than one value then the values are stored in an
# Array of String objects. Otherwise the attribute is stored as a String.
#
# The platform and each sample may contain a table of data. A dataset from a
# DataSet file may also contain a table.
#
# Attributes are dynamically created based on the data in the file.
# Predefined keys have not been created in advance due to the variability of
# SOFT files in-the-wild.
#
# Keys are generally stored as Symbols. In the case of keys for samples and
# table headings may alternatively be accessed with Strings.
# The names of samples (geo accessions) are case sensitive. Table headers
# are case insensitive.
#
# require 'bio'
#
# lines = IO.readlines('GSE3457_family.soft')
# soft = Bio::SOFT.new(lines)
#
# soft.platform[:geo_accession] # => "GPL2092"
# soft.platform[:organism] # => "Populus"
# soft.platform[:contributor] # => ["Jingyi,,Li", "Olga,,Shevchenko", "Steve,H,Strauss", "Amy,M,Brunner"]
# soft.platform[:data_row_count] # => "240"
# soft.platform.keys.sort {|a,b| a.to_s <=> b.to_s}[0..2] # => [:contact_address, :contact_city, :contact_country]
# soft.platform[:"contact_zip/postal_code"] # => "97331"
# soft.platform[:table].header # => ["ID", "GB_ACC", "SPOT_ID", "Function/Family", "ORGANISM", "SEQUENCE"]
# soft.platform[:table].header_description # => {"ORGANISM"=>"sequence sources", "SEQUENCE"=>"oligo sequence used", "Function/Family"=>"gene functions and family", "ID"=>"", "SPOT_ID"=>"", "GB_ACC"=>"Gene bank accession number"}
# soft.platform[:table].rows.size # => 240
# soft.platform[:table].rows[5] # => ["A039P68U", "AI163321", "", "TF, flowering protein CONSTANS", "P. tremula x P. tremuloides", "AGAAAATTCGATATACTGTCCGTAAAGAGGTAGCACTTAGAATGCAACGGAATAAAGGGCAGTTCACCTC"]
# soft.platform[:table].rows[5][4] # => "P. tremula x P. tremuloides"
# soft.platform[:table].rows[5][:organism] # => "P. tremula x P. tremuloides"
# soft.platform[:table].rows[5]['ORGANISM'] # => "P. tremula x P. tremuloides"
#
# soft.series[:geo_accession] # => "GSE3457"
# soft.series[:contributor] # => ["Jingyi,,Li", "Olga,,Shevchenko", "Ove,,Nilsson", "Steve,H,Strauss", "Amy,M,Brunner"]
# soft.series[:platform_id] # => "GPL2092"
# soft.series[:sample_id].size # => 74
# soft.series[:sample_id][0..4] # => ["GSM77557", "GSM77558", "GSM77559", "GSM77560", "GSM77561"]
#
# soft.database[:name] # => "Gene Expression Omnibus (GEO)"
# soft.database[:ref] # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
# soft.database[:institute] # => "NCBI NLM NIH"
#
# soft.samples.size # => 74
# soft.samples[:GSM77600][:series_id] # => "GSE3457"
# soft.samples['GSM77600'][:series_id] # => "GSE3457"
# soft.samples[:GSM77600][:platform_id] # => "GPL2092"
# soft.samples[:GSM77600][:type] # => "RNA"
# soft.samples[:GSM77600][:title] # => "jst2b2"
# soft.samples[:GSM77600][:table].header # => ["ID_REF", "VALUE"]
# soft.samples[:GSM77600][:table].header_description # => {"ID_REF"=>"", "VALUE"=>"normalized signal intensities"}
# soft.samples[:GSM77600][:table].rows.size # => 217
# soft.samples[:GSM77600][:table].rows[5] # => ["A039P68U", "8.19"]
# soft.samples[:GSM77600][:table].rows[5][0] # => "A039P68U"
# soft.samples[:GSM77600][:table].rows[5][:id_ref] # => "A039P68U"
# soft.samples[:GSM77600][:table].rows[5]['ID_REF'] # => "A039P68U"
#
#
# lines = IO.readlines('GDS100.soft')
# soft = Bio::SOFT.new(lines)
#
# soft.database[:name] # => "Gene Expression Omnibus (GEO)"
# soft.database[:ref] # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
# soft.database[:institute] # => "NCBI NLM NIH"
#
# soft.subsets.size # => 8
# soft.subsets.keys # => ["GDS100_1", "GDS100_2", "GDS100_3", "GDS100_4", "GDS100_5", "GDS100_6", "GDS100_7", "GDS100_8"]
# soft.subsets[:GDS100_7] # => {:dataset_id=>"GDS100", :type=>"time", :sample_id=>"GSM548,GSM543", :description=>"60 minute"}
# soft.subsets['GDS100_7'][:sample_id] # => "GSM548,GSM543"
# soft.subsets[:GDS100_7][:sample_id] # => "GSM548,GSM543"
# soft.subsets[:GDS100_7][:dataset_id] # => "GDS100"
#
# soft.dataset[:order] # => "none"
# soft.dataset[:sample_organism] # => "Escherichia coli"
# soft.dataset[:table].header # => ["ID_REF", "IDENTIFIER", "GSM549", "GSM542", "GSM543", "GSM547", "GSM544", "GSM545", "GSM546", "GSM548"]
# soft.dataset[:table].rows.size # => 5764
# soft.dataset[:table].rows[5] # => ["6", "EMPTY", "0.097", "0.217", "0.242", "0.067", "0.104", "0.162", "0.104", "0.154"]
# soft.dataset[:table].rows[5][4] # => "0.242"
# soft.dataset[:table].rows[5][:gsm549] # => "0.097"
# soft.dataset[:table].rows[5][:GSM549] # => "0.097"
# soft.dataset[:table].rows[5]['GSM549'] # => "0.097"
#
class SOFT
attr_accessor :database
attr_accessor :series, :platform, :samples
attr_accessor :dataset, :subsets
LINE_TYPE_ENTITY_INDICATOR = '^'
LINE_TYPE_ENTITY_ATTRIBUTE = '!'
LINE_TYPE_TABLE_HEADER = '#'
# data table row defined by absence of line type character
TABLE_COLUMN_DELIMITER = "\t"
# Constructor
#
# ---
# *Arguments*
# * +lines+: (_required_) contents of SOFT formatted file
# *Returns*:: Bio::SOFT
def initialize(lines=nil)
@database = Database.new
@series = Series.new
@platform = Platform.new
@samples = Samples.new
@dataset = Dataset.new
@subsets = Subsets.new
process(lines)
end
# Classes for Platform and Series files
class Samples < Hash #:nodoc:
def [](x)
x = x.to_s if x.kind_of?( Symbol )
super(x)
end
end
class Entity < Hash #:nodoc:
end
class Sample < Entity #:nodoc:
end
class Platform < Entity #:nodoc:
end
class Series < Entity #:nodoc:
end
# Classes for DataSet files
class Subsets < Samples #:nodoc:
end
class Subset < Entity #:nodoc:
end
class Dataset < Entity #:nodoc:
end
# Classes important for all types
class Database < Entity #:nodoc:
end
class Table #:nodoc:
attr_accessor :header
attr_accessor :header_description
attr_accessor :rows
class Header < Array #:nodoc:
# @column_index contains column name => numerical index of column
attr_accessor :column_index
def initialize
@column_index = {}
end
end
class Row < Array #:nodoc:
attr_accessor :header_object
def initialize( n, header_object=nil )
@header_object = header_object
super(n)
end
def [](x)
if x.kind_of?( Fixnum )
super(x)
else
begin
x = x.to_s.downcase.to_sym
z = @header_object.column_index[x]
unless z.kind_of?( Fixnum )
raise IndexError, "#{x.inspect} is not a valid index. Contents of @header_object.column_index: #{@header_object.column_index.inspect}"
end
self[ z ]
rescue NoMethodError
unless @header_object
$stderr.puts "Table::Row @header_object undefined!"
end
raise
end
end
end
end
def initialize()
@header_description = {}
@header = Header.new
@rows = []
end
def add_header( line )
raise "Can only define one header" unless @header.empty?
@header = @header.concat( parse_row( line ) ) # beware of clobbering this into an Array
@header.each_with_index do |key, i|
@header.column_index[key.downcase.to_sym] = i
end
end
def add_row( line )
@rows << Row.new( parse_row( line ), @header )
end
def add_header_or_row( line )
@header.empty? ? add_header( line ) : add_row( line )
end
protected
def parse_row( line )
line.split( TABLE_COLUMN_DELIMITER )
end
end
#########
protected
#########
def process(lines)
current_indicator = nil
current_class_accessor = nil
in_table = false
lines.each_with_index do |line, line_number|
line.strip!
next if line.nil? or line.empty?
case line[0].chr
when LINE_TYPE_ENTITY_INDICATOR
current_indicator, value = split_label_value_in( line[1..-1] )
case current_indicator
when 'DATABASE'
current_class_accessor = @database
when 'DATASET'
current_class_accessor = @dataset
when 'PLATFORM'
current_class_accessor = @platform
when 'SERIES'
current_class_accessor = @series
when 'SAMPLE'
@samples[value] = Sample.new
current_class_accessor = @samples[value]
when 'SUBSET'
@subsets[value] = Subset.new
current_class_accessor = @subsets[value]
else
custom_raise( line_number, error_msg(40, line) )
end
when LINE_TYPE_ENTITY_ATTRIBUTE
if( current_indicator == nil )
custom_raise( line_number, error_msg(30) )
end
# Handle lines such as '!platform_table_begin' and '!platform_table_end'
if in_table
if line =~ %r{table_begin}
next
elsif line =~ %r{table_end}
in_table = false
next
end
end
key, value = split_label_value_in( line, true )
key_s = key.to_sym
if current_class_accessor.include?( key_s )
if current_class_accessor[ key_s ].class != Array
current_class_accessor[ key_s ] = [ current_class_accessor[ key_s ] ]
end
current_class_accessor[key.to_sym] << value
else
current_class_accessor[key.to_sym] = value
end
when LINE_TYPE_TABLE_HEADER
if( (current_indicator != 'SAMPLE') and (current_indicator != 'PLATFORM') and (current_indicator != 'DATASET') )
custom_raise( line_number, error_msg(20, current_indicator.inspect) )
end
in_table = true # may be redundant, computationally not worth checking
# We only expect one table per platform or sample
current_class_accessor[:table] ||= Table.new
key, value = split_label_value_in( line )
# key[1..-1] -- Remove first character which is the LINE_TYPE_TABLE_HEADER
current_class_accessor[:table].header_description[ key[1..-1] ] = value
else
# Type: No line type - should be a row in a table.
if( (current_indicator == nil) or (in_table == false) )
custom_raise( line_number, error_msg(10) )
end
current_class_accessor[:table].add_header_or_row( line )
end
end
end
def error_msg( i, extra_info=nil )
case i
when 10
x = ["Lines without line-type characters are rows in a table, but",
"a line containing an entity indicator such as",
"\"#{LINE_TYPE_ENTITY_INDICATOR}SAMPLE\",",
"\"#{LINE_TYPE_ENTITY_INDICATOR}PLATFORM\",",
"or \"#{LINE_TYPE_ENTITY_INDICATOR}DATASET\" has not been",
"previously encountered or it does not appear that this line is",
"in a table."]
when 20
# tables are allowed inside samples and platforms
x = ["Tables are only allowed inside SAMPLE and PLATFORM.",
"Current table information found inside #{extra_info}."]
when 30
x = ["Entity attribute line (\"#{LINE_TYPE_ENTITY_ATTRIBUTE}\")",
"found before entity indicator line (\"#{LINE_TYPE_ENTITY_INDICATOR}\")"]
when 40
x = ["Unkown entity indicator. Must be DATABASE, SAMPLE, PLATFORM,",
"SERIES, DATASET, or SUBSET."]
else
raise IndexError, "Unknown error message requested."
end
x.join(" ")
end
def custom_raise( line_number_with_0_based_indexing, msg )
raise ["Error processing input line: #{line_number_with_0_based_indexing+1}",
msg].join("\t")
end
def split_label_value_in( line, shift_key=false )
line =~ %r{\s*=\s*}
key, value = $`, $'
if shift_key
key =~ %r{_}
key = $'
end
if( (key == nil) or (value == nil) )
puts line.inspect
raise
end
[key, value]
end
end # SOFT
end # Bio
|