1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
|
#
# bio/util/contingency_table.rb - Statistical contingency table analysis for aligned sequences
#
# Author:: Trevor Wennblom <mailto:trevor@corevx.com>
# Copyright:: Copyright (c) 2005-2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License:: The Ruby License
#
# $Id:$
#
module Bio
#
# bio/util/contingency_table.rb - Statistical contingency table analysis for aligned sequences
#
# Author:: Trevor Wennblom <mailto:trevor@corevx.com>
# Copyright:: Copyright (c) 2005-2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License:: The Ruby License
#
# = Description
#
# The Bio::ContingencyTable class provides basic statistical contingency table
# analysis for two positions within aligned sequences.
#
# When ContingencyTable is instantiated the set of characters in the
# aligned sequences may be passed to it as an array. This is
# important since it uses these characters to create the table's rows
# and columns. If this array is not passed it will use it's default
# of an amino acid and nucleotide alphabet in lowercase along with the
# clustal spacer '-'.
#
# To get data from the table the most used functions will be
# chi_square and contingency_coefficient:
#
# ctable = Bio::ContingencyTable.new()
# ctable['a']['t'] += 1
# # .. put more values into the table
# puts ctable.chi_square
# puts ctable.contingency_coefficient # between 0.0 and 1.0
#
# The contingency_coefficient represents the degree of correlation of
# change between two sequence positions in a multiple-sequence
# alignment. 0.0 indicates no correlation, 1.0 is the maximum
# correlation.
#
#
# = Further Reading
#
# * http://en.wikipedia.org/wiki/Contingency_table
# * http://www.physics.csbsju.edu/stats/exact.details.html
# * Numerical Recipes in C by Press, Flannery, Teukolsky, and Vetterling
#
# = Usage
#
# What follows is an example of ContingencyTable in typical usage
# analyzing results from a clustal alignment.
#
# require 'bio'
#
# seqs = {}
# max_length = 0
# Bio::ClustalW::Report.new( IO.read('sample.aln') ).to_a.each do |entry|
# data = entry.data.strip
# seqs[entry.definition] = data.downcase
# max_length = data.size if max_length == 0
# raise "Aligned sequences must be the same length!" unless data.size == max_length
# end
#
# VERBOSE = true
# puts "i\tj\tchi_square\tcontingency_coefficient" if VERBOSE
# correlations = {}
#
# 0.upto(max_length - 1) do |i|
# (i+1).upto(max_length - 1) do |j|
# ctable = Bio::ContingencyTable.new()
# seqs.each_value { |seq| ctable.table[ seq[i].chr ][ seq[j].chr ] += 1 }
#
# chi_square = ctable.chi_square
# contingency_coefficient = ctable.contingency_coefficient
# puts [(i+1), (j+1), chi_square, contingency_coefficient].join("\t") if VERBOSE
#
# correlations["#{i+1},#{j+1}"] = contingency_coefficient
# correlations["#{j+1},#{i+1}"] = contingency_coefficient # Both ways are accurate
# end
# end
#
# require 'yaml'
# File.new('results.yml', 'a+') { |f| f.puts correlations.to_yaml }
#
#
# = Tutorial
#
# ContingencyTable returns the statistical significance of change
# between two positions in an alignment. If you would like to see how
# every possible combination of positions in your alignment compares
# to one another you must set this up yourself. Hopefully the
# provided examples will help you get started without too much
# trouble.
#
# def lite_example(sequences, max_length, characters)
#
# %w{i j chi_square contingency_coefficient}.each { |x| print x.ljust(12) }
# puts
#
# 0.upto(max_length - 1) do |i|
# (i+1).upto(max_length - 1) do |j|
# ctable = Bio::ContingencyTable.new( characters )
# sequences.each do |seq|
# i_char = seq[i].chr
# j_char = seq[j].chr
# ctable.table[i_char][j_char] += 1
# end
# chi_square = ctable.chi_square
# contingency_coefficient = ctable.contingency_coefficient
# [(i+1), (j+1), chi_square, contingency_coefficient].each { |x| print x.to_s.ljust(12) }
# puts
# end
# end
#
# end
#
# allowed_letters = Array.new
# allowed_letters = 'abcdefghijk'.split('')
#
# seqs = Array.new
# seqs << 'abcde'
# seqs << 'abcde'
# seqs << 'aacje'
# seqs << 'aacae'
#
# length_of_every_sequence = seqs[0].size # 5 letters long
#
# lite_example(seqs, length_of_every_sequence, allowed_letters)
#
#
# Producing the following results:
#
# i j chi_square contingency_coefficient
# 1 2 0.0 0.0
# 1 3 0.0 0.0
# 1 4 0.0 0.0
# 1 5 0.0 0.0
# 2 3 0.0 0.0
# 2 4 4.0 0.707106781186548
# 2 5 0.0 0.0
# 3 4 0.0 0.0
# 3 5 0.0 0.0
# 4 5 0.0 0.0
#
# The position i=2 and j=4 has a high contingency coefficient
# indicating that the changes at these positions are related. Note
# that i and j are arbitrary, this could be represented as i=4 and j=2
# since they both refer to position two and position four in the
# alignment. Here are some more examples:
#
# seqs = Array.new
# seqs << 'abcde'
# seqs << 'abcde'
# seqs << 'aacje'
# seqs << 'aacae'
# seqs << 'akcfe'
# seqs << 'akcfe'
#
# length_of_every_sequence = seqs[0].size # 5 letters long
#
# lite_example(seqs, length_of_every_sequence, allowed_letters)
#
#
# Results:
#
# i j chi_square contingency_coefficient
# 1 2 0.0 0.0
# 1 3 0.0 0.0
# 1 4 0.0 0.0
# 1 5 0.0 0.0
# 2 3 0.0 0.0
# 2 4 12.0 0.816496580927726
# 2 5 0.0 0.0
# 3 4 0.0 0.0
# 3 5 0.0 0.0
# 4 5 0.0 0.0
#
# Here we can see that the strength of the correlation of change has
# increased when more data is added with correlated changes at the
# same positions.
#
# seqs = Array.new
# seqs << 'abcde'
# seqs << 'abcde'
# seqs << 'kacje' # changed first letter
# seqs << 'aacae'
# seqs << 'akcfa' # changed last letter
# seqs << 'akcfe'
#
# length_of_every_sequence = seqs[0].size # 5 letters long
#
# lite_example(seqs, length_of_every_sequence, allowed_letters)
#
#
# Results:
#
# i j chi_square contingency_coefficient
# 1 2 2.4 0.534522483824849
# 1 3 0.0 0.0
# 1 4 6.0 0.707106781186548
# 1 5 0.24 0.196116135138184
# 2 3 0.0 0.0
# 2 4 12.0 0.816496580927726
# 2 5 2.4 0.534522483824849
# 3 4 0.0 0.0
# 3 5 0.0 0.0
# 4 5 2.4 0.534522483824849
#
# With random changes it becomes more difficult to identify correlated
# changes, yet positions two and four still have the highest
# correlation as indicated by the contingency coefficient. The best
# way to improve the accuracy of your results, as is often the case
# with statistics, is to increase the sample size.
#
#
# = A Note on Efficiency
#
# ContingencyTable is slow. It involves many calculations for even a
# seemingly small five-string data set. Even worse, it's very
# dependent on matrix traversal, and this is done with two dimensional
# hashes which dashes any hope of decent speed.
#
# Finally, half of the matrix is redundant and positions could be
# summed with their companion position to reduce calculations. For
# example the positions (5,2) and (2,5) could both have their values
# added together and just stored in (2,5) while (5,2) could be an
# illegal position. Also, positions (1,1), (2,2), (3,3), etc. will
# never be used.
#
# The purpose of this package is flexibility and education. The code
# is short and to the point in aims of achieving that purpose. If the
# BioRuby project moves towards C extensions in the future a
# professional caliber version will likely be created.
#
class ContingencyTable
# Since we're making this math-notation friendly here is the layout of @table:
# * @table[row][column]
# * @table[i][j]
# * @table[y][x]
attr_accessor :table
attr_reader :characters
# Create a ContingencyTable that has characters_in_sequence.size rows and
# characters_in_sequence.size columns for each row
#
# ---
# *Arguments*
# * +characters_in_sequences+: (_optional_) The allowable characters that will be present in the aligned sequences.
# *Returns*:: +ContingencyTable+ object to be filled with values and calculated upon
def initialize(characters_in_sequences = nil)
@characters = ( characters_in_sequences or %w{a c d e f g h i k l m n p q r s t v w y - x u} )
tmp = Hash[*@characters.collect { |v| [v, 0] }.flatten]
@table = Hash[*@characters.collect { |v| [v, tmp.dup] }.flatten]
end
# Report the sum of all values in a given row
#
# ---
# *Arguments*
# * +i+: Row to sum
# *Returns*:: +Integer+ sum of row
def row_sum(i)
total = 0
@table[i].each { |k, v| total += v }
total
end
# Report the sum of all values in a given column
#
# ---
# *Arguments*
# * +j+: Column to sum
# *Returns*:: +Integer+ sum of column
def column_sum(j)
total = 0
@table.each { |row_key, column| total += column[j] }
total
end
# Report the sum of all values in all columns.
#
# * This is the same thing as asking for the sum of all values in the table.
#
# ---
# *Arguments*
# * _none_
# *Returns*:: +Integer+ sum of all columns
def column_sum_all
total = 0
@characters.each { |j| total += column_sum(j) }
total
end
# Report the sum of all values in all rows.
#
# * This is the same thing as asking for the sum of all values in the table.
#
# ---
# *Arguments*
# * _none_
# *Returns*:: +Integer+ sum of all rows
def row_sum_all
total = 0
@characters.each { |i| total += row_sum(i) }
total
end
alias table_sum_all row_sum_all
# Calculate _e_, the _expected_ value.
#
# ---
# *Arguments*
# * +i+: row
# * +j+: column
# *Returns*:: +Float+ e(sub:ij) = (r(sub:i)/N) * (c(sub:j))
def expected(i, j)
(row_sum(i).to_f / table_sum_all) * column_sum(j)
end
# Report the chi square of the entire table
#
# ---
# *Arguments*
# * _none_
# *Returns*:: +Float+ chi square value
def chi_square
total = 0
@characters.each do |i| # Loop through every row in the ContingencyTable
@characters.each do |j| # Loop through every column in the ContingencyTable
total += chi_square_element(i, j)
end
end
total
end
# Report the chi-square relation of two elements in the table
#
# ---
# *Arguments*
# * +i+: row
# * +j+: column
# *Returns*:: +Float+ chi-square of an intersection
def chi_square_element(i, j)
eij = expected(i, j)
return 0 if eij == 0
( @table[i][j] - eij )**2 / eij
end
# Report the contingency coefficient of the table
#
# ---
# *Arguments*
# * _none_
# *Returns*:: +Float+ contingency_coefficient of the table
def contingency_coefficient
c_s = chi_square
Math.sqrt(c_s / (table_sum_all + c_s) )
end
end # ContingencyTable
end # Bio
|