File: contingency_table.rb

package info (click to toggle)
ruby-bio 1.5.0-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 7,480 kB
  • ctags: 9,428
  • sloc: ruby: 74,117; xml: 3,383; makefile: 17; perl: 13; sh: 1
file content (368 lines) | stat: -rw-r--r-- 11,738 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
#
# bio/util/contingency_table.rb - Statistical contingency table analysis for aligned sequences
#
# Author::    Trevor Wennblom  <mailto:trevor@corevx.com>
# Copyright:: Copyright (c) 2005-2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License::   The Ruby License
#
#  $Id:$
#

module Bio

#
# bio/util/contingency_table.rb - Statistical contingency table analysis for aligned sequences
#
# Author::    Trevor Wennblom  <mailto:trevor@corevx.com>
# Copyright:: Copyright (c) 2005-2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
# License::   The Ruby License
#
# = Description
# 
# The Bio::ContingencyTable class provides basic statistical contingency table
# analysis for two positions within aligned sequences.
# 
# When ContingencyTable is instantiated the set of characters in the
# aligned sequences may be passed to it as an array.  This is
# important since it uses these characters to create the table's rows
# and columns.  If this array is not passed it will use it's default
# of an amino acid and nucleotide alphabet in lowercase along with the
# clustal spacer '-'.
# 
# To get data from the table the most used functions will be
# chi_square and contingency_coefficient:
# 
#   ctable = Bio::ContingencyTable.new()
#   ctable['a']['t'] += 1
#   # .. put more values into the table
#   puts ctable.chi_square
#   puts ctable.contingency_coefficient  # between 0.0 and 1.0
# 
# The contingency_coefficient represents the degree of correlation of
# change between two sequence positions in a multiple-sequence
# alignment.  0.0 indicates no correlation, 1.0 is the maximum
# correlation.
# 
# 
# = Further Reading
# 
# * http://en.wikipedia.org/wiki/Contingency_table
# * http://www.physics.csbsju.edu/stats/exact.details.html
# * Numerical Recipes in C by Press, Flannery, Teukolsky, and Vetterling
#  
# = Usage
# 
# What follows is an example of ContingencyTable in typical usage
# analyzing results from a clustal alignment.
# 
#   require 'bio'
# 
#   seqs = {}
#   max_length = 0
#   Bio::ClustalW::Report.new( IO.read('sample.aln') ).to_a.each do |entry|
#     data = entry.data.strip
#     seqs[entry.definition] = data.downcase
#     max_length = data.size if max_length == 0
#     raise "Aligned sequences must be the same length!" unless data.size == max_length
#   end
# 
#   VERBOSE = true
#   puts "i\tj\tchi_square\tcontingency_coefficient" if VERBOSE
#   correlations = {}
# 
#   0.upto(max_length - 1) do |i|
#     (i+1).upto(max_length - 1) do |j|
#       ctable = Bio::ContingencyTable.new()
#       seqs.each_value { |seq| ctable.table[ seq[i].chr ][ seq[j].chr ] += 1 }
# 
#       chi_square = ctable.chi_square
#       contingency_coefficient = ctable.contingency_coefficient
#       puts [(i+1), (j+1), chi_square, contingency_coefficient].join("\t") if VERBOSE
# 
#       correlations["#{i+1},#{j+1}"] = contingency_coefficient
#       correlations["#{j+1},#{i+1}"] = contingency_coefficient  # Both ways are accurate
#     end
#   end
# 
#   require 'yaml'
#   File.new('results.yml', 'a+') { |f| f.puts correlations.to_yaml }
# 
# 
# = Tutorial
#
# ContingencyTable returns the statistical significance of change
# between two positions in an alignment.  If you would like to see how
# every possible combination of positions in your alignment compares
# to one another you must set this up yourself.  Hopefully the
# provided examples will help you get started without too much
# trouble.
# 
#   def lite_example(sequences, max_length, characters)
# 
#     %w{i j chi_square contingency_coefficient}.each { |x| print x.ljust(12) }
#     puts
# 
#     0.upto(max_length - 1) do |i|
#       (i+1).upto(max_length - 1) do |j|
#         ctable = Bio::ContingencyTable.new( characters )
#         sequences.each do |seq|
#           i_char = seq[i].chr
#           j_char = seq[j].chr
#           ctable.table[i_char][j_char] += 1
#         end
#         chi_square = ctable.chi_square
#         contingency_coefficient = ctable.contingency_coefficient
#         [(i+1), (j+1), chi_square, contingency_coefficient].each { |x| print x.to_s.ljust(12) }
#         puts
#       end
#     end
# 
#   end
# 
#   allowed_letters = Array.new
#   allowed_letters = 'abcdefghijk'.split('')
# 
#   seqs = Array.new
#   seqs << 'abcde'
#   seqs << 'abcde'
#   seqs << 'aacje'
#   seqs << 'aacae'
# 
#   length_of_every_sequence = seqs[0].size  # 5 letters long
# 
#   lite_example(seqs, length_of_every_sequence, allowed_letters)
# 
# 
# Producing the following results:
# 
#   i           j           chi_square  contingency_coefficient
#   1           2           0.0         0.0
#   1           3           0.0         0.0
#   1           4           0.0         0.0
#   1           5           0.0         0.0
#   2           3           0.0         0.0
#   2           4           4.0         0.707106781186548
#   2           5           0.0         0.0
#   3           4           0.0         0.0
#   3           5           0.0         0.0
#   4           5           0.0         0.0
# 
# The position i=2 and j=4 has a high contingency coefficient
# indicating that the changes at these positions are related.  Note
# that i and j are arbitrary, this could be represented as i=4 and j=2
# since they both refer to position two and position four in the
# alignment.  Here are some more examples:
# 
#   seqs = Array.new
#   seqs << 'abcde'
#   seqs << 'abcde'
#   seqs << 'aacje'
#   seqs << 'aacae'
#   seqs << 'akcfe'
#   seqs << 'akcfe'
# 
#   length_of_every_sequence = seqs[0].size  # 5 letters long
# 
#   lite_example(seqs, length_of_every_sequence, allowed_letters)
# 
# 
# Results:
# 
#   i           j           chi_square  contingency_coefficient
#   1           2           0.0         0.0
#   1           3           0.0         0.0
#   1           4           0.0         0.0
#   1           5           0.0         0.0
#   2           3           0.0         0.0
#   2           4           12.0        0.816496580927726
#   2           5           0.0         0.0
#   3           4           0.0         0.0
#   3           5           0.0         0.0
#   4           5           0.0         0.0
# 
# Here we can see that the strength of the correlation of change has
# increased when more data is added with correlated changes at the
# same positions.
# 
#   seqs = Array.new
#   seqs << 'abcde'
#   seqs << 'abcde'
#   seqs << 'kacje'  # changed first letter
#   seqs << 'aacae'
#   seqs << 'akcfa'  # changed last letter
#   seqs << 'akcfe'
# 
#   length_of_every_sequence = seqs[0].size  # 5 letters long
# 
#   lite_example(seqs, length_of_every_sequence, allowed_letters)
# 
# 
# Results:
# 
#   i           j           chi_square  contingency_coefficient
#   1           2           2.4         0.534522483824849
#   1           3           0.0         0.0
#   1           4           6.0         0.707106781186548
#   1           5           0.24        0.196116135138184
#   2           3           0.0         0.0
#   2           4           12.0        0.816496580927726
#   2           5           2.4         0.534522483824849
#   3           4           0.0         0.0
#   3           5           0.0         0.0
#   4           5           2.4         0.534522483824849
# 
# With random changes it becomes more difficult to identify correlated
# changes, yet positions two and four still have the highest
# correlation as indicated by the contingency coefficient.  The best
# way to improve the accuracy of your results, as is often the case
# with statistics, is to increase the sample size.
# 
# 
# = A Note on Efficiency
# 
# ContingencyTable is slow.  It involves many calculations for even a
# seemingly small five-string data set.  Even worse, it's very
# dependent on matrix traversal, and this is done with two dimensional
# hashes which dashes any hope of decent speed.
# 
# Finally, half of the matrix is redundant and positions could be
# summed with their companion position to reduce calculations.  For
# example the positions (5,2) and (2,5) could both have their values
# added together and just stored in (2,5) while (5,2) could be an
# illegal position.  Also, positions (1,1), (2,2), (3,3), etc.  will
# never be used.
# 
# The purpose of this package is flexibility and education.  The code
# is short and to the point in aims of achieving that purpose.  If the
# BioRuby project moves towards C extensions in the future a
# professional caliber version will likely be created.
#
  
class ContingencyTable
  # Since we're making this math-notation friendly here is the layout of @table:
  # * @table[row][column]
  # * @table[i][j]
  # * @table[y][x]
  attr_accessor :table
  attr_reader :characters

  # Create a ContingencyTable that has characters_in_sequence.size rows and
  # characters_in_sequence.size columns for each row
  #
  # ---
  # *Arguments*
  # * +characters_in_sequences+: (_optional_) The allowable characters that will be present in the aligned sequences.
  # *Returns*:: +ContingencyTable+ object to be filled with values and calculated upon
  def initialize(characters_in_sequences = nil)
    @characters = ( characters_in_sequences or %w{a c d e f g h i k l m n p q r s t v w y - x u} )
    tmp = Hash[*@characters.collect { |v| [v, 0] }.flatten]
    @table = Hash[*@characters.collect { |v| [v, tmp.dup] }.flatten]
  end
  
  # Report the sum of all values in a given row
  #
  # ---
  # *Arguments*
  # * +i+: Row to sum
  # *Returns*:: +Integer+ sum of row
  def row_sum(i)
    total = 0
    @table[i].each { |k, v| total += v }
    total
  end

  # Report the sum of all values in a given column
  #
  # ---
  # *Arguments*
  # * +j+: Column to sum
  # *Returns*:: +Integer+ sum of column
  def column_sum(j)
    total = 0
    @table.each { |row_key, column| total += column[j] }
    total
  end

  # Report the sum of all values in all columns.
  #
  # * This is the same thing as asking for the sum of all values in the table.
  #
  # ---
  # *Arguments*
  # * _none_
  # *Returns*:: +Integer+ sum of all columns
  def column_sum_all
    total = 0
    @characters.each { |j| total += column_sum(j) }
    total
  end

  # Report the sum of all values in all rows.
  #
  # * This is the same thing as asking for the sum of all values in the table.
  #
  # ---
  # *Arguments*
  # * _none_
  # *Returns*:: +Integer+ sum of all rows
  def row_sum_all
    total = 0
    @characters.each { |i| total += row_sum(i) }
    total
  end
  alias table_sum_all row_sum_all

  # Calculate _e_, the _expected_ value.
  #
  # ---
  # *Arguments*
  # * +i+: row
  # * +j+: column
  # *Returns*:: +Float+ e(sub:ij) = (r(sub:i)/N) * (c(sub:j))
  def expected(i, j)
    (row_sum(i).to_f / table_sum_all) * column_sum(j)
  end

  # Report the chi square of the entire table
  #
  # ---
  # *Arguments*
  # * _none_
  # *Returns*:: +Float+ chi square value
  def chi_square
    total = 0
    @characters.each do |i|    # Loop through every row in the ContingencyTable
      @characters.each do |j|  # Loop through every column in the ContingencyTable
        total += chi_square_element(i, j)
      end
    end
    total
  end

  # Report the chi-square relation of two elements in the table
  #
  # ---
  # *Arguments*
  # * +i+: row
  # * +j+: column
  # *Returns*:: +Float+ chi-square of an intersection
  def chi_square_element(i, j)
    eij = expected(i, j)
    return 0 if eij == 0
    ( @table[i][j] - eij )**2 / eij
  end

  # Report the contingency coefficient of the table
  #
  # ---
  # *Arguments*
  # * _none_
  # *Returns*:: +Float+ contingency_coefficient of the table
  def contingency_coefficient
    c_s = chi_square
    Math.sqrt(c_s / (table_sum_all + c_s) )
  end

end # ContingencyTable
end # Bio