1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
|
module Statistics
module StatisticalTest
class WilcoxonRankSumTest
def rank(elements)
ranked_elements = {}
elements.sort.each_with_index do |element, index|
if ranked_elements.fetch(element, false)
# This allow us to solve the ties easily when performing the rank summation per group
ranked_elements[element][:counter] += 1
ranked_elements[element][:rank] += (index + 1)
else
ranked_elements[element] = { counter: 1, rank: (index + 1) }
end
end
# ranked_elements = [{ x => { counter: 1, rank: y } ]
ranked_elements
end
# Steps to perform the calculation are based on http://www.mit.edu/~6.s085/notes/lecture5.pdf
def perform(alpha, tails, group_one, group_two)
# Size for each group
n1, n2 = group_one.size, group_two.size
# Rank all data
total_ranks = rank(group_one + group_two)
# sum rankings per group
r1 = ranked_sum_for(total_ranks, group_one)
r2 = ranked_sum_for(total_ranks, group_two)
# calculate U statistic
u1 = (n1 * (n1 + 1)/2.0) - r1
u2 = (n2 * (n2 + 1)/2.0 ) - r2
u_statistic = [u1.abs, u2.abs].min
median_u = (n1 * n2)/2.0
ties = total_ranks.values.select { |element| element[:counter] > 1 }
std_u = if ties.size > 0
corrected_sigma(ties, n1, n2)
else
Math.sqrt((n1 * n2 * (n1 + n2 + 1))/12.0)
end
z = (u_statistic - median_u)/std_u
# Most literature are not very specific about the normal distribution to be used.
# We ran multiple tests with a Normal(median_u, std_u) and Normal(0, 1) and we found
# the latter to be more aligned with the results.
probability = Distribution::StandardNormal.new.cumulative_function(z.abs)
p_value = 1 - probability
p_value *= 2 if tails == :two_tail
{ probability: probability,
u: u_statistic,
z: z,
p_value: p_value,
alpha: alpha,
null: alpha < p_value,
alternative: p_value <= alpha,
confidence_level: 1 - alpha }
end
# Formula extracted from http://www.statstutor.ac.uk/resources/uploaded/mannwhitney.pdf
private def corrected_sigma(ties, total_group_one, total_group_two)
n = total_group_one + total_group_two
rank_sum = ties.reduce(0) do |memo, t|
memo += ((t[:counter] ** 3) - t[:counter])/12.0
end
left = (total_group_one * total_group_two)/(n * (n - 1)).to_f
right = (((n ** 3) - n)/12.0) - rank_sum
Math.sqrt(left * right)
end
private def ranked_sum_for(total, group)
# sum rankings per group
group.reduce(0) do |memo, element|
rank_of_element = total[element][:rank] / total[element][:counter].to_f
memo += rank_of_element
end
end
end
# Both test are the same. To keep the selected name, we just alias the class
# with the implementation.
MannWhitneyU = WilcoxonRankSumTest
end
end
|