File: mini_histogram.rb

package info (click to toggle)
ruby-mini-histogram 0.1.3-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 132 kB
  • sloc: ruby: 139; sh: 4; makefile: 4
file content (228 lines) | stat: -rw-r--r-- 5,663 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
require "mini_histogram/version"

# A class for building histogram info
#
# Given an array, this class calculates the "edges" of a histogram
# these edges mark the boundries for "bins"
#
#   array = [1,1,1, 5, 5, 5, 5, 10, 10, 10]
#   histogram = MiniHistogram.new(array)
#   puts histogram.edges
#   # => [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0]
#
# It also finds the weights (aka count of values) that would go in each bin:
#
#   puts histogram.weights
#   # => [3, 0, 4, 0, 0, 3]
#
# This means that the `array` here had three items between 0.0 and 2.0.
#
class MiniHistogram
  class Error < StandardError; end
  attr_reader :array, :left_p, :max

  def initialize(array, left_p: true, edges: nil)
    @array = array
    @left_p = left_p
    @edges = edges
    @weights = nil

    @min, @max = array.minmax
  end

  def edges_min
    edges.min
  end

  def edges_max
    edges.max
  end

  def histogram(*_)
    self
  end

  def closed
    @left_p ? :left : :right
  end

  # Sets the edge value to something new,
  # also clears any previously calculated values
  def update_values(edges:, max: )
    @edges = edges
    @max = max
    @weights = nil # clear memoized value
  end

  def bin_size
    return 0 if edges.length <= 1

    edges[1] - edges[0]
  end

  # Weird name, right? There are multiple ways to
  # calculate the number of "bins" a histogram should have, one
  # of the most common is the "sturges" method
  #
  # Here are some alternatives from numpy:
  # https://github.com/numpy/numpy/blob/d9b1e32cb8ef90d6b4a47853241db2a28146a57d/numpy/lib/histograms.py#L489-L521
  def sturges
    len = array.length
    return 1.0 if len == 0

    # return (long)(ceil(Math.log2(n)) + 1);
    return Math.log2(len).ceil + 1
  end

  # Given an array of edges and an array we want to generate a histogram from
  # return the counts for each "bin"
  #
  # Example:
  #
  #   a = [1,1,1, 5, 5, 5, 5, 10, 10, 10]
  #   edges = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0]
  #
  #   MiniHistogram.new(a).weights
  #   # => [3, 0, 4, 0, 0, 3]
  #
  #   This means that the `a` array has 3 values between 0.0 and 2.0
  #   4 values between 4.0 and 6.0 and three values between 10.0 and 12.0
  def weights
    return @weights if @weights
    return @weights = [] if array.empty?

    lo = edges.first
    step = edges[1] - edges[0]

    max_index = ((@max  - lo) / step).floor
    @weights = Array.new(max_index + 1, 0)

    array.each do |x|
      index = ((x - lo) / step).floor
      @weights[index] += 1
    end

    return @weights
  end

  # Finds the "edges" of a given histogram that will mark the boundries
  # for the histogram's "bins"
  #
  # Example:
  #
  #  a = [1,1,1, 5, 5, 5, 5, 10, 10, 10]
  #  MiniHistogram.new(a).edges
  #  # => [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0]
  #
  #  There are multiple ways to find edges, this was taken from
  #  https://github.com/mrkn/enumerable-statistics/issues/24
  #
  #  Another good set of implementations is in numpy
  #  https://github.com/numpy/numpy/blob/d9b1e32cb8ef90d6b4a47853241db2a28146a57d/numpy/lib/histograms.py#L222
  def edges
    return @edges if @edges

    return @edges = [0.0] if array.empty?

    lo = @min
    hi = @max

    nbins = sturges.to_f

    if hi == lo
      start = lo
      step = 1.0
      divisor = 1.0
      len = 1
    else
      bw = (hi - lo) / nbins
      lbw = Math.log10(bw)
      if lbw >= 0
        step = 10 ** lbw.floor * 1.0
        r = bw/step

        if r <= 1.1
          # do nothing
        elsif r <= 2.2
          step *= 2.0
        elsif r <= 5.5
          step *= 5.0
        else
          step *= 10
        end
        divisor = 1.0
        start = step * (lo/step).floor
        len = ((hi - start)/step).ceil
      else
        divisor = 10 ** - lbw.floor
        r = bw * divisor
        if r <= 1.1
          # do nothing
        elsif r <= 2.2
          divisor /= 2.0
        elsif r <= 5.5
          divisor /= 5.0
        else
          divisor /= 10.0
        end
        step = 1.0
        start = (lo * divisor).floor
        len = (hi * divisor - start).ceil
      end
    end

    if left_p
      while (lo < start/divisor)
        start -= step
      end

      while (start + (len - 1)*step)/divisor <= hi
        len += 1
      end
    else
      while lo <= start/divisor
        start -= step
      end
      while (start + (len - 1)*step)/divisor < hi
        len += 1
      end
    end

    @edges = []
    len.times.each do
      @edges << start/divisor
      start += step
    end

    return @edges
  end
  alias :edge :edges

  # Given an array of Histograms this function calcualtes
  # an average edge size along with the minimum and maximum
  # edge values. It then updates the edge value on all inputs
  #
  # The main pourpose of this method is to be able to chart multiple
  # distributions against a similar axis
  #
  # See for more context: https://github.com/schneems/derailed_benchmarks/pull/169
  def self.set_average_edges!(*array_of_histograms)
    array_of_histograms.each { |x| raise "Input expected to be a histogram but is #{x.inspect}" unless x.is_a?(MiniHistogram) }
    steps = array_of_histograms.map(&:bin_size)
    avg_step_size = steps.inject(&:+).to_f / steps.length

    max_value = array_of_histograms.map(&:max).max

    max_edge = array_of_histograms.map(&:edges_max).max
    min_edge = array_of_histograms.map(&:edges_min).min

    average_edges = [min_edge]
    while average_edges.last < max_edge
      average_edges << average_edges.last + avg_step_size
    end

    array_of_histograms.each {|h| h.update_values(edges: average_edges, max: max_value) }

    return array_of_histograms
  end
end