File: aggregate.rb

package info (click to toggle)
ruby-aggregate 0.2.4-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 136 kB
  • sloc: ruby: 275; makefile: 2
file content (288 lines) | stat: -rw-r--r-- 7,291 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# Implements aggregate statistics and maintains
# configurable histogram for a set of given samples. Convenient for tracking
# high throughput data.
class Aggregate
  #The current number of samples
  attr_reader :count

  #The maximum sample value
  attr_reader :max

  #The minimum samples value
  attr_reader :min

  #The sum of all samples
  attr_reader :sum

  #The number of samples falling below the lowest valued histogram bucket
  attr_reader :outliers_low

  #The number of samples falling above the highest valued histogram bucket
  attr_reader :outliers_high

  # The number of buckets in the binary logarithmic histogram (low => 2**0, high => 2**@@LOG_BUCKETS)
  @@LOG_BUCKETS = 128

  # Create a new Aggregate that maintains a binary logarithmic histogram
  # by default. Specifying values for low, high, and width configures
  # the aggregate to maintain a linear histogram with (high - low)/width buckets
  def initialize(low=nil, high=nil, width=nil)
    @count = 0
    @sum = 0.0
    @sum2 = 0.0
    @outliers_low = 0
    @outliers_high = 0

    # If the user asks we maintain a linear histogram where
    # values in the range [low, high) are bucketed in multiples
    # of width
    if (nil != low && nil != high && nil != width)

      #Validate linear specification
      if high <= low
	raise ArgumentError, "High bucket must be > Low bucket"
      end

      if high - low < width
        raise ArgumentError, "Histogram width must be <= histogram range"
      end

      if 0 != (high - low).modulo(width)
	raise ArgumentError, "Histogram range (high - low) must be a multiple of width"
      end

      @low = low
      @high = high
      @width = width
    else
      @low = 1
      @width = nil
      @high = to_bucket(@@LOG_BUCKETS - 1)
    end

    #Initialize all buckets to 0
    @buckets = Array.new(bucket_count, 0)
  end

  # Include a sample in the aggregate
  def << data

    # Update min/max
    if 0 == @count
      @min = data
      @max = data
    else
      @max = data if data > @max
      @min = data if data < @min
    end

    # Update the running info
    @count += 1
    @sum += data
    @sum2 += (data * data)

    # Update the bucket
    @buckets[to_index(data)] += 1 unless outlier?(data)
  end

  #The current average of all samples
  def mean
    @sum / @count
  end

  #Calculate the standard deviation
  def std_dev
    Math.sqrt((@sum2.to_f - ((@sum.to_f * @sum.to_f)/@count.to_f)) / (@count.to_f - 1))
  end

  # Combine two aggregates
  #def +(b)
  #  a = self
  #  c = Aggregate.new

  #  c.count = a.count + b.count
  #end

  #Generate a pretty-printed ASCII representation of the histogram
  def to_s(columns=nil)

    #default to an 80 column terminal, don't support < 80 for now
    if nil == columns
      columns = 80
    else
      raise ArgumentError if columns < 80
    end

    #Find the largest bucket and create an array of the rows we intend to print
    disp_buckets = Array.new
    max_count = 0
    total = 0
    @buckets.each_with_index do |count, idx|
      next if 0 == count
      max_count = [max_count, count].max
      disp_buckets << [idx, to_bucket(idx), count]
      total += count
    end

    #XXX: Better to print just header --> footer
    return "Empty histogram" if 0 == disp_buckets.length

    #Figure out how wide the value and count columns need to be based on their
    #largest respective numbers
    value_str = "value"
    count_str = "count"
    total_str = "Total"
    value_width = [disp_buckets.last[1].to_s.length, value_str.length].max
    value_width = [value_width, total_str.length].max
    count_width = [total.to_s.length, count_str.length].max
    max_bar_width  = columns - (value_width + " |".length + "| ".length + count_width)

    #Determine the value of a '@'
    weight = [max_count.to_f/max_bar_width.to_f, 1.0].max

    #format the header
    histogram = sprintf("%#{value_width}s |", value_str)
    max_bar_width.times { histogram << "-"}
    histogram << sprintf("| %#{count_width}s\n", count_str)

    # We denote empty buckets with a '~'
    def skip_row(value_width)
      sprintf("%#{value_width}s ~\n", " ")
    end

    #Loop through each bucket to be displayed and output the correct number
    prev_index = disp_buckets[0][0] - 1

    disp_buckets.each do |x|
      #Denote skipped empty buckets with a ~
      histogram << skip_row(value_width) unless prev_index == x[0] - 1
      prev_index = x[0]

      #Add the value
      row = sprintf("%#{value_width}d |", x[1])

      #Add the bar
      bar_size = (x[2]/weight).to_i
      bar_size.times { row += "@"}
      (max_bar_width - bar_size).times { row += " " }

      #Add the count
      row << sprintf("| %#{count_width}d\n", x[2])

      #Append the finished row onto the histogram
      histogram << row
    end

    #End the table
    histogram << skip_row(value_width) if disp_buckets.last[0] != bucket_count-1
    histogram << sprintf("%#{value_width}s", "Total")
    histogram << " |"
    max_bar_width.times {histogram << "-"}
    histogram << "| "
    histogram << sprintf("%#{count_width}d\n", total)
  end

  #Iterate through each bucket in the histogram regardless of
  #its contents
  def each
    @buckets.each_with_index do |count, index|
      yield(to_bucket(index), count)
    end
  end

  #Iterate through only the buckets in the histogram that contain
  #samples
  def each_nonzero
    @buckets.each_with_index do |count, index|
      yield(to_bucket(index), count) if count != 0
    end
  end

  private

  def linear?
    nil != @width
  end

  def outlier?(data)

    if data < @low
      @outliers_low += 1
    elsif data >= @high
      @outliers_high += 1
    else
      return false
    end
  end

  def bucket_count
    if linear?
      return (@high-@low)/@width
    else
      return @@LOG_BUCKETS
    end
  end

  def to_bucket(index)
    if linear?
      return @low + (index * @width)
    else
      return 2**(index)
    end
  end

  def right_bucket?(index, data)

    # check invariant
    raise unless linear?

    bucket = to_bucket(index)

    #It's the right bucket if data falls between bucket and next bucket
    bucket <= data && data < bucket + @width
  end

=begin
  def find_bucket(lower, upper, target)
    #Classic binary search
    return upper if right_bucket?(upper, target)

    # Cut the search range in half
    middle = (upper/2).to_i

    # Determine which half contains our value and recurse
    if (to_bucket(middle) >= target)
      return find_bucket(lower, middle, target)
    else
      return find_bucket(middle, upper, target)
    end
  end
=end

  # A data point is added to the bucket[n] where the data point
  # is less than the value represented by bucket[n], but greater
  # than the value represented by bucket[n+1]
  def to_index(data)

    # basic case is simple
    return log2(data).to_i if !linear?

    # Search for the right bucket in the linear case
    @buckets.each_with_index do |count, idx|
      return idx if right_bucket?(idx, data)
    end
    #find_bucket(0, bucket_count-1, data)

    #Should not get here
    raise "#{data}"
  end

  # log2(x) returns j, | i = j-1 and 2**i <= data < 2**j
  @@LOG2_DIVEDEND = Math.log(2)
  def log2( x )
   Math.log(x) / @@LOG2_DIVEDEND
  end

end

require_relative 'aggregate/version'