File: equi_height.cc

package info (click to toggle)
mysql-8.0 8.0.43-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 1,273,924 kB
  • sloc: cpp: 4,684,605; ansic: 412,450; pascal: 108,398; java: 83,641; perl: 30,221; cs: 27,067; sql: 26,594; sh: 24,181; python: 21,816; yacc: 17,169; php: 11,522; xml: 7,388; javascript: 7,076; makefile: 2,194; lex: 1,075; awk: 670; asm: 520; objc: 183; ruby: 97; lisp: 86
file content (845 lines) | stat: -rw-r--r-- 33,653 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
/* Copyright (c) 2016, 2025, Oracle and/or its affiliates.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
   as published by the Free Software Foundation.

   This program is designed to work with certain software (including
   but not limited to OpenSSL) that is licensed under separate terms,
   as designated in a particular file or component or in included license
   documentation.  The authors of MySQL hereby grant you an additional
   permission to link the program and your derivative works with the
   separately licensed software that they have either included with
   the program or referenced in the documentation.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License, version 2.0, for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */

/**
  @file sql/histograms/equi_height.cc
  Equi-height histogram (implementation).
*/

#include "sql/histograms/equi_height.h"

#include <stdlib.h>
#include <algorithm>  // std::is_sorted
#include <cmath>      // std::lround
#include <iterator>
#include <new>

#include "my_base.h"  // ha_rows
#include "my_dbug.h"
#include "my_inttypes.h"
#include "mysql_time.h"
#include "sql-common/json_dom.h"  // Json_*
#include "sql/histograms/equi_height_bucket.h"
#include "sql/histograms/value_map.h"  // Value_map
#include "sql/mem_root_allocator.h"
#include "sql_string.h"
#include "template_utils.h"

class my_decimal;
struct MEM_ROOT;

namespace histograms {

// Private constructor
template <class T>
Equi_height<T>::Equi_height(MEM_ROOT *mem_root, const std::string &db_name,
                            const std::string &tbl_name,
                            const std::string &col_name,
                            Value_map_type data_type, bool *error)
    : Histogram(mem_root, db_name, tbl_name, col_name,
                enum_histogram_type::EQUI_HEIGHT, data_type, error),
      m_buckets(mem_root) {}

// Public factory method
template <class T>
Equi_height<T> *Equi_height<T>::create(MEM_ROOT *mem_root,
                                       const std::string &db_name,
                                       const std::string &tbl_name,
                                       const std::string &col_name,
                                       Value_map_type data_type) {
  bool error = false;
  Equi_height<T> *equi_height = new (mem_root)
      Equi_height<T>(mem_root, db_name, tbl_name, col_name, data_type, &error);
  if (error) return nullptr;
  return equi_height;
}

template <class T>
Equi_height<T>::Equi_height(MEM_ROOT *mem_root, const Equi_height<T> &other,
                            bool *error)
    : Histogram(mem_root, other, error), m_buckets(mem_root) {
  if (m_buckets.reserve(other.m_buckets.size())) {
    *error = true;
    return;
  }
  for (const auto &bucket : other.m_buckets) m_buckets.push_back(bucket);
}

template <>
Equi_height<String>::Equi_height(MEM_ROOT *mem_root,
                                 const Equi_height<String> &other, bool *error)
    : Histogram(mem_root, other, error), m_buckets(mem_root) {
  /*
    Copy bucket contents. We need to make duplicates of String data, since they
    are allocated on a MEM_ROOT that most likely will be freed way too early.
  */
  if (m_buckets.reserve(other.m_buckets.size())) {
    *error = true;
    return;
  }
  for (const auto &pair : other.m_buckets) {
    char *lower_string_data = pair.get_lower_inclusive().dup(mem_root);
    char *upper_string_data = pair.get_upper_inclusive().dup(mem_root);
    if (lower_string_data == nullptr || upper_string_data == nullptr) {
      assert(false); /* purecov: deadcode */
      *error = true;
      return;
    }

    String lower_string_dup(lower_string_data,
                            pair.get_lower_inclusive().length(),
                            pair.get_lower_inclusive().charset());
    String upper_string_dup(upper_string_data,
                            pair.get_upper_inclusive().length(),
                            pair.get_upper_inclusive().charset());
    equi_height::Bucket<String> bucket_dup(lower_string_dup, upper_string_dup,
                                           pair.get_cumulative_frequency(),
                                           pair.get_num_distinct());
    m_buckets.push_back(bucket_dup);
  }
}

/*
  Returns true if the greedy equi-height histogram construction algorithm can
  successfully fit the provided value_map into a histogram with at most
  max_buckets of size at most max_bucket_values. This function does not actually
  build a histogram, but is used as a step to find the right bucket size.
*/
template <class T>
static bool FitsIntoBuckets(const Value_map<T> &value_map,
                            ha_rows max_bucket_values, size_t max_buckets) {
  assert(value_map.size() > 0);
  size_t used_buckets = 1;
  ha_rows current_bucket_values = 0;

  for (const auto &[value, count] : value_map) {
    assert(count > 0);
    /*
      If the current bucket is not empty and adding the values causes it to
      exceed its max size, add the values to a new bucket instead.
      Note that we allow the size of singleton buckets (buckets with only one
      distinct value) to exceed max_bucket_values.
    */
    if (current_bucket_values > 0 &&
        current_bucket_values + count > max_bucket_values) {
      ++used_buckets;
      current_bucket_values = 0;
    }
    current_bucket_values += count;

    // Terminate early if we have used too many buckets.
    if (used_buckets > max_buckets) return false;
  }
  return true;
}

/*
  Performs a binary search to find the smallest possible bucket size that will
  allow us to greedily construct a histogram with at most max_buckets buckets.

  Important properties of the greedy construction algorithm:

  See the comment above build_histogram() for a description of the algorithm.

  Let M denote the total number of values in the value_map and assume for
  simplicity that max_buckets is an even number. Fractions are rounded up to the
  nearest integer. Buckets are composite if they contain more than one distinct
  value.

  Property (1)
  The histogram fits into N buckets with a composite size of at most K = 2M/N.

  Proof sketch (1)
  Consider the first pair of buckets. If the first bucket contains K - c values,
  then the second bucket is guaranteed to contain at least c values, otherwise
  the greedy construction algorithm would have placed the additional c values in
  the first bucket as well. Thus, every pair of buckets together contain at
  least K = 2M/N rows, and there are N/2 successive pairs of buckets. Therefore,
  the first N buckets contain at least (N/2) * (2M/N) = M values and the
  histogram fits into N buckets.

  Property (2)
  Increasing the maximum allowed composite bucket size can never result in a
  histogram with more buckets. I.e., the number of buckets is non-increasing
  in the max composite bucket size.

  The first property ensure that we have a reasonable upper bound when searching
  for the bucket size. The second property ensures that we can reason about
  ranges of bucket sizes when performing our search. For example, if we cannot
  fit a histogram using a bucket size of K, then it will not work with a bucket
  size of K' < K either.
*/
template <class T>
static ha_rows FindBucketMaxValues(const Value_map<T> &value_map,
                                   size_t max_buckets) {
  ha_rows total_values = 0;
  for (const auto &[value, count] : value_map) total_values += count;
  if (max_buckets == 1) return total_values;

  // Conservative upper bound to avoid dealing with rounding and odd max_buckets
  ha_rows upper_bucket_values = 2 * total_values / (max_buckets - 1) + 1;
  assert(FitsIntoBuckets(value_map, upper_bucket_values, max_buckets));
  ha_rows lower_bucket_values = 0;

  const int max_search_steps = 10;
  int search_step = 0;
  while (upper_bucket_values > lower_bucket_values + 1 &&
         search_step < max_search_steps) {
    ha_rows bucket_values = (upper_bucket_values + lower_bucket_values) / 2;
    if (FitsIntoBuckets(value_map, bucket_values, max_buckets)) {
      upper_bucket_values = bucket_values;
    } else {
      lower_bucket_values = bucket_values;
    }
    ++search_step;
  }

  return upper_bucket_values;
}

/*
  Returns an estimate of the number of distinct values in a histogram bucket
  when the histogram is based on sampling.

  We use the Guaranteed Error Estimator (GEE) from [1]. Let s denote the
  sampling rate, d the number of distinct values in the sample, and u the number
  of distinct values that appear only once in the sample. Then,

                        GEE = sqrt(1/s)*u + d - u.

  The intuition behind the GEE estimator is that we can divide the dataset into
  "high frequency" and "low frequency" values. High frequency values are those
  d - u values that appear at least twice in the sample. The contribution to the
  estimated number of distinct values from the high frequency values will not
  increase, even if increase the sample size. The low frequency values are the
  u values that appeared only once in the sample. The final contribution of the
  low frequency values can be between u and (1/s)*u. In order to minimize the
  worst-case relative error, we use the geometric mean of these two values.

  Important note:

  This estimator was designed for uniform random sampling. We currently use
  page-level sampling for histograms. This can cause us to underestimate the
  number of distinct values by nearly a factor 1/s in the worst case. The
  reason is that we only scale up the number of singleton values.
  With page-level sampling we can have pairs of distinct values occuring
  together so that we will have u=0 in the formula above.

  For now, we opt to keep the formula as it is, since we would rather
  underestimate than overestimate the number of distinct values. Potential
  solutions:

  1) Use a custom estimator for page-level sampling [3]. This requires changes
     to the sampling interface to InnoDB to support counting the number of pages
     a value appears in.

  2) Use the simpler estimate of sqrt(1/s)*d, the geometric mean between the
     lower bound of d and the upper bound of d/s. This has the downside of
     overestimating the number of distinct values by sqrt(1/s) in cases where
     the table only contains heavy hitters.

  3) Simulate uniform random sampling on top of the page-level sampling.
     Postgres does this, but it requires sampling as many pages as the target
     number of rows.

   Further considerations:

  It turns out that estimating the number of distinct values is a difficult
  problem. In [1] it is shown that for any estimator based on random sampling
  with a sampling rate of s there exists a data set such that with probability p
  the estimator is off by a factor at least ((1/s) * ln(1/p))^0.5. For a
  sampling rate of s = 0.01 and an error probability of 1/e this means the
  estimate could be off by a factor 10 about 1/3 of the time.

  We are currently using the distinct values estimates for providing selectivity
  estimates for equality predicates. The selectivity of a value in a composite
  bucket is estimated to be the total selectivity of the bucket divided by the
  number of distinct values in the bucket. So a larger distinct values estimate
  leads to lower selectivity estimates. In future we might also use histograms
  in estimating the size of joins though. In both cases it seems better to
  overestimate rather than underestimate the selectivity.

  The GEE estimator is designed to minimize the ratio between the estimate and
  actual value. The estimator is simple and relatively conservative in that it
  only scales u by sqrt(1/s) rather than 1/s, so it seems suitable for our use.
  In [1] it is furthermore shown that it performs relatively well on real data.

  If we require more accurate estimates we could consider upgrading to the more
  advanced estimators proposed in [1] or [2]. Since estimation distinct values
  by sampling is inherently prone to large errors [1], we could also consider
  streaming/sketching techniques such as HyperLogLog or Count-Min if we need
  more accuracy. These would require updating a sketch on every table update.

  References:

  [1] Charikar, Moses, et al. "Towards estimation error guarantees for distinct
  values." Proceedings of the nineteenth ACM SIGMOD-SIGACT-SIGART symposium on
  Principles of database systems. 2000.

  [2] Haas, Peter J., et al. "Sampling-based estimation of the number of
  distinct values of an attribute." VLDB. Vol. 95. 1995.

  [3] Chaudhuri, Surajit, Gautam Das, and Utkarsh Srivastava. "Effective use of
  block-level sampling in statistics estimation." Proceedings of the 2004 ACM
  SIGMOD international conference on Management of data. 2004.

*/
static ha_rows EstimateDistinctValues(double sampling_rate,
                                      ha_rows bucket_distinct_values,
                                      ha_rows bucket_unary_values) {
  // Singleton buckets can only contain one distinct value.
  if (bucket_distinct_values == 1) return 1;

  // GEE estimate for non-singleton buckets.
  assert(sampling_rate > 0.0);
  assert(bucket_distinct_values >= bucket_unary_values);
  return static_cast<ha_rows>(
             std::round(std::sqrt(1.0 / sampling_rate) * bucket_unary_values)) +
         bucket_distinct_values - bucket_unary_values;
}

/*
  Greedy equi-height histogram construction algorithm:

  Inputs: An ordered collection of [value, count] pairs and a maximum bucket
  size.

  Create an empty bucket. Proceeding in the order of the collection, insert
  values into the bucket while keeping track of its size.

  If the insertion of a value into a non-empty bucket causes the bucket to
  exceed the maximum size, create a new empty bucket and continue.

  ---

  Guarantees:

  Selectivity estimation error of at most ~2 * #values / #buckets, often less.
  Values with relative frequency exceeding this threshold are guaranteed to be
  placed in singleton buckets.

  Longer description:

  The build_histogram() method takes as input the target number of buckets and
  calls FindBucketMaxValues() to search for the smallest maximum bucket size
  that will cause the histogram to fit into the target number of buckets.
  See the comments on find_max_bucket_values() for more details.

  If we disregard sampling error then the remaining error in selectivity
  estimation stems entirely from buckets that contain more than one distinct
  value (composite buckets). To see this, consider estimating the selectivity
  for e.g. "WHERE x < 5". If the value 5 lies inside a composite bucket, the
  selectivity estimation error can be almost as large as the size of the bucket.

  By constructing histograms with the smallest possible composite bucket size
  we minimize the worst case selectivity estimation error. Our algorithm is
  guaranteed to produce a histogram with a maximum composite bucket size of at
  most 2 * #values / #buckets in the worst case. In general it will adapt to the
  data distribution to minimize the size of composite buckets. This property is
  particularly beneficial for distributions that are concentrated on a few
  highly frequent values. The heavy values can be placed in singleton buckets
  and the algorithm will attempt to spread the remaining values evenly across
  the remaining buckets, leading to a lower composite bucket size.

  Note on terminology:

  The term "value" primarily refers to an entry/cell in a column. "value" is
  also used to refer to the actual value of an entry, causing some confusion.
  We try to use the term distinct value to refer the value of an entry.
  The Value_map is an ordered collection of [distinct value, value count] pairs.
  For example, a Value_map<String> could contain the pairs ["a", 1], ["b", 2] to
  represent one "a" value and two "b" values.
*/
template <class T>
bool Equi_height<T>::build_histogram(const Value_map<T> &value_map,
                                     size_t num_buckets) {
  assert(num_buckets > 0);
  if (num_buckets < 1) return true; /* purecov: inspected */

  // Set the number of buckets that was specified/requested by the user.
  m_num_buckets_specified = num_buckets;

  // Clear any existing data.
  m_buckets.clear();
  m_null_values_fraction = INVALID_NULL_VALUES_FRACTION;
  m_sampling_rate = value_map.get_sampling_rate();

  // Set the character set for the histogram contents.
  m_charset = value_map.get_character_set();

  // Get total count of non-null values.
  ha_rows num_non_null_values = 0;
  for (const auto &[value, count] : value_map) num_non_null_values += count;

  // No non-null values, nothing to do.
  if (num_non_null_values == 0) {
    if (value_map.get_num_null_values() > 0)
      m_null_values_fraction = 1.0;
    else
      m_null_values_fraction = 0.0;

    return false;
  }

  // Set the fraction of NULL values.
  const ha_rows total_values =
      value_map.get_num_null_values() + num_non_null_values;

  m_null_values_fraction =
      value_map.get_num_null_values() / static_cast<double>(total_values);

  /*
    Ensure that the capacity is at least num_buckets in order to avoid the
    overhead of additional allocations when inserting buckets.
  */
  if (m_buckets.reserve(num_buckets)) return true;

  const ha_rows bucket_max_values = FindBucketMaxValues(value_map, num_buckets);
  ha_rows cumulative_values = 0;
  ha_rows bucket_values = 0;
  ha_rows bucket_distinct_values = 0;
  ha_rows bucket_unary_values = 0;  // Number of values with a count of one.
  size_t distinct_values_remaining = value_map.size();

  auto freq_it = value_map.begin();
  const T *bucket_lower_value = &freq_it->first;

  for (; freq_it != value_map.end(); ++freq_it) {
    // Add the current distinct value to the current bucket.
    cumulative_values += freq_it->second;
    bucket_values += freq_it->second;
    ++bucket_distinct_values;
    if (freq_it->second == 1) ++bucket_unary_values;
    --distinct_values_remaining;

    /*
      Continue adding the next distinct value to the bucket if:
      (1) We have not reached the last distinct value in the value_map.
      (2) There are more remaining distinct values than empty buckets.
      (3) Adding the value does not cause the bucket to exceed its max size.
    */
    auto next = std::next(freq_it);
    size_t empty_buckets_remaining = num_buckets - m_buckets.size() - 1;
    if (next != value_map.end() &&
        distinct_values_remaining > empty_buckets_remaining &&
        bucket_values + next->second <= bucket_max_values) {
      continue;
    }

    // Finalize the current bucket and add it to our collection of buckets.
    double cumulative_frequency =
        cumulative_values / static_cast<double>(total_values);
    ha_rows bucket_distinct_values_estimate =
        EstimateDistinctValues(value_map.get_sampling_rate(),
                               bucket_distinct_values, bucket_unary_values);

    equi_height::Bucket<T> bucket(*bucket_lower_value, freq_it->first,
                                  cumulative_frequency,
                                  bucket_distinct_values_estimate);

    /*
      In case the histogram construction algorithm unintendedly inserts more
      buckets than we have reserved space for and triggers a reallocation that
      fails, push_back() returns true.
    */
    assert(m_buckets.capacity() > m_buckets.size());
    if (m_buckets.push_back(bucket)) return true;

    /*
      In debug, check that the lower value actually is less than or equal to
      the upper value.
    */
    assert(!Histogram_comparator()(bucket.get_upper_inclusive(),
                                   bucket.get_lower_inclusive()));

    bucket_unary_values = 0;
    bucket_values = 0;
    bucket_distinct_values = 0;
    if (next != value_map.end()) bucket_lower_value = &next->first;
  }

  assert(m_buckets.size() <= num_buckets);
  assert(std::is_sorted(m_buckets.begin(), m_buckets.end(),
                        Histogram_comparator()));
  return false;
}

template <class T>
bool Equi_height<T>::histogram_to_json(Json_object *json_object) const {
  /*
    Call the base class implementation first. This will add the properties that
    are common among different histogram types, such as "last-updated" and
    "histogram-type".
  */
  if (Histogram::histogram_to_json(json_object))
    return true; /* purecov: inspected */

  // Add the equi-height buckets.
  Json_array buckets;
  for (const auto &bucket : m_buckets) {
    Json_array json_bucket;
    if (bucket.bucket_to_json(&json_bucket))
      return true; /* purecov: inspected */
    if (buckets.append_clone(&json_bucket))
      return true; /* purecov: inspected */
  }

  if (json_object->add_clone(buckets_str(), &buckets))
    return true; /* purecov: inspected */

  if (histogram_data_type_to_json(json_object))
    return true; /* purecov: inspected */
  return false;
}

template <class T>
std::string Equi_height<T>::histogram_type_to_str() const {
  return equi_height_str();
}

template <class T>
bool Equi_height<T>::json_to_histogram(const Json_object &json_object,
                                       Error_context *context) {
  if (Histogram::json_to_histogram(json_object, context)) return true;

  // if the histogram is internally persisted, it has already been validated
  // and should never have errors, so assert whenever an error is encountered.
  // If it is not already validated, it is a user-defined histogram and it may
  // have errors, which should be detected and reported.
  bool already_validated [[maybe_unused]] = context->binary();

  const Json_dom *buckets_dom = json_object.get(buckets_str());
  assert(!already_validated ||
         (buckets_dom && buckets_dom->json_type() == enum_json_type::J_ARRAY));
  if (buckets_dom == nullptr) {
    context->report_missing_attribute(buckets_str());
    return true;
  }
  if (buckets_dom->json_type() != enum_json_type::J_ARRAY) {
    context->report_node(buckets_dom, Message::JSON_WRONG_ATTRIBUTE_TYPE);
    return true;
  }

  const Json_array *buckets = down_cast<const Json_array *>(buckets_dom);
  if (m_buckets.reserve(buckets->size())) return true;
  for (size_t i = 0; i < buckets->size(); ++i) {
    const Json_dom *bucket_dom = (*buckets)[i];
    assert(!already_validated ||
           bucket_dom->json_type() == enum_json_type::J_ARRAY);
    if (bucket_dom->json_type() != enum_json_type::J_ARRAY) {
      context->report_node(bucket_dom, Message::JSON_WRONG_ATTRIBUTE_TYPE);
      return true;
    }

    const Json_array *bucket = down_cast<const Json_array *>(bucket_dom);
    // Only the first four items are defined, others are simply ignored.
    assert(!already_validated || (bucket->size() == 4));
    if (bucket->size() < 4) {
      context->report_node(bucket_dom, Message::JSON_WRONG_BUCKET_TYPE_4);
      return true;
    }
    if (add_bucket_from_json(bucket, context)) return true;
  }
  assert(std::is_sorted(m_buckets.begin(), m_buckets.end(),
                        Histogram_comparator()));

  // Global post-check
  {
    if (m_buckets.empty()) {
      context->report_global(Message::JSON_IMPOSSIBLE_EMPTY_EQUI_HEIGHT);
      return true;
    } else {
      equi_height::Bucket<T> *last_bucket = &m_buckets[m_buckets.size() - 1];
      float sum =
          last_bucket->get_cumulative_frequency() + get_null_values_fraction();
      if (std::abs(sum - 1.0) > 0) {
        context->report_global(Message::JSON_INVALID_TOTAL_FREQUENCY);
        return true;
      }
    }
  }
  return false;
}

template <class T>
bool Equi_height<T>::add_bucket_from_json(const Json_array *json_bucket,
                                          Error_context *context) {
  const Json_dom *cumulative_frequency_dom = (*json_bucket)[2];
  if (cumulative_frequency_dom->json_type() != enum_json_type::J_DOUBLE) {
    context->report_node(cumulative_frequency_dom,
                         Message::JSON_WRONG_ATTRIBUTE_TYPE);
    return true;
  }

  const Json_double *cumulative_frequency =
      down_cast<const Json_double *>(cumulative_frequency_dom);

  const Json_dom *num_distinct_dom = (*json_bucket)[3];
  ulonglong num_distinct_v = 0;
  if (num_distinct_dom->json_type() == enum_json_type::J_UINT) {
    num_distinct_v = down_cast<const Json_uint *>(num_distinct_dom)->value();
  } else if (!context->binary() &&
             num_distinct_dom->json_type() == enum_json_type::J_INT) {
    const Json_int *num_distinct =
        down_cast<const Json_int *>(num_distinct_dom);
    if (num_distinct->value() < 1) {
      context->report_node(num_distinct_dom,
                           Message::JSON_INVALID_NUM_DISTINCT);
      return true;
    }
    num_distinct_v = num_distinct->value();
  } else {
    context->report_node(num_distinct_dom, Message::JSON_WRONG_ATTRIBUTE_TYPE);
    return true;
  }

  const Json_dom *lower_inclusive_dom = (*json_bucket)[0];
  const Json_dom *upper_inclusive_dom = (*json_bucket)[1];

  T upper_value;
  T lower_value;
  if (extract_json_dom_value(upper_inclusive_dom, &upper_value, context))
    return true;
  if (extract_json_dom_value(lower_inclusive_dom, &lower_value, context))
    return true;

  // Bucket-extraction post-check
  {
    // Check items in the bucket
    if ((cumulative_frequency->value() < 0.0) ||
        (cumulative_frequency->value() > 1.0)) {
      context->report_node(cumulative_frequency_dom,
                           Message::JSON_INVALID_FREQUENCY);
      return true;
    }
    if (context->check_value(&upper_value)) {
      context->report_node(upper_inclusive_dom,
                           Message::JSON_VALUE_OUT_OF_RANGE);
      return true;
    }
    if (context->check_value(&lower_value)) {
      context->report_node(lower_inclusive_dom,
                           Message::JSON_VALUE_OUT_OF_RANGE);
      return true;
    }

    // Check endpoint sequence and frequency sequence.
    if (histograms::Histogram_comparator()(upper_value, lower_value)) {
      context->report_node(lower_inclusive_dom,
                           Message::JSON_VALUE_DESCENDING_IN_BUCKET);
      return true;
    }
    if (!m_buckets.empty()) {
      equi_height::Bucket<T> *last_bucket = &m_buckets[m_buckets.size() - 1];
      if (!histograms::Histogram_comparator()(
              last_bucket->get_upper_inclusive(), lower_value)) {
        context->report_node(lower_inclusive_dom,
                             Message::JSON_VALUE_NOT_ASCENDING_2);
        return true;
      }
      if (last_bucket->get_cumulative_frequency() >=
          cumulative_frequency->value()) {
        context->report_node(cumulative_frequency_dom,
                             Message::JSON_CUMULATIVE_FREQUENCY_NOT_ASCENDING);
        return true;
      }
    }
  }
  equi_height::Bucket<T> bucket(lower_value, upper_value,
                                cumulative_frequency->value(), num_distinct_v);

  if (m_buckets.push_back(bucket)) return true;

  return false;
}

template <class T>
Histogram *Equi_height<T>::clone(MEM_ROOT *mem_root) const {
  DBUG_EXECUTE_IF("fail_histogram_clone", return nullptr;);
  bool error = false;
  Histogram *equi_height =
      new (mem_root) Equi_height<T>(mem_root, *this, &error);
  if (error) return nullptr;
  return equi_height;
}

/*
  This produces an estimate for the total number of distinct values by summing
  all the individual bucket estimates. A better estimate could perhaps be
  obtained by computing a single estimate for the entire histogram when it is
  built.
*/
template <class T>
size_t Equi_height<T>::get_num_distinct_values() const {
  size_t distinct_values = 0;
  for (const auto &bucket : m_buckets) {
    distinct_values += bucket.get_num_distinct();
  }
  return distinct_values;
}

template <class T>
double Equi_height<T>::get_equal_to_selectivity(const T &value) const {
  /*
    Find the first bucket where the upper inclusive value is not less than the
    provided value.
  */
  const auto found = std::lower_bound(m_buckets.begin(), m_buckets.end(), value,
                                      Histogram_comparator());

  // Check if we are after the last bucket
  if (found == m_buckets.end()) return 0.0;

  // Check if we are before the first bucket, or between two buckets.
  if (Histogram_comparator()(value, found->get_lower_inclusive())) return 0.0;

  double bucket_frequency;
  if (found == m_buckets.begin()) {
    /*
      If the value we are looking for is in the first bucket, we will end up
      here.
    */
    bucket_frequency = found->get_cumulative_frequency();
  } else {
    /*
      If the value we are looking for is NOT in the first bucket, we will end up
      here.
    */
    const auto previous = std::prev(found, 1);
    bucket_frequency = found->get_cumulative_frequency() -
                       previous->get_cumulative_frequency();

    assert(bucket_frequency >= 0.0);
    assert(bucket_frequency <= get_non_null_values_fraction());
  }

  return (bucket_frequency / found->get_num_distinct());
}

template <class T>
double Equi_height<T>::get_less_than_selectivity(const T &value) const {
  /*
    Find the first bucket with endpoints [a, b] where the upper inclusive value
    b is not less than the provided value, i.e. we have value <= b.
    Buckets that come before the found bucket (previous buckets) have an upper
    inclusive value strictly less than the provided value, and will therefore
    count towards the selectivity.
  */
  const auto found = std::lower_bound(m_buckets.begin(), m_buckets.end(), value,
                                      Histogram_comparator());

  if (found == m_buckets.end()) return get_non_null_values_fraction();

  double previous_bucket_cumulative_frequency;
  double found_bucket_frequency;
  if (found == m_buckets.begin()) {
    previous_bucket_cumulative_frequency = 0.0;
    found_bucket_frequency = found->get_cumulative_frequency();
  } else {
    const auto previous = std::prev(found, 1);
    previous_bucket_cumulative_frequency = previous->get_cumulative_frequency();
    found_bucket_frequency = found->get_cumulative_frequency() -
                             previous->get_cumulative_frequency();
  }

  /*
    We now consider how the found bucket contributes to the selectivity.
    There are two cases:

    1) a < value <= b
    The value lies inside the bucket and we know that the bucket is
    non-singleton since a < b. We include a fraction of the bucket's frequency
    corresponding to the position of the value between a and b.

    2) value <= a <= b
    In this case the found bucket contributes nothing since the lower inclusive
    endpoint a is greater than or equal to the value.
  */
  if (Histogram_comparator()(found->get_lower_inclusive(), value)) {
    const double distance = found->get_distance_from_lower(value);
    assert(distance >= 0.0);
    assert(distance <= 1.0);
    return previous_bucket_cumulative_frequency +
           (found_bucket_frequency * distance);
  } else {
    return previous_bucket_cumulative_frequency;
  }
}

template <class T>
double Equi_height<T>::get_greater_than_selectivity(const T &value) const {
  /*
    Find the first bucket with endpoints [a, b] where the upper inclusive value
    b is greater than the provided value, i.e. we have value < b.
    Buckets that come after the found bucket (next buckets) have a lower
    inclusive value greater than the provided value, and will therefore
    count towards the selectivity.
  */
  const auto found = std::upper_bound(m_buckets.begin(), m_buckets.end(), value,
                                      Histogram_comparator());

  if (found == m_buckets.end()) return 0.0;

  double found_bucket_frequency;
  if (found == m_buckets.begin()) {
    found_bucket_frequency = found->get_cumulative_frequency();
  } else {
    const auto previous = std::prev(found, 1);
    found_bucket_frequency = found->get_cumulative_frequency() -
                             previous->get_cumulative_frequency();
  }
  double next_buckets_frequency =
      get_non_null_values_fraction() - found->get_cumulative_frequency();

  /*
    We now consider how the found bucket contributes to the selectivity.
    There are two cases:

    1) value < a <= b
    The provided value is smaller than the inclusive lower endpoint and the
    entire bucket should be included.

    2) a <= value < b
    The value lies inside the bucket and we know that the bucket is
    non-singleton since a < b. We include a fraction of the bucket's frequency
    corresponding to the position of the value between a and b.
  */
  if (Histogram_comparator()(value, found->get_lower_inclusive())) {
    return found_bucket_frequency + next_buckets_frequency;
  } else {
    const double distance = found->get_distance_from_upper(value);
    assert(distance >= 0.0);
    assert(distance <= 1.0);
    return distance * found_bucket_frequency + next_buckets_frequency;
  }
}

// Explicit template instantiations.
template class Equi_height<double>;
template class Equi_height<String>;
template class Equi_height<ulonglong>;
template class Equi_height<longlong>;
template class Equi_height<MYSQL_TIME>;
template class Equi_height<my_decimal>;

}  // namespace histograms