File: opt_histogram_json.cc

package info (click to toggle)
mariadb 1%3A11.8.3-1
links: PTS, VCS
area: main
in suites: sid
size: 772,520 kB
sloc: ansic: 2,414,714; cpp: 1,791,394; asm: 381,336; perl: 62,905; sh: 49,647; pascal: 40,897; java: 39,363; python: 20,791; yacc: 20,432; sql: 17,907; xml: 12,344; ruby: 8,544; cs: 6,542; makefile: 6,145; ada: 1,879; lex: 1,193; javascript: 996; objc: 80; tcl: 73; awk: 46; php: 22
file content (1208 lines) | stat: -rw-r--r-- 31,669 bytes
/*
   Copyright (c) 2021, 2022, MariaDB Corporation.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */

#include "mariadb.h"
#include "sql_base.h"
#include "my_json_writer.h"
#include "sql_statistics.h"
#include "opt_histogram_json.h"


/*
  @brief
    Un-escape a JSON string and save it into *out.

  @detail
    There's no way to tell how much space is needed for the output.
    Start with a small string and increase its size until json_unescape()
    succeeds.
*/

static bool json_unescape_to_string(const char *val, int val_len, String* out)
{
  // Make sure 'out' has some memory allocated.
  if (!out->alloced_length() && out->alloc(128))
    return true;

  while (1)
  {
    uchar *buf= (uchar*)out->ptr();
    out->length(out->alloced_length());

    int res= json_unescape(&my_charset_utf8mb4_bin,
                           (const uchar*)val,
                           (const uchar*)val + val_len,
                           out->charset(),
                           buf, buf + out->length());
    if (res >= 0)
    {
      out->length(res);
      return false; // Ok
    }
    if (res == JSON_ERROR_ILLEGAL_SYMBOL)
      return true; // Invalid character

    // We get here if the unescaped string didn't fit into memory.
    if (res == JSON_ERROR_OUT_OF_SPACE)
    {
      if (out->alloc(out->alloced_length()*2))
        return true;
    }
    else
      return true; // unknown error
  }
}


/*
  @brief
    Escape a JSON string and save it into *out.

  @detail
    There's no way to tell how much space is needed for the output.
    Start with a small string and increase its size until json_escape()
    succeeds.
*/

static int json_escape_to_string(const String *str, String* out)
{
  // Make sure 'out' has some memory allocated.
  if (!out->alloced_length() && out->alloc(128))
    return JSON_ERROR_OUT_OF_SPACE;

  while (1)
  {
    uchar *buf= (uchar*)out->ptr();
    out->length(out->alloced_length());
    const uchar *str_ptr= (const uchar*)str->ptr();

    int res= json_escape(str->charset(),
                         str_ptr,
                         str_ptr + str->length(),
                         &my_charset_utf8mb4_bin,
                         buf, buf + out->length());
    if (res >= 0)
    {
      out->length(res);
      return 0; // Ok
    }

    if (res != JSON_ERROR_OUT_OF_SPACE)
      return res; // Some conversion error

    // Out of space error. Try with a bigger buffer
    if (out->alloc(out->alloced_length()*2))
      return JSON_ERROR_OUT_OF_SPACE;
  }
}


class Histogram_json_builder : public Histogram_builder
{
  Histogram_json_hb *histogram;
  /* Number of buckets in the histogram */
  uint hist_width;

  /*
    Number of rows that we intend to have in the bucket. That is, this is

      n_rows_in_table / hist_width

    Actual number of rows in the buckets we produce may vary because of
    "popular values" and rounding.
  */
  longlong bucket_capacity;

  /* Number of the buckets already collected */
  uint n_buckets_collected;

  /*
    TRUE means do not try to represent values as UTF-8 text in histogram
    storage. Use start_hex/end_hex for all values.
  */
  bool force_binary;

  /* Data about the bucket we are filling now */
  struct CurBucket
  {
    /* Number of values in the bucket so far. */
    longlong size;

    /* Number of distinct values in the bucket */
    int ndv;
  };
  CurBucket bucket;

  /* Used to create the JSON representation of the histogram. */
  Json_writer writer;

public:

  Histogram_json_builder(Histogram_json_hb *hist, Field *col, uint col_len,
                         ha_rows rows)
    : Histogram_builder(col, col_len, rows), histogram(hist)
  {
    /*
      When computing number of rows in the bucket, round it UP. This way, we
      will not end up with a histogram that has more buckets than intended.

      We may end up producing a histogram with fewer buckets than intended, but
      this is considered tolerable.
    */
    bucket_capacity= (longlong)round(rows2double(records) / histogram->get_width() + 0.5);
    if (bucket_capacity == 0)
      bucket_capacity= 1;
    hist_width= histogram->get_width();
    n_buckets_collected= 0;
    bucket.ndv= 0;
    bucket.size= 0;
    force_binary= (col->type() == MYSQL_TYPE_BIT);

    writer.start_object();
    append_histogram_params();

    writer.add_member(Histogram_json_hb::JSON_NAME).start_array();
  }

  ~Histogram_json_builder() override = default;

private:
  bool bucket_is_empty() { return bucket.ndv == 0; }

  void append_histogram_params()
  {
    char buf[128];
    String str(buf, sizeof(buf), system_charset_info);
    THD *thd= current_thd;

    Timestamp(thd->query_start(), 0).to_datetime(thd).to_string(&str, 0);
    writer.add_member("target_histogram_size").add_ull(hist_width);
    writer.add_member("collected_at").add_str(str.ptr());
    writer.add_member("collected_by").add_str(server_version);
  }
  /*
    Flush the current bucket out (to JSON output), and set it to be empty.
  */
  void finalize_bucket()
  {
    double fract= (double) bucket.size / records;
    writer.add_member("size").add_double(fract);
    writer.add_member("ndv").add_ll(bucket.ndv);
    writer.end_object();
    n_buckets_collected++;

    bucket.ndv= 0;
    bucket.size= 0;
  }

  /*
    Same as finalize_bucket() but also provide the bucket's end value.
  */
  bool finalize_bucket_with_end_value(void *elem)
  {
    if (append_column_value(elem, false))
      return true;
    finalize_bucket();
    return false;
  }

  /*
    Write the first value group to the bucket.
    @param elem  The value we are writing
    @param cnt   The number of such values.
  */
  bool start_bucket(void *elem, longlong cnt)
  {
    DBUG_ASSERT(bucket.size == 0);
    writer.start_object();
    if (append_column_value(elem, true))
      return true;

    bucket.ndv= 1;
    bucket.size= cnt;
    return false;
  }
  
  /*
    Append the passed value into the JSON writer as string value
  */
  bool append_column_value(void *elem, bool is_start)
  {
    StringBuffer<MAX_FIELD_WIDTH> val;

    // Get the text representation of the value
    column->store_field_value((uchar*) elem, col_length);
    String *str= column->val_str(&val);

    // Escape the value for JSON
    StringBuffer<MAX_FIELD_WIDTH> escaped_val;
    int rc= JSON_ERROR_ILLEGAL_SYMBOL;
    if (!force_binary)
    {
      rc= json_escape_to_string(str, &escaped_val);
      if (!rc)
      {
        writer.add_member(is_start? "start": "end");
        writer.add_str(escaped_val.c_ptr_safe());
        return false;
      }
    }
    if (rc == JSON_ERROR_ILLEGAL_SYMBOL)
    {
      escaped_val.set_hex(val.ptr(), val.length());
      writer.add_member(is_start? "start_hex": "end_hex");
      writer.add_str(escaped_val.c_ptr_safe());
      return false;
    }
    return true;
  }

  /*
    Append a value group of cnt values.
  */
  void append_to_bucket(longlong cnt)
  {
    bucket.ndv++;
    bucket.size += cnt;
  }

public:
  /*
    @brief
      Add data to the histogram.

    @detail
      The call signals to add a "value group" of elem_cnt rows, each of which
      has the same value that is provided in *elem.

      Subsequent next() calls will add values that are greater than the
      current one.

    @return
      0 - OK
  */
  int next(void *elem, element_count elem_cnt) override
  {
    counters.next(elem, elem_cnt);
    ulonglong count= counters.get_count();

    /*
      Ok, we've got a "value group" of elem_cnt identical values.

      If we take the values from the value group and put them into
      the current bucket, how many values will be left after we've
      filled the bucket?
    */
    longlong overflow= bucket.size + elem_cnt - bucket_capacity;

    /*
      Case #1: This value group should be put into a separate bucket, if
       A. It fills the current bucket and also fills the next bucket, OR
       B. It fills the current bucket, which was empty.
    */
    if (overflow >= bucket_capacity || (bucket_is_empty() && overflow >= 0))
    {
      // Finalize the current bucket
      if (!bucket_is_empty())
        finalize_bucket();

      // Start/end the separate bucket for this value group.
      if (start_bucket(elem, elem_cnt))
        return 1; // OOM

      if (records == count)
      {
        if (finalize_bucket_with_end_value(elem))
          return 1;
      }
      else
        finalize_bucket();
    }
    else if (overflow >= 0)
    {
      /*
        Case #2: is when Case#1 doesn't hold, but we can still fill the
        current bucket.
      */

      // If the bucket was empty, it would have been case #1.
      DBUG_ASSERT(!bucket_is_empty());

      /*
        Finalize the current bucket. Put there enough values to make it hold
        bucket_capacity values.
      */
      append_to_bucket(bucket_capacity - bucket.size);
      if (records == count && !overflow)
      {
        if (finalize_bucket_with_end_value(elem))
          return 1;
      }
      else
        finalize_bucket();

      if (overflow > 0)
      {
        // Then, start the new bucket with the remaining values.
        if (start_bucket(elem, overflow))
          return 1;
      }
    }
    else
    {
      // Case #3: there's not enough values to fill the current bucket.
      if (bucket_is_empty())
      {
        if (start_bucket(elem, elem_cnt))
          return 1;
      }
      else
        append_to_bucket(elem_cnt);
    }

    if (records == count)
    {
      // This is the final value group.
      if (!bucket_is_empty())
      {
        if (finalize_bucket_with_end_value(elem))
          return 1;
      }
    }
    return 0;
  }

  /*
    @brief
      Finalize the creation of histogram
  */
  void finalize() override
  {
    writer.end_array();
    writer.end_object();
    Binary_string *json_string= (Binary_string *) writer.output.get_string();
    histogram->set_json_text(n_buckets_collected,
                             json_string->c_ptr(),
                             (size_t)json_string->length());
  }
};


Histogram_builder *Histogram_json_hb::create_builder(Field *col, uint col_len,
                                                     ha_rows rows)
{
  return new Histogram_json_builder(this, col, col_len, rows);
}


void Histogram_json_hb::init_for_collection(MEM_ROOT *mem_root,
                                            Histogram_type htype_arg,
                                            ulonglong size_arg)
{
  DBUG_ASSERT(htype_arg == JSON_HB);
  size= (size_t)size_arg;
}


/*
  A syntax sugar interface to json_string_t
*/
class Json_string
{
  json_string_t str;
public:
  explicit Json_string(const char *name)
  {
    json_string_set_str(&str, (const uchar*)name,
                        (const uchar*)name + strlen(name));
    json_string_set_cs(&str, system_charset_info);
  }
  json_string_t *get() { return &str; }
};


/*
  This [partially] saves the JSON parser state and then can rollback the parser
  to it.

  The goal of this is to be able to make multiple json_key_matches() calls:

    Json_saved_parser_state save(je);
    if (json_key_matches(je, KEY_NAME_1)) {
      ...
      return;
    }
    save.restore_to(je);
    if (json_key_matches(je, KEY_NAME_2)) {
      ...
    }

  This allows one to parse JSON objects where [optional] members come in any
  order.
*/

class Json_saved_parser_state
{
  const uchar *c_str;
  my_wc_t c_next;
  int state;
public:
  explicit Json_saved_parser_state(const json_engine_t *je) :
    c_str(je->s.c_str),
    c_next(je->s.c_next),
    state(je->state)
  {}
  void restore_to(json_engine_t *je)
  {
    je->s.c_str= c_str;
    je->s.c_next= c_next;
    je->state= state;
  }
};


/*
  @brief
    Read a constant from JSON document and save it in *out.

  @detail
    The JSON document stores constant in text form, we need to save it in
    KeyTupleFormat. String constants in JSON may be escaped.
*/

bool read_bucket_endpoint(json_engine_t *je, Field *field, String *out,
                          const char **err)
{
  if (json_read_value(je))
    return true;

  if (je->value_type != JSON_VALUE_STRING &&
      je->value_type != JSON_VALUE_NUMBER)
  {
    *err= "String or number expected";
    return true;
  }

  const char* je_value= (const char*)je->value;
  if (je->value_type == JSON_VALUE_STRING && je->value_escaped)
  {
    StringBuffer<128> unescape_buf(field->charset() ? field->charset() : &my_charset_bin);
    if (json_unescape_to_string(je_value, je->value_len, &unescape_buf))
    {
      *err= "Un-escape error";
      return true;
    }
    field->store_text(unescape_buf.ptr(), unescape_buf.length(),
                      unescape_buf.charset());
  }
  else
    field->store_text(je_value, je->value_len, &my_charset_utf8mb4_bin);

  out->alloc(field->pack_length());
  uint bytes= field->get_key_image((uchar*)out->ptr(),
                                   field->key_length(), Field::itRAW);
  out->length(bytes);
  return false;
}


bool read_hex_bucket_endpoint(json_engine_t *je, Field *field, String *out,
                              const char **err)
{
  if (json_read_value(je))
    return true;

  if (je->value_type != JSON_VALUE_STRING || je->value_escaped ||
      (je->value_len & 1))
  {
    *err= "Expected a hex string";
    return true;
  }
  StringBuffer<128> buf;
    
  for (auto pc= je->value; pc < je->value + je->value_len; pc+=2)
  {
    int hex_char1= hexchar_to_int(pc[0]);
    int hex_char2= hexchar_to_int(pc[1]);
    if (hex_char1 == -1 || hex_char2 == -1)
    {
      *err= "Expected a hex string";
      return true;
    }
    buf.append((hex_char1 << 4) | hex_char2);
  }

  field->store_text(buf.ptr(), buf.length(), field->charset());
  out->alloc(field->pack_length());
  uint bytes= field->get_key_image((uchar*)out->ptr(),
                                   field->key_length(), Field::itRAW);
  out->length(bytes);
  return false;
}


/*
  @brief  Parse a JSON reprsentation for one histogram bucket

  @param je     The JSON parser object
  @param field  Table field we are using histogram (used to convert
                               endpoints from text representation to binary)
  @param total_size  INOUT  Fraction of the table rows in the buckets parsed so
                            far.
  @param assigned_last_end  OUT  TRUE<=> The bucket had "end" members, the
                                 function has saved it in
                                 this->last_bucket_end_endp
  @param err  OUT  If function returns 1, this *may* be set to point to text
                   describing the error.

  @detail

    Parse a JSON object in this form:

      { "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"}

   Unknown members are ignored.

  @return
    0  OK
    1  Parse Error
   -1  EOF
*/
int Histogram_json_hb::parse_bucket(json_engine_t *je, Field *field,
                                    double *total_size,
                                    bool *assigned_last_end,
                                    const char **err)
{
  *assigned_last_end= false;
  if (json_scan_next(je))
    return 1;
  if (je->state != JST_VALUE)
  {
    if (je->state == JST_ARRAY_END)
      return -1; // EOF
    else
      return 1; // An error
  }

  if (json_scan_next(je) || je->state != JST_OBJ_START)
  {
    *err= "Expected an object in the buckets array";
    return 1;
  }

  bool have_start= false;
  bool have_size= false;
  bool have_ndv= false;
  CHARSET_INFO *cs;

  if (!(cs= field->charset()))
    cs= &my_charset_bin;

  double size_d;
  longlong ndv_ll= 0;
  StringBuffer<128> value_buf(cs);
  int rc;

  while (!(rc= json_scan_next(je)) && je->state != JST_OBJ_END)
  {
    Json_saved_parser_state save1(je);
    Json_string start_str("start");
    if (json_key_matches(je, start_str.get()))
    {
      if (read_bucket_endpoint(je, field, &value_buf, err))
        return 1;

      have_start= true;
      continue;
    }
    save1.restore_to(je);

    Json_string size_str("size");
    if (json_key_matches(je, size_str.get()))
    {
      if (json_read_value(je))
        return 1;

      const char *size= (const char*)je->value_begin;
      char *size_end= (char*)je->value_end;
      int conv_err;
      size_d= my_strtod(size, &size_end, &conv_err);
      if (conv_err)
      {
        *err= ".size member must be a floating-point value";
        return 1;
      }
      have_size= true;
      continue;
    }
    save1.restore_to(je);

    Json_string ndv_str("ndv");
    if (json_key_matches(je, ndv_str.get()))
    {
      if (json_read_value(je))
        return 1;

      const char *ndv= (const char*)je->value_begin;
      char *ndv_end= (char*)je->value_end;
      int conv_err;
      ndv_ll= my_strtoll10(ndv, &ndv_end, &conv_err);
      if (conv_err)
      {
        *err= ".ndv member must be an integer value";
        return 1;
      }
      have_ndv= true;
      continue;
    }
    save1.restore_to(je);

    Json_string end_str("end");
    if (json_key_matches(je, end_str.get()))
    {
      if (read_bucket_endpoint(je, field, &value_buf, err))
        return 1;
      last_bucket_end_endp.assign(value_buf.ptr(), value_buf.length());
      *assigned_last_end= true;
      continue;
    }
    save1.restore_to(je);

    // Less common endoints:
    Json_string start_hex_str("start_hex");
    if (json_key_matches(je, start_hex_str.get()))
    {
      if (read_hex_bucket_endpoint(je, field, &value_buf, err))
        return 1;

      have_start= true;
      continue;
    }
    save1.restore_to(je);

    Json_string end_hex_str("end_hex");
    if (json_key_matches(je, end_hex_str.get()))
    {
      if (read_hex_bucket_endpoint(je, field, &value_buf, err))
        return 1;
      last_bucket_end_endp.assign(value_buf.ptr(), value_buf.length());
      *assigned_last_end= true;
      continue;
    }
    save1.restore_to(je);


    // Some unknown member. Skip it.
    if (json_skip_key(je))
      return 1;
  }

  if (rc)
    return 1;

  if (!have_start)
  {
    *err= "\"start\" element not present";
    return 1;
  }
  if (!have_size)
  {
    *err= "\"size\" element not present";
    return 1;
  }
  if (!have_ndv)
  {
    *err= "\"ndv\" element not present";
    return 1;
  }

  *total_size += size_d;

  buckets.push_back({std::string(value_buf.ptr(), value_buf.length()),
                     *total_size, ndv_ll});

  return 0; // Ok, continue reading
}


/*
  @brief
    Parse the histogram from its on-disk JSON representation

  @detail
    See opt_histogram_json.h, class Histogram_json_hb for description of the
    data format.

  @return
     false  OK
     True   Error
*/

bool Histogram_json_hb::parse(MEM_ROOT *mem_root, const char *db_name,
                              const char *table_name, Field *field,
                              const char *hist_data, size_t hist_data_len)
{
  json_engine_t je;
  int rc;
  const char *err= "JSON parse error";
  double total_size;
  int end_element;
  bool end_assigned;
  DBUG_ENTER("Histogram_json_hb::parse");

  json_scan_start(&je, &my_charset_utf8mb4_bin,
                  (const uchar*)hist_data,
                  (const uchar*)hist_data+hist_data_len);

  if (json_scan_next(&je))
    goto err;

  if (je.state != JST_OBJ_START)
  {
    err= "Root JSON element must be a JSON object";
    goto err;
  }

  while (1)
  {
    if (json_scan_next(&je))
      goto err;
    if (je.state == JST_OBJ_END)
      break; // End of object

    if (je.state != JST_KEY)
      goto err; // Can' really have this: JSON object has keys in it

    Json_string hist_key_name(JSON_NAME);
    if (json_key_matches(&je, hist_key_name.get()))
    {
      total_size= 0.0;
      end_element= -1;
      if (json_scan_next(&je))
        goto err;

      if (je.state != JST_ARRAY_START)
      {
        err= "histogram_hb must contain an array";
        goto err;
      }

      while (!(rc= parse_bucket(&je, field, &total_size, &end_assigned, &err)))
      {
        if (end_assigned && end_element != -1)
          end_element= (int)buckets.size();
      }
      if (rc > 0)  // Got error other than EOF
        goto err;
    }
    else
    {
      // Some unknown member. Skip it.
      if (json_skip_key(&je))
        return 1;
    }
  }

  if (buckets.size() < 1)
  {
    err= "Histogram must have at least one bucket";
    goto err;
  }

  if (end_element == -1)
  {
    buckets.back().start_value= last_bucket_end_endp;
  }
  else if (end_element < (int)buckets.size())
  {
    err= ".end is only allowed in the last bucket";
    goto err;
  }

  DBUG_RETURN(false); // Ok
err:
  THD *thd= current_thd;
  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
                      ER_JSON_HISTOGRAM_PARSE_FAILED,
                      ER_THD(thd, ER_JSON_HISTOGRAM_PARSE_FAILED),
                      db_name, table_name,
                      err, (je.s.c_str - (const uchar*)hist_data));
  sql_print_error(ER_DEFAULT(ER_JSON_HISTOGRAM_PARSE_FAILED),
                  db_name, table_name, err,
                  (je.s.c_str - (const uchar*)hist_data));

  DBUG_RETURN(true);
}


static
void store_key_image_to_rec_no_null(Field *field, const char *ptr, size_t len)
{
  MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
                                    &field->table->write_set);
  field->set_key_image((const uchar*)ptr, (uint)len);
  dbug_tmp_restore_column_map(&field->table->write_set, old_map);
}


static
double position_in_interval(Field *field, const  uchar *key, uint key_len,
                            const std::string& left, const std::string& right)
{
  double res;
  if (field->pos_through_val_str())
  {
    StringBuffer<64> buf1, buf2, buf3;

    store_key_image_to_rec_no_null(field, left.data(), left.size());
    String *min_str= field->val_str(&buf1);
    /*
      Make sure we've saved a copy of the data, not a pointer into the
      field->ptr. We will overwrite the contents of field->ptr with the next
      store_key_image_to_rec_no_null call
    */
    if (&buf1 != min_str)
      buf1.copy(*min_str);
    else
      buf1.copy();

    store_key_image_to_rec_no_null(field, right.data(), right.size());
    String *max_str= field->val_str(&buf2);
    /* Same as above */
    if (&buf2 != max_str)
      buf2.copy(*max_str);
    else
      buf2.copy();

    store_key_image_to_rec_no_null(field, (const char*)key, key_len);
    String *midp_str= field->val_str(&buf3);

    res= pos_in_interval_for_string(field->charset(),
           (const uchar*)midp_str->ptr(), midp_str->length(),
           (const uchar*)buf1.ptr(), buf1.length(),
           (const uchar*)buf2.ptr(), buf2.length());
  }
  else
  {
    store_key_image_to_rec_no_null(field, left.data(), field->key_length());
    double min_val_real= field->val_real();

    store_key_image_to_rec_no_null(field, right.data(), field->key_length());
    double max_val_real= field->val_real();

    store_key_image_to_rec_no_null(field, (const char*)key, field->key_length());
    double midp_val_real= field->val_real();

    res= pos_in_interval_for_double(midp_val_real, min_val_real, max_val_real);
  }
  return res;
}


double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
                                            double avg_sel)
{
  const uchar *key = endpoint->key;
  if (field->real_maybe_null())
    key++;

  // If the value is outside of the histogram's range, this will "clip" it to
  // first or last bucket.
  int endp_cmp;
  int idx= find_bucket(field, key, &endp_cmp);

  double sel;

  if (buckets[idx].ndv == 1 && (endp_cmp!=0))
  {
    /*
      The bucket has a single value and it doesn't match! Return a very
      small value.
    */
    sel= 0.0;
  }
  else
  {
    /*
      We get here when:
      * The bucket has one value and this is the value we are looking for.
      * The bucket has multiple values. Then, assume
    */
    sel= (buckets[idx].cum_fract - get_left_fract(idx)) / buckets[idx].ndv;
  }
  return sel;
}


double Histogram_json_hb::get_left_fract(int idx)
{
  if (!idx)
    return 0.0;
  else
    return buckets[idx-1].cum_fract;
}

std::string& Histogram_json_hb::get_end_value(int idx)
{
  if (idx == (int)buckets.size()-1)
    return last_bucket_end_endp;
  else
    return buckets[idx+1].start_value;
}

/*
  @param field    The table field histogram is for.  We don't care about the
                  field's current value, we only need its virtual functions to
                  perform various operations
  @param min_endp Left endpoint, or NULL if there is none
  @param max_endp Right endpoint, or NULL if there is none
  @param avg_sel  Average selectivity of "field=const" equality for this field

  @return
     Range selectivity: a number between 0.0 and 1.0.

  @note
     This may return 0.0. Adjustments to avoid multiply-by-zero meltdown are
     made elsewhere.
*/

double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
                                            key_range *max_endp, double avg_sel)
{
  double min, max;

  if (min_endp && !(field->real_maybe_null() && min_endp->key[0]))
  {
    bool exclusive_endp= (min_endp->flag == HA_READ_AFTER_KEY)? true: false;
    const uchar *min_key= min_endp->key;
    uint min_key_len= min_endp->length;
    if (field->real_maybe_null())
    {
      min_key++;
      min_key_len--;
    }

    // Find the leftmost bucket that contains the lookup value.
    // (If the lookup value is to the left of all buckets, find bucket #0)
    int endp_cmp;
    int idx= find_bucket(field, min_key, &endp_cmp);

    double sel;
    // Special handling for buckets with ndv=1:
    if (buckets[idx].ndv == 1)
    {
      if (endp_cmp < 0)
        sel= 0.0;
      else if (endp_cmp > 0)
        sel= 1.0;
      else // endp_cmp == 0.0
        sel= (exclusive_endp)? 1.0 : 0.0;
    }
    else
    {
      sel= position_in_interval(field, min_key, min_key_len,
				buckets[idx].start_value,
				get_end_value(idx));
    }
    double left_fract= get_left_fract(idx);
    min= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
    min= 0.0;

  if (max_endp)
  {
    // The right endpoint cannot be NULL
    DBUG_ASSERT(!(field->real_maybe_null() && max_endp->key[0]));
    bool inclusive_endp= (max_endp->flag == HA_READ_AFTER_KEY)? true: false;
    const uchar *max_key= max_endp->key;
    uint max_key_len= max_endp->length;
    if (field->real_maybe_null())
    {
      max_key++;
      max_key_len--;
    }
    int endp_cmp;
    int idx= find_bucket(field, max_key, &endp_cmp);

    if ((endp_cmp == 0) && !inclusive_endp)
    {
      /*
        The range is "col < $CONST" and we've found a bucket starting with
        $CONST.
      */
      if (idx > 0)
      {
        // Move to the previous bucket
        endp_cmp= 1;
        idx--;
      }
      else
        endp_cmp= -1;
    }
    double sel;

    // Special handling for buckets with ndv=1:
    if (buckets[idx].ndv == 1)
    {
      if (endp_cmp < 0)
        sel= 0.0;
      else if (endp_cmp > 0)
        sel= 1.0;
      else // endp_cmp == 0.0
        sel= inclusive_endp? 1.0 : 0.0;
    }
    else
    {
      sel= position_in_interval(field, max_key, max_key_len,
                                buckets[idx].start_value,
                                get_end_value(idx));
    }
    double left_fract= get_left_fract(idx);
    max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
    max= 1.0;

  if (min > max)
  {
    /*
      This can happen due to rounding errors.

      What is the acceptable error size? Json_writer::add_double() uses
      %.11lg format. This gives 9 digits after the dot. A histogram may have
      hundreds of buckets, let's multiply the error by 1000. 9-3=6
    */
    DBUG_ASSERT(max < min + 1e-6);
    max= min;
  }
  return max - min;
}


void Histogram_json_hb::serialize(Field *field)
{
  field->store(json_text.data(), json_text.size(), &my_charset_bin);
}


#ifndef DBUG_OFF
static int SGN(int x)
{
  if (!x)
    return 0;
  return (x < 0)? -1 : 1;
}
#endif


/*
  @brief
   Find the leftmost histogram bucket such that "lookup_val >= start_value".

  @param field        Field object (used to do value comparisons)
  @param lookup_val   The lookup value in KeyTupleFormat.
  @param cmp  OUT     How the lookup_val compares to found_bucket.left_bound:
                      0  - lookup_val == bucket.left_bound
                      >0 - lookup_val > bucket.left_bound (the most typical)
                      <0 - lookup_val < bucket.left_bound. This can only happen
                      for the first bucket, for all other buckets we would just
                      pick the previous bucket and have cmp>=0.
  @return
     The bucket index
*/

int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
                                   int *cmp)
{
  int res;
  int low= 0;
  int high= (int)buckets.size() - 1;
  *cmp= 1; // By default, (bucket[retval].start_value < *lookup_val)

  while (low + 1 < high)
  {
    int middle= (low + high) / 2;
    res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
    if (!res)
    {
      *cmp= res;
      low= middle;
      goto end;
    }
    else if (res < 0)
      low= middle;
    else //res > 0
      high= middle;
  }

  /*
    If low and high were assigned a value in the above loop and we got here,
    then the following holds:

      bucket[low].start_value < lookup_val < bucket[high].start_value

    Besides that, there are two special cases: low=0 and high=last_bucket.
    Handle them below.
  */
  if (low == 0)
  {
    res= field->key_cmp(lookup_val, (uchar*)buckets[0].start_value.data());
    if (res <= 0)
      *cmp= res;
    else // res>0, lookup_val > buckets[0].start_value
    {
      res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
      if (res >= 0)  // lookup_val >= buckets[high].start_value
      {
        // Move to that bucket
        low= high;
        *cmp= res;
      }
      else
        *cmp= 1;
    }
  }
  else if (high == (int)buckets.size() - 1)
  {
    res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
    if (res >= 0)
    {
      // Ok the value is in the last bucket.
      *cmp= res;
      low= high;
    }
    else
    {
      // The value is in the 'low' bucket.
      res= field->key_cmp(lookup_val, (uchar*)buckets[low].start_value.data());
      *cmp= res;
    }
  }

end:
  // Verification: *cmp has correct value
  DBUG_ASSERT(SGN(*cmp) ==
              SGN(field->key_cmp(lookup_val,
                                 (uchar*)buckets[low].start_value.data())));
  // buckets[low] <= lookup_val, with one exception of the first bucket.
  DBUG_ASSERT(low == 0 ||
              field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
  // buckets[low+1] > lookup_val, with one exception of the last bucket
  DBUG_ASSERT(low == (int)buckets.size()-1 ||
              field->key_cmp((uchar*)buckets[low+1].start_value.data(), lookup_val)> 0);
  return low;
}