File: crn_dxt_fast.cpp

package info (click to toggle)
crunch-dxtc 0.55.5-1
links: PTS, VCS
area: main
in suites: forky
size: 3,600 kB
sloc: cpp: 64,979; ansic: 633; python: 321; makefile: 112
file content (838 lines) | stat: -rw-r--r-- 23,492 bytes
parent folder | download | duplicates (2)
// File: crn_dxt_fast.cpp
// See Copyright Notice and license at the end of inc/crnlib.h
// Parts of this module are derived from RYG's excellent public domain DXTx compressor.
#include "crn_core.h"
#include "crn_dxt_fast.h"
#include "crn_ryg_dxt.hpp"

namespace crnlib {
namespace dxt_fast {
static inline int mul_8bit(int a, int b) {
  int t = a * b + 128;
  return (t + (t >> 8)) >> 8;
}

static inline color_quad_u8& unpack_color(color_quad_u8& c, uint v) {
  uint rv = (v & 0xf800) >> 11;
  uint gv = (v & 0x07e0) >> 5;
  uint bv = (v & 0x001f) >> 0;

  c.r = ryg_dxt::Expand5[rv];
  c.g = ryg_dxt::Expand6[gv];
  c.b = ryg_dxt::Expand5[bv];
  c.a = 0;

  return c;
}

static inline uint pack_color(const color_quad_u8& c) {
  return (mul_8bit(c.r, 31) << 11) + (mul_8bit(c.g, 63) << 5) + mul_8bit(c.b, 31);
}

#if 0
static inline void lerp_color(color_quad_u8& result, const color_quad_u8& p1, const color_quad_u8& p2, uint f) {
  CRNLIB_ASSERT(f <= 255);

  result.r = static_cast<uint8>(p1.r + mul_8bit(p2.r - p1.r, f));
  result.g = static_cast<uint8>(p1.g + mul_8bit(p2.g - p1.g, f));
  result.b = static_cast<uint8>(p1.b + mul_8bit(p2.b - p1.b, f));
}
#endif

static inline void eval_colors(color_quad_u8* pColors, uint c0, uint c1) {
  unpack_color(pColors[0], c0);
  unpack_color(pColors[1], c1);

#if 0
         lerp_color(pColors[2], pColors[0], pColors[1], 0x55);
         lerp_color(pColors[3], pColors[0], pColors[1], 0xAA);
#else
  pColors[2].r = (pColors[0].r * 2 + pColors[1].r) / 3;
  pColors[2].g = (pColors[0].g * 2 + pColors[1].g) / 3;
  pColors[2].b = (pColors[0].b * 2 + pColors[1].b) / 3;

  pColors[3].r = (pColors[1].r * 2 + pColors[0].r) / 3;
  pColors[3].g = (pColors[1].g * 2 + pColors[0].g) / 3;
  pColors[3].b = (pColors[1].b * 2 + pColors[0].b) / 3;
#endif
}

// false if all selectors equal
static bool match_block_colors(uint n, const color_quad_u8* pBlock, const color_quad_u8* pColors, uint8* pSelectors) {
  int dirr = pColors[0].r - pColors[1].r;
  int dirg = pColors[0].g - pColors[1].g;
  int dirb = pColors[0].b - pColors[1].b;

  int stops[4];
  for (int i = 0; i < 4; i++)
    stops[i] = pColors[i].r * dirr + pColors[i].g * dirg + pColors[i].b * dirb;

  // 0 2 3 1
  int c0Point = stops[1] + stops[3];
  int halfPoint = stops[3] + stops[2];
  int c3Point = stops[2] + stops[0];

  //dirr *= 2;
  //dirg *= 2;
  //dirb *= 2;
  c0Point >>= 1;
  halfPoint >>= 1;
  c3Point >>= 1;

  bool status = false;
  for (uint i = 0; i < n; i++) {
    int dot = pBlock[i].r * dirr + pBlock[i].g * dirg + pBlock[i].b * dirb;

    uint8 s;
    if (dot < halfPoint)
      s = (dot < c0Point) ? 1 : 3;
    else
      s = (dot < c3Point) ? 2 : 0;

    pSelectors[i] = s;

    if (s != pSelectors[0])
      status = true;
  }

  return status;
}

static bool optimize_block_colors(uint n, const color_quad_u8* block, uint& max16, uint& min16, uint ave_color[3], float axis[3]) {
  int min[3], max[3];

  for (uint ch = 0; ch < 3; ch++) {
    const uint8* bp = ((const uint8*)block) + ch;
    int minv, maxv;

    int64 muv = bp[0];
    minv = maxv = bp[0];

    const uint l = n << 2;
    for (uint i = 4; i < l; i += 4) {
      muv += bp[i];
      minv = math::minimum<int>(minv, bp[i]);
      maxv = math::maximum<int>(maxv, bp[i]);
    }

    ave_color[ch] = static_cast<int>((muv + (n / 2)) / n);
    min[ch] = minv;
    max[ch] = maxv;
  }

  if ((min[0] == max[0]) && (min[1] == max[1]) && (min[2] == max[2]))
    return false;

  // determine covariance matrix
  double cov[6];
  for (int i = 0; i < 6; i++)
    cov[i] = 0;

  for (uint i = 0; i < n; i++) {
    double r = (int)block[i].r - (int)ave_color[0];
    double g = (int)block[i].g - (int)ave_color[1];
    double b = (int)block[i].b - (int)ave_color[2];

    cov[0] += r * r;
    cov[1] += r * g;
    cov[2] += r * b;
    cov[3] += g * g;
    cov[4] += g * b;
    cov[5] += b * b;
  }

  double covf[6], vfr, vfg, vfb;
  for (int i = 0; i < 6; i++)
    covf[i] = cov[i] * (1.0f / 255.0f);

  vfr = max[0] - min[0];
  vfg = max[1] - min[1];
  vfb = max[2] - min[2];

  static const uint nIterPower = 4;
  for (uint iter = 0; iter < nIterPower; iter++) {
    double r = vfr * covf[0] + vfg * covf[1] + vfb * covf[2];
    double g = vfr * covf[1] + vfg * covf[3] + vfb * covf[4];
    double b = vfr * covf[2] + vfg * covf[4] + vfb * covf[5];

    vfr = r;
    vfg = g;
    vfb = b;
  }

  double magn = math::maximum(math::maximum(fabs(vfr), fabs(vfg)), fabs(vfb));
  int v_r, v_g, v_b;

  if (magn < 4.0f)  // too small, default to luminance
  {
    v_r = 148;
    v_g = 300;
    v_b = 58;

    axis[0] = (float)v_r;
    axis[1] = (float)v_g;
    axis[2] = (float)v_b;
  } else {
    magn = 512.0f / magn;
    vfr *= magn;
    vfg *= magn;
    vfb *= magn;
    v_r = static_cast<int>(vfr);
    v_g = static_cast<int>(vfg);
    v_b = static_cast<int>(vfb);

    axis[0] = (float)vfr;
    axis[1] = (float)vfg;
    axis[2] = (float)vfb;
  }

  int mind = block[0].r * v_r + block[0].g * v_g + block[0].b * v_b;
  int maxd = mind;
  color_quad_u8 minp(block[0]);
  color_quad_u8 maxp(block[0]);

  for (uint i = 1; i < n; i++) {
    int dot = block[i].r * v_r + block[i].g * v_g + block[i].b * v_b;

    if (dot < mind) {
      mind = dot;
      minp = block[i];
    }

    if (dot > maxd) {
      maxd = dot;
      maxp = block[i];
    }
  }

  max16 = pack_color(maxp);
  min16 = pack_color(minp);

  return true;
}

// The refinement function. (Clever code, part 2)
// Tries to optimize colors to suit block contents better.
// (By solving a least squares system via normal equations+Cramer's rule)
static bool refine_block(uint n, const color_quad_u8* block, uint& max16, uint& min16, const uint8* pSelectors) {
  static const int w1Tab[4] = {3, 0, 2, 1};

  static const int prods_0[4] = {0x00, 0x00, 0x02, 0x02};
  static const int prods_1[4] = {0x00, 0x09, 0x01, 0x04};
  static const int prods_2[4] = {0x09, 0x00, 0x04, 0x01};

  double akku_0 = 0;
  double akku_1 = 0;
  double akku_2 = 0;
  double At1_r, At1_g, At1_b;
  double At2_r, At2_g, At2_b;

  At1_r = At1_g = At1_b = 0;
  At2_r = At2_g = At2_b = 0;
  for (uint i = 0; i < n; i++) {
    double r = block[i].r;
    double g = block[i].g;
    double b = block[i].b;
    int step = pSelectors[i];

    int w1 = w1Tab[step];

    akku_0 += prods_0[step];
    akku_1 += prods_1[step];
    akku_2 += prods_2[step];
    At1_r += w1 * r;
    At1_g += w1 * g;
    At1_b += w1 * b;
    At2_r += r;
    At2_g += g;
    At2_b += b;
  }

  At2_r = 3 * At2_r - At1_r;
  At2_g = 3 * At2_g - At1_g;
  At2_b = 3 * At2_b - At1_b;

  double xx = akku_2;
  double yy = akku_1;
  double xy = akku_0;

  double t = xx * yy - xy * xy;
  if (!yy || !xx || (fabs(t) < .0000125f))
    return false;

  double frb = (3.0f * 31.0f / 255.0f) / t;
  double fg = frb * (63.0f / 31.0f);

  uint oldMin = min16;
  uint oldMax = max16;

  // solve.
  max16 = math::clamp<int>(static_cast<int>((At1_r * yy - At2_r * xy) * frb + 0.5f), 0, 31) << 11;
  max16 |= math::clamp<int>(static_cast<int>((At1_g * yy - At2_g * xy) * fg + 0.5f), 0, 63) << 5;
  max16 |= math::clamp<int>(static_cast<int>((At1_b * yy - At2_b * xy) * frb + 0.5f), 0, 31) << 0;

  min16 = math::clamp<int>(static_cast<int>((At2_r * xx - At1_r * xy) * frb + 0.5f), 0, 31) << 11;
  min16 |= math::clamp<int>(static_cast<int>((At2_g * xx - At1_g * xy) * fg + 0.5f), 0, 63) << 5;
  min16 |= math::clamp<int>(static_cast<int>((At2_b * xx - At1_b * xy) * frb + 0.5f), 0, 31) << 0;

  return (oldMin != min16) || (oldMax != max16);
}

// false if all selectors equal
static bool determine_selectors(uint n, const color_quad_u8* block, uint min16, uint max16, uint8* pSelectors) {
  color_quad_u8 color[4];

  if (max16 != min16) {
    eval_colors(color, min16, max16);

    return match_block_colors(n, block, color, pSelectors);
  }

  memset(pSelectors, 0, n);
  return false;
}

static uint64 determine_error(uint n, const color_quad_u8* block, uint min16, uint max16, uint64 early_out_error) {
  color_quad_u8 color[4];

  eval_colors(color, min16, max16);

  int dirr = color[0].r - color[1].r;
  int dirg = color[0].g - color[1].g;
  int dirb = color[0].b - color[1].b;

  int stops[4];
  for (int i = 0; i < 4; i++)
    stops[i] = color[i].r * dirr + color[i].g * dirg + color[i].b * dirb;

  // 0 2 3 1
  int c0Point = stops[1] + stops[3];
  int halfPoint = stops[3] + stops[2];
  int c3Point = stops[2] + stops[0];

  c0Point >>= 1;
  halfPoint >>= 1;
  c3Point >>= 1;

  uint64 total_error = 0;

  for (uint i = 0; i < n; i++) {
    const color_quad_u8& a = block[i];

    uint s = 0;
    if (min16 != max16) {
      int dot = a.r * dirr + a.g * dirg + a.b * dirb;

      if (dot < halfPoint)
        s = (dot < c0Point) ? 1 : 3;
      else
        s = (dot < c3Point) ? 2 : 0;
    }

    const color_quad_u8& b = color[s];

    int e = a[0] - b[0];
    total_error += e * e;

    e = a[1] - b[1];
    total_error += e * e;

    e = a[2] - b[2];
    total_error += e * e;

    if (total_error >= early_out_error)
      break;
  }

  return total_error;
}

static bool refine_endpoints(uint n, const color_quad_u8* pBlock, uint& low16, uint& high16, uint8* pSelectors) {
  bool optimized = false;

  const int limits[3] = {31, 63, 31};

  for (uint trial = 0; trial < 2; trial++) {
    color_quad_u8 color[4];
    eval_colors(color, low16, high16);

    uint64 total_error[3] = {0, 0, 0};

    for (uint i = 0; i < n; i++) {
      const color_quad_u8& a = pBlock[i];

      const uint s = pSelectors[i];
      const color_quad_u8& b = color[s];

      int e = a[0] - b[0];
      total_error[0] += e * e;

      e = a[1] - b[1];
      total_error[1] += e * e;

      e = a[2] - b[2];
      total_error[2] += e * e;
    }

    color_quad_u8 endpoints[2];
    endpoints[0] = dxt1_block::unpack_color((uint16)low16, false);
    endpoints[1] = dxt1_block::unpack_color((uint16)high16, false);

    color_quad_u8 expanded_endpoints[2];
    expanded_endpoints[0] = dxt1_block::unpack_color((uint16)low16, true);
    expanded_endpoints[1] = dxt1_block::unpack_color((uint16)high16, true);

    bool trial_optimized = false;

    for (uint axis = 0; axis < 3; axis++) {
      if (!total_error[axis])
        continue;

      const sU8* const pExpand = (axis == 1) ? ryg_dxt::Expand6 : ryg_dxt::Expand5;

      for (uint e = 0; e < 2; e++) {
        uint v[4];
        v[e ^ 1] = expanded_endpoints[e ^ 1][axis];

        for (int t = -1; t <= 1; t += 2) {
          int a = endpoints[e][axis] + t;
          if ((a < 0) || (a > limits[axis]))
            continue;

          v[e] = pExpand[a];

          //int delta = v[1] - v[0];
          //v[2] = v[0] + mul_8bit(delta, 0x55);
          //v[3] = v[0] + mul_8bit(delta, 0xAA);

          v[2] = (v[0] * 2 + v[1]) / 3;
          v[3] = (v[0] + v[1] * 2) / 3;

          uint64 axis_error = 0;

          for (uint i = 0; i < n; i++) {
            const color_quad_u8& p = pBlock[i];

            int e = v[pSelectors[i]] - p[axis];

            axis_error += e * e;

            if (axis_error >= total_error[axis])
              break;
          }

          if (axis_error < total_error[axis]) {
            //total_error[axis] = axis_error;

            endpoints[e][axis] = (uint8)a;
            expanded_endpoints[e][axis] = (uint8)v[e];

            if (e)
              high16 = dxt1_block::pack_color(endpoints[1], false);
            else
              low16 = dxt1_block::pack_color(endpoints[0], false);

            determine_selectors(n, pBlock, low16, high16, pSelectors);

            eval_colors(color, low16, high16);

            utils::zero_object(total_error);

            for (uint i = 0; i < n; i++) {
              const color_quad_u8& a = pBlock[i];

              const uint s = pSelectors[i];
              const color_quad_u8& b = color[s];

              int e = a[0] - b[0];
              total_error[0] += e * e;

              e = a[1] - b[1];
              total_error[1] += e * e;

              e = a[2] - b[2];
              total_error[2] += e * e;
            }

            trial_optimized = true;
          }

        }  // t

      }  // e
    }    // axis

    if (!trial_optimized)
      break;

    optimized = true;

  }  // for ( ; ; )

  return optimized;
}

static void refine_endpoints2(uint n, const color_quad_u8* pBlock, uint& low16, uint& high16, uint8* pSelectors, float axis[3]) {
  uint64 orig_error = determine_error(n, pBlock, low16, high16, cUINT64_MAX);
  if (!orig_error)
    return;

  float l = 1.0f / sqrt(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
  vec3F principle_axis(axis[0] * l, axis[1] * l, axis[2] * l);

  const float dist_per_trial = 0.027063293f;

  const uint cMaxProbeRange = 8;
  uint probe_low[cMaxProbeRange * 2 + 1];
  uint probe_high[cMaxProbeRange * 2 + 1];

  int probe_range = 8;
  uint num_iters = 4;

  const uint num_trials = probe_range * 2 + 1;

  vec3F scaled_principle_axis(principle_axis * dist_per_trial);
  scaled_principle_axis[0] *= 31.0f;
  scaled_principle_axis[1] *= 63.0f;
  scaled_principle_axis[2] *= 31.0f;
  vec3F initial_ofs(scaled_principle_axis * (float)-probe_range);
  initial_ofs[0] += .5f;
  initial_ofs[1] += .5f;
  initial_ofs[2] += .5f;

  uint64 cur_error = orig_error;

  for (uint iter = 0; iter < num_iters; iter++) {
    color_quad_u8 endpoints[2];

    endpoints[0] = dxt1_block::unpack_color((uint16)low16, false);
    endpoints[1] = dxt1_block::unpack_color((uint16)high16, false);

    vec3F low_color(endpoints[0][0], endpoints[0][1], endpoints[0][2]);
    vec3F high_color(endpoints[1][0], endpoints[1][1], endpoints[1][2]);

    vec3F probe_low_color(low_color + initial_ofs);
    for (uint i = 0; i < num_trials; i++) {
      int r = math::clamp((int)floor(probe_low_color[0]), 0, 31);
      int g = math::clamp((int)floor(probe_low_color[1]), 0, 63);
      int b = math::clamp((int)floor(probe_low_color[2]), 0, 31);
      probe_low[i] = b | (g << 5U) | (r << 11U);

      probe_low_color += scaled_principle_axis;
    }

    vec3F probe_high_color(high_color + initial_ofs);
    for (uint i = 0; i < num_trials; i++) {
      int r = math::clamp((int)floor(probe_high_color[0]), 0, 31);
      int g = math::clamp((int)floor(probe_high_color[1]), 0, 63);
      int b = math::clamp((int)floor(probe_high_color[2]), 0, 31);
      probe_high[i] = b | (g << 5U) | (r << 11U);

      probe_high_color += scaled_principle_axis;
    }

    uint best_l = low16;
    uint best_h = high16;

    enum { cMaxHash = 4 };
    uint64 hash[cMaxHash];
    for (uint i = 0; i < cMaxHash; i++)
      hash[i] = 0;

    uint c = best_l | (best_h << 16);
    c = fast_hash(&c, sizeof(c));
    hash[(c >> 6) & 3] = 1ULL << (c & 63);

    for (uint i = 0; i < num_trials; i++) {
      for (uint j = 0; j < num_trials; j++) {
        uint l = probe_low[i];
        uint h = probe_high[j];
        if (l < h)
          utils::swap(l, h);

        uint c = l | (h << 16);
        c = fast_hash(&c, sizeof(c));
        uint64 mask = 1ULL << (c & 63);
        uint ofs = (c >> 6) & 3;
        if (hash[ofs] & mask)
          continue;

        hash[ofs] |= mask;

        uint64 new_error = determine_error(n, pBlock, l, h, cur_error);
        if (new_error < cur_error) {
          best_l = l;
          best_h = h;
          cur_error = new_error;
        }
      }
    }

    bool improved = false;

    if ((best_l != low16) || (best_h != high16)) {
      low16 = best_l;
      high16 = best_h;

      determine_selectors(n, pBlock, low16, high16, pSelectors);
      improved = true;
    }

    if (refine_endpoints(n, pBlock, low16, high16, pSelectors)) {
      improved = true;

      uint64 cur_error = determine_error(n, pBlock, low16, high16, cUINT64_MAX);
      if (!cur_error)
        return;
    }

    if (!improved)
      break;

  }  // iter

  //uint64 end_error = determine_error(n, pBlock, low16, high16, UINT64_MAX);
  //if (end_error > orig_error) DebugBreak();
}

static void compress_solid_block(uint n, uint ave_color[3], uint& low16, uint& high16, uint8* pSelectors) {
  uint r = ave_color[0];
  uint g = ave_color[1];
  uint b = ave_color[2];

  memset(pSelectors, 2, n);

  low16 = (ryg_dxt::OMatch5[r][0] << 11) | (ryg_dxt::OMatch6[g][0] << 5) | ryg_dxt::OMatch5[b][0];
  high16 = (ryg_dxt::OMatch5[r][1] << 11) | (ryg_dxt::OMatch6[g][1] << 5) | ryg_dxt::OMatch5[b][1];
}

void compress_color_block(uint n, const color_quad_u8* block, uint& low16, uint& high16, uint8* pSelectors, bool refine) {
  CRNLIB_ASSERT((n & 15) == 0);

  uint ave_color[3];
  float axis[3];

  if (!optimize_block_colors(n, block, low16, high16, ave_color, axis)) {
    compress_solid_block(n, ave_color, low16, high16, pSelectors);
  } else {
    if (!determine_selectors(n, block, low16, high16, pSelectors))
      compress_solid_block(n, ave_color, low16, high16, pSelectors);
    else {
      if (refine_block(n, block, low16, high16, pSelectors))
        determine_selectors(n, block, low16, high16, pSelectors);

      if (refine)
        refine_endpoints2(n, block, low16, high16, pSelectors, axis);
    }
  }

  if (low16 < high16) {
    utils::swap(low16, high16);
    for (uint i = 0; i < n; i++)
      pSelectors[i] ^= 1;
  }
}

void compress_color_block(dxt1_block* pDXT1_block, const color_quad_u8* pBlock, bool refine) {
  uint8 color_selectors[16];
  uint low16, high16;
  dxt_fast::compress_color_block(16, pBlock, low16, high16, color_selectors, refine);

  pDXT1_block->set_low_color(static_cast<uint16>(low16));
  pDXT1_block->set_high_color(static_cast<uint16>(high16));

  uint mask = 0;
  for (int i = 15; i >= 0; i--) {
    mask <<= 2;
    mask |= color_selectors[i];
  }

  pDXT1_block->m_selectors[0] = (uint8)(mask & 0xFF);
  pDXT1_block->m_selectors[1] = (uint8)((mask >> 8) & 0xFF);
  pDXT1_block->m_selectors[2] = (uint8)((mask >> 16) & 0xFF);
  pDXT1_block->m_selectors[3] = (uint8)((mask >> 24) & 0xFF);
}

void compress_alpha_block(uint n, const color_quad_u8* block, uint& low8, uint& high8, uint8* pSelectors, uint comp_index) {
  int min, max;
  min = max = block[0][comp_index];

  for (uint i = 1; i < n; i++) {
    min = math::minimum<int>(min, block[i][comp_index]);
    max = math::maximum<int>(max, block[i][comp_index]);
  }

  low8 = max;
  high8 = min;

  int dist = max - min;
  int bias = min * 7 - (dist >> 1);
  int dist4 = dist * 4;
  int dist2 = dist * 2;

  for (uint i = 0; i < n; i++) {
    int a = block[i][comp_index] * 7 - bias;
    int ind, t;

    t = (dist4 - a) >> 31;
    ind = t & 4;
    a -= dist4 & t;
    t = (dist2 - a) >> 31;
    ind += t & 2;
    a -= dist2 & t;
    t = (dist - a) >> 31;
    ind += t & 1;

    ind = -ind & 7;
    ind ^= (2 > ind);

    pSelectors[i] = static_cast<uint8>(ind);
  }
}

void compress_alpha_block(dxt5_block* pDXT5_block, const color_quad_u8* pBlock, uint comp_index) {
  uint8 selectors[16];
  uint low8, high8;

  compress_alpha_block(16, pBlock, low8, high8, selectors, comp_index);

  pDXT5_block->set_low_alpha(low8);
  pDXT5_block->set_high_alpha(high8);

  uint mask = 0;
  uint bits = 0;
  uint8* pDst = pDXT5_block->m_selectors;

  for (uint i = 0; i < 16; i++) {
    mask |= (selectors[i] << bits);

    if ((bits += 3) >= 8) {
      *pDst++ = static_cast<uint8>(mask);
      mask >>= 8;
      bits -= 8;
    }
  }
}

void find_representative_colors(uint n, const color_quad_u8* pBlock, color_quad_u8& lo, color_quad_u8& hi) {
  uint64 ave64[3];
  ave64[0] = 0;
  ave64[1] = 0;
  ave64[2] = 0;

  for (uint i = 0; i < n; i++) {
    ave64[0] += pBlock[i].r;
    ave64[1] += pBlock[i].g;
    ave64[2] += pBlock[i].b;
  }

  uint ave[3];
  ave[0] = static_cast<uint>((ave64[0] + (n / 2)) / n);
  ave[1] = static_cast<uint>((ave64[1] + (n / 2)) / n);
  ave[2] = static_cast<uint>((ave64[2] + (n / 2)) / n);

  int furthest_dist = -1;
  uint furthest_index = 0;
  for (uint i = 0; i < n; i++) {
    int r = pBlock[i].r - ave[0];
    int g = pBlock[i].g - ave[1];
    int b = pBlock[i].b - ave[2];
    int dist = r * r + g * g + b * b;
    if (dist > furthest_dist) {
      furthest_dist = dist;
      furthest_index = i;
    }
  }

  color_quad_u8 lo_color(pBlock[furthest_index]);

  int opp_dist = -1;
  uint opp_index = 0;
  for (uint i = 0; i < n; i++) {
    int r = pBlock[i].r - lo_color.r;
    int g = pBlock[i].g - lo_color.g;
    int b = pBlock[i].b - lo_color.b;
    int dist = r * r + g * g + b * b;
    if (dist > opp_dist) {
      opp_dist = dist;
      opp_index = i;
    }
  }

  color_quad_u8 hi_color(pBlock[opp_index]);

  for (uint i = 0; i < 3; i++) {
    lo_color[i] = static_cast<uint8>((lo_color[i] + ave[i]) >> 1);
    hi_color[i] = static_cast<uint8>((hi_color[i] + ave[i]) >> 1);
  }

  const uint cMaxIters = 4;
  for (uint iter_index = 0; iter_index < cMaxIters; iter_index++) {
    if ((lo_color[0] == hi_color[0]) && (lo_color[1] == hi_color[1]) && (lo_color[2] == hi_color[2]))
      break;

    uint64 new_color[2][3];
    uint weight[2];

    utils::zero_object(new_color);
    utils::zero_object(weight);

    int vec_r = hi_color[0] - lo_color[0];
    int vec_g = hi_color[1] - lo_color[1];
    int vec_b = hi_color[2] - lo_color[2];

    int lo_dot = vec_r * lo_color[0] + vec_g * lo_color[1] + vec_b * lo_color[2];
    int hi_dot = vec_r * hi_color[0] + vec_g * hi_color[1] + vec_b * hi_color[2];
    int mid_dot = lo_dot + hi_dot;

    vec_r *= 2;
    vec_g *= 2;
    vec_b *= 2;

    for (uint i = 0; i < n; i++) {
      const color_quad_u8& c = pBlock[i];

      const int dot = c[0] * vec_r + c[1] * vec_g + c[2] * vec_b;
      const uint match_index = (dot > mid_dot);

      new_color[match_index][0] += c.r;
      new_color[match_index][1] += c.g;
      new_color[match_index][2] += c.b;
      weight[match_index]++;
    }

    if ((!weight[0]) || (!weight[1]))
      break;

    uint8 new_color8[2][3];

    for (uint j = 0; j < 2; j++)
      for (uint i = 0; i < 3; i++)
        new_color8[j][i] = static_cast<uint8>((new_color[j][i] + (weight[j] / 2)) / weight[j]);

    if ((new_color8[0][0] == lo_color[0]) && (new_color8[0][1] == lo_color[1]) && (new_color8[0][2] == lo_color[2]) &&
        (new_color8[1][0] == hi_color[0]) && (new_color8[1][1] == hi_color[1]) && (new_color8[1][2] == hi_color[2]))
      break;

    for (uint i = 0; i < 3; i++) {
      lo_color[i] = new_color8[0][i];
      hi_color[i] = new_color8[1][i];
    }
  }

  uint energy[2] = {0, 0};
  for (uint i = 0; i < 3; i++) {
    energy[0] += lo_color[i] * lo_color[i];
    energy[1] += hi_color[i] * hi_color[i];
  }

  if (energy[0] > energy[1])
    utils::swap(lo_color, hi_color);

  lo = lo_color;
  hi = hi_color;
}

}  // namespace dxt_fast

}  // namespace crnlib