File: test_gradient_check_util.hpp

package info (click to toggle)
caffe-contrib 1.0.0%2Bgit20180821.99bd997-2
links: PTS, VCS
area: contrib
in suites: buster
size: 16,244 kB
sloc: cpp: 61,579; python: 5,783; makefile: 586; sh: 562
file content (266 lines) | stat: -rw-r--r-- 11,478 bytes
parent folder | download | duplicates (5)
#ifndef CAFFE_TEST_GRADIENT_CHECK_UTIL_H_
#define CAFFE_TEST_GRADIENT_CHECK_UTIL_H_

#include <glog/logging.h>
#include <gtest/gtest.h>

#include <algorithm>
#include <cmath>
#include <vector>

#include "caffe/layer.hpp"
#include "caffe/net.hpp"

namespace caffe {

// The gradient checker adds a L2 normalization loss function on top of the
// top blobs, and checks the gradient.
template <typename Dtype>
class GradientChecker {
 public:
  // kink and kink_range specify an ignored nonsmooth region of the form
  // kink - kink_range <= |feature value| <= kink + kink_range,
  // which accounts for all nonsmoothness in use by caffe
  GradientChecker(const Dtype stepsize, const Dtype threshold,
      const unsigned int seed = 1701, const Dtype kink = 0.,
      const Dtype kink_range = -1)
      : stepsize_(stepsize), threshold_(threshold), seed_(seed),
        kink_(kink), kink_range_(kink_range) {}
  // Checks the gradient of a layer, with provided bottom layers and top
  // layers.
  // Note that after the gradient check, we do not guarantee that the data
  // stored in the layer parameters and the blobs are unchanged.
  void CheckGradient(Layer<Dtype>* layer, const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top, int check_bottom = -1) {
      layer->SetUp(bottom, top);
      CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1);
  }
  void CheckGradientExhaustive(Layer<Dtype>* layer,
      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
      int check_bottom = -1);

  // CheckGradientEltwise can be used to test layers that perform element-wise
  // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when
  // i != j.
  void CheckGradientEltwise(Layer<Dtype>* layer,
      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);

  // Checks the gradient of a single output with respect to particular input
  // blob(s).  If check_bottom = i >= 0, check only the ith bottom Blob.
  // If check_bottom == -1, check everything -- all bottom Blobs and all
  // param Blobs.  Otherwise (if check_bottom < -1), check only param Blobs.
  void CheckGradientSingle(Layer<Dtype>* layer,
      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
      int check_bottom, int top_id, int top_data_id, bool element_wise = false);

  // Checks the gradient of a network. This network should not have any data
  // layers or loss layers, since the function does not explicitly deal with
  // such cases yet. All input blobs and parameter blobs are going to be
  // checked, layer-by-layer to avoid numerical problems to accumulate.
  void CheckGradientNet(const Net<Dtype>& net,
      const vector<Blob<Dtype>*>& input);

 protected:
  Dtype GetObjAndGradient(const Layer<Dtype>& layer,
      const vector<Blob<Dtype>*>& top, int top_id = -1, int top_data_id = -1);
  Dtype stepsize_;
  Dtype threshold_;
  unsigned int seed_;
  Dtype kink_;
  Dtype kink_range_;
};


template <typename Dtype>
void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
    int check_bottom, int top_id, int top_data_id, bool element_wise) {
  if (element_wise) {
    CHECK_EQ(0, layer->blobs().size());
    CHECK_LE(0, top_id);
    CHECK_LE(0, top_data_id);
    const int top_count = top[top_id]->count();
    for (int blob_id = 0; blob_id < bottom.size(); ++blob_id) {
      CHECK_EQ(top_count, bottom[blob_id]->count());
    }
  }
  // First, figure out what blobs we need to check against, and zero init
  // parameter blobs.
  vector<Blob<Dtype>*> blobs_to_check;
  vector<bool> propagate_down(bottom.size(), check_bottom == -1);
  for (int i = 0; i < layer->blobs().size(); ++i) {
    Blob<Dtype>* blob = layer->blobs()[i].get();
    caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
    blobs_to_check.push_back(blob);
  }
  if (check_bottom == -1) {
    for (int i = 0; i < bottom.size(); ++i) {
      blobs_to_check.push_back(bottom[i]);
    }
  } else if (check_bottom >= 0) {
    CHECK_LT(check_bottom, bottom.size());
    blobs_to_check.push_back(bottom[check_bottom]);
    propagate_down[check_bottom] = true;
  }
  CHECK_GT(blobs_to_check.size(), 0) << "No blobs to check.";
  // Compute the gradient analytically using Backward
  Caffe::set_random_seed(seed_);
  // Ignore the loss from the layer (it's just the weighted sum of the losses
  // from the top blobs, whose gradients we may want to test individually).
  layer->Forward(bottom, top);
  // Get additional loss from the objective
  GetObjAndGradient(*layer, top, top_id, top_data_id);
  layer->Backward(top, propagate_down, bottom);
  // Store computed gradients for all checked blobs
  vector<shared_ptr<Blob<Dtype> > >
      computed_gradient_blobs(blobs_to_check.size());
  for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
    Blob<Dtype>* current_blob = blobs_to_check[blob_id];
    computed_gradient_blobs[blob_id].reset(new Blob<Dtype>());
    computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob);
    const int count = blobs_to_check[blob_id]->count();
    const Dtype* diff = blobs_to_check[blob_id]->cpu_diff();
    Dtype* computed_gradients =
        computed_gradient_blobs[blob_id]->mutable_cpu_data();
    caffe_copy(count, diff, computed_gradients);
  }
  // Compute derivative of top w.r.t. each bottom and parameter input using
  // finite differencing.
  // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs.";
  for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
    Blob<Dtype>* current_blob = blobs_to_check[blob_id];
    const Dtype* computed_gradients =
        computed_gradient_blobs[blob_id]->cpu_data();
    // LOG(ERROR) << "Blob " << blob_id << ": checking "
    //     << current_blob->count() << " parameters.";
    for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) {
      // For an element-wise layer, we only need to do finite differencing to
      // compute the derivative of top[top_id][top_data_id] w.r.t.
      // bottom[blob_id][i] only for i == top_data_id.  For any other
      // i != top_data_id, we know the derivative is 0 by definition, and simply
      // check that that's true.
      Dtype estimated_gradient = 0;
      Dtype positive_objective = 0;
      Dtype negative_objective = 0;
      if (!element_wise || (feat_id == top_data_id)) {
        // Do finite differencing.
        // Compute loss with stepsize_ added to input.
        current_blob->mutable_cpu_data()[feat_id] += stepsize_;
        Caffe::set_random_seed(seed_);
        layer->Forward(bottom, top);
        positive_objective =
            GetObjAndGradient(*layer, top, top_id, top_data_id);
        // Compute loss with stepsize_ subtracted from input.
        current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2;
        Caffe::set_random_seed(seed_);
        layer->Forward(bottom, top);
        negative_objective =
            GetObjAndGradient(*layer, top, top_id, top_data_id);
        // Recover original input value.
        current_blob->mutable_cpu_data()[feat_id] += stepsize_;
        estimated_gradient = (positive_objective - negative_objective) /
            stepsize_ / 2.;
      }
      Dtype computed_gradient = computed_gradients[feat_id];
      Dtype feature = current_blob->cpu_data()[feat_id];
      // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " "
      //     << current_blob->cpu_diff()[feat_id];
      if (kink_ - kink_range_ > fabs(feature)
          || fabs(feature) > kink_ + kink_range_) {
        // We check relative accuracy, but for too small values, we threshold
        // the scale factor by 1.
        Dtype scale = std::max<Dtype>(
            std::max(fabs(computed_gradient), fabs(estimated_gradient)),
            Dtype(1.));
        EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale)
          << "debug: (top_id, top_data_id, blob_id, feat_id)="
          << top_id << "," << top_data_id << "," << blob_id << "," << feat_id
          << "; feat = " << feature
          << "; objective+ = " << positive_objective
          << "; objective- = " << negative_objective;
      }
      // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id];
      // LOG(ERROR) << "computed gradient: " << computed_gradient
      //    << " estimated_gradient: " << estimated_gradient;
    }
  }
}

template <typename Dtype>
void GradientChecker<Dtype>::CheckGradientExhaustive(Layer<Dtype>* layer,
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
    int check_bottom) {
  layer->SetUp(bottom, top);
  CHECK_GT(top.size(), 0) << "Exhaustive mode requires at least one top blob.";
  // LOG(ERROR) << "Exhaustive Mode.";
  for (int i = 0; i < top.size(); ++i) {
    // LOG(ERROR) << "Exhaustive: blob " << i << " size " << top[i]->count();
    for (int j = 0; j < top[i]->count(); ++j) {
      // LOG(ERROR) << "Exhaustive: blob " << i << " data " << j;
      CheckGradientSingle(layer, bottom, top, check_bottom, i, j);
    }
  }
}

template <typename Dtype>
void GradientChecker<Dtype>::CheckGradientEltwise(Layer<Dtype>* layer,
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  layer->SetUp(bottom, top);
  CHECK_GT(top.size(), 0) << "Eltwise mode requires at least one top blob.";
  const int check_bottom = -1;
  const bool element_wise = true;
  for (int i = 0; i < top.size(); ++i) {
    for (int j = 0; j < top[i]->count(); ++j) {
      CheckGradientSingle(layer, bottom, top, check_bottom, i, j, element_wise);
    }
  }
}

template <typename Dtype>
void GradientChecker<Dtype>::CheckGradientNet(
    const Net<Dtype>& net, const vector<Blob<Dtype>*>& input) {
  const vector<shared_ptr<Layer<Dtype> > >& layers = net.layers();
  vector<vector<Blob<Dtype>*> >& bottom_vecs = net.bottom_vecs();
  vector<vector<Blob<Dtype>*> >& top_vecs = net.top_vecs();
  for (int i = 0; i < layers.size(); ++i) {
    net.Forward(input);
    LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name();
    CheckGradientExhaustive(*(layers[i].get()), bottom_vecs[i], top_vecs[i]);
  }
}

template <typename Dtype>
Dtype GradientChecker<Dtype>::GetObjAndGradient(const Layer<Dtype>& layer,
    const vector<Blob<Dtype>*>& top, int top_id, int top_data_id) {
  Dtype loss = 0;
  if (top_id < 0) {
    // the loss will be half of the sum of squares of all outputs
    for (int i = 0; i < top.size(); ++i) {
      Blob<Dtype>* top_blob = top[i];
      const Dtype* top_blob_data = top_blob->cpu_data();
      Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
      int count = top_blob->count();
      for (int j = 0; j < count; ++j) {
        loss += top_blob_data[j] * top_blob_data[j];
      }
      // set the diff: simply the data.
      caffe_copy(top_blob->count(), top_blob_data, top_blob_diff);
    }
    loss /= 2.;
  } else {
    // the loss will be the top_data_id-th element in the top_id-th blob.
    for (int i = 0; i < top.size(); ++i) {
      Blob<Dtype>* top_blob = top[i];
      Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
      caffe_set(top_blob->count(), Dtype(0), top_blob_diff);
    }
    const Dtype loss_weight = 2;
    loss = top[top_id]->cpu_data()[top_data_id] * loss_weight;
    top[top_id]->mutable_cpu_diff()[top_data_id] = loss_weight;
  }
  return loss;
}

}  // namespace caffe

#endif  // CAFFE_TEST_GRADIENT_CHECK_UTIL_H_