1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
|
// Copyright (c) 2017 GeometryFactory Sarl (France).
// All rights reserved.
//
// This file is part of CGAL (www.cgal.org).
//
// $URL: https://github.com/CGAL/cgal/blob/v6.1/Classification/include/CGAL/Classification/OpenCV/Random_forest_classifier.h $
// $Id: include/CGAL/Classification/OpenCV/Random_forest_classifier.h b26b07a1242 $
// SPDX-License-Identifier: GPL-3.0-or-later OR LicenseRef-Commercial
//
// Author(s) : Simon Giraudot
#ifndef CGAL_CLASSIFICATION_OPENCV_RANDOM_FOREST_CLASSIFIER_H
#define CGAL_CLASSIFICATION_OPENCV_RANDOM_FOREST_CLASSIFIER_H
#include <CGAL/license/Classification.h>
#include <CGAL/Classification/Feature_set.h>
#include <CGAL/Classification/Label_set.h>
#include <opencv2/opencv.hpp>
//In opencv version 2.X the first digit is named EPOCH,
//until version 3.0 where EPOCH disappears and it becomes MAJOR. Hence this
//weird condition
#ifdef CV_VERSION_EPOCH
#if CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR>= 11
#include <opencv2/ml.hpp>
#else
#include <opencv2/ml/ml.hpp>
#endif
#else
#include <opencv2/ml.hpp>
#endif
namespace CGAL {
namespace Classification {
namespace OpenCV {
/*!
\ingroup PkgClassificationClassifiersOpenCV
\brief %Classifier based on the OpenCV version of the random forest algorithm.
\note This class requires the \ref thirdpartyOpenCV library.
\cgalModels{CGAL::Classification::Classifier}
*/
class Random_forest_classifier
{
const Label_set& m_labels;
const Feature_set& m_features;
int m_max_depth;
int m_min_sample_count;
int m_max_categories;
int m_max_number_of_trees_in_the_forest;
float m_forest_accuracy;
#if (CV_MAJOR_VERSION < 3)
CvRTrees* rtree;
#else
cv::Ptr<cv::ml::RTrees> rtree;
#endif
public:
/// \name Constructor
/// @{
/*!
\brief instantiates the classifier using the sets of `labels` and `features`.
Parameters documentation is copy-pasted from [the official documentation of OpenCV](https://docs.opencv.org/2.4/modules/ml/doc/random_trees.html). For more details on this method, please refer to it.
\param labels label set used.
\param features feature set used.
\param max_depth the depth of the tree. A low value will likely underfit and conversely a high value will likely overfit. The optimal value can be obtained using cross validation or other suitable methods.
\param min_sample_count minimum samples required at a leaf node for it to be split. A reasonable value is a small percentage of the total data e.g. 1%.
\param max_categories Cluster possible values of a categorical variable into \f$ K \leq max\_categories \f$ clusters to find a suboptimal split. If a discrete variable, on which the training procedure tries to make a split, takes more than max_categories values, the precise best subset estimation may take a very long time because the algorithm is exponential. Instead, many decision trees engines (including ML) try to find sub-optimal split in this case by clustering all the samples into max_categories clusters that is some categories are merged together. The clustering is applied only in \f$ n>2-class \f$ classification problems for categorical variables with \f$ N > max\_categories \f$ possible values. In case of regression and 2-class classification the optimal split can be found efficiently without employing clustering, thus the parameter is not used in these cases.
\param max_number_of_trees_in_the_forest The maximum number of trees in the forest (surprise, surprise). Typically the more trees you have the better the accuracy. However, the improvement in accuracy generally diminishes and asymptotes pass a certain number of trees. Also to keep in mind, the number of tree increases the prediction time linearly.
\param forest_accuracy Sufficient accuracy (OOB error).
*/
Random_forest_classifier (const Label_set& labels,
const Feature_set& features,
int max_depth = 20,
int min_sample_count = 5,
int max_categories = 15,
int max_number_of_trees_in_the_forest = 100,
float forest_accuracy = 0.01f)
: m_labels (labels), m_features (features),
m_max_depth (max_depth), m_min_sample_count (min_sample_count),
m_max_categories (max_categories),
m_max_number_of_trees_in_the_forest (max_number_of_trees_in_the_forest),
m_forest_accuracy (forest_accuracy)
#if (CV_MAJOR_VERSION < 3)
, rtree (nullptr)
#endif
{ }
/// \cond SKIP_IN_MANUAL
~Random_forest_classifier ()
{
#if (CV_MAJOR_VERSION < 3)
if (rtree != nullptr)
delete rtree;
#endif
}
/// \endcond
/// @}
/// \name Parameters
/// @{
void set_max_depth (int max_depth) { m_max_depth = max_depth; }
void set_min_sample_count (int min_sample_count) { m_min_sample_count = min_sample_count; }
void set_max_categories (int max_categories) { m_max_categories = max_categories; }
void set_max_number_of_trees_in_the_forest (int max_number_of_trees_in_the_forest)
{ m_max_number_of_trees_in_the_forest = max_number_of_trees_in_the_forest; }
void set_forest_accuracy (float forest_accuracy) { m_forest_accuracy = forest_accuracy; }
/// @}
/// \name Training
/// @{
/*!
\brief runs the training algorithm.
From the set of provided ground truth, this algorithm estimates
sets up the random trees that produce the most accurate result
with respect to this ground truth.
\pre At least one ground truth item should be assigned to each
label.
\param ground_truth vector of label indices. It should contain for
each input item, in the same order as the input set, the index of
the corresponding label in the `Label_set` provided in the
constructor. Input items that do not have a ground truth
information should be given the value `-1`.
*/
template <typename LabelIndexRange>
void train (const LabelIndexRange& ground_truth)
{
CGAL_precondition (m_labels.is_valid_ground_truth (ground_truth));
#if (CV_MAJOR_VERSION < 3)
if (rtree != nullptr)
delete rtree;
#endif
#ifdef CGAL_CLASSIFICATION_VERBOSE
std::cerr << "Training random forest (OpenCV "
<< CV_MAJOR_VERSION << "."
<< CV_MINOR_VERSION << ")" << std::endl;
#endif
std::size_t nb_samples = 0;
for (const auto& gt_value : ground_truth)
if (int(gt_value) != -1)
++ nb_samples;
cv::Mat training_features (int(nb_samples), int(m_features.size()), CV_32FC1);
cv::Mat training_labels (int(nb_samples), 1, CV_32FC1);
std::size_t i = 0, index = 0;
for (const auto& gt_value : ground_truth)
{
if (int(gt_value) != -1)
{
for (std::size_t f = 0; f < m_features.size(); ++ f)
training_features.at<float>(int(index), int(f)) = m_features[f]->value(i);
training_labels.at<float>(int(index), 0) = static_cast<float>(gt_value);
++ index;
}
++ i;
}
#if (CV_MAJOR_VERSION < 3)
float* priors = new float[m_labels.size()];
for (std::size_t i = 0; i < m_labels.size(); ++ i)
priors[i] = 1.;
CvRTParams params;
if (m_forest_accuracy == 0.f)
params = CvRTParams
(m_max_depth, m_min_sample_count,
0, false, m_max_categories, priors, false, 0,
m_max_number_of_trees_in_the_forest,
m_forest_accuracy, CV_TERMCRIT_ITER);
else
params = CvRTParams
(m_max_depth, m_min_sample_count,
0, false, m_max_categories, priors, false, 0,
m_max_number_of_trees_in_the_forest,
m_forest_accuracy, CV_TERMCRIT_EPS | CV_TERMCRIT_ITER);
cv::Mat var_type (m_features.size() + 1, 1, CV_8U);
var_type.setTo (cv::Scalar(CV_VAR_NUMERICAL));
rtree = new CvRTrees;
rtree->train (training_features, CV_ROW_SAMPLE, training_labels,
cv::Mat(), cv::Mat(), var_type, cv::Mat(), params);
delete[] priors;
#else
rtree = cv::ml::RTrees::create();
rtree->setMaxDepth (m_max_depth);
rtree->setMinSampleCount (m_min_sample_count);
rtree->setMaxCategories (m_max_categories);
rtree->setCalculateVarImportance (false);
rtree->setRegressionAccuracy (m_forest_accuracy);
rtree->setUseSurrogates(false);
rtree->setPriors(cv::Mat());
rtree->setCalculateVarImportance(false);
cv::TermCriteria criteria;
if (m_forest_accuracy == 0.f)
criteria = cv::TermCriteria (cv::TermCriteria::COUNT, m_max_number_of_trees_in_the_forest, m_forest_accuracy);
else
criteria = cv::TermCriteria (cv::TermCriteria::EPS + cv::TermCriteria::COUNT, m_max_number_of_trees_in_the_forest, m_forest_accuracy);
rtree->setTermCriteria (criteria);
cv::Ptr<cv::ml::TrainData> tdata = cv::ml::TrainData::create
(training_features, cv::ml::ROW_SAMPLE, training_labels);
rtree->train (tdata);
#endif
}
/// @}
/// \cond SKIP_IN_MANUAL
void operator() (std::size_t item_index, std::vector<float>& out) const
{
out.resize (m_labels.size(), 0.);
cv::Mat feature (1, int(m_features.size()), CV_32FC1);
for (std::size_t f = 0; f < m_features.size(); ++ f)
feature.at<float>(0, int(f)) = m_features[f]->value(item_index);
//compute the result of each tree
#if (CV_MAJOR_VERSION < 3)
std::size_t nb_trees = std::size_t(rtree->get_tree_count());
for (std::size_t i = 0; i < nb_trees; i++)
{
std::size_t l = rtree->get_tree(int(i))->predict(feature, cv::Mat())->value;
out[l] += 1.;
}
for (std::size_t i = 0; i < out.size(); ++ i)
out[i] = out[i] / nb_trees;
#else
std::vector<float> result (1, 0);
rtree->predict (feature, result);
for (std::size_t i = 0; i < out.size(); ++ i)
if (i == std::size_t(result[0]))
out[i] = 1.f;
else
out[i] = 0.f;
#endif
}
/// \endcond
/// \name Input/Output
/// @{
/*!
\brief saves the current configuration in the file named `filename`.
This allows to easily save and recover a specific classification
configuration.
The output file is written in an XML format that is readable by
the `load_configuration()` method.
*/
void save_configuration (const char* filename)
{
rtree->save(filename);
}
/*!
\brief loads a configuration from the file named `filename`.
The input file should be in the XML format written by the
`save_configuration()` method. The feature set of the classifier
should contain the exact same features in the exact same order as
the ones present when the file was generated using
`save_configuration()`.
*/
void load_configuration (const char* filename)
{
#if (CV_MAJOR_VERSION < 3)
if (rtree != nullptr)
delete rtree;
rtree = new CvRTrees;
rtree->load(filename);
#else
rtree = cv::ml::StatModel::load<cv::ml::RTrees> (filename);
#endif
}
/// @}
};
}
/// \cond SKIP_IN_MANUAL
// Backward compatibility
typedef OpenCV::Random_forest_classifier OpenCV_random_forest_classifier;
/// \endcond
}
}
#endif // CGAL_CLASSIFICATION_OPENCV_RANDOM_FOREST_CLASSIFIER_H
|