1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
|
// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/zucchini/heuristic_ensemble_matcher.h"
#include <algorithm>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "base/functional/bind.h"
#include "base/logging.h"
#include "base/memory/raw_ref.h"
#include "base/numerics/safe_conversions.h"
#include "base/strings/stringprintf.h"
#include "components/zucchini/binary_data_histogram.h"
#include "components/zucchini/element_detection.h"
#include "components/zucchini/image_utils.h"
#include "components/zucchini/io_utils.h"
namespace zucchini {
namespace {
/******** Helper Functions ********/
// Uses |detector| to find embedded executables inside |image|, and returns the
// result on success, or std::nullopt on failure, which occurs if too many
// (> |kElementLimit|) elements are found.
std::optional<std::vector<Element>> FindEmbeddedElements(
ConstBufferView image,
const std::string& name,
offset_t start_scan_at,
ElementDetector&& detector) {
// Maximum number of Elements in a file. This is enforced because our matching
// algorithm is O(n^2), which suffices for regular archive files that should
// have up to 10's of executable files. An archive containing 100's of
// executables is likely pathological, and is rejected to prevent exploits.
static constexpr size_t kElementLimit = 256;
std::vector<Element> elements;
std::unique_ptr<ElementFinder> element_finder =
std::make_unique<ElementFinder>(image, std::move(detector),
start_scan_at);
for (auto element = element_finder->GetNext();
element.has_value() && elements.size() <= kElementLimit;
element = element_finder->GetNext()) {
elements.push_back(*element);
}
if (elements.size() >= kElementLimit) {
LOG(WARNING) << name << ": Found too many elements.";
return std::nullopt;
}
LOG(INFO) << name << ": Found " << elements.size() << " elements.";
return elements;
}
// Determines whether a proposed comparison between Elements should be rejected
// early, to decrease the likelihood of creating false-positive matches, which
// may be costly for patching. Our heuristic simply prohibits big difference in
// size (relative and absolute) between matched elements.
bool UnsafeDifference(const Element& old_element, const Element& new_element) {
static constexpr double kMaxBloat = 2.0;
static constexpr size_t kMinWorrysomeDifference = 2 << 20; // 2MB
size_t lo_size = std::min(old_element.size, new_element.size);
size_t hi_size = std::max(old_element.size, new_element.size);
if (hi_size - lo_size < kMinWorrysomeDifference)
return false;
if (hi_size < lo_size * kMaxBloat)
return false;
return true;
}
std::ostream& operator<<(std::ostream& stream, const Element& elt) {
stream << "(" << CastExecutableTypeToString(elt.exe_type) << ", "
<< AsHex<8, size_t>(elt.offset) << " +" << AsHex<8, size_t>(elt.size)
<< ")";
return stream;
}
/******** MatchingInfoOut ********/
// A class to output detailed information during ensemble matching. Extracting
// the functionality to a separate class decouples formatting and printing logic
// from matching logic. The base class consists of stubs.
class MatchingInfoOut {
protected:
MatchingInfoOut() = default;
MatchingInfoOut(const MatchingInfoOut&) = delete;
const MatchingInfoOut& operator=(const MatchingInfoOut&) = delete;
public:
virtual ~MatchingInfoOut() = default;
virtual void InitSizes(size_t old_size, size_t new_size) {}
virtual void DeclareTypeMismatch(int iold, int inew) {}
virtual void DeclareUnsafeDistance(int iold, int inew) {}
virtual void DeclareCandidate(int iold, int inew) {}
virtual void DeclareMatch(int iold,
int inew,
double dist,
bool is_identical) {}
virtual void DeclareOutlier(int iold, int inew) {}
virtual void OutputCompare(const Element& old_element,
const Element& new_element,
double dist) {}
virtual void OutputMatch(const Element& best_old_element,
const Element& new_element,
bool is_identical,
double best_dist) {}
virtual void OutputScores(const std::string& stats) {}
virtual void OutputTextGrid() {}
};
/******** MatchingInfoTerse ********/
// A terse MatchingInfoOut that prints only basic information, using LOG().
class MatchingInfoOutTerse : public MatchingInfoOut {
public:
MatchingInfoOutTerse() = default;
MatchingInfoOutTerse(const MatchingInfoOutTerse&) = delete;
const MatchingInfoOutTerse& operator=(const MatchingInfoOutTerse&) = delete;
~MatchingInfoOutTerse() override = default;
void OutputScores(const std::string& stats) override {
LOG(INFO) << "Best dists: " << stats;
}
};
/******** MatchingInfoOutVerbose ********/
// A verbose MatchingInfoOut that prints detailed information using |out_|,
// including comparison pairs, scores, and a text grid representation of
// pairwise matching results.
class MatchingInfoOutVerbose : public MatchingInfoOut {
public:
explicit MatchingInfoOutVerbose(std::ostream& out) : out_(out) {}
MatchingInfoOutVerbose(const MatchingInfoOutVerbose&) = delete;
const MatchingInfoOutVerbose& operator=(const MatchingInfoOutVerbose&) =
delete;
~MatchingInfoOutVerbose() override = default;
// Outputs sizes and initializes |text_grid_|.
void InitSizes(size_t old_size, size_t new_size) override {
*out_ << "Comparing old (" << old_size << " elements) and new (" << new_size
<< " elements)" << std::endl;
text_grid_.assign(new_size, std::string(old_size, '-'));
best_dist_.assign(new_size, -1.0);
}
// Functions to update match status in text grid representation.
void DeclareTypeMismatch(int iold, int inew) override {
text_grid_[inew][iold] = 'T';
}
void DeclareUnsafeDistance(int iold, int inew) override {
text_grid_[inew][iold] = 'U';
}
void DeclareCandidate(int iold, int inew) override {
text_grid_[inew][iold] = 'C'; // Provisional.
}
void DeclareMatch(int iold,
int inew,
double dist,
bool is_identical) override {
text_grid_[inew][iold] = is_identical ? 'I' : 'M';
best_dist_[inew] = dist;
}
void DeclareOutlier(int iold, int inew) override {
text_grid_[inew][iold] = 'O';
}
// Functions to print detailed information.
void OutputCompare(const Element& old_element,
const Element& new_element,
double dist) override {
*out_ << "Compare old" << old_element << " to new" << new_element << " --> "
<< base::StringPrintf("%.5f", dist) << std::endl;
}
void OutputMatch(const Element& best_old_element,
const Element& new_element,
bool is_identical,
double best_dist) override {
if (is_identical) {
*out_ << "Skipped old" << best_old_element << " - identical to new"
<< new_element;
} else {
*out_ << "Matched old" << best_old_element << " to new" << new_element
<< " --> " << base::StringPrintf("%.5f", best_dist);
}
*out_ << std::endl;
}
void OutputScores(const std::string& stats) override {
*out_ << "Best dists: " << stats << std::endl;
}
void OutputTextGrid() override {
int new_size = static_cast<int>(text_grid_.size());
for (int inew = 0; inew < new_size; ++inew) {
const std::string& line = text_grid_[inew];
*out_ << " ";
for (char ch : line) {
char prefix = (ch == 'I' || ch == 'M') ? '(' : ' ';
char suffix = (ch == 'I' || ch == 'M') ? ')' : ' ';
*out_ << prefix << ch << suffix;
}
if (best_dist_[inew] >= 0)
*out_ << " " << base::StringPrintf("%.5f", best_dist_[inew]);
*out_ << std::endl;
}
if (!text_grid_.empty()) {
*out_ << " Legend: I = identical, M = matched, T = type mismatch, "
"U = unsafe distance, C = candidate, O = outlier, - = skipped."
<< std::endl;
}
}
private:
const raw_ref<std::ostream> out_;
// Text grid representation of matches. Rows correspond to "old" and columns
// correspond to "new".
std::vector<std::string> text_grid_;
// For each "new" element, distance of best match. -1 denotes no match.
std::vector<double> best_dist_;
};
} // namespace
/******** HeuristicEnsembleMatcher ********/
HeuristicEnsembleMatcher::HeuristicEnsembleMatcher(offset_t start_scan_at,
std::ostream* out)
: start_scan_at_(start_scan_at), out_(out) {}
HeuristicEnsembleMatcher::~HeuristicEnsembleMatcher() = default;
bool HeuristicEnsembleMatcher::RunMatch(ConstBufferView old_image,
ConstBufferView new_image) {
DCHECK(matches_.empty());
LOG(INFO) << "Start matching.";
// Find all elements in "old" and "new".
std::optional<std::vector<Element>> old_elements =
FindEmbeddedElements(old_image, "Old file", start_scan_at_,
base::BindRepeating(DetectElementFromDisassembler));
if (!old_elements.has_value())
return false;
std::optional<std::vector<Element>> new_elements =
FindEmbeddedElements(new_image, "New file", start_scan_at_,
base::BindRepeating(DetectElementFromDisassembler));
if (!new_elements.has_value())
return false;
std::unique_ptr<MatchingInfoOut> info_out;
if (out_)
info_out = std::make_unique<MatchingInfoOutVerbose>(*out_);
else
info_out = std::make_unique<MatchingInfoOutTerse>();
const int num_new_elements = base::checked_cast<int>(new_elements->size());
const int num_old_elements = base::checked_cast<int>(old_elements->size());
info_out->InitSizes(num_old_elements, num_new_elements);
// For each "new" element, match it with the "old" element that's nearest to
// it, with distance determined by BinaryDataHistogram. The resulting
// "old"-"new" pairs are stored into |results|. Possibilities:
// - Type mismatch: No match.
// - UnsafeDifference() heuristics fail: No match.
// - Identical match: Skip "new" since this is a trivial case.
// - Non-identical match: Match "new" with "old" with min distance.
// - No match: Skip "new".
struct Results {
int iold;
int inew;
double dist;
};
std::vector<Results> results;
// Precompute histograms for "old" since they get reused.
std::vector<BinaryDataHistogram> old_his(num_old_elements);
for (int iold = 0; iold < num_old_elements; ++iold) {
ConstBufferView sub_image(old_image[(*old_elements)[iold]]);
old_his[iold].Compute(sub_image);
// ProgramDetector should have imposed minimal size limit to |sub_image|.
// Therefore resulting histogram are expected to be valid.
CHECK(old_his[iold].IsValid());
}
const int kUninitIold = num_old_elements;
for (int inew = 0; inew < num_new_elements; ++inew) {
const Element& cur_new_element = (*new_elements)[inew];
ConstBufferView cur_new_sub_image(new_image[cur_new_element.region()]);
BinaryDataHistogram new_his;
new_his.Compute(cur_new_sub_image);
CHECK(new_his.IsValid());
double best_dist = HUGE_VAL;
int best_iold = kUninitIold;
bool is_identical = false;
for (int iold = 0; iold < num_old_elements; ++iold) {
const Element& cur_old_element = (*old_elements)[iold];
if (cur_old_element.exe_type != cur_new_element.exe_type) {
info_out->DeclareTypeMismatch(iold, inew);
continue;
}
if (UnsafeDifference(cur_old_element, cur_new_element)) {
info_out->DeclareUnsafeDistance(iold, inew);
continue;
}
double dist = old_his[iold].Distance(new_his);
info_out->DeclareCandidate(iold, inew);
info_out->OutputCompare(cur_old_element, cur_new_element, dist);
if (best_dist > dist) { // Tie resolution: First-one, first-serve.
best_iold = iold;
best_dist = dist;
if (best_dist == 0) {
ConstBufferView sub_image(old_image[cur_old_element.region()]);
if (sub_image.equals(cur_new_sub_image)) {
is_identical = true;
break;
}
}
}
}
if (best_iold != kUninitIold) {
const Element& best_old_element = (*old_elements)[best_iold];
info_out->DeclareMatch(best_iold, inew, best_dist, is_identical);
if (is_identical) // Skip "new" if identical match is found.
++num_identical_;
else
results.push_back({best_iold, inew, best_dist});
info_out->OutputMatch(best_old_element, cur_new_element, is_identical,
best_dist);
}
}
// Populate |matches_| from |result|. To reduce that chance of false-positive
// matches, statistics on dists are computed. If a match's |dist| is an
// outlier then it is rejected.
if (results.size() > 0) {
OutlierDetector detector;
for (const auto& result : results) {
if (result.dist > 0)
detector.Add(result.dist);
}
detector.Prepare();
info_out->OutputScores(detector.RenderStats());
for (const Results& result : results) {
if (detector.DecideOutlier(result.dist) > 0) {
info_out->DeclareOutlier(result.iold, result.inew);
} else {
matches_.push_back(
{(*old_elements)[result.iold], (*new_elements)[result.inew]});
}
}
info_out->OutputTextGrid();
}
Trim();
return true;
}
} // namespace zucchini
|