File: phishing_classifier.h

package info (click to toggle)
chromium 139.0.7258.138-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,120,676 kB
  • sloc: cpp: 35,100,869; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (193 lines) | stat: -rw-r--r-- 7,752 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// This class handles the process of extracting all of the features from a
// page and computing a phishyness score.  The basic steps are:
//  - Run each feature extractor over the page, building up a FeatureMap of
//    feature -> value.
//  - SHA-256 hash all of the feature names in the map so that they match the
//    supplied model.
//  - Hand the hashed map off to a Scorer, which computes the probability that
//    the page is phishy.
//  - If the page is phishy, run the supplied callback.
//
// For more details, see phishing_*_feature_extractor.h, scorer.h, and
// client_model.proto.

#ifndef COMPONENTS_SAFE_BROWSING_CONTENT_RENDERER_PHISHING_CLASSIFIER_PHISHING_CLASSIFIER_H_
#define COMPONENTS_SAFE_BROWSING_CONTENT_RENDERER_PHISHING_CLASSIFIER_PHISHING_CLASSIFIER_H_

#include <stdint.h>

#include <memory>
#include <set>
#include <string>
#include <vector>

#include "base/functional/callback.h"
#include "base/memory/raw_ptr.h"
#include "base/memory/ref_counted_memory.h"
#include "base/memory/scoped_refptr.h"
#include "base/memory/weak_ptr.h"
#include "base/time/time.h"
#include "components/safe_browsing/content/renderer/phishing_classifier/scorer.h"
#include "third_party/skia/include/core/SkBitmap.h"

namespace content {
class RenderFrame;
}

namespace safe_browsing {
class ClientPhishingRequest;
class VisualFeatures;
class FeatureMap;
class PhishingDOMFeatureExtractor;
class PhishingTermFeatureExtractor;
class PhishingUrlFeatureExtractor;
class PhishingVisualFeatureExtractor;
class Scorer;

class PhishingClassifier {
 public:
  enum class Result {
    kSuccess = 0,
    kInvalidScore = 1,
    kInvalidURLFormatRequest = 2,
    kInvalidDocumentLoader = 3,
    kURLFeatureExtractionFailed = 4,
    kDOMExtractionFailed = 5,
    kTermExtractionFailed = 6,
    kVisualExtractionFailed = 7,
  };

  // Callback to be run when phishing classification finishes. The verdict
  // is a ClientPhishingRequest which contains the verdict computed by the
  // classifier as well as the extracted features.  If the verdict.is_phishing()
  // is true, the page is considered phishy by the client-side model,
  // and the browser should ping back to get a final verdict.  The
  // verdict.client_score() is set to -1 if the classification failed. If the
  // client_score() is not -1, the Result will be kSuccess,
  // and one of other results otherwise.
  typedef base::OnceCallback<void(const ClientPhishingRequest& /* verdict */,
                                  Result /*result*/)>
      DoneCallback;

  static const int kClassifierFailed;

  // Creates a new PhishingClassifier object that will operate on
  // |render_view|. Note that the classifier will not be 'ready' until
  // set_phishing_scorer() is called.
  explicit PhishingClassifier(content::RenderFrame* render_frame);

  PhishingClassifier(const PhishingClassifier&) = delete;
  PhishingClassifier& operator=(const PhishingClassifier&) = delete;

  virtual ~PhishingClassifier();

  // Returns true if the classifier is ready to classify pages, i.e. it
  // has had a scorer set via set_phishing_scorer().
  bool is_ready() const;

  // Called by the RenderView when a page has finished loading.  This begins
  // the feature extraction and scoring process. |page_text| should contain
  // the plain text of a web page, including any subframes, as returned by
  // RenderView::CaptureText().  |page_text| is owned by the caller, and must
  // not be destroyed until either |done_callback| is run or
  // CancelPendingClassification() is called.
  //
  // To avoid blocking the render thread for too long, phishing classification
  // may run in several chunks of work, posting a task to the current
  // MessageLoop to continue processing.  Once the scoring process is complete,
  // |done_callback| is run on the current thread.  PhishingClassifier takes
  // ownership of the callback.
  //
  // It is an error to call BeginClassification if the classifier is not yet
  // ready.
  virtual void BeginClassification(
      scoped_refptr<const base::RefCountedString16> page_text,
      DoneCallback callback);

  // Called by the RenderView (on the render thread) when a page is unloading
  // or the RenderView is being destroyed.  This cancels any extraction that
  // is in progress.  It is an error to call CancelPendingClassification if
  // the classifier is not yet ready.
  virtual void CancelPendingClassification();

 private:
  // Any score equal to or above this value is considered phishy.
  static const float kPhishyThreshold;

  // Begins the feature extraction process, by extracting URL features and
  // beginning DOM feature extraction.
  void BeginFeatureExtraction();

  // Callback to be run when DOM feature extraction is complete.
  // If it was successful, begins term feature extraction, otherwise
  // runs the DoneCallback with a non-phishy verdict.
  void DOMExtractionFinished(bool success);

  // Callback to be run when term feature extraction is complete.
  // If it was successful, begins visual feature extraction, otherwise runs the
  // DoneCallback with a non-phishy verdict.
  void TermExtractionFinished(bool success);

  // Called to extract the visual features of the current page.
  void ExtractVisualFeatures();

  // Callback when off-thread playback of the recorded paint operations is
  // complete.
  void OnPlaybackDone(std::unique_ptr<SkBitmap> bitmap);

  // Callback when visual features have been extracted from the screenshot.
  void OnVisualFeaturesExtracted(
      std::unique_ptr<VisualFeatures> visual_features);

  // Callback when visual feature extraction is complete.
  // If it was successful, computes a score and runs the DoneCallback.
  // If extraction was unsuccessful, runs the DoneCallback with a
  // non-phishy verdict.
  void VisualExtractionFinished(bool success);

  // Callback when the visual TFLite model has been applied, and returned a list
  // of scores.
  void OnVisualTfLiteModelDone(std::unique_ptr<ClientPhishingRequest> verdict,
                               std::vector<double> result);

  // Helper method to run the DoneCallback and clear the state.
  void RunCallback(const ClientPhishingRequest& verdict,
                   Result phishing_classifier_result);

  // Helper to run the DoneCallback when feature extraction has failed.
  // This always signals a non-phishy verdict for the page, with
  // |kInvalidScore|.
  void RunFailureCallback(Result failure_event);

  // Clears the current state of the PhishingClassifier.
  void Clear();

  raw_ptr<content::RenderFrame, DanglingUntriaged> render_frame_;  // owns us
  std::unique_ptr<PhishingUrlFeatureExtractor> url_extractor_;
  std::unique_ptr<PhishingDOMFeatureExtractor> dom_extractor_;
  std::unique_ptr<PhishingTermFeatureExtractor> term_extractor_;
  std::unique_ptr<PhishingVisualFeatureExtractor> visual_extractor_;

  // State for any in-progress extraction.
  std::unique_ptr<FeatureMap> features_;
  std::unique_ptr<std::set<uint32_t>> shingle_hashes_;
  scoped_refptr<const base::RefCountedString16> page_text_;
  std::unique_ptr<SkBitmap> bitmap_;
  std::unique_ptr<VisualFeatures> visual_features_;
  DoneCallback done_callback_;

  // Used to record the duration of visual feature scoring.
  base::TimeTicks visual_matching_start_;

  // Used in scheduling BeginFeatureExtraction tasks.
  // These pointers are invalidated if classification is cancelled.
  base::WeakPtrFactory<PhishingClassifier> weak_factory_{this};
};

}  // namespace safe_browsing

#endif  // COMPONENTS_SAFE_BROWSING_CONTENT_RENDERER_PHISHING_CLASSIFIER_PHISHING_CLASSIFIER_H_