File: identifiability_study_state.h

package info (click to toggle)
chromium 138.0.7204.157-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 6,071,864 kB
  • sloc: cpp: 34,936,859; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,967; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (402 lines) | stat: -rw-r--r-- 15,553 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CHROME_BROWSER_PRIVACY_BUDGET_IDENTIFIABILITY_STUDY_STATE_H_
#define CHROME_BROWSER_PRIVACY_BUDGET_IDENTIFIABILITY_STUDY_STATE_H_

#include <stdint.h>

#include <cstddef>
#include <iosfwd>
#include <vector>

#include "base/containers/flat_map.h"
#include "base/containers/flat_set.h"
#include "base/memory/raw_ptr.h"
#include "base/sequence_checker.h"
#include "base/thread_annotations.h"
#include "chrome/browser/privacy_budget/encountered_surface_tracker.h"
#include "chrome/browser/privacy_budget/mesa_distribution.h"
#include "chrome/browser/privacy_budget/privacy_budget_prefs.h"
#include "chrome/browser/privacy_budget/representative_surface_set.h"
#include "chrome/browser/privacy_budget/surface_set_equivalence.h"
#include "chrome/browser/privacy_budget/surface_set_valuation.h"
#include "chrome/browser/privacy_budget/surface_set_with_valuation.h"
#include "chrome/common/privacy_budget/order_preserving_set.h"
#include "chrome/common/privacy_budget/types.h"
#include "components/prefs/pref_service.h"
#include "identifiability_study_group_settings.h"
#include "third_party/blink/public/common/privacy_budget/identifiability_study_settings.h"
#include "third_party/blink/public/common/privacy_budget/identifiable_surface.h"

class PrefService;
class SurfaceSetEquivalence;

namespace blink {
class IdentifiableSurface;
}  // namespace blink

namespace content {
class RenderProcessHost;
}  // namespace content

namespace test_utils {
class InspectableIdentifiabilityStudyState;
}  // namespace test_utils

// Current state of the identifiability study.
//
// Persists mutable state in a `PrefService`. In normal operation the
// `PrefService` is `LocalState`. The persisted state corresponds to the prefs
// named in `privacy_budget_prefs.h`.
//
// * The list of "active" identifiable surfaces. I.e. the set of surfaces for
//   which this client is reporting sampled values.
//
// * The list of "seen" identifiable surfaces. I.e. a list of surfaces that
//   this client has seen in the order in which they were observed.
//
// In addition, this object also tracks per-session state which is not
// persisted. This state includes:
//
// * The list of "seen" surfaces that this client has reported to the server.
class IdentifiabilityStudyState {
 public:
  using OffsetType = unsigned int;

  // Construct from a `PrefService`. `pref_service` is used to retrieve and
  // store study state and MUST outlive this.
  explicit IdentifiabilityStudyState(PrefService* pref_service);

  IdentifiabilityStudyState(IdentifiabilityStudyState&) = delete;
  IdentifiabilityStudyState& operator=(const IdentifiabilityStudyState&) =
      delete;

  ~IdentifiabilityStudyState();

  // Returns the active experiment generation as defined by the server-side
  // configuration.
  //
  // See kIdentifiabilityStudyGeneration.
  int generation() const;

  // Returns true if metrics collection is enabled for `surface`.
  //
  // Calling this method may alter the state of the study settings.
  bool ShouldRecordSurface(blink::IdentifiableSurface surface);

  // Should be called from unit-tests if multiple IdentifiabilityStudyState
  // instances are to be constructed.
  static void ResetGlobalStudySettingsForTesting();

  // Returns true if tracking metrics should be recorded for this
  // source_id/surface combination.
  bool ShouldReportEncounteredSurface(uint64_t source_id,
                                      blink::IdentifiableSurface surface);

  // Resets the state associated with a single report.
  //
  // It should be called each time the UKM service constructs a UKM client
  // report.
  void ResetPerReportState();

  // Clears all persisted and ephemeral state.
  //
  // It should be called when the UKM client ID changes or if the experiment
  // generation changes.
  void ResetPersistedState();

  void InitStateForAssignedBlockSampling();
  void InitStateForRandomSurfaceSampling();

  static int SelectMultinomialChoice(const std::vector<double>& weights);

  // Initializes from fields persisted in `pref_service_`.
  void InitFromPrefs();

  // Initializes a new renderer process.
  void InitializeRenderer(content::RenderProcessHost* render_process_host);

  // The largest offset that we can select. At worst `seen_surfaces_` must keep
  // track of this many (+1) surfaces. This value is approximately based on the
  // 90ᵗʰ percentile surface encounter rate as measured in June 2021.
  static constexpr OffsetType kMaxSelectedSurfaceOffset = 1999;

  // A knob that we can use to split data sets from different versions of the
  // implementation where the differences could have material effects on the
  // data distribution.
  //
  // Increment this whenever a non-backwards-compatible change is made in the
  // code. This value is independent of any server controlled study parameters.
  static constexpr int kGeneratorVersion = 1;

  // The ratio between the linear region of the Mesa distribution and the entire
  // range. See `MesaDistribution` for details. The distribution is the source
  // of random numbers for selecting identifiable surface for measurement.
  static constexpr double kMesaDistributionRatio = 0.9;

  // The parameter of the geometric distribution used for the tail of the Mesa
  // distribution.
  static constexpr double kMesaDistributionGeometricDistributionParam = 0.5;

 private:
  friend class test_utils::InspectableIdentifiabilityStudyState;

  using SurfaceSelectionRateMap =
      base::flat_map<blink::IdentifiableSurface, int>;
  using TypeSelectionRateMap =
      base::flat_map<blink::IdentifiableSurface::Type, int>;

  // Initializes global study settings based on FeatureLists and FieldTrial
  // lists.
  void InitializeGlobalStudySettings();

  // Determines if the meta experiment must be activated for this client.
  bool IsMetaExperimentActive();

  // Checks that the invariants hold. When DCHECK_IS_ON() this call is
  // expensive. Noop otherwise.
  void CheckInvariants() const;

  // Returns true if at least one more identifiable surface can be added to the
  // active surface set. This is an estimate since each surface costs different
  // amounts.
  bool CanAddOneMoreActiveSurface() const;

  // Attempts to add `surface` to `seen_surfaces_`.
  //
  // Returns false if `surface` was already included in `seen_surfaces_` or if
  // the `seen_surfaces_` set has reached its cap. Returns true otherwise.
  bool TryAddNewlySeenSurface(blink::IdentifiableSurface surface);

  // Writes individual fields to prefs.
  void WriteSeenSurfacesToPrefs() const;
  void WriteSelectedOffsetsToPrefs() const;

  // Contains all the logic for determining whether a newly observed surface
  // should be added to the active list or not. Should only be called if
  // `active_surfaces_` does not contain `surface`.
  bool DecideInclusionForNewSurface(blink::IdentifiableSurface surface);

  // On exit, ensures that `selected_offsets_` is non-empty and satisfies our
  // invariants.
  void MaybeUpdateSelectedOffsets();

  void UpdateSelectedOffsets(unsigned expected_offset_count);

  // Resets all in-memory state, but doesn't touch any persisted state. This
  // operation invalidates the relationship between persistent and in-memory
  // states. A call to this function should be immediately followed by either
  // reading from or clearing associated preferences.
  void ResetInMemoryState();

  // Determines the number of extra offsets that should be a part of the study
  // state in order to guide surface selection.
  //
  // It attempts to answer the following question:
  //
  //    Given that `active_surfaces_.Cost()` of `active_surface_budget_` has
  //    been consumed, what's the expected number of surfaces we'd need to
  //    select in order to saturate the budget?
  //
  unsigned GetCountOfOffsetsToSelect() const;

  // Verifies that the offset `o` is within the range that's considered valid.
  // The valid range may change between versions.
  static bool IsValidOffset(OffsetType o);

  // Removes disallowed surfaces from `container` and returns the offsets of
  // removed elements relative to the original order of elements.
  //
  // Modifies `container` in-place. Appends removed offsets to `dropped_offsets`
  // in ascending order. (Note that existing offsets are not removed from
  // `container`.)
  //
  // On input, `container` should have no duplicate items nor internal
  // meta-surfaces (i.e. surfaces of type kReservedInternal). Returns `false` if
  // these conditions are violated.
  //
  // E.g.:
  //   Before:
  //       container       == {1,2,3,4}
  //       dropped_offsets == {}
  //
  //       Surface #3 (at offset 2) is blocked, and should therefore be removed.
  //
  //   After:
  //       container       == {1,2,4}
  //       dropped_offsets == {2}
  static bool StripDisallowedSurfaces(IdentifiableSurfaceList& container,
                                      std::vector<OffsetType>& dropped_offsets);

  // Given a list of offsets and a list of offsets to remove, returns the list
  // of offsets adjusted to reflect now missing offsets.
  //
  // So, for example:
  //
  //   Before:
  //     offsets = {1, 2, 3}
  //     dropped_offsets = {1}
  //   After:
  //     offsets = {1, 2} # Formerly offsets 2, and 3, but are now shifted one
  //                      # position.
  //
  //   ~ or ~
  //
  //   Before:
  //     offsets = {1,2,4,6}
  //     dropped_offsets = {2,3,5}
  //   After:
  //     offsets = {1,2,3}
  //
  static std::vector<OffsetType> AdjustForDroppedOffsets(
      std::vector<OffsetType> dropped_offsets,
      std::vector<OffsetType> offsets);

  // Wrapper around some of the experiment field trial params.
  IdentifiabilityStudyGroupSettings settings_;

  // `pref_service_` pointee must outlive `this`. Used for persistent state.
  raw_ptr<PrefService> pref_service_ = nullptr;

  // Offset of selected block. Only used when using assigned block sampling.
  //
  // Persisted in kPrivacyBudgetSelectedBlock within a single study generation.
  int selected_block_offset_ = -1;

  // `equivalence_` contains a model that determines the equivalence of
  // identifiable information for two or more surfaces. See
  // SurfaceSetEquivalence for more details.
  const SurfaceSetEquivalence equivalence_;

  // `valuation_` contains a model that determines an identifiability measure (a
  // cost or valuation, in budget parlance) for a set of identifiable surfaces.
  const SurfaceSetValuation valuation_;

  // Set of identifiable surfaces for which we will collect metrics. This set is
  // extended as we go unless it is already saturated.
  //
  // The set is considered saturated when the cost has reached
  // `active_surface_budget_`. It can also be saturated when the cost is near
  // `active_surface_budget_` but the remaining budget doesn't accommodate any
  // surface.
  //
  // Invariants:
  //
  //   * active_surfaces_ ∩ kSettings.blocked_surfaces() = Ø.
  //
  //   * s ∈ active_surfaces_ ⇒  s.GetType() ∉ kSettings.blocked_types().
  //
  //   * i ∈ selected_offsets_ ∧ i < seen_surfaces_.size()
  //                          ⇒  seen_surfaces_[i] ∈ active_surfaces_.
  //
  //   * Cost(active_surfaces_) ≤ active_surface_budget_.
  //
  // Where kSettings is the PrivacyBudgetSettingsProvider singleton.
  SurfaceSetWithValuation active_surfaces_;

  // Surfaces that the client has encountered in the order in which they were
  // encountered. The set is for fast lookup, and the list is for preserving the
  // order.
  //
  // Invariants:
  //
  //   * seen_surfaces_.CheckModel() passes.
  //
  //   * seen_surfaces_ ∩ kSettings.blocked_surfaces() = Ø.
  //
  //   * s ∈ seen_surfaces_ ⇒  s.GetType() ∉ kSettings.blocked_types().
  //
  //   * seen_surfaces_.size() <= kMaxSelectedSurfaceOffset + 1.
  //
  // Where kSettings is the PrivacyBudgetSettingsProvider singleton.
  OrderPreservingSet<blink::IdentifiableSurface> seen_surfaces_;

  // Incremental serialization of `seen_surfaces_`. Profiling indicates that as
  // the size of the list grows, the serialization consumes a non-negligible
  // amount of time during tight loops.
  //
  // Invariants:
  //
  //   * seen_surface_sequence_string_ = SerializationOf(seen_surfaces_)
  std::string seen_surface_sequence_string_;

  // Indices into `seen_surfaces_` for surfaces that are *active*.
  //
  // Only offsets that are less than |seen_surfaces_.size()| are in use. Others
  // are kept around until we have sufficient surfaces.
  //
  // Invariants:
  //
  //   * i ∈ selected_offsets_ ⇒  i <= kMaxSelectedSurfaceOffset.
  base::flat_set<OffsetType> selected_offsets_;

  // Count of offsets `i` in `selected_offsets_` which satisfy
  // `seen_surfaces_[i] ∈ active_surfaces_`.
  //
  // Invariants:
  //
  //   * active_offset_count_ = O.size() where
  //                            O = { i | i ∈ selected_offsets_ ∧
  //                                      seen_surfaces_[i] ∈ active_surfaces_}
  int active_offset_count_ = 0;

  // Contains kIdentifiabilityStudyGeneration as defined by the server-side
  // experiment.
  //
  // All valid `generation_` values are positive and non-zero. A value of zero
  // implies that the study is not active.
  const int generation_;

  // Hard cap on the number of identifiable surfaces we will sample per client.
  // The limit is specified based on the surface valuation as known to
  // SurfaceSetValuation.
  //
  // This setting can be tweaked experimentally via
  // `kIdentifiabilityStudyActiveSurfaceBudget`.
  //
  // Invariants:
  //
  //   * active_surface_budget_ ≤ kMaxIdentifiabilityStudyActiveSurfaceBudget.
  //
  const int active_surface_budget_;

  // Source of random offsets for selection. The returned offsets are in the
  // range [0, UINT_MAX]. See mesa_distribution.h for details on the random
  // distribution.
  //
  // This distribution is initialized with the expected number of surfaces as
  // the distribution's pivot point. I.e.
  // `random_offset_generator_.pivot_point()` is
  // `features::kIdentifiabilityStudyExpectedSurfaceCount`.
  MesaDistribution<OffsetType> random_offset_generator_;

  // Keeps track of which identifiable surfaces have been exposed to which UKM
  // sources. Each document and worker context within a document tree has
  // a unique source. Hence this field keeps track of identifiable surfaces
  // exposed to all execution contexts in all the document trees.
  //
  // This field resets each time a new UKM report is generated. Hence the
  // tracked value is essentially "which surfaces have been exposed to which
  // sources since the last UKM report."
  //
  // Invariants:
  //
  //   * surface_encounters_ ∩ kSettings.blocked_surfaces() = Ø.
  //
  //   * ∀ s ∈ surface_encounters_[i], s.GetType() ∉ kSettings.blocked_types().
  //
  // Where kSettings is the PrivacyBudgetSettingsProvider singleton.
  EncounteredSurfaceTracker surface_encounters_;

  // Whether the meta experiment (i.e. reporting the meta surfaces, which
  // include information only about usage of APIs) is active or not. Note that
  // this setting is independent from the rest of the Identifiability Study, and
  // can be enabled / disabled separately.
  const bool meta_experiment_active_;

  SEQUENCE_CHECKER(sequence_checker_);
};

#endif  // CHROME_BROWSER_PRIVACY_BUDGET_IDENTIFIABILITY_STUDY_STATE_H_