1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
|
// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_BROWSER_PRIVACY_BUDGET_IDENTIFIABILITY_STUDY_STATE_H_
#define CHROME_BROWSER_PRIVACY_BUDGET_IDENTIFIABILITY_STUDY_STATE_H_
#include <stdint.h>
#include <cstddef>
#include <iosfwd>
#include <vector>
#include "base/containers/flat_map.h"
#include "base/containers/flat_set.h"
#include "base/memory/raw_ptr.h"
#include "base/sequence_checker.h"
#include "base/thread_annotations.h"
#include "chrome/browser/privacy_budget/encountered_surface_tracker.h"
#include "chrome/browser/privacy_budget/mesa_distribution.h"
#include "chrome/browser/privacy_budget/privacy_budget_prefs.h"
#include "chrome/browser/privacy_budget/representative_surface_set.h"
#include "chrome/browser/privacy_budget/surface_set_equivalence.h"
#include "chrome/browser/privacy_budget/surface_set_valuation.h"
#include "chrome/browser/privacy_budget/surface_set_with_valuation.h"
#include "chrome/common/privacy_budget/order_preserving_set.h"
#include "chrome/common/privacy_budget/types.h"
#include "components/prefs/pref_service.h"
#include "identifiability_study_group_settings.h"
#include "third_party/blink/public/common/privacy_budget/identifiability_study_settings.h"
#include "third_party/blink/public/common/privacy_budget/identifiable_surface.h"
class PrefService;
class SurfaceSetEquivalence;
namespace blink {
class IdentifiableSurface;
} // namespace blink
namespace content {
class RenderProcessHost;
} // namespace content
namespace test_utils {
class InspectableIdentifiabilityStudyState;
} // namespace test_utils
// Current state of the identifiability study.
//
// Persists mutable state in a `PrefService`. In normal operation the
// `PrefService` is `LocalState`. The persisted state corresponds to the prefs
// named in `privacy_budget_prefs.h`.
//
// * The list of "active" identifiable surfaces. I.e. the set of surfaces for
// which this client is reporting sampled values.
//
// * The list of "seen" identifiable surfaces. I.e. a list of surfaces that
// this client has seen in the order in which they were observed.
//
// In addition, this object also tracks per-session state which is not
// persisted. This state includes:
//
// * The list of "seen" surfaces that this client has reported to the server.
class IdentifiabilityStudyState {
public:
using OffsetType = unsigned int;
// Construct from a `PrefService`. `pref_service` is used to retrieve and
// store study state and MUST outlive this.
explicit IdentifiabilityStudyState(PrefService* pref_service);
IdentifiabilityStudyState(IdentifiabilityStudyState&) = delete;
IdentifiabilityStudyState& operator=(const IdentifiabilityStudyState&) =
delete;
~IdentifiabilityStudyState();
// Returns the active experiment generation as defined by the server-side
// configuration.
//
// See kIdentifiabilityStudyGeneration.
int generation() const;
// Returns true if metrics collection is enabled for `surface`.
//
// Calling this method may alter the state of the study settings.
bool ShouldRecordSurface(blink::IdentifiableSurface surface);
// Should be called from unit-tests if multiple IdentifiabilityStudyState
// instances are to be constructed.
static void ResetGlobalStudySettingsForTesting();
// Returns true if tracking metrics should be recorded for this
// source_id/surface combination.
bool ShouldReportEncounteredSurface(uint64_t source_id,
blink::IdentifiableSurface surface);
// Resets the state associated with a single report.
//
// It should be called each time the UKM service constructs a UKM client
// report.
void ResetPerReportState();
// Clears all persisted and ephemeral state.
//
// It should be called when the UKM client ID changes or if the experiment
// generation changes.
void ResetPersistedState();
void InitStateForAssignedBlockSampling();
void InitStateForRandomSurfaceSampling();
static int SelectMultinomialChoice(const std::vector<double>& weights);
// Initializes from fields persisted in `pref_service_`.
void InitFromPrefs();
// Initializes a new renderer process.
void InitializeRenderer(content::RenderProcessHost* render_process_host);
// The largest offset that we can select. At worst `seen_surfaces_` must keep
// track of this many (+1) surfaces. This value is approximately based on the
// 90ᵗʰ percentile surface encounter rate as measured in June 2021.
static constexpr OffsetType kMaxSelectedSurfaceOffset = 1999;
// A knob that we can use to split data sets from different versions of the
// implementation where the differences could have material effects on the
// data distribution.
//
// Increment this whenever a non-backwards-compatible change is made in the
// code. This value is independent of any server controlled study parameters.
static constexpr int kGeneratorVersion = 1;
// The ratio between the linear region of the Mesa distribution and the entire
// range. See `MesaDistribution` for details. The distribution is the source
// of random numbers for selecting identifiable surface for measurement.
static constexpr double kMesaDistributionRatio = 0.9;
// The parameter of the geometric distribution used for the tail of the Mesa
// distribution.
static constexpr double kMesaDistributionGeometricDistributionParam = 0.5;
private:
friend class test_utils::InspectableIdentifiabilityStudyState;
using SurfaceSelectionRateMap =
base::flat_map<blink::IdentifiableSurface, int>;
using TypeSelectionRateMap =
base::flat_map<blink::IdentifiableSurface::Type, int>;
// Initializes global study settings based on FeatureLists and FieldTrial
// lists.
void InitializeGlobalStudySettings();
// Determines if the meta experiment must be activated for this client.
bool IsMetaExperimentActive();
// Checks that the invariants hold. When DCHECK_IS_ON() this call is
// expensive. Noop otherwise.
void CheckInvariants() const;
// Returns true if at least one more identifiable surface can be added to the
// active surface set. This is an estimate since each surface costs different
// amounts.
bool CanAddOneMoreActiveSurface() const;
// Attempts to add `surface` to `seen_surfaces_`.
//
// Returns false if `surface` was already included in `seen_surfaces_` or if
// the `seen_surfaces_` set has reached its cap. Returns true otherwise.
bool TryAddNewlySeenSurface(blink::IdentifiableSurface surface);
// Writes individual fields to prefs.
void WriteSeenSurfacesToPrefs() const;
void WriteSelectedOffsetsToPrefs() const;
// Contains all the logic for determining whether a newly observed surface
// should be added to the active list or not. Should only be called if
// `active_surfaces_` does not contain `surface`.
bool DecideInclusionForNewSurface(blink::IdentifiableSurface surface);
// On exit, ensures that `selected_offsets_` is non-empty and satisfies our
// invariants.
void MaybeUpdateSelectedOffsets();
void UpdateSelectedOffsets(unsigned expected_offset_count);
// Resets all in-memory state, but doesn't touch any persisted state. This
// operation invalidates the relationship between persistent and in-memory
// states. A call to this function should be immediately followed by either
// reading from or clearing associated preferences.
void ResetInMemoryState();
// Determines the number of extra offsets that should be a part of the study
// state in order to guide surface selection.
//
// It attempts to answer the following question:
//
// Given that `active_surfaces_.Cost()` of `active_surface_budget_` has
// been consumed, what's the expected number of surfaces we'd need to
// select in order to saturate the budget?
//
unsigned GetCountOfOffsetsToSelect() const;
// Verifies that the offset `o` is within the range that's considered valid.
// The valid range may change between versions.
static bool IsValidOffset(OffsetType o);
// Removes disallowed surfaces from `container` and returns the offsets of
// removed elements relative to the original order of elements.
//
// Modifies `container` in-place. Appends removed offsets to `dropped_offsets`
// in ascending order. (Note that existing offsets are not removed from
// `container`.)
//
// On input, `container` should have no duplicate items nor internal
// meta-surfaces (i.e. surfaces of type kReservedInternal). Returns `false` if
// these conditions are violated.
//
// E.g.:
// Before:
// container == {1,2,3,4}
// dropped_offsets == {}
//
// Surface #3 (at offset 2) is blocked, and should therefore be removed.
//
// After:
// container == {1,2,4}
// dropped_offsets == {2}
static bool StripDisallowedSurfaces(IdentifiableSurfaceList& container,
std::vector<OffsetType>& dropped_offsets);
// Given a list of offsets and a list of offsets to remove, returns the list
// of offsets adjusted to reflect now missing offsets.
//
// So, for example:
//
// Before:
// offsets = {1, 2, 3}
// dropped_offsets = {1}
// After:
// offsets = {1, 2} # Formerly offsets 2, and 3, but are now shifted one
// # position.
//
// ~ or ~
//
// Before:
// offsets = {1,2,4,6}
// dropped_offsets = {2,3,5}
// After:
// offsets = {1,2,3}
//
static std::vector<OffsetType> AdjustForDroppedOffsets(
std::vector<OffsetType> dropped_offsets,
std::vector<OffsetType> offsets);
// Wrapper around some of the experiment field trial params.
IdentifiabilityStudyGroupSettings settings_;
// `pref_service_` pointee must outlive `this`. Used for persistent state.
raw_ptr<PrefService> pref_service_ = nullptr;
// Offset of selected block. Only used when using assigned block sampling.
//
// Persisted in kPrivacyBudgetSelectedBlock within a single study generation.
int selected_block_offset_ = -1;
// `equivalence_` contains a model that determines the equivalence of
// identifiable information for two or more surfaces. See
// SurfaceSetEquivalence for more details.
const SurfaceSetEquivalence equivalence_;
// `valuation_` contains a model that determines an identifiability measure (a
// cost or valuation, in budget parlance) for a set of identifiable surfaces.
const SurfaceSetValuation valuation_;
// Set of identifiable surfaces for which we will collect metrics. This set is
// extended as we go unless it is already saturated.
//
// The set is considered saturated when the cost has reached
// `active_surface_budget_`. It can also be saturated when the cost is near
// `active_surface_budget_` but the remaining budget doesn't accommodate any
// surface.
//
// Invariants:
//
// * active_surfaces_ ∩ kSettings.blocked_surfaces() = Ø.
//
// * s ∈ active_surfaces_ ⇒ s.GetType() ∉ kSettings.blocked_types().
//
// * i ∈ selected_offsets_ ∧ i < seen_surfaces_.size()
// ⇒ seen_surfaces_[i] ∈ active_surfaces_.
//
// * Cost(active_surfaces_) ≤ active_surface_budget_.
//
// Where kSettings is the PrivacyBudgetSettingsProvider singleton.
SurfaceSetWithValuation active_surfaces_;
// Surfaces that the client has encountered in the order in which they were
// encountered. The set is for fast lookup, and the list is for preserving the
// order.
//
// Invariants:
//
// * seen_surfaces_.CheckModel() passes.
//
// * seen_surfaces_ ∩ kSettings.blocked_surfaces() = Ø.
//
// * s ∈ seen_surfaces_ ⇒ s.GetType() ∉ kSettings.blocked_types().
//
// * seen_surfaces_.size() <= kMaxSelectedSurfaceOffset + 1.
//
// Where kSettings is the PrivacyBudgetSettingsProvider singleton.
OrderPreservingSet<blink::IdentifiableSurface> seen_surfaces_;
// Incremental serialization of `seen_surfaces_`. Profiling indicates that as
// the size of the list grows, the serialization consumes a non-negligible
// amount of time during tight loops.
//
// Invariants:
//
// * seen_surface_sequence_string_ = SerializationOf(seen_surfaces_)
std::string seen_surface_sequence_string_;
// Indices into `seen_surfaces_` for surfaces that are *active*.
//
// Only offsets that are less than |seen_surfaces_.size()| are in use. Others
// are kept around until we have sufficient surfaces.
//
// Invariants:
//
// * i ∈ selected_offsets_ ⇒ i <= kMaxSelectedSurfaceOffset.
base::flat_set<OffsetType> selected_offsets_;
// Count of offsets `i` in `selected_offsets_` which satisfy
// `seen_surfaces_[i] ∈ active_surfaces_`.
//
// Invariants:
//
// * active_offset_count_ = O.size() where
// O = { i | i ∈ selected_offsets_ ∧
// seen_surfaces_[i] ∈ active_surfaces_}
int active_offset_count_ = 0;
// Contains kIdentifiabilityStudyGeneration as defined by the server-side
// experiment.
//
// All valid `generation_` values are positive and non-zero. A value of zero
// implies that the study is not active.
const int generation_;
// Hard cap on the number of identifiable surfaces we will sample per client.
// The limit is specified based on the surface valuation as known to
// SurfaceSetValuation.
//
// This setting can be tweaked experimentally via
// `kIdentifiabilityStudyActiveSurfaceBudget`.
//
// Invariants:
//
// * active_surface_budget_ ≤ kMaxIdentifiabilityStudyActiveSurfaceBudget.
//
const int active_surface_budget_;
// Source of random offsets for selection. The returned offsets are in the
// range [0, UINT_MAX]. See mesa_distribution.h for details on the random
// distribution.
//
// This distribution is initialized with the expected number of surfaces as
// the distribution's pivot point. I.e.
// `random_offset_generator_.pivot_point()` is
// `features::kIdentifiabilityStudyExpectedSurfaceCount`.
MesaDistribution<OffsetType> random_offset_generator_;
// Keeps track of which identifiable surfaces have been exposed to which UKM
// sources. Each document and worker context within a document tree has
// a unique source. Hence this field keeps track of identifiable surfaces
// exposed to all execution contexts in all the document trees.
//
// This field resets each time a new UKM report is generated. Hence the
// tracked value is essentially "which surfaces have been exposed to which
// sources since the last UKM report."
//
// Invariants:
//
// * surface_encounters_ ∩ kSettings.blocked_surfaces() = Ø.
//
// * ∀ s ∈ surface_encounters_[i], s.GetType() ∉ kSettings.blocked_types().
//
// Where kSettings is the PrivacyBudgetSettingsProvider singleton.
EncounteredSurfaceTracker surface_encounters_;
// Whether the meta experiment (i.e. reporting the meta surfaces, which
// include information only about usage of APIs) is active or not. Note that
// this setting is independent from the rest of the Identifiability Study, and
// can be enabled / disabled separately.
const bool meta_experiment_active_;
SEQUENCE_CHECKER(sequence_checker_);
};
#endif // CHROME_BROWSER_PRIVACY_BUDGET_IDENTIFIABILITY_STUDY_STATE_H_
|