File: model_metadata.proto

package info (click to toggle)
chromium 139.0.7258.138-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 6,120,676 kB
  • sloc: cpp: 35,100,869; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (441 lines) | stat: -rw-r--r-- 17,374 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

syntax = "proto2";
option optimize_for = LITE_RUNTIME;

package segmentation_platform.proto;

import "components/segmentation_platform/public/proto/aggregation.proto";
import "components/segmentation_platform/public/proto/output_config.proto";
import "components/segmentation_platform/public/proto/types.proto";

// The version is used to verify if the metadata provided by the server is
// supported in current version of the code. Update the version number for any
// new feature added to metadata proto, and add a log of the new changes in the
// current version in this file.
// Version 0 supports UMA features and aggregation in |features| field.
// Version 1 supports UMA features, custom inputs and sql features in
// |input_features| field.
// Version 2 supports training data output collection in |training_outputs|
// field.
// Version 3 supports trigger configurations for training data collection.
enum CurrentVersion {
  METADATA_VERSION = 3;
}

// Version information for segmentation models.
message VersionInfo {
  // Minimum model metadata version that is supported. Some newer
  // features/fields might not be available before this version. This field is
  // set on the server and read by the client to verify if model is valid.
  optional int32 metadata_min_version = 1;

  // Current model metadata version. This field is set by the client while
  // sending a model download request to optimization guide server so that the
  // server knows the capabilities of the client.
  optional int32 metadata_cur_version = 2;
}

// Used to identify the source of the model whether it is a client side or
// server side model.
enum ModelSource {
  UNKNOWN_MODEL_SOURCE = 0;
  SERVER_MODEL_SOURCE = 1;   // Represents server side model.
  DEFAULT_MODEL_SOURCE = 2;  // Represents client side model.
}

message UMAFeature {
  // The type of signal this feature refers to.
  // Note: SignalType::UKM_EVENT type is only used for SignalStorageConfig and
  // should not be used as uma feature's signal type.
  optional SignalType type = 1;

  // The human readable name of the histogram or user action.
  optional string name = 2;

  // The hash of the histogram name or user action. Must match the result of
  // base::HashMetricName.
  optional fixed64 name_hash = 3;

  // Number of buckets to include in the result. If set to 0, no data will be
  // collected. This can be used to start storing data before it should be used.
  // See documentation for Aggregation for details.
  optional uint64 bucket_count = 4;

  // The required length of the calculated result. See documentation for
  // Aggregation for details.
  optional uint64 tensor_length = 5;

  // The type of aggregation to use for this particular feature.
  optional Aggregation aggregation = 6;

  // Only set if type == HISTOGRAM_ENUM.
  // Matches are only valid when the enum ID matches any of these.
  // Works like an OR condition, e.g.: [url, search, …] or just [url].
  repeated int32 enum_ids = 7;

  // Only set if aggregation == LATEST_OR_DEFAULT.
  // Value used for model if latest value requested is not available in the
  // database. The number of entries should be equal to the tensor_length.
  repeated float default_values = 8;
}

message CustomInput {
  // This parameter is required.
  // 1. If the param is directly used as the input tensor field to the model,
  // then this specifies the number of columns to fill in the tensor. In this
  // case the value should be float.
  // 2. If the param is used as a bind value for sql features, then this
  // specifies the number of sql bindings to fill in the sql query.
  optional int32 tensor_length = 1;

  // Used to distinguish between different types of custom inputs.
  enum FillPolicy {
    // Custom functions provided by the engine that fills in the input feature
    // to the model.
    UNKNOWN_FILL_POLICY = 0;
    // Output is the time at which model prediction is needed. Can be used to
    // bind TIME type param to queries.
    // Output type: Time
    // Output length: 1
    FILL_PREDICTION_TIME = 1;
    // Output is two timestamps, the beginning and the end of last x days. Can
    // be used to bind TIME type param to query within a time interval.
    // Output type: Time
    // Output length: 2
    // Additional arg:
    //   `bucket_count`: Required. Number of buckets to include in the result.
    TIME_RANGE_BEFORE_PREDICTION = 2;

    // Used to determine whether a given page is a product details page and can
    // be price tracked.
    PRICE_TRACKING_HINTS = 3;

    // This type of custom input is used directly to fill the input tensor to
    // the model or to another query.
    // Output type: ProcessedValue
    // Output length: 1
    // Additional arg:
    //   `name`: Optional. The name of the field to be looked up in input
    //    context. If missing then the |name| field is used.
    FILL_FROM_INPUT_CONTEXT = 4;

    // Output is a tensor of length 10 consisting of float values denoting
    // various devices count by type with different form factor and os type.
    // See `SyncDeviceInfoObserver` for description of each value.
    // Output type: float
    // Output length: 10
    // Additional arg:
    //   `wait_for_device_info_in_seconds`: Number of seconds to wait for sync
    //   device info before timeout. If 0, then does not wait for sync and times
    //   out immediately if device info is not available.
    // InputContext arg:
    //   `active_days_limit`: Number of days after which the device is
    //   considered not active after last sync. Must be INT.
    FILL_SYNC_DEVICE_INFO = 5;

    // Output is a tensor of length 1 consisting device RAM in MB.
    // Output type: float
    // Output length: 1
    FILL_DEVICE_RAM_MB = 6;

    // Output is a tensor of length 1 describing device OS level.
    // Output type: float
    // Output length: 1
    FILL_DEVICE_OS_VERSION_NUMBER = 7;

    // Output is a tensor of length 1 giving pixels per inch for the current
    // device used by the user.
    // Output type: float
    // Output length: 1
    FILL_DEVICE_PPI = 8;

    // Fills metrics about a given tab. A `tab_id` and `session_tag` is expected
    // from input_context.
    // Output type: float
    // Output length: `TabSessionSource::kNumInputs`
    FILL_TAB_METRICS = 9;

    // Fills a random number between [0, 1).
    // Output type: float
    // Output length: 1
    FILL_RANDOM = 10;

    // Fill various metrics from the shopping service. Currently only support
    // shopping bookmark count.
    // Output type: float
    // Output length: 1
    FILL_FROM_SHOPPING_SERVICE = 11;
  }

  // The fill type of the custom input.
  optional FillPolicy fill_policy = 2;

  // If the current chrome version does not support this fill type, use this
  // value. If this is not specified and the function is unavailable, the model
  // will not run due to missing input. The number of entries should be equal to
  // the |tensor_length|.
  repeated float default_value = 3;

  // If the fill type need additional arguments, use this value.
  map<string, string> additional_args = 4;

  // The human readable name of the custom input.
  optional string name = 5;
}

// Configuration for storing signals in the SQL database.
message SignalFilterConfig {
  // Defines a single UKM event that should be stored.
  message UkmEvent {
    // Event hash of the UKM event.
    optional uint64 event_hash = 1;
    // List of metric hashes for the event, to store in the database. It is
    // is required to provide list of necessary metrics.
    // TODO: Support empty metric hash list, the database will store all the
    // metrics for the UKM event.
    repeated uint64 metric_hash_filter = 2;
  }
  // List of UKM events to store in the database.
  repeated UkmEvent ukm_events = 1;
}

message SqlFeature {
  // The query should select a single float column. The query can contain '?'
  // which can be used to bind values using |bind_values| list.
  // TODO(ssid): Consider expanding this to return multiple input tensor
  // features.
  optional string sql = 1;

  // List of signals needed in the storage for the query.
  optional SignalFilterConfig signal_filter = 2;

  // Used to bind value for the SQL query.
  message BindValue {
    // The bind field numbers, in range of 0 to n-1, for n question marks in the
    // SQL query.
    repeated int32 bind_field_index = 1;

    // Used to call Bind*() in sql::Statement.
    enum ParamType {
      UNKNOWN = 0;
      NULL = 1;
      BOOL = 2;
      INT = 3;
      INT64 = 4;
      DOUBLE = 5;
      STRING = 6;
      TIME = 7;
    }
    optional ParamType param_type = 2;

    // Value of the input to bind the query. The custom function should return
    // the specified param type. The |tensor_length| should be 0 since these
    // inputs can only be used for SQL bind values.
    optional CustomInput value = 3;
  }
  repeated BindValue bind_values = 3;

  // The human readable name of the ukm event and metric.
  optional string name = 4;
}

// Contains a feature used as an input to the ML model.
message InputFeature {
  oneof Feature {
    // An UMAFeature type of input feature.
    UMAFeature uma_feature = 1;

    // A custom input type of input feature.
    CustomInput custom_input = 2;

    // Input feature computed using SQL query.
    SqlFeature sql_feature = 3;
  }
}

// Contains a list of training output generators. The ML model pipeline can
// iterate on different output candidates and select the final output generator.
message TrainingOutputs {
  repeated TrainingOutput outputs = 1;

  // Config for triggering the training outputs data collection for the current
  // model.
  message TriggerConfig {
    // Describes how the training outputs are collected.
    enum DecisionType {
      // By default considered as PERIODIC type.
      UNKNOWN = 0;
      // The on demand scheduler will trigger training data collection when the
      // client asks for a model execution with input context.
      ONDEMAND = 1;
      // The periodic scheduler will trigger training data collection everyday.
      // Currently this period is fixed on the client to 1 day.
      PERIODIC = 2;
    }
    optional DecisionType decision_type = 1;

    message ObservationTrigger {
      oneof trigger {
        // The delay, in seconds, to collect output tensors after input tensors
        // are collected. For example, output labels can be collected one week
        // after input tensors are collected. Set to 0 if output tensors need to
        // be collected in the same time period as input tensors.
        uint64 delay_sec = 1;
        // The user action or histogram to trigger a training data output
        // collection. Note: Only the name and type should be used with
        // bucket_duration = 0.
        // TODO(crbug.com/40239034): Figure out how to include the trigger as
        // one of the outputs automatically.
        UMAOutput uma_trigger = 2;
      }
    }
    // List of triggers, whichever is hit first is used to upload the training
    // data.
    repeated ObservationTrigger observation_trigger = 2;

    // Only for PERIODIC trigger. The prediction and observation times can be
    // exact or flexible. The exact prediction setting forces the prediction
    // time to be the time at which the segment selection or classification
    // result was changed. The input features will be collected till the
    // prediction time. Flexible prediction time setting allows the collector to
    // pick any point in the past as the prediction time, usually pick the
    // current time. The training data collection is triggered once a day with a
    // rolling window whenever Chrome is active. This setting uploads more
    // training data samples. By default the prediction time is FLEXIBLE. The
    // exact observation time setting will be used only in case of exact
    // prediction case and the observation starts exactly after prediction time.
    // Flexible observation can be used to get most recent user behavior by
    // setting observation time to the time of upload, which could be later than
    // end of the observation period. By default the observation time is EXACT.
    optional bool use_exact_prediction_time = 3;
    optional bool use_flexible_observation_time = 4;
  }
  optional TriggerConfig trigger_config = 2;
}

// Generic type to define how to generate the training data output.
// TODO(xingliu): Add more implementation details about how output training data
// is generated.
message TrainingOutput {
  oneof output {
    // Training data output is generated from UMA metrics.
    UMAOutput uma_output = 1;
  }
}

// Contains the information to generate the output for training data based on a
// particular UMA metric.
message UMAOutput {
  // The UMA metric to generate the training data output.
  optional UMAFeature uma_feature = 1;

  // The duration to trigger a training data collection, unit in TimeUnit. If
  // not specified or 0, the training data will be generated immediately after
  // certain UMA is recorded.
  optional uint64 duration = 2;
}

// Metadata about a segmentation model for a given segment. Contains information
// on how to use the model such as collecting signals, interpreting results etc.
// Next tag: 16
message SegmentationModelMetadata {
  // Values for obsolete fields.
  reserved 15;

  // The version information needed to validate segmentation models.
  optional VersionInfo version_info = 9;

  // DEPRECATED: Use |input_features.uma_feature| instead. Only one of
  // |features| or |input_features| can be used in the config, not both. An
  // ordered list of required features.
  repeated UMAFeature features = 1;

  // An ordered list of required features and custom inputs. Only one of
  // |features| or |input_features| can be used in the config, not both.
  repeated InputFeature input_features = 10;

  // A list of training data output definitions.
  optional TrainingOutputs training_outputs = 11;

  // The time unit to be used for the rest of this proto.
  optional TimeUnit time_unit = 2;

  // The size of each interval the data should be aggregated over.
  optional uint64 bucket_duration = 3;

  // For how long should data be stored for this model.
  optional int64 signal_storage_length = 4;

  // For how long do we have to have captured data for this model. If the
  // relevant signals have been captured for a shorter amount of time than this,
  // this model can never be selected.
  optional int64 min_signal_collection_length = 5;

  // Describes how long after a valid result has been calculated for this model
  // it is OK to cache the result without recalculating with updated data.
  optional int64 result_time_to_live = 6;

  // The model always executes with a fixed timestamp. This is used when the
  // model is trained on data from a specific time period, and needs to evaluate
  // on the same date.
  optional int64 fixed_prediction_timestamp = 17;

  message DiscreteMapping {
    // A mapping result from the raw continuous result to a discrete and
    // comparable value based on |rank|.
    message Entry {
      // The minimum result of the model to be allowed to choose this mapping.
      optional float min_result = 1;

      // A feature specific rank.
      optional int64 rank = 2;
    }

    // An ordered (based on their |min_result|) list of discrete mappings.
    // To map a model evaluation result to a DiscreteMapping, choose the highest
    // |min_value| that the evaluation result is at or above.
    // E.g. for these mappings: [(0.0, 0), (0.4, 1), (0.7, 2), (0.9, 3)], a
    // result of 0.7 would yield (0.7, 2), and 0.69 would yield (0.4, 1).
    repeated Entry entries = 1;
  }
  map<string, DiscreteMapping> discrete_mappings = 7;

  // The default key to use during the mapping process if no key has been
  // provided.
  optional string default_discrete_mapping = 8;

  // The delay, in seconds, to collect output tensors after input tensors are
  // collected. For example, output labels can be collected one week after input
  // tensors are collected. If not specified, output tensors are collected in
  // the same time period as input tensors.
  // DEPRECATED: optional int64 output_collection_delay_sec = 12;
  reserved 12;

  // Whether the client should upload the input and output tensors through UKM.
  optional bool upload_tensors = 13;

  // Describes the return type of the model score. Used for recording
  // histograms.
  enum OutputDescription {
    UNKNOWN_RETURN_TYPE = 0;
    // Model returns either 0 or 1.
    RETURN_TYPE_HEURISTIC = 1;
    // Model returns an int corresponding to a specific subsegment. Assume
    // between 0 and 100.
    RETURN_TYPE_MULTISEGMENT = 2;
    // Model returns a float between 0 and 1.
    RETURN_TYPE_PROBABILITY = 3;
    // Model returns any integer value.
    RETURN_TYPE_INTEGER = 4;
  }
  // TODO(ritikagup@): Deprecate the field.
  optional OutputDescription return_type = 14;

  // Contains information about the model results. Supplied by the client. It
  // gives a description of how should the results look like and how to
  // interpret them.
  optional OutputConfig output_config = 16;
}