File: metadata_proto.proto

package info (click to toggle)
python-confluent-kafka 2.11.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,660 kB
  • sloc: python: 30,428; ansic: 9,487; sh: 1,477; makefile: 192
file content (591 lines) | stat: -rw-r--r-- 19,175 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
syntax = "proto3";

package Criteo.Glup;
option java_package = "com.criteo.glup";

/**
 * GLUP messages SHALL:
 *   - Have an Origin field with id 1.
 *   - Have an Partition field with id 2.
 *   - Not use id 3 and 4.
 *   - Have an ControlMessage field with id 2097151.
 * You can create a new one by copying BaseGlupMessage and adding your
 * own fields.
 */

import "google/protobuf/descriptor.proto";

// It holds the kafka topic name
message KafkaMessageOptions {
  repeated string topic = 1;
}


// It describes the currently supported partitioning scheme
enum PartitionScheme {
  UNSUPPORTED_PARTITION_SCHEME = 0;
  DAILY = 2; /* Deprecated, use PLATFORM_HOURLY instead */
  HOURLY = 3;
  PLATFORM_HOURLY = 4;
  EVENTTYPE_PLATFORM_HOURLY = 5;
}

enum MessageFormat {
  UNSUPPORTED_FORMAT = 0;
  JSON = 1;
  PROTOBUF = 2;
}

// It describes the currently supported data on HDFS, usually partitioned in PAIL files
enum HDFSDataFormat {
  UNSUPPORTED_DATA_FORMAT = 0;
  JSON_PAIL = 2;
  PROTOBUF_SEQ = 3;
  PROTOBUF_PARQUET = 4;
}

enum DataSetKind {
  UNSUPPORTED_KIND = 0;
  TIMESERIES = 1;
}

enum MonitoringLevel {
  DEFAULT = 0;
  REMOVE_MONITORING = 1; // consensus and last_run monitoring will be skipped
  INFORMATIVE_MONITORING = 2; // No page for consensus or last_run monitoring
  CONSENSUS_IGNORED = 3;  // consensus monitoring will be skip but last_run will be monitored (can page)
  CONSENSUS_IGNORED_AND_INFORMATIVE_MONITORING = 4; // consensus monitoring will be skip and last_run will be monitored as informative (no page)
}

// Defines a data set with its underlying available formats
message DataSet {
  // A uniq identifier for this dataset.
  // This string must be lowercase and only contain alphanumerical characters
  // or underscores.
  string id = 1;
  // A set of formats available for this dataset.
  repeated DataSetFormat format = 2;
  // Partition scheme used to store data inside the path.
  PartitionScheme partition_scheme = 3;
  // (Mandatory) This should be java class which will represent the dataset's schema.
  string java_class = 4;
  // is this dataset a unit test dataset
  bool for_tests = 5;
  // owner of this dataset
  string owner = 6;
  // is this dataset private
  bool private = 7;
  // kind of dataset
  DataSetKind kind = 8;
  // retention period (in days)
  int32 retention_days = 9;
}

// Defines a set of dataset partitions
message DataSetChunk {
  repeated Partition partition = 1;
  DataSetFormat format = 2;
  string datasetId = 3;
}

// Defines a dataset format.
message DataSetFormat {
  reserved 6; // used to be the flag 'prefered_format'

  // Location of the data.
  string path = 1;
  // File format used to store records.
  HDFSDataFormat file_format = 2;
  // Partition scheme used to store data inside the path.
  PartitionScheme partition_scheme = 3;
  // The minimum defined partition for this format inclusive.
  HDFSPartition start_partition = 4;
  // The maximum defined partition for this format exclusive.
  HDFSPartition end_partition = 5;
  // Retention in days
  int32 retention_days = 7;
  // Priority, bigger number means higher priority, 0 as the lowest, used to decide which format to use when several are available.
  int32 priority = 8;
  // Label, should be unique across all formats in a same dataset.
  string label = 9;
  //Monitoring level, if not set DEFAULT will be applied
  MonitoringLevel monitoring_level = 10;
}


message HDFSOptions {
  message ImportOptions {
    message View {
      // Defines a table view of a surrounding hdfs import into hive.
      // All other settings (owner, namespace, name) are already
      // in the surrounding block.
      message HiveOptions {
        /*
          The database will be glup
          The table name will be:
          - PROD:
            glup_${name} prod data
          - PREPROD:
            - sync: glup_${name} copy of [sampled] prod data
            - others: glup_${name}_preprod preprod data
        */
        // if not provided it will default to the glup partition_scheme
        PartitionScheme partitioning = 3;
      }

      // define table mapping in hive
      HiveOptions hive = 10;

      // Add here new view of data in HDFS
    }
    message Generator {
      message DedupOptions {
        // Name of input dataset to deduplicate
        string input_dataset_id = 1;
        // Label of input format
        string input_format_label = 2;
        // Name of output dataset to deduplicate
        string output_dataset_id = 3;
        // Label of output format
        string output_format_label = 4;
        // Generate the conf for the cuttle job and not for the legacy lobster job
        // Check RIVERS-3614 for details
        // TODO : remove this property when lobster job will be fully decomissioned
        bool use_hippo_cuttle_job = 5;
      }

      message Kafka2HdfsOptions {
        reserved 2; // deprecated: enable per-datacenter import:
        // settings specific to kafka2hdfs
        string topic = 1;
        // should the output be deduplicated
        bool deduplicate = 3;
        // Name of the dataset import will produce
        string output_dataset_id = 4;
        // Label of output format
        string output_format_label = 5;
      }

      message KacohaConfig {
        // How many partitions per task
        int32 partitions_per_task = 1;
        // The size of the poll buffer used in KaCoHa
        int32 poll_buffer_size_bytes = 2;
      }

      message KacohaConfigPerDc {
        // The Data center to which this config applies
        DataCenter dc = 1;
        // The configuration for this data center
        KacohaConfig config = 2;
      }

      message KaCoHaOptions {
        // settings specific to KaCoHa
        string topic = 1;
        // Name of the dataset import will produce
        string output_dataset_id = 2;
        // Deduplicate dataset in the consolidation
        bool deduplicate = 3;
        // Common configs for all data centers
        KacohaConfig config = 4;
        // Label of output format
        string output_format_label = 5;
        // Override specific configs for each data center
        repeated KacohaConfigPerDc config_per_dc = 6;
      }

      message DataloaderOptions {
        // settings specific to dataloader
        // If no platform is defined, it will default to all platforms
        repeated Platform platform = 1;
      }

      message SyncOptions {
        // defines the source of the sync
        Location from = 1;
        // The default is glup
        string source_namespace = 3;
        // string start_date = 4;  // REMOVED because duplicates generator.start_date
        // string stop_date = 5;  // REMOVED because duplicates generator.stop_date
        // default is all platforms.
        repeated Platform platforms = 6;
        // define if sync is a backfilling (from now to oldest partitions
        bool is_backfilling = 8;
        // define where data will be sync'ed, based on dataset_id and label
        string to_label = 9;
        // define where data will be sync'ed, based on dataset_id and label
        string to_dataset_id = 10;
        // define backfilling in addition to sync at the same time
        bool with_backfilling = 11;
        // defines whether should we schedule generator on to or from location
        bool is_scheduled_on_source = 12;
      }

      message BackupOptions {
        // defines the source of the sync
        Location from = 1;
        // The default is glup
        string source_namespace = 2;
        // default is all platforms.
        repeated Platform platforms = 3;
      }

      // Defines how to transcode data from one format to an other
      message TranscodingOptions {
          // id of the source data set
          string input_dataset_id = 1;
          // id of the target data set, it can be the same as the source
          string output_dataset_id = 2;
          // format of the source data set
          HDFSDataFormat input_format = 3;
          // format of the target data set
          HDFSDataFormat output_format = 4;
          // input format label
          string input_dataset_label = 5;
          // output format label
          string output_dataset_label = 6;
          // transcoding job should work by platform
          bool is_by_platform = 7;
      }

      message SamplerOptions {
        // Name of input dataset to sample
        string input_dataset_id = 1;
        // Label of input format
        string input_format_label = 2;
        // Name of output dataset to sample
        string output_dataset_id = 3;
        // Label of output format
        string output_format_label = 4;
        // Sampling rate between 0 and 1
        float sampling_rate = 5;
      }

      // Used by the DataComparator
      // NB: comparision is symmetric, left and right are just name and
      // do not refer to join strategy
      message ComparatorOptions {
        // The first data set to compare
        string left_dataset_id = 1;
        // The specific format of the first data set
        string left_format_label = 2;
        // The second data set to compare
        string right_dataset_id = 3;
        // The specific format of the second data set
        string right_format_label = 4;
        // If set, filtering left and right dataset to a specific hostname.
        // Please note hostname must be resolvable to be able to get the ip (example : web-rtb254-par.par.ad.criteo.prod)
        string hostname = 5;
        // If set, fields ignored in the comparison separated by ','
        string ignored_fields = 6;
      }

      message ExternalOptions {
      }

      // oneof is not currently supporte by protonet
      DataloaderOptions dataloader = 1;
      Kafka2HdfsOptions kafka2hdfs = 2;
      SyncOptions sync = 3;
      ExternalOptions external = 4;
      BackupOptions backup = 5;
      TranscodingOptions transcoding = 6;
      KaCoHaOptions kacoha = 7;
      DedupOptions deduplicate = 8;
      SamplerOptions sampler = 9;
      ComparatorOptions comparator = 10;
      // Add here any new hdfs producer

      // defines where it will run and push data
      repeated Location to = 250;
      string namespace = 251;

      // define beginning date inclusive
      string start_date = 253;
      // define stop date exclusive
      string stop_date = 254;

      // TODO : remove this property when RIVERS-3723 (remove CN platform) is over
      bool ignore_cn = 255;
    }


    // location on hdfs
    string owner = 1;
    string name = 2;
    PartitionScheme partitioning = 4;
    HDFSDataFormat format = 5;
    // if set to true, imported data won't be readable from other users
    // (permission o-rwx)
    bool private = 6;

    repeated Generator generator = 11;
    repeated View view = 12;
  }

  repeated ImportOptions import = 3;
}

/* Describes the producers to use when emitting a Glup
 * If the producer is not set or is set to false, then the producer is not used
 */
message ProducerTransportOptions {
  // send glup through syslog. Deprecated, do not use for new glup
  bool syslog = 1;
  // send glup through kafka
  bool kafka = 2;
}

/* Describes additional glup properties
 * It is used to help selecting the most appropriate pipeline configuration when sending a glup
 */
message PropertyOptions {
  // can be used to provide a different QoS for important glups
  bool valuable = 1;
  // can be used to select suitable instances capable of handling a high volume of glups
  bool high_volume = 2;
}

message GlupOptions {
  // kafka settings
  KafkaMessageOptions kafka = 1;
  // hdfs import settings
  HDFSOptions hdfs = 2;
  // field samplimg rate
  uint32 sampling_pct = 3;
  // preprod message sampling rate
  uint32 preprod_sampling_pct = 4;
  // dataset definition
  repeated DataSet dataset = 5;
  // message sampling rate
  uint32 message_sampling_pct = 6;
  // producer(s) to use
  ProducerTransportOptions producers = 7;
  // additional glup properties
  PropertyOptions properties = 8;
}

// Internal extensions uses 50000 to 99999
// https://github.com/google/protobuf/blob/master/src/google/protobuf/descriptor.proto#L267
extend google.protobuf.MessageOptions {
  GlupOptions glup = 50000;
  // Does message contains nullable field. This will be subject to special generation that will allow
  // to know if a specific field in the corresponding message or sub-message has been set or not.
  bool contains_nullable_fields = 50001;
}

/* It describes glup fields options:
disabled_platform => list of platform where the field will never be sent
*/
message GlupFieldOptions {
  bool sampled = 1;
  bool sampling_key = 2;
  repeated Platform disabled_platform = 3;
  bool should_clean_pii = 4;

  // The next fields are part of the Try&Wipe hackathon 2018 project:
  // When true, this field can not be used by GLUP consumers anymore
  bool pending_deletion = 5;
  // Date at which the field has been added to the schema.
  string added_at = 6;
}

// Internal extensions uses 50000 to 99999
// https://github.com/google/protobuf/blob/master/src/google/protobuf/descriptor.proto#L267
extend google.protobuf.FieldOptions {
  GlupFieldOptions glupfield = 50000;
}


// This option is used when exposing a message to hive
message JsonMapping {
  string name = 1; // alias to use for json serialization
  bool skip = 2; // do we have to skip this field from json serialization
}

// This option is used by the transcoder
// Note that both options could be merged, see RIVERS-1894
message JsonAlias {
  // alias for json serialization
  string name = 1;
  // When the field type is an enum in protobuf, the value of this field in
  // json is the id of the field of the enum (an integer value) rather
  // than its name.
  bool use_enum_field_id = 3;
}


// Internal extensions uses 50000 to 99999
// https://github.com/google/protobuf/blob/master/src/google/protobuf/descriptor.proto#L267
extend google.protobuf.FieldOptions {
  JsonMapping json_mapping = 50001; // add JsonProxy extension
  JsonAlias json = 50002; // add JsonAlias extension
}




/* It is safe to read any GLUP message as a BaseGlupMessage */
message BaseGlupMessage {
  option (contains_nullable_fields) = true;
  Origin glup_origin = 1;
  Partition partition = 2;
  // ID 3 is reserved, ask rivers-team@ first.
  // ID 4 is reserved, ask rivers-team@ first.
  map<int32, bool> set_fields = 50010; // nullable tracking special field
  repeated ControlMessage.Watermark control_message = 2097151 [ (json).name = "__metadata" ]; // standard glup field
}

/* Message to be used for forward watermarks in KaCoHa */
message ForwardedWatermarkMessage {
  int64  original_kafka_offset = 5; //Where it was taken
  int64  timestamp = 6; //When message was created
  bool   consolidation_enabled = 7; //True to update consensus in consolidation folder, false for consensus in dataset folder
  string dataset_id = 10; //DatasetId that we should process
  string dataset_format_label = 11;  // Exact format that we're processing (when overriding the default format)
  repeated ControlMessage.Watermark control_message = 2097151 [ (json).name = "__metadata" ]; // original watermark
}



/* Describes the emitter of a message. */
enum DataCenter {
  // If a MARS datacenter gets added, code built before that will see
  // UNSUPPORTED_DATACENTER as the value.
  // Never emit the actual value of UNSUPPORTED_DATACENTER.
  UNSUPPORTED_DATACENTER = 0;
  AM5 = 2;
  HK5 = 3;
  NY8 = 4;
  PAR = 5;
  PA4 = 6;
  SH5 = 7;
  SV6 = 8;
  TY5 = 9;
  VA1 = 10;
  AM6 = 11;
  DA1 = 12;
}

enum Environment {
  UNSUPPORTED_ENVIRONMENT = 0;
  PREPROD = 1;
  PROD = 2;
}

message Location {
  Environment env = 1;
  DataCenter dc = 2;
  // label, associated with dataset_id (both required) is used to override source path of data
  string label = 3;
  // label, associated with dataset_id (both required) is used to override source path of data
  string dataset_id = 4;
}


/* Identifies the emitter of a message. */
message Origin {
  // DataCenter of the emitting server.
  DataCenter datacenter = 1;
  // Mandatory for TLAs and chef-deployed producers.
  // IP v4 adresses in netork byte order
  fixed32 ip4 = 2 [ (json_mapping).name = "host_ip" ];
  // Mandatory for TLAs and chef-deployed producers.
  string hostname = 3;
  // Mandatory on container environment.
  string container_task = 4 [ (json_mapping).skip = true ];
  string container_app = 5 [ (json_mapping).skip = true ];
}


/* For use by infrastructure teams. */
message ControlMessage {
  enum Cluster {
    UNSUPPORTED_CLUSTER = 0;
    LOCAL = 2;
    CENTRAL = 3;
    STREAM = 4;
  }
  message WatermarkOrigin {
    string kafka_topic = 1;
    DataCenter datacenter = 2;
    Cluster cluster = 3;
  }
  message Watermark {
    option (contains_nullable_fields) = true;
    string type = 1; // backward compatibility watermark type
    string hostname = 2; // hostname of sender
    string kafka_topic = 3; // topic concerned
    int32 partition = 4; // partition id
    int32 partition_count = 5; // partition count
    bytes process_uuid = 6; // process uuid of sender
    string region = 7; // region of sender
    int32 timestamp_seconds = 8 [ (json).name = "timestamp" ]; // timestamp in second
    string cluster = 9; // cluster
    string environment = 10; // prod or preprod
    map<int32, bool> set_fields = 50010; // nullable map to track set fields
  }
  Watermark watermark = 1;
}


/* Used in HourlyPlatformPartition to describe where should the message go on HDFS. */
enum Platform {
  // If a MARS platform gets added, code built before that will see
  // UNSUPPORTED_PLATFORM as the value.
  // Never emit the actual value of UNSUPPORTED_PLATFORM.
  UNSUPPORTED_PLATFORM = 0;
  EU = 2;
  US = 3;
  AS = 4;
  CN = 5;
}

enum EventType {
  UNSUPPORTED_EVENT_TYPE = 0;
  ItemPageView = 2;
  Sales = 3;
  Basket = 4;
  Other = 5;
}

/* Describes where messages should go on HDFS.
 * This information is carried by all glup message.
 */
message Partition {
  // Unix timestamp in seconds
  uint64 timestamp_seconds = 1 [ (json_mapping).name = "timestamp" ];
  Platform host_platform = 2;
  EventType event_type = 3 [ (json_mapping).skip = true ];
  /* Other partition keys... */
}

/* Internaly used by datadiscovery framework to handlde
 * datadiscovery path and datadisco filesystem
 */
message HDFSPartition {
  // Unix timestamp in seconds
  uint64 timestamp_seconds = 1;
  Platform host_platform = 2;
  EventType event_type = 3;
  // Define the depth of a partition. This is used in order to distinguished partition
  // membership within a parent partition:
  // 2017-02-01/12/EU with depth 3 is a parent partition of 2017-02-01/12 with depth 2
  int32 depth = 4;
}

/* Three-state boolean to use when willing to express the absence of
 * information for a field. */
enum YesNo {
    UNKNOWN = 0;
    NO = 1;
    YES = 2;
}

/* 128bit hash to pass publisher information through the request URL  */
message Hash128 {
  fixed64 most_sig_bits = 1; // most significant bits
  fixed64 least_sig_bits = 2; // least significant bits
  map<int32, bool> set_fields = 50010; // nullable map to track set fields
}