-
Notifications
You must be signed in to change notification settings - Fork 49
/
schema.proto
960 lines (832 loc) · 35.5 KB
/
schema.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
syntax = "proto2";
package tensorflow.metadata.v0;
import "google/protobuf/any.proto";
import "tensorflow_metadata/proto/v0/derived_feature.proto";
import "tensorflow_metadata/proto/v0/path.proto";
// GOOGLE-LEGACY option jspb_use_correct_proto2_semantics = false;
option cc_enable_arenas = true;
option java_package = "org.tensorflow.metadata.v0";
option java_multiple_files = true;
// LifecycleStage. Only UNKNOWN_STAGE, BETA, PRODUCTION, and VALIDATION_DERIVED
// features are actually validated.
// PLANNED, ALPHA, DISABLED, and DEBUG are treated as DEPRECATED.
enum LifecycleStage {
// Unknown stage.
UNKNOWN_STAGE = 0;
// Planned feature, may not be created yet.
PLANNED = 1;
// Prototype feature, not used in experiments yet.
ALPHA = 2;
// Used in user-facing experiments.
BETA = 3;
// Used in a significant fraction of user traffic.
PRODUCTION = 4;
// No longer supported: do not use in new models.
DEPRECATED = 5;
// Only exists for debugging purposes.
DEBUG_ONLY = 6;
// Generic indication that feature is disabled / excluded
// from models, regardless of specific reason.
DISABLED = 7;
// Indicates that this feature was derived from ordinary
// features for the purposes of statistics generation or
// validation. Consumers should expect that this feature
// may be present in DatasetFeatureStatistics, but not in
// input data.
// Experimental and subject to change.
VALIDATION_DERIVED = 9;
reserved 8;
}
//
// Message to represent schema information.
// NextID: 15
message Schema {
// Features described in this schema.
repeated Feature feature = 1;
// Sparse features described in this schema.
repeated SparseFeature sparse_feature = 6;
// Weighted features described in this schema.
repeated WeightedFeature weighted_feature = 12;
// Use StructDomain instead.
// Sequences described in this schema. A sequence may be described in terms of
// several features. Any features appearing within a sequence must *not* be
// declared as top-level features in <feature>.
// GOOGLE-LEGACY repeated Sequence sequence = 2;
// String domains referenced in the features.
repeated StringDomain string_domain = 4;
// TOP LEVEL FLOAT AND INT DOMAINS ARE UNSUPPORTED IN TFDV.
// TODO(b/63664182): Support this.
// top level float domains that can be reused by features
repeated FloatDomain float_domain = 9;
// top level int domains that can be reused by features
repeated IntDomain int_domain = 10;
// Default environments for each feature.
// An environment represents both a type of location (e.g. a server or phone)
// and a time (e.g. right before model X is run). In the standard scenario,
// 99% of the features should be in the default environments TRAINING,
// SERVING, and the LABEL (or labels) AND WEIGHT is only available at TRAINING
// (not at serving).
// Other possible variations:
// 1. There may be TRAINING_MOBILE, SERVING_MOBILE, TRAINING_SERVICE,
// and SERVING_SERVICE.
// 2. If one is ensembling three models, where the predictions of the first
// three models are available for the ensemble model, there may be
// TRAINING, SERVING_INITIAL, SERVING_ENSEMBLE.
// See FeatureProto::not_in_environment and FeatureProto::in_environment.
repeated string default_environment = 5;
/* BEGIN GOOGLE-LEGACY
// TODO(b/73109633): Change default to false, before removing this field.
optional bool generate_legacy_feature_spec = 7 [default = true];
END GOOGLE-LEGACY */
// Whether to represent variable length features as RaggedTensors. By default
// they are represented as ragged left-alighned SparseTensors. RaggedTensor
// representation is more memory efficient. Therefore, turning this on will
// likely yield data processing performance improvement.
// Experimental and may be subject to change.
optional bool represent_variable_length_as_ragged = 14;
// Additional information about the schema as a whole. Features may also
// be annotated individually.
optional Annotation annotation = 8;
// Dataset-level constraints. This is currently used for specifying
// information about changes in num_examples.
optional DatasetConstraints dataset_constraints = 11;
// TensorRepresentation groups. The keys are the names of the groups.
// Key "" (empty string) denotes the "default" group, which is what should
// be used when a group name is not provided.
// See the documentation at TensorRepresentationGroup for more info.
// Under development.
map<string, TensorRepresentationGroup> tensor_representation_group = 13;
}
message ValueCountList {
repeated ValueCount value_count = 1;
}
// Describes schema-level information about a specific feature.
// NextID: 36
message Feature {
// The name of the feature.
optional string name = 1; // required
// This field is no longer supported. Instead, use:
// lifecycle_stage: DEPRECATED
// TODO(b/111450258): remove this.
optional bool deprecated = 2 [deprecated = true];
// Comment field for a human readable description of the field.
// TODO(b/123518108): remove this.
// GOOGLE-LEGACY optional string comment = 3 [deprecated = true];
oneof presence_constraints {
// Constraints on the presence of this feature in the examples.
FeaturePresence presence = 14;
// Only used in the context of a "group" context, e.g., inside a sequence.
FeaturePresenceWithinGroup group_presence = 17;
}
// The shape of the feature which governs the number of values that appear in
// each example.
oneof shape_type {
// The feature has a fixed shape corresponding to a multi-dimensional
// tensor.
FixedShape shape = 23;
// The feature doesn't have a well defined shape. All we know are limits on
// the minimum and maximum number of values.
ValueCount value_count = 5;
// Captures the same information as value_count but for features with
// nested values. A ValueCount is provided for each nest level.
ValueCountList value_counts = 32;
}
// Physical type of the feature's values.
// Note that you can have:
// type: BYTES
// int_domain: {
// min: 0
// max: 3
// }
// This would be a field that is syntactically BYTES (i.e. strings), but
// semantically an int, i.e. it would be "0", "1", "2", or "3".
optional FeatureType type = 6;
// Domain for the values of the feature.
oneof domain_info {
// Reference to a domain defined at the schema level.
// NOTE THAT TFDV ONLY SUPPORTS STRING DOMAINS AT THE TOP LEVEL.
// TODO(b/63664182): Support this.
string domain = 7;
// Inline definitions of domains.
IntDomain int_domain = 9;
FloatDomain float_domain = 10;
StringDomain string_domain = 11;
BoolDomain bool_domain = 13;
StructDomain struct_domain = 29;
// Supported semantic domains.
NaturalLanguageDomain natural_language_domain = 24;
ImageDomain image_domain = 25;
MIDDomain mid_domain = 26;
URLDomain url_domain = 27;
TimeDomain time_domain = 28;
TimeOfDayDomain time_of_day_domain = 30;
}
// Constraints on the distribution of the feature values.
// Only supported for StringDomains.
optional DistributionConstraints distribution_constraints = 15;
// Additional information about the feature for documentation purpose.
optional Annotation annotation = 16;
// Tests comparing the distribution to the associated serving data.
optional FeatureComparator skew_comparator = 18;
// Tests comparing the distribution between two consecutive spans (e.g. days).
optional FeatureComparator drift_comparator = 21;
// List of environments this feature is present in.
// Should be disjoint from not_in_environment.
// This feature is in environment "foo" if:
// ("foo" is in in_environment or default_environment) AND
// "foo" is not in not_in_environment.
// See Schema::default_environment.
repeated string in_environment = 20;
// List of environments this feature is not present in.
// Should be disjoint from of in_environment.
// See Schema::default_environment and in_environment.
repeated string not_in_environment = 19;
// The lifecycle stage of a feature. It can also apply to its descendants.
// i.e., if a struct is DEPRECATED, its children are implicitly deprecated.
optional LifecycleStage lifecycle_stage = 22;
// Constraints on the number of unique values for a given feature.
// This is supported for string and categorical features only.
optional UniqueConstraints unique_constraints = 31;
// If set, indicates that that this feature is derived, and stores metadata
// about its source. If this field is set, this feature should have a
// disabled stage (PLANNED, ALPHA, DEPRECATED, DISABLED, DEBUG_ONLY), or
// lifecycle_stage VALIDATION_DERIVED.
// Experimental and subject to change.
optional DerivedFeatureSource validation_derived_source = 34;
reserved 33;
// This field specifies if this feature could be treated as a sequence
// feature which has meaningful element order.
optional SequenceMetadata sequence_metadata = 35;
}
// Additional information about the schema or about a feature.
message Annotation {
// Tags can be used to mark features. For example, tag on user_age feature can
// be `user_feature`, tag on user_country feature can be `location_feature`,
// `user_feature`.
repeated string tag = 1;
// Free-text comments. This can be used as a description of the feature,
// developer notes etc.
repeated string comment = 2;
// Application-specific metadata may be attached here.
repeated .google.protobuf.Any extra_metadata = 3;
}
// Checks that the ratio of the current value to the previous value is not below
// the min_fraction_threshold or above the max_fraction_threshold. That is,
// previous value * min_fraction_threshold <= current value <=
// previous value * max_fraction_threshold.
// To specify that the value cannot change, set both min_fraction_threshold and
// max_fraction_threshold to 1.0.
message NumericValueComparator {
optional double min_fraction_threshold = 1;
optional double max_fraction_threshold = 2;
}
// Constraints on the entire dataset.
message DatasetConstraints {
// Tests differences in number of examples between the current data and the
// previous span.
optional NumericValueComparator num_examples_drift_comparator = 1;
// Tests comparisions in number of examples between the current data and the
// previous version of that data.
optional NumericValueComparator num_examples_version_comparator = 2;
// Minimum number of examples in the dataset.
optional int64 min_examples_count = 3;
// Maximum number of examples in the dataset.
optional int64 max_examples_count = 4;
}
// Specifies a fixed shape for the feature's values. The immediate implication
// is that each feature has a fixed number of values. Moreover, these values
// can be parsed in a multi-dimensional tensor using the specified axis sizes.
// The FixedShape defines a lexicographical ordering of the data. For instance,
// if there is a FixedShape {
// dim {size:3} dim {size:2}
// }
// then tensor[0][0]=field[0]
// then tensor[0][1]=field[1]
// then tensor[1][0]=field[2]
// then tensor[1][1]=field[3]
// then tensor[2][0]=field[4]
// then tensor[2][1]=field[5]
//
// The FixedShape message is identical with the tensorflow.TensorShape proto
// message for fully defined shapes. The FixedShape message cannot represent
// unknown dimensions or an unknown rank.
message FixedShape {
// The dimensions that define the shape. The total number of values in each
// example is the product of sizes of each dimension.
repeated Dim dim = 2;
// An axis in a multi-dimensional feature representation.
message Dim {
optional int64 size = 1;
// Optional name of the tensor dimension.
optional string name = 2;
}
}
// Limits on maximum and minimum number of values in a
// single example (when the feature is present). Use this when the minimum
// value count can be different than the maximum value count. Otherwise prefer
// FixedShape.
message ValueCount {
optional int64 min = 1;
optional int64 max = 2;
}
/* BEGIN GOOGLE-LEGACY
// Constraint on the number of elements in a sequence.
message LengthConstraint {
optional int64 min = 1;
optional int64 max = 2;
}
// A sequence is a logical feature that comprises several "raw" features that
// encode values at different "steps" within the sequence.
// TODO(b/110490010): Delete this. This is a special case of StructDomain.
message Sequence {
// An optional name for this sequence. Used mostly for debugging and
// presentation.
optional string name = 1;
// Features that comprise the sequence. These features are "zipped" together
// to form the values for the sequence at different steps.
// - Use group_presence within each feature to encode presence constraints
// within the sequence.
// - If all features have the same value-count constraints then
// declare this once using the shape_constraint below.
repeated Feature feature = 2;
// Constraints on the presence of the sequence across all examples in the
// dataset. The sequence is assumed to be present if at least one of its
// features is present.
optional FeaturePresence presence = 3;
// Shape constraints that apply on all the features that comprise the
// sequence. If this is set then the value_count in 'feature' is
// ignored.
// TODO(martinz): delete: there is no reason to believe the shape of the
// fields in a sequence will be the same. Use the fields in Feature instead.
oneof shape_constraint {
ValueCount value_count = 4;
FixedShape fixed_shape = 5;
}
// Constraint on the number of elements in a sequence.
optional LengthConstraint length_constraint = 6;
}
END GOOGLE-LEGACY */
// Represents a weighted feature that is encoded as a combination of raw base
// features. The `weight_feature` should be a float feature with identical
// shape as the `feature`. This is useful for representing weights associated
// with categorical tokens (e.g. a TFIDF weight associated with each token).
// TODO(b/142122960): Handle WeightedCategorical end to end in TFX (validation,
// TFX Unit Testing, etc)
message WeightedFeature {
// Name for the weighted feature. This should not clash with other features in
// the same schema.
optional string name = 1; // required
// Path of a base feature to be weighted. Required.
optional Path feature = 2;
// Path of weight feature to associate with the base feature. Must be same
// shape as feature. Required.
optional Path weight_feature = 3;
// The lifecycle_stage determines where a feature is expected to be used,
// and therefore how important issues with it are.
optional LifecycleStage lifecycle_stage = 4;
}
// A sparse feature represents a sparse tensor that is encoded with a
// combination of raw features, namely index features and a value feature. Each
// index feature defines a list of indices in a different dimension.
message SparseFeature {
reserved 11;
// Name for the sparse feature. This should not clash with other features in
// the same schema.
optional string name = 1; // required
// This field is no longer supported. Instead, use:
// lifecycle_stage: DEPRECATED
// TODO(b/111450258): remove this.
optional bool deprecated = 2 [deprecated = true];
// The lifecycle_stage determines where a feature is expected to be used,
// and therefore how important issues with it are.
optional LifecycleStage lifecycle_stage = 7;
// Comment field for a human readable description of the field.
// TODO(martinz): delete, convert to annotation.
// GOOGLE-LEGACY optional string comment = 3 [deprecated = true];
// Constraints on the presence of this feature in examples.
// Deprecated, this is inferred by the referred features.
optional FeaturePresence presence = 4 [deprecated = true];
// Shape of the sparse tensor that this SparseFeature represents.
// Currently not supported.
// TODO(b/109669962): Consider deriving this from the referred features.
optional FixedShape dense_shape = 5;
// Features that represent indexes. Should be integers >= 0.
repeated IndexFeature index_feature = 6; // at least one
message IndexFeature {
// Name of the index-feature. This should be a reference to an existing
// feature in the schema.
optional string name = 1;
}
// If true then the index values are already sorted lexicographically.
optional bool is_sorted = 8;
optional ValueFeature value_feature = 9; // required
message ValueFeature {
// Name of the value-feature. This should be a reference to an existing
// feature in the schema.
optional string name = 1;
}
// Type of value feature.
// Deprecated, this is inferred by the referred features.
optional FeatureType type = 10 [deprecated = true];
}
// Models constraints on the distribution of a feature's values.
// TODO(martinz): replace min_domain_mass with max_off_domain (but slowly).
message DistributionConstraints {
// The minimum fraction (in [0,1]) of values across all examples that
// should come from the feature's domain, e.g.:
// 1.0 => All values must come from the domain.
// .9 => At least 90% of the values must come from the domain.
optional double min_domain_mass = 1 [default = 1.0];
}
// Encodes vocabulary coverage constraints.
message FeatureCoverageConstraints {
// Fraction of feature values that map to a vocab entry (i.e. are not oov).
optional float min_coverage = 1;
// Average length of tokens. Used for cases such as wordpiece that fallback
// to character-level tokenization.
optional float min_avg_token_length = 2;
// String tokens to exclude when calculating min_coverage and
// min_avg_token_length. Useful for tokens such as [PAD].
repeated string excluded_string_tokens = 3;
// Integer tokens to exclude when calculating min_coverage and
// min_avg_token_length.
repeated int64 excluded_int_tokens = 4 [packed = true];
// String tokens to treat as oov tokens (e.g. [UNK]). These tokens are also
// excluded when calculating avg token length.
repeated string oov_string_tokens = 5;
}
// Encodes constraints on specific values in sequences.
message SequenceValueConstraints {
// The value which to express constraints for. Can be either an integer or
// a string.
oneof value {
int64 int_value = 1;
string string_value = 2;
}
// Min / max number of times the value can occur in a sequence.
optional int64 min_per_sequence = 3;
optional int64 max_per_sequence = 4;
// Min / max fraction of sequences that must contain the value.
optional float min_fraction_of_sequences = 5;
optional float max_fraction_of_sequences = 6;
}
// Encodes constraints on sequence lengths.
message SequenceLengthConstraints {
// Token values (int and string) that are excluded when calculating sequence
// length.
repeated int64 excluded_int_value = 1;
repeated string excluded_string_value = 2;
// Min / max sequence length.
optional int64 min_sequence_length = 3;
optional int64 max_sequence_length = 4;
}
// Encodes information for domains of integer values.
// Note that FeatureType could be either INT or BYTES.
message IntDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// Min and max values for the domain.
optional int64 min = 3;
optional int64 max = 4;
// If true then the domain encodes categorical values (i.e., ids) rather than
// ordinal values.
optional bool is_categorical = 5;
}
// Encodes information for domains of float values.
// Note that FeatureType could be either INT or BYTES.
message FloatDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// Min and max values of the domain.
optional float min = 3;
optional float max = 4;
// If true, feature should not contain NaNs.
optional bool disallow_nan = 5;
// If true, feature should not contain Inf or -Inf.
optional bool disallow_inf = 6;
// If True, this indicates that the feature is semantically an embedding. This
// can be useful for distinguishing fixed dimensional numeric features that
// should be fed to a model unmodified.
optional bool is_embedding = 7;
// If true then the domain encodes categorical values (i.e., ids) rather than
// continuous values.
optional bool is_categorical = 8;
// This field specifies the embedding dimension and is only applicable if
// is_embedding is true. It is useful for use cases such as restoring shapes
// for flattened sequence of embeddings.
optional int64 embedding_dim = 9;
// Specifies the semantic type of the embedding e.g. sbv4_semantic or pulsar.
optional string embedding_type = 10;
}
// Domain for a recursive struct.
// NOTE: If a feature with a StructDomain is deprecated, then all the
// child features (features and sparse_features of the StructDomain) are also
// considered to be deprecated. Similarly child features can only be in
// environments of the parent feature.
message StructDomain {
repeated Feature feature = 1;
repeated SparseFeature sparse_feature = 2;
}
// Encodes information for domains of string values.
message StringDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// The values appearing in the domain.
repeated string value = 2;
// Currently unused
// StringDomain consists of Categorical. This enum allows the user to
// specify the whether to treat the feature as categorical.
enum Categorical {
CATEGORICAL_UNSPECIFIED = 0;
CATEGORICAL_YES = 1;
CATEGORICAL_NO = 2;
}
optional Categorical is_categorical = 3;
}
// Encodes information about the domain of a boolean attribute that encodes its
// TRUE/FALSE values as strings, or 0=false, 1=true.
// Note that FeatureType could be either INT or BYTES.
message BoolDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// Strings values for TRUE/FALSE.
optional string true_value = 2;
optional string false_value = 3;
}
// BEGIN SEMANTIC-TYPES-PROTOS
// Semantic domains are specialized feature domains. For example a string
// Feature might represent a Time of a specific format.
// Semantic domains are defined as protocol buffers to allow further sub-types /
// specialization, e.g: NaturalLanguageDomain can provide information on the
// language of the text.
// Natural language text.
message NaturalLanguageDomain {
// Name of the vocabulary associated with the NaturalLanguageDomain.
// When computing and validating stats using TFDV,
// tfdv.StatsOptions.vocab_paths should map this name to a vocabulary file.
optional string vocabulary = 1;
optional FeatureCoverageConstraints coverage = 2;
repeated SequenceValueConstraints token_constraints = 3;
optional SequenceLengthConstraints sequence_length_constraints = 5;
reserved 4;
}
// Image data.
message ImageDomain {
// If set, at least this fraction of values should be TensorFlow supported
// images.
optional float minimum_supported_image_fraction = 1;
// If set, image should have less than this value of undecoded byte size.
optional int64 max_image_byte_size = 2;
}
// Knowledge graph ID, see: https://www.wikidata.org/wiki/Property:P646
message MIDDomain {}
// A URL, see: https://en.wikipedia.org/wiki/URL
message URLDomain {}
// Time or date representation.
message TimeDomain {
enum IntegerTimeFormat {
FORMAT_UNKNOWN = 0;
UNIX_DAYS = 5; // Number of days since 1970-01-01.
UNIX_SECONDS = 1;
UNIX_MILLISECONDS = 2;
UNIX_MICROSECONDS = 3;
UNIX_NANOSECONDS = 4;
}
oneof format {
// Expected format that contains a combination of regular characters and
// special format specifiers. Format specifiers are a subset of the
// strptime standard.
string string_format = 1;
// Expected format of integer times.
IntegerTimeFormat integer_format = 2;
}
}
// Time of day, without a particular date.
message TimeOfDayDomain {
enum IntegerTimeOfDayFormat {
FORMAT_UNKNOWN = 0;
// Time values, containing hour/minute/second/nanos, encoded into 8-byte
// bit fields following the ZetaSQL convention:
// 6 5 4 3 2 1
// MSB 3210987654321098765432109876543210987654321098765432109876543210 LSB
// | H || M || S ||---------- nanos -----------|
PACKED_64_NANOS = 1;
}
oneof format {
// Expected format that contains a combination of regular characters and
// special format specifiers. Format specifiers are a subset of the
// strptime standard.
string string_format = 1;
// Expected format of integer times.
IntegerTimeOfDayFormat integer_format = 2;
}
}
// END SEMANTIC-TYPES-PROTOS
// Describes the physical representation of a feature.
// It may be different than the logical representation, which
// is represented as a Domain.
enum FeatureType {
TYPE_UNKNOWN = 0;
BYTES = 1;
INT = 2;
FLOAT = 3;
STRUCT = 4;
}
// Describes constraints on the presence of the feature in the data.
message FeaturePresence {
// Minimum fraction of examples that have this feature.
optional double min_fraction = 1;
// Minimum number of examples that have this feature.
optional int64 min_count = 2;
}
// Records constraints on the presence of a feature inside a "group" context
// (e.g., .presence inside a group of features that define a sequence).
message FeaturePresenceWithinGroup {
optional bool required = 1;
}
// Checks that the L-infinity norm is below a certain threshold between the
// two discrete distributions. Since this is applied to a FeatureNameStatistics,
// it only considers the top k.
// L_infty(p,q) = max_i |p_i-q_i|
message InfinityNorm {
// The InfinityNorm is in the interval [0.0, 1.0] so sensible bounds should
// be in the interval [0.0, 1.0).
optional double threshold = 1;
}
message HistogramSelection {
// Type controls the source of the histogram used for numeric drift and
// skew calculations. Currently the default is STANDARD. Calculations
// based on QUANTILES are more robust to outliers.
enum Type {
DEFAULT = 0;
QUANTILES = 1;
STANDARD = 2;
}
optional Type type = 1;
}
// Checks that the approximate Jensen-Shannon Divergence is below a certain
// threshold between the two distributions.
message JensenShannonDivergence {
// The JensenShannonDivergence will be in the interval [0.0, 1.0] so sensible
// bounds should be in the interval [0.0, 1.0).
optional double threshold = 1;
optional HistogramSelection source = 2;
}
// Checks that the absolute count difference relative to the total count of both
// datasets is small. This metric is appropriate for comparing datasets that
// are expected to have similar absolute counts, and not necessarily just
// similar distributions.
// Computed as max_i | x_i - y_i | / sum_i(x_i + y_i) for aligned datasets
// x and y. Results will be in the interval [0.0, 1.0] so sensible bounds should
// be in the interval [0.0, 1.0).
message NormalizedAbsoluteDifference {
optional double threshold = 1;
}
message FeatureComparator {
optional InfinityNorm infinity_norm = 1;
optional JensenShannonDivergence jensen_shannon_divergence = 2;
optional NormalizedAbsoluteDifference normalized_abs_difference = 3;
}
// Checks that the number of unique values is greater than or equal to the min,
// and less than or equal to the max.
message UniqueConstraints {
optional int64 min = 1;
optional int64 max = 2;
}
// A TensorRepresentation captures the intent for converting columns in a
// dataset to TensorFlow Tensors (or more generally, tf.CompositeTensors).
// Note that one tf.CompositeTensor may consist of data from multiple columns,
// for example, a N-dimensional tf.SparseTensor may need N + 1 columns to
// provide the sparse indices and values.
// Note that the "column name" that a TensorRepresentation needs is a
// string, not a Path -- it means that the column name identifies a top-level
// Feature in the schema (i.e. you cannot specify a Feature nested in a STRUCT
// Feature).
message TensorRepresentation {
message DefaultValue {
oneof kind {
double float_value = 1;
// Note that the data column might be of a shorter integral type. It's the
// user's responsitiblity to make sure the default value fits that type.
int64 int_value = 2;
bytes bytes_value = 3;
// uint_value should only be used if the default value can't fit in a
// int64 (`int_value`).
uint64 uint_value = 4;
}
}
// A tf.Tensor
message DenseTensor {
// Identifies the column in the dataset that provides the values of this
// Tensor.
optional string column_name = 1;
// The shape of each row of the data (i.e. does not include the batch
// dimension)
optional FixedShape shape = 2;
// If this column is missing values in a row, the default_value will be
// used to fill that row.
optional DefaultValue default_value = 3;
}
// A ragged tf.SparseTensor that models nested lists.
message VarLenSparseTensor {
// Identifies the column in the dataset that should be converted to the
// VarLenSparseTensor.
optional string column_name = 1;
}
// A tf.SparseTensor whose indices and values come from separate data columns.
// This will replace Schema.sparse_feature eventually.
// The index columns must be of INT type, and all the columns must co-occur
// and have the same valency at the same row.
message SparseTensor {
// The dense shape of the resulting SparseTensor (does not include the batch
// dimension).
optional FixedShape dense_shape = 1;
// The columns constitute the coordinates of the values.
// indices_column[i][j] contains the coordinate of the i-th dimension of the
// j-th value.
repeated string index_column_names = 2;
// The column that contains the values.
optional string value_column_name = 3;
// Specify whether the values are already sorted by their index position.
optional bool already_sorted = 4;
}
// A tf.RaggedTensor that models nested lists.
// Currently there is no way for the user to specify the shape of the leaf
// value (the innermost value tensor of the RaggedTensor). The leaf value will
// always be a 1-D tensor.
message RaggedTensor {
// Identifies the leaf feature that provides values of the RaggedTensor.
// struct type sub fields.
// The first step of the path refers to a top-level feature in the data. The
// remaining steps refer to STRUCT features under the top-level feature,
// recursively.
// If the feature has N outer ragged lists, they will become the first
// N dimensions of the resulting RaggedTensor and the contents will become
// the flat_values.
optional Path feature_path = 1; // required.
// Further partition of the feature values at the leaf level.
message Partition {
oneof kind {
// If the final element(s) of partition are uniform_row_lengths [U0, U1,
// ...] , then the result RaggedTensor will have their flat values (a
// dense tensor) being of shape [U0, U1, ...]. Otherwise, a
// uniform_row_length simply means a ragged dimension with row_lengths
// [uniform_row_length]*nrows.
int64 uniform_row_length = 1;
// Identifies a leaf feature who share the same parent of
// value_feature_path that contains the partition row lengths.
string row_length = 2;
}
}
// The result RaggedTensor would be of shape:
// [B, D_0, D_1, ..., D_N, P_0, P_1, ..., P_M, U_0, U_1, ..., U_P]
//
// Where the dimensions belong to different categories:
// * B: Batch size dimension
// * D_n: Dimensions specified by the nested structure specified by the
// value path until the leaf node. n>=1.
// * P_m: Dimensions specified by the partitions that do not define any
// fixed diomension size. m>=0.
// * U_0: Dimensions specified by the latest partitions of type
// uniform_row_length that can define the fixed inner shape of the tensor.
// If iterationg the partitions from the end to the beginning, these
// dimensions are defined by all the continuous uniform_row_length
// partitions present. p>=0.
repeated Partition partition = 3;
// The data type of the ragged tensor's row partitions. This will
// default to INT64 if it is not specified.
optional RowPartitionDType row_partition_dtype = 2;
}
// RaggedTensor consists of RowPartitions. This enum allows the user to
// specify the dtype of those RowPartitions. If it is UNSPECIFIED, then we
// default to INT64.
enum RowPartitionDType {
UNSPECIFIED = 0;
INT64 = 1;
INT32 = 2;
}
oneof kind {
DenseTensor dense_tensor = 1;
VarLenSparseTensor varlen_sparse_tensor = 2;
SparseTensor sparse_tensor = 3;
RaggedTensor ragged_tensor = 4;
}
}
// A TensorRepresentationGroup is a collection of TensorRepresentations with
// names. These names may serve as identifiers when converting the dataset
// to a collection of Tensors or tf.CompositeTensors.
// For example, given the following group:
// {
// key: "dense_tensor"
// tensor_representation {
// dense_tensor {
// column_name: "univalent_feature"
// shape {
// dim {
// size: 1
// }
// }
// default_value {
// float_value: 0
// }
// }
// }
// }
// {
// key: "varlen_sparse_tensor"
// tensor_representation {
// varlen_sparse_tensor {
// column_name: "multivalent_feature"
// }
// }
// }
//
// Then the schema is expected to have feature "univalent_feature" and
// "multivalent_feature", and when a batch of data is converted to Tensors using
// this TensorRepresentationGroup, the result may be the following dict:
// {
// "dense_tensor": tf.Tensor(...),
// "varlen_sparse_tensor": tf.SparseTensor(...),
// }
message TensorRepresentationGroup {
map<string, TensorRepresentation> tensor_representation = 1;
}
message SequenceMetadata {
// This enum specifies whether to treat the feature as a sequence which has
// meaningful element order.
enum SequentialStatus {
SEQUENTIAL_UNSPECIFIED = 0;
SEQUENTIAL_YES = 1;
SEQUENTIAL_NO = 2;
}
optional SequentialStatus sequential_status = 3;
// An arbitrary string defining a "group" of features that could be modeled as
// a single joint sequence. For example, consider a dataset that contains
// three sequential features "purchase_time", "product_id", "purchase_price".
// These belong to the same sequence of purchases and could be modeled
// jointly. Specifying joint_group = "purchase" on all three sequences would
// communicate that the features can be considered part of a single conceptual
// sequence.
optional string joint_group = 4;
// Specifies the maximum sequence length that should be processed. Sequences
// may exceed this limit but are expected to be truncated by modeling layers.
optional int64 sequence_truncation_limit = 5;
}