-
Notifications
You must be signed in to change notification settings - Fork 466
GH-531: Add parquet flatbuf schema #544
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,224 @@ | ||||||
| namespace parquet.format3; | ||||||
|
|
||||||
| // Optimization notes | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we expand this comment to be explicit about the relationship between this FBS and parquet.thrift. |
||||||
| // 1. Statistics are stored in integral types if their size is fixed, otherwise prefix + suffix | ||||||
| // 2. ColumnMetaData.encoding_stats are removed, they are replaced with | ||||||
| // ColumnMetaData.is_fully_dict_encoded. | ||||||
| // 3. RowGroups are limited to 2GB in size, so we can use int for sizes. | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this and the item below are out of date (we are using |
||||||
| // 4. ColumnChunk/ColumnMetaData offsets are now relative to the start of the row group, so we can | ||||||
| // use int for offsets. | ||||||
| // 5. Remove ordinal. | ||||||
| // 6. Restrict RowGroups to 2^31-1 rows. | ||||||
| // 7. Remove offset/column indexes, they are small and just their offsets are of similar size. | ||||||
|
|
||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Physical types. | ||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|
|
||||||
| enum Type : byte { | ||||||
| BOOLEAN = 0, | ||||||
| INT32 = 1, | ||||||
| INT64 = 2, | ||||||
| INT96 = 3, | ||||||
| FLOAT = 4, | ||||||
| DOUBLE = 5, | ||||||
| BYTE_ARRAY = 6, | ||||||
| FIXED_LEN_BYTE_ARRAY = 7, | ||||||
| } | ||||||
|
|
||||||
| enum FieldRepetitionType : byte { | ||||||
| REQUIRED = 0, | ||||||
| OPTIONAL = 1, | ||||||
| REPEATED = 2, | ||||||
| } | ||||||
|
|
||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Encodings. | ||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|
|
||||||
| // Note: Match the thrift enum values so that we can cast between them. | ||||||
| enum Encoding : byte { | ||||||
| PLAIN = 0, | ||||||
| // GROUP_VAR_INT = 1, | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Call out commented out entries as deprecated to make it clear why they are commented out? |
||||||
| PLAIN_DICTIONARY = 2, | ||||||
| RLE = 3, | ||||||
| // BIT_PACKED = 4, | ||||||
| DELTA_BINARY_PACKED = 5, | ||||||
| DELTA_LENGTH_BYTE_ARRAY = 6, | ||||||
| DELTA_BYTE_ARRAY = 7, | ||||||
| RLE_DICTIONARY = 8, | ||||||
| BYTE_STREAM_SPLIT = 9, | ||||||
| } | ||||||
|
|
||||||
| // Note: Match the thrift enum values so that we can cast between them. | ||||||
| enum CompressionCodec : byte { | ||||||
| UNCOMPRESSED = 0, | ||||||
| SNAPPY = 1, | ||||||
| GZIP = 2, | ||||||
| LZO = 3, | ||||||
| BROTLI = 4, | ||||||
| // LZ4 = 5, | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment on deprecation. |
||||||
| ZSTD = 6, | ||||||
| LZ4_RAW = 7, | ||||||
| } | ||||||
|
|
||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Logical types. | ||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|
|
||||||
| table Empty {} | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we want detailed docs (same level as parquet.thrift if we intend this to be the new footer)? |
||||||
| table DecimalOpts { | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Should be spell out type names to make it easer on reader? |
||||||
| precision: int; | ||||||
| scale: int; | ||||||
| } | ||||||
| enum TimeUnit : byte { | ||||||
| MS = 0, | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we please make these match parquet.thrift for names (Millisecond, Microsecond, Nanosecond)? |
||||||
| US = 1, | ||||||
| NS = 2, | ||||||
| } | ||||||
| table TimeOpts { | ||||||
| is_adjusted_to_utc: bool; | ||||||
| unit: TimeUnit; | ||||||
| } | ||||||
| table IntOpts { | ||||||
| bit_width: byte = 8; | ||||||
| is_signed: bool; | ||||||
| } | ||||||
| table GeometryType { | ||||||
| crs: string; | ||||||
| } | ||||||
| enum EdgeInterpolationAlgorithm : byte { | ||||||
| SPHERICAL = 0, | ||||||
| VINCENTY = 1, | ||||||
| THOMAS = 2, | ||||||
| ANDOYER = 3, | ||||||
| KARNEY = 4, | ||||||
| } | ||||||
| table GeographyType { | ||||||
| crs: string; | ||||||
| algorithm: EdgeInterpolationAlgorithm; | ||||||
| } | ||||||
| union LogicalType { | ||||||
| StringType:Empty, | ||||||
| MapType:Empty, | ||||||
| ListType:Empty, | ||||||
| EnumType:Empty, | ||||||
| DecimalType:DecimalOpts, | ||||||
| DateType:Empty, | ||||||
| TimeType:TimeOpts, | ||||||
| TimestampType:TimeOpts, | ||||||
| IntType:IntOpts, | ||||||
| NullType:Empty, | ||||||
| JsonType:Empty, | ||||||
| BsonType:Empty, | ||||||
| UUIDType:Empty, | ||||||
| Float16Type:Empty, | ||||||
| VariantType:Empty, | ||||||
| GeometryType:GeometryType, | ||||||
| GeographyType:GeographyType, | ||||||
| } | ||||||
|
|
||||||
| table Statistics { | ||||||
| null_count: int = null; | ||||||
| // Store min/max values fixed sized entities depending on the physical type. If len is present | ||||||
| // then the min/max value is present. | ||||||
| // | ||||||
| // - BOOLEAN: none | ||||||
| // - INT32/FLOAT: lo4 (little-endian) | ||||||
| // - INT64/DOUBLE: lo8 (little-endian) | ||||||
| // - INT96: lo4+lo8 (little-endian) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for composite values, I think this is complicated enough that providing concrete examples would be belpful for implementors? |
||||||
| // - FIXED_LEN_BYTE_ARRAY: | ||||||
| // - BYTE_ARRAY: | ||||||
| // prefix: the longest common prefix of min/max | ||||||
| // lo8+hi8 zero padded 16 bytes (big-endian) of the suffix | ||||||
| // len: the length for the suffix of the value after removing the prefix. If > 16 then the | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| // value is inexact | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| min_lo4: uint; | ||||||
| min_lo8: ulong; | ||||||
| min_hi8: ulong; | ||||||
| min_len: byte = null; | ||||||
| max_lo4: uint; | ||||||
| max_lo8: ulong; | ||||||
| max_hi8: ulong; | ||||||
| max_len: byte = null; | ||||||
| prefix: string; | ||||||
| } | ||||||
|
|
||||||
| union ColumnOrder { | ||||||
| TypeDefinedOrder:Empty, | ||||||
| } | ||||||
|
|
||||||
| table SchemaElement { | ||||||
| name: string; | ||||||
| type: Type = null; | ||||||
| repetition_type: FieldRepetitionType; | ||||||
| logical_type: LogicalType; | ||||||
| type_length: int = null; | ||||||
| num_children: int = 0; | ||||||
| field_id: int = null; | ||||||
| column_order: ColumnOrder; // only present for leaf nodes | ||||||
| } | ||||||
|
|
||||||
| enum PageType : byte { | ||||||
| DATA_PAGE = 0, | ||||||
| INDEX_PAGE = 1, | ||||||
| DICTIONARY_PAGE = 2, | ||||||
| DATA_PAGE_V2 = 3, | ||||||
| } | ||||||
|
|
||||||
| table KV { | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit
Suggested change
Lets keep name consistent if possible? |
||||||
| key: string; | ||||||
| val: string; | ||||||
| } | ||||||
|
|
||||||
| table ColumnMetadata { | ||||||
| codec: CompressionCodec; | ||||||
| num_values: long = null; // only present if not equal to rg.num_rows | ||||||
| total_uncompressed_size: long; | ||||||
| total_compressed_size: long; | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to keep total unencoded size here which I think is generally useful? But I suppose it can be added after? |
||||||
| key_value_metadata: [KV]; | ||||||
| data_page_offset: long; | ||||||
| index_page_offset: long = null; | ||||||
| dictionary_page_offset: long = null; | ||||||
| statistics: Statistics; | ||||||
| is_fully_dict_encoded: bool; | ||||||
| bloom_filter_offset: long = null; | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we this be made a struct/value to make the bloom filter info more self contained? |
||||||
| bloom_filter_length: int = null; | ||||||
| } | ||||||
|
|
||||||
| table ColumnChunk { | ||||||
| file_path: string; | ||||||
| meta_data: ColumnMetadata; | ||||||
| // crypto_metadata: ColumnCryptoMetadata; // TODO | ||||||
| // encrypted_column_metadata: [byte]; // TODO | ||||||
| } | ||||||
|
|
||||||
| table SortingColumn { | ||||||
| column_idx: int; | ||||||
| descending: bool; | ||||||
| nulls_first: bool; | ||||||
| } | ||||||
|
|
||||||
| table RowGroup { | ||||||
| columns: [ColumnChunk]; | ||||||
| total_byte_size: long; | ||||||
| num_rows: long; | ||||||
| sorting_columns: [SortingColumn]; | ||||||
| file_offset: long; | ||||||
| total_compressed_size: long; | ||||||
| ordinal: short = null; | ||||||
| } | ||||||
|
|
||||||
| table FileMetaData { | ||||||
| version: int; | ||||||
| schema: [SchemaElement]; | ||||||
| num_rows: long; | ||||||
| row_groups: [RowGroup]; | ||||||
| kv: [KV]; | ||||||
| created_by: string; | ||||||
| // column_orders: [ColumnOrder]; // moved to SchemaElement | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove this row for now? |
||||||
| // encryption_algorithm: [EncryptionAlgorithm]; // TODO | ||||||
| // footer_signing_key_metadata: binary; // TODO | ||||||
| } | ||||||
|
|
||||||
| root_type FileMetaData; | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lets just name this parquet.format for now?