This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-ballista.git
The following commit(s) were added to refs/heads/main by this push:
new e39a7e68 Update datafusion protobuf definitions (#1057)
e39a7e68 is described below
commit e39a7e68da093ff6f0f002e7c6553d6fd4763dbb
Author: Baris Palaska <[email protected]>
AuthorDate: Thu Sep 19 20:52:24 2024 +0200
Update datafusion protobuf definitions (#1057)
* update datafusion proto defs
* allow optionals in proto3
update docker environment for higher protoc version
---
ballista/core/build.rs | 3 +
ballista/core/proto/ballista.proto | 11 +-
ballista/core/proto/datafusion.proto | 789 +++++++++++---------------
ballista/core/proto/datafusion_common.proto | 541 ++++++++++++++++++
ballista/core/src/serde/generated/ballista.rs | 10 +-
dev/docker/ballista-builder.Dockerfile | 4 +-
6 files changed, 896 insertions(+), 462 deletions(-)
diff --git a/ballista/core/build.rs b/ballista/core/build.rs
index a13da777..6e501b88 100644
--- a/ballista/core/build.rs
+++ b/ballista/core/build.rs
@@ -37,10 +37,13 @@ fn main() -> Result<(), String> {
// We don't include the proto files in releases so that downstreams
// do not need to have PROTOC included
if Path::new("proto/datafusion.proto").exists() {
+ println!("cargo:rerun-if-changed=proto/datafusion_common.proto");
println!("cargo:rerun-if-changed=proto/datafusion.proto");
println!("cargo:rerun-if-changed=proto/ballista.proto");
tonic_build::configure()
+ .extern_path(".datafusion_common", "::datafusion_proto_common")
.extern_path(".datafusion", "::datafusion_proto::protobuf")
+ .protoc_arg("--experimental_allow_proto3_optional")
.compile(&["proto/ballista.proto"], &["proto"])
.map_err(|e| format!("protobuf compilation failed: {e}"))?;
let generated_source_path = out.join("ballista.protobuf.rs");
diff --git a/ballista/core/proto/ballista.proto
b/ballista/core/proto/ballista.proto
index 78613a72..eab1d801 100644
--- a/ballista/core/proto/ballista.proto
+++ b/ballista/core/proto/ballista.proto
@@ -25,6 +25,7 @@ option java_package = "org.apache.arrow.ballista.protobuf";
option java_outer_classname = "BallistaProto";
import "datafusion.proto";
+import "datafusion_common.proto";
///////////////////////////////////////////////////////////////////////////////////////////////////
// Ballista Physical Plan
@@ -48,13 +49,13 @@ message ShuffleWriterExecNode {
message UnresolvedShuffleExecNode {
uint32 stage_id = 1;
- datafusion.Schema schema = 2;
+ datafusion_common.Schema schema = 2;
uint32 output_partition_count = 4;
}
message ShuffleReaderExecNode {
repeated ShuffleReaderPartition partition = 1;
- datafusion.Schema schema = 2;
+ datafusion_common.Schema schema = 2;
// The stage to read from
uint32 stage_id = 3;
}
@@ -236,8 +237,8 @@ message PartitionStats {
}
message ColumnStats {
- datafusion.ScalarValue min_value = 1;
- datafusion.ScalarValue max_value = 2;
+ datafusion_common.ScalarValue min_value = 1;
+ datafusion_common.ScalarValue max_value = 2;
uint32 null_count = 3;
uint32 distinct_count = 4;
}
@@ -634,7 +635,7 @@ message GetFileMetadataParams {
}
message GetFileMetadataResult {
- datafusion.Schema schema = 1;
+ datafusion_common.Schema schema = 1;
}
message FilePartitionMetadata {
diff --git a/ballista/core/proto/datafusion.proto
b/ballista/core/proto/datafusion.proto
index 8a9a8c0f..8402b92f 100644
--- a/ballista/core/proto/datafusion.proto
+++ b/ballista/core/proto/datafusion.proto
@@ -24,24 +24,7 @@ option java_multiple_files = true;
option java_package = "org.apache.arrow.datafusion.protobuf";
option java_outer_classname = "DatafusionProto";
-message ColumnRelation {
- string relation = 1;
-}
-
-message Column {
- string name = 1;
- ColumnRelation relation = 2;
-}
-
-message DfField{
- Field field = 1;
- ColumnRelation qualifier = 2;
-}
-
-message DfSchema {
- repeated DfField columns = 1;
- map<string, string> metadata = 2;
-}
+import "datafusion_common.proto";
// logical plan
// LogicalPlan is a nested type
@@ -72,6 +55,10 @@ message LogicalPlanNode {
ViewTableScanNode view_scan = 24;
CustomTableScanNode custom_scan = 25;
PrepareNode prepare = 26;
+ DropViewNode drop_view = 27;
+ DistinctOnNode distinct_on = 28;
+ CopyToNode copy_to = 29;
+ UnnestNode unnest = 30;
}
}
@@ -84,49 +71,44 @@ message ProjectionColumns {
repeated string columns = 1;
}
-message CsvFormat {
- bool has_header = 1;
- string delimiter = 2;
+message LogicalExprNodeCollection {
+ repeated LogicalExprNode logical_expr_nodes = 1;
}
-message ParquetFormat {
- // Used to be bool enable_pruning = 1;
- reserved 1;
-}
-
-message AvroFormat {}
-
message ListingTableScanNode {
- string table_name = 1;
+ reserved 1; // was string table_name
+ TableReference table_name = 14;
repeated string paths = 2;
string file_extension = 3;
ProjectionColumns projection = 4;
- Schema schema = 5;
+ datafusion_common.Schema schema = 5;
repeated LogicalExprNode filters = 6;
repeated string table_partition_cols = 7;
bool collect_stat = 8;
uint32 target_partitions = 9;
oneof FileFormatType {
- CsvFormat csv = 10;
- ParquetFormat parquet = 11;
- AvroFormat avro = 12;
+ datafusion_common.CsvFormat csv = 10;
+ datafusion_common.ParquetFormat parquet = 11;
+ datafusion_common.AvroFormat avro = 12;
}
- repeated LogicalExprNode file_sort_order = 13;
+ repeated LogicalExprNodeCollection file_sort_order = 13;
}
message ViewTableScanNode {
- string table_name = 1;
+ reserved 1; // was string table_name
+ TableReference table_name = 6;
LogicalPlanNode input = 2;
- Schema schema = 3;
+ datafusion_common.Schema schema = 3;
ProjectionColumns projection = 4;
string definition = 5;
}
// Logical Plan to Scan a CustomTableProvider registered at runtime
message CustomTableScanNode {
- string table_name = 1;
+ reserved 1; // was string table_name
+ TableReference table_name = 6;
ProjectionColumns projection = 2;
- Schema schema = 3;
+ datafusion_common.Schema schema = 3;
repeated LogicalExprNode filters = 4;
bytes custom_table_data = 5;
}
@@ -170,40 +152,47 @@ message EmptyRelationNode {
message CreateExternalTableNode {
reserved 1; // was string name
- OwnedTableReference name = 12;
+ TableReference name = 9;
string location = 2;
string file_type = 3;
- bool has_header = 4;
- DfSchema schema = 5;
- repeated string table_partition_cols = 6;
- bool if_not_exists = 7;
- string delimiter = 8;
- string definition = 9;
- string file_compression_type = 10;
- map<string, string> options = 11;
-}
+ datafusion_common.DfSchema schema = 4;
+ repeated string table_partition_cols = 5;
+ bool if_not_exists = 6;
+ string definition = 7;
+ repeated LogicalExprNodeCollection order_exprs = 10;
+ bool unbounded = 11;
+ map<string, string> options = 8;
+ datafusion_common.Constraints constraints = 12;
+ map<string, LogicalExprNode> column_defaults = 13;
+ }
message PrepareNode {
string name = 1;
- repeated ArrowType data_types = 2;
+ repeated datafusion_common.ArrowType data_types = 2;
LogicalPlanNode input = 3;
}
message CreateCatalogSchemaNode {
string schema_name = 1;
bool if_not_exists = 2;
- DfSchema schema = 3;
+ datafusion_common.DfSchema schema = 3;
}
message CreateCatalogNode {
string catalog_name = 1;
bool if_not_exists = 2;
- DfSchema schema = 3;
+ datafusion_common.DfSchema schema = 3;
+}
+
+message DropViewNode {
+ TableReference name = 1;
+ bool if_exists = 2;
+ datafusion_common.DfSchema schema = 3;
}
message CreateViewNode {
reserved 1; // was string name
- OwnedTableReference name = 5;
+ TableReference name = 5;
LogicalPlanNode input = 2;
bool or_replace = 3;
string definition = 4;
@@ -237,27 +226,11 @@ message WindowNode {
repeated LogicalExprNode window_expr = 2;
}
-enum JoinType {
- INNER = 0;
- LEFT = 1;
- RIGHT = 2;
- FULL = 3;
- LEFTSEMI = 4;
- LEFTANTI = 5;
- RIGHTSEMI = 6;
- RIGHTANTI = 7;
-}
-
-enum JoinConstraint {
- ON = 0;
- USING = 1;
-}
-
message JoinNode {
LogicalPlanNode left = 1;
LogicalPlanNode right = 2;
- JoinType join_type = 3;
- JoinConstraint join_constraint = 4;
+ datafusion_common.JoinType join_type = 3;
+ datafusion_common.JoinConstraint join_constraint = 4;
repeated LogicalExprNode left_join_key = 5;
repeated LogicalExprNode right_join_key = 6;
bool null_equals_null = 7;
@@ -268,6 +241,40 @@ message DistinctNode {
LogicalPlanNode input = 1;
}
+message DistinctOnNode {
+ repeated LogicalExprNode on_expr = 1;
+ repeated LogicalExprNode select_expr = 2;
+ repeated LogicalExprNode sort_expr = 3;
+ LogicalPlanNode input = 4;
+}
+
+message CopyToNode {
+ LogicalPlanNode input = 1;
+ string output_url = 2;
+ oneof format_options {
+ datafusion_common.CsvOptions csv = 8;
+ datafusion_common.JsonOptions json = 9;
+ datafusion_common.TableParquetOptions parquet = 10;
+ datafusion_common.AvroOptions avro = 11;
+ datafusion_common.ArrowOptions arrow = 12;
+ }
+ repeated string partition_by = 7;
+}
+
+message UnnestNode {
+ LogicalPlanNode input = 1;
+ repeated datafusion_common.Column exec_columns = 2;
+ repeated uint64 list_type_columns = 3;
+ repeated uint64 struct_type_columns = 4;
+ repeated uint64 dependency_indices = 5;
+ datafusion_common.DfSchema schema = 6;
+ UnnestOptions options = 7;
+}
+
+message UnnestOptions {
+ bool preserve_nulls = 1;
+}
+
message UnionNode {
repeated LogicalPlanNode inputs = 1;
}
@@ -290,20 +297,21 @@ message SelectionExecNode {
}
message SubqueryAliasNode {
+ reserved 2; // Was string alias
LogicalPlanNode input = 1;
- string alias = 2;
+ TableReference alias = 3;
}
// logical expressions
message LogicalExprNode {
oneof ExprType {
// column references
- Column column = 1;
+ datafusion_common.Column column = 1;
// alias
AliasNode alias = 2;
- ScalarValue literal = 3;
+ datafusion_common.ScalarValue literal = 3;
// binary expressions
BinaryExprNode binary_expr = 4;
@@ -322,8 +330,8 @@ message LogicalExprNode {
SortExprNode sort = 12;
NegativeNode negative = 13;
InListNode in_list = 14;
- bool wildcard = 15;
- ScalarFunctionNode scalar_function = 16;
+ Wildcard wildcard = 15;
+ // was ScalarFunctionNode scalar_function = 16;
TryCastNode try_cast = 17;
// window expressions
@@ -335,7 +343,7 @@ message LogicalExprNode {
// Scalar UDF expressions
ScalarUDFExprNode scalar_udf_expr = 20;
- GetIndexedField get_indexed_field = 21;
+ // GetIndexedField get_indexed_field = 21;
GroupingSetNode grouping_set = 22;
@@ -355,12 +363,18 @@ message LogicalExprNode {
PlaceholderNode placeholder = 34;
+ Unnest unnest = 35;
+
}
}
+message Wildcard {
+ string qualifier = 1;
+}
+
message PlaceholderNode {
string id = 1;
- ArrowType data_type = 2;
+ datafusion_common.ArrowType data_type = 2;
}
message LogicalExprList {
@@ -379,11 +393,18 @@ message RollupNode {
repeated LogicalExprNode expr = 1;
}
+message NamedStructField {
+ datafusion_common.ScalarValue name = 1;
+}
+message ListIndex {
+ LogicalExprNode key = 1;
+}
-message GetIndexedField {
- LogicalExprNode expr = 1;
- ScalarValue key = 2;
+message ListRange {
+ LogicalExprNode start = 1;
+ LogicalExprNode stop = 2;
+ LogicalExprNode stride = 3;
}
message IsNull {
@@ -425,6 +446,7 @@ message Not {
message AliasNode {
LogicalExprNode expr = 1;
string alias = 2;
+ repeated TableReference relation = 3;
}
message BinaryExprNode {
@@ -439,93 +461,16 @@ message NegativeNode {
LogicalExprNode expr = 1;
}
+message Unnest {
+ repeated LogicalExprNode exprs = 1;
+}
+
message InListNode {
LogicalExprNode expr = 1;
repeated LogicalExprNode list = 2;
bool negated = 3;
}
-enum ScalarFunction {
- Abs = 0;
- Acos = 1;
- Asin = 2;
- Atan = 3;
- Ascii = 4;
- Ceil = 5;
- Cos = 6;
- Digest = 7;
- Exp = 8;
- Floor = 9;
- Ln = 10;
- Log = 11;
- Log10 = 12;
- Log2 = 13;
- Round = 14;
- Signum = 15;
- Sin = 16;
- Sqrt = 17;
- Tan = 18;
- Trunc = 19;
- Array = 20;
- RegexpMatch = 21;
- BitLength = 22;
- Btrim = 23;
- CharacterLength = 24;
- Chr = 25;
- Concat = 26;
- ConcatWithSeparator = 27;
- DatePart = 28;
- DateTrunc = 29;
- InitCap = 30;
- Left = 31;
- Lpad = 32;
- Lower = 33;
- Ltrim = 34;
- MD5 = 35;
- NullIf = 36;
- OctetLength = 37;
- Random = 38;
- RegexpReplace = 39;
- Repeat = 40;
- Replace = 41;
- Reverse = 42;
- Right = 43;
- Rpad = 44;
- Rtrim = 45;
- SHA224 = 46;
- SHA256 = 47;
- SHA384 = 48;
- SHA512 = 49;
- SplitPart = 50;
- StartsWith = 51;
- Strpos = 52;
- Substr = 53;
- ToHex = 54;
- ToTimestamp = 55;
- ToTimestampMillis = 56;
- ToTimestampMicros = 57;
- ToTimestampSeconds = 58;
- Now = 59;
- Translate = 60;
- Trim = 61;
- Upper = 62;
- Coalesce = 63;
- Power = 64;
- StructFun = 65;
- FromUnixtime = 66;
- Atan2 = 67;
- DateBin = 68;
- ArrowTypeof = 69;
- CurrentDate = 70;
- CurrentTime = 71;
- Uuid = 72;
-}
-
-message ScalarFunctionNode {
- ScalarFunction fun = 1;
- repeated LogicalExprNode args = 2;
-}
-
enum AggregateFunction {
MIN = 0;
MAX = 1;
@@ -534,10 +479,10 @@ enum AggregateFunction {
COUNT = 4;
APPROX_DISTINCT = 5;
ARRAY_AGG = 6;
- VARIANCE = 7;
+ // VARIANCE = 7;
VARIANCE_POP = 8;
- COVARIANCE = 9;
- COVARIANCE_POP = 10;
+ // COVARIANCE = 9;
+ // COVARIANCE_POP = 10;
STDDEV = 11;
STDDEV_POP = 12;
CORRELATION = 13;
@@ -545,7 +490,23 @@ enum AggregateFunction {
APPROX_MEDIAN = 15;
APPROX_PERCENTILE_CONT_WITH_WEIGHT = 16;
GROUPING = 17;
- MEDIAN = 18;
+ // MEDIAN = 18;
+ BIT_AND = 19;
+ BIT_OR = 20;
+ BIT_XOR = 21;
+ BOOL_AND = 22;
+ BOOL_OR = 23;
+ REGR_SLOPE = 26;
+ REGR_INTERCEPT = 27;
+ REGR_COUNT = 28;
+ REGR_R2 = 29;
+ REGR_AVGX = 30;
+ REGR_AVGY = 31;
+ REGR_SXX = 32;
+ REGR_SYY = 33;
+ REGR_SXY = 34;
+ STRING_AGG = 35;
+ NTH_VALUE_AGG = 36;
}
message AggregateExprNode {
@@ -553,17 +514,20 @@ message AggregateExprNode {
repeated LogicalExprNode expr = 2;
bool distinct = 3;
LogicalExprNode filter = 4;
+ repeated LogicalExprNode order_by = 5;
}
message AggregateUDFExprNode {
string fun_name = 1;
repeated LogicalExprNode args = 2;
LogicalExprNode filter = 3;
+ repeated LogicalExprNode order_by = 4;
}
message ScalarUDFExprNode {
string fun_name = 1;
repeated LogicalExprNode args = 2;
+ optional bytes fun_definition = 3;
}
enum BuiltInWindowFunction {
@@ -584,7 +548,8 @@ message WindowExprNode {
oneof window_function {
AggregateFunction aggr_function = 1;
BuiltInWindowFunction built_in_function = 2;
- // udaf = 3
+ string udaf = 3;
+ string udwf = 9;
}
LogicalExprNode expr = 4;
repeated LogicalExprNode partition_by = 5;
@@ -634,12 +599,12 @@ message WhenThen {
message CastNode {
LogicalExprNode expr = 1;
- ArrowType arrow_type = 2;
+ datafusion_common.ArrowType arrow_type = 2;
}
message TryCastNode {
LogicalExprNode expr = 1;
- ArrowType arrow_type = 2;
+ datafusion_common.ArrowType arrow_type = 2;
}
message SortExprNode {
@@ -672,242 +637,26 @@ enum WindowFrameBoundType {
message WindowFrameBound {
WindowFrameBoundType window_frame_bound_type = 1;
- ScalarValue bound_value = 2;
+ datafusion_common.ScalarValue bound_value = 2;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Arrow Data Types
///////////////////////////////////////////////////////////////////////////////////////////////////
-message Schema {
- repeated Field columns = 1;
-}
-
-message Field {
- // name of the field
- string name = 1;
- ArrowType arrow_type = 2;
- bool nullable = 3;
- // for complex data types like structs, unions
- repeated Field children = 4;
-}
-
message FixedSizeBinary{
int32 length = 1;
}
-message Timestamp{
- TimeUnit time_unit = 1;
- string timezone = 2;
-}
-
enum DateUnit{
Day = 0;
DateMillisecond = 1;
}
-enum TimeUnit{
- Second = 0;
- Millisecond = 1;
- Microsecond = 2;
- Nanosecond = 3;
-}
-
-enum IntervalUnit{
- YearMonth = 0;
- DayTime = 1;
- MonthDayNano = 2;
-}
-
-message Decimal{
- reserved 1, 2;
- uint32 precision = 3;
- int32 scale = 4;
-}
-
-message List{
- Field field_type = 1;
-}
-
-message FixedSizeList{
- Field field_type = 1;
- int32 list_size = 2;
-}
-
-message Dictionary{
- ArrowType key = 1;
- ArrowType value = 2;
-}
-
-message Struct{
- repeated Field sub_field_types = 1;
-}
-
-enum UnionMode{
- sparse = 0;
- dense = 1;
-}
-
-message Union{
- repeated Field union_types = 1;
- UnionMode union_mode = 2;
- repeated int32 type_ids = 3;
-}
-
-message ScalarListValue{
- // encode null explicitly to distinguish a list with a null value
- // from a list with no values)
- bool is_null = 3;
- Field field = 1;
- repeated ScalarValue values = 2;
+message AnalyzedLogicalPlanType {
+ string analyzer_name = 1;
}
-message ScalarTime32Value {
- oneof value {
- int32 time32_second_value = 1;
- int32 time32_millisecond_value = 2;
- };
-}
-
-message ScalarTime64Value {
- oneof value {
- int64 time64_microsecond_value = 1;
- int64 time64_nanosecond_value = 2;
- };
-}
-
-message ScalarTimestampValue {
- oneof value {
- int64 time_microsecond_value = 1;
- int64 time_nanosecond_value = 2;
- int64 time_second_value = 3;
- int64 time_millisecond_value = 4;
- };
- string timezone = 5;
-}
-
-message ScalarDictionaryValue {
- ArrowType index_type = 1;
- ScalarValue value = 2;
-}
-
-message IntervalMonthDayNanoValue {
- int32 months = 1;
- int32 days = 2;
- int64 nanos = 3;
-}
-
-message StructValue {
- // Note that a null struct value must have one or more fields, so we
- // encode a null StructValue as one witth an empty field_values
- // list.
- repeated ScalarValue field_values = 2;
- repeated Field fields = 3;
-}
-
-message ScalarFixedSizeBinary{
- bytes values = 1;
- int32 length = 2;
-}
-
-message ScalarValue{
- // was PrimitiveScalarType null_value = 19;
- reserved 19;
-
- oneof value {
- // was PrimitiveScalarType null_value = 19;
- // Null value of any type
- ArrowType null_value = 33;
-
- bool bool_value = 1;
- string utf8_value = 2;
- string large_utf8_value = 3;
- int32 int8_value = 4;
- int32 int16_value = 5;
- int32 int32_value = 6;
- int64 int64_value = 7;
- uint32 uint8_value = 8;
- uint32 uint16_value = 9;
- uint32 uint32_value = 10;
- uint64 uint64_value = 11;
- float float32_value = 12;
- double float64_value = 13;
- // Literal Date32 value always has a unit of day
- int32 date_32_value = 14;
- ScalarTime32Value time32_value = 15;
- ScalarListValue list_value = 17;
- //WAS: ScalarType null_list_value = 18;
-
- Decimal128 decimal128_value = 20;
- int64 date_64_value = 21;
- int32 interval_yearmonth_value = 24;
- int64 interval_daytime_value = 25;
- ScalarTimestampValue timestamp_value = 26;
- ScalarDictionaryValue dictionary_value = 27;
- bytes binary_value = 28;
- bytes large_binary_value = 29;
- ScalarTime64Value time64_value = 30;
- IntervalMonthDayNanoValue interval_month_day_nano = 31;
- StructValue struct_value = 32;
- ScalarFixedSizeBinary fixed_size_binary_value = 34;
- }
-}
-
-message Decimal128{
- bytes value = 1;
- int64 p = 2;
- int64 s = 3;
-}
-
-// Serialized data type
-message ArrowType{
- oneof arrow_type_enum {
- EmptyMessage NONE = 1; // arrow::Type::NA
- EmptyMessage BOOL = 2; // arrow::Type::BOOL
- EmptyMessage UINT8 = 3; // arrow::Type::UINT8
- EmptyMessage INT8 = 4; // arrow::Type::INT8
- EmptyMessage UINT16 = 5; // represents arrow::Type fields in
src/arrow/type.h
- EmptyMessage INT16 = 6;
- EmptyMessage UINT32 = 7;
- EmptyMessage INT32 = 8;
- EmptyMessage UINT64 = 9;
- EmptyMessage INT64 = 10 ;
- EmptyMessage FLOAT16 = 11 ;
- EmptyMessage FLOAT32 = 12 ;
- EmptyMessage FLOAT64 = 13 ;
- EmptyMessage UTF8 = 14 ;
- EmptyMessage LARGE_UTF8 = 32;
- EmptyMessage BINARY = 15 ;
- int32 FIXED_SIZE_BINARY = 16 ;
- EmptyMessage LARGE_BINARY = 31;
- EmptyMessage DATE32 = 17 ;
- EmptyMessage DATE64 = 18 ;
- TimeUnit DURATION = 19;
- Timestamp TIMESTAMP = 20 ;
- TimeUnit TIME32 = 21 ;
- TimeUnit TIME64 = 22 ;
- IntervalUnit INTERVAL = 23 ;
- Decimal DECIMAL = 24 ;
- List LIST = 25;
- List LARGE_LIST = 26;
- FixedSizeList FIXED_SIZE_LIST = 27;
- Struct STRUCT = 28;
- Union UNION = 29;
- Dictionary DICTIONARY = 30;
- }
-}
-
-//Useful for representing an empty enum variant in rust
-// E.G. enum example{One, Two(i32)}
-// maps to
-// message example{
-// oneof{
-// EmptyMessage One = 1;
-// i32 Two = 2;
-// }
-//}
-message EmptyMessage{}
-
message OptimizedLogicalPlanType {
string optimizer_name = 1;
}
@@ -918,12 +667,16 @@ message OptimizedPhysicalPlanType {
message PlanType {
oneof plan_type_enum {
- EmptyMessage InitialLogicalPlan = 1;
+ datafusion_common.EmptyMessage InitialLogicalPlan = 1;
+ AnalyzedLogicalPlanType AnalyzedLogicalPlan = 7;
+ datafusion_common.EmptyMessage FinalAnalyzedLogicalPlan = 8;
OptimizedLogicalPlanType OptimizedLogicalPlan = 2;
- EmptyMessage FinalLogicalPlan = 3;
- EmptyMessage InitialPhysicalPlan = 4;
+ datafusion_common.EmptyMessage FinalLogicalPlan = 3;
+ datafusion_common.EmptyMessage InitialPhysicalPlan = 4;
+ datafusion_common.EmptyMessage InitialPhysicalPlanWithStats = 9;
OptimizedPhysicalPlanType OptimizedPhysicalPlan = 5;
- EmptyMessage FinalPhysicalPlan = 6;
+ datafusion_common.EmptyMessage FinalPhysicalPlan = 6;
+ datafusion_common.EmptyMessage FinalPhysicalPlanWithStats = 10;
}
}
@@ -947,7 +700,7 @@ message FullTableReference {
string table = 3;
}
-message OwnedTableReference {
+message TableReference {
oneof table_reference_enum {
BareTableReference bare = 1;
PartialTableReference partial = 2;
@@ -980,9 +733,70 @@ message PhysicalPlanNode {
UnionExecNode union = 19;
ExplainExecNode explain = 20;
SortPreservingMergeExecNode sort_preserving_merge = 21;
+ NestedLoopJoinExecNode nested_loop_join = 22;
+ AnalyzeExecNode analyze = 23;
+ JsonSinkExecNode json_sink = 24;
+ SymmetricHashJoinExecNode symmetric_hash_join = 25;
+ InterleaveExecNode interleave = 26;
+ PlaceholderRowExecNode placeholder_row = 27;
+ CsvSinkExecNode csv_sink = 28;
+ ParquetSinkExecNode parquet_sink = 29;
}
}
+message PartitionColumn {
+ string name = 1;
+ datafusion_common.ArrowType arrow_type = 2;
+}
+
+
+message FileSinkConfig {
+ reserved 6; // writer_mode
+
+ string object_store_url = 1;
+ repeated PartitionedFile file_groups = 2;
+ repeated string table_paths = 3;
+ datafusion_common.Schema output_schema = 4;
+ repeated PartitionColumn table_partition_cols = 5;
+ bool overwrite = 8;
+}
+
+message JsonSink {
+ FileSinkConfig config = 1;
+ datafusion_common.JsonWriterOptions writer_options = 2;
+}
+
+message JsonSinkExecNode {
+ PhysicalPlanNode input = 1;
+ JsonSink sink = 2;
+ datafusion_common.Schema sink_schema = 3;
+ PhysicalSortExprNodeCollection sort_order = 4;
+}
+
+message CsvSink {
+ FileSinkConfig config = 1;
+ datafusion_common.CsvWriterOptions writer_options = 2;
+}
+
+message CsvSinkExecNode {
+ PhysicalPlanNode input = 1;
+ CsvSink sink = 2;
+ datafusion_common.Schema sink_schema = 3;
+ PhysicalSortExprNodeCollection sort_order = 4;
+}
+
+message ParquetSink {
+ FileSinkConfig config = 1;
+ datafusion_common.TableParquetOptions parquet_options = 2;
+}
+
+message ParquetSinkExecNode {
+ PhysicalPlanNode input = 1;
+ ParquetSink sink = 2;
+ datafusion_common.Schema sink_schema = 3;
+ PhysicalSortExprNodeCollection sort_order = 4;
+}
+
message PhysicalExtensionNode {
bytes node = 1;
repeated PhysicalPlanNode inputs = 2;
@@ -990,11 +804,14 @@ message PhysicalExtensionNode {
// physical expressions
message PhysicalExprNode {
+ // Was date_time_interval_expr
+ reserved 17;
+
oneof ExprType {
// column references
PhysicalColumn column = 1;
- ScalarValue literal = 2;
+ datafusion_common.ScalarValue literal = 2;
// binary expressions
PhysicalBinaryExprNode binary_expr = 3;
@@ -1012,15 +829,13 @@ message PhysicalExprNode {
PhysicalSortExprNode sort = 10;
PhysicalNegativeNode negative = 11;
PhysicalInListNode in_list = 12;
- PhysicalScalarFunctionNode scalar_function = 13;
+ // was PhysicalScalarFunctionNode scalar_function = 13;
PhysicalTryCastNode try_cast = 14;
-
// window expressions
PhysicalWindowExprNode window_expr = 15;
PhysicalScalarUdfNode scalar_udf = 16;
-
- PhysicalDateTimeIntervalExprNode date_time_interval_expr = 17;
+ // was PhysicalDateTimeIntervalExprNode date_time_interval_expr = 17;
PhysicalLikeExprNode like_expr = 18;
}
@@ -1029,12 +844,17 @@ message PhysicalExprNode {
message PhysicalScalarUdfNode {
string name = 1;
repeated PhysicalExprNode args = 2;
- ArrowType return_type = 4;
+ optional bytes fun_definition = 3;
+ datafusion_common.ArrowType return_type = 4;
}
message PhysicalAggregateExprNode {
- AggregateFunction aggr_function = 1;
+ oneof AggregateFunction {
+ AggregateFunction aggr_function = 1;
+ string user_defined_aggr_function = 4;
+ }
repeated PhysicalExprNode expr = 2;
+ repeated PhysicalSortExprNode ordering_req = 5;
bool distinct = 3;
}
@@ -1042,9 +862,13 @@ message PhysicalWindowExprNode {
oneof window_function {
AggregateFunction aggr_function = 1;
BuiltInWindowFunction built_in_function = 2;
- // udaf = 3
+ string user_defined_aggr_function = 3;
}
- PhysicalExprNode expr = 4;
+ repeated PhysicalExprNode args = 4;
+ repeated PhysicalExprNode partition_by = 5;
+ repeated PhysicalSortExprNode order_by = 6;
+ WindowFrame window_frame = 7;
+ string name = 8;
}
message PhysicalIsNull {
@@ -1106,21 +930,14 @@ message PhysicalCaseNode {
PhysicalExprNode else_expr = 3;
}
-message PhysicalScalarFunctionNode {
- string name = 1;
- ScalarFunction fun = 2;
- repeated PhysicalExprNode args = 3;
- ArrowType return_type = 4;
-}
-
message PhysicalTryCastNode {
PhysicalExprNode expr = 1;
- ArrowType arrow_type = 2;
+ datafusion_common.ArrowType arrow_type = 2;
}
message PhysicalCastNode {
PhysicalExprNode expr = 1;
- ArrowType arrow_type = 2;
+ datafusion_common.ArrowType arrow_type = 2;
}
message PhysicalNegativeNode {
@@ -1130,6 +947,7 @@ message PhysicalNegativeNode {
message FilterExecNode {
PhysicalPlanNode input = 1;
PhysicalExprNode expr = 2;
+ uint32 default_filter_selectivity = 3;
}
message FileGroup {
@@ -1141,29 +959,41 @@ message ScanLimit {
uint32 limit = 1;
}
+message PhysicalSortExprNodeCollection {
+ repeated PhysicalSortExprNode physical_sort_expr_nodes = 1;
+}
+
message FileScanExecConf {
// Was repeated ConfigOption options = 10;
reserved 10;
repeated FileGroup file_groups = 1;
- Schema schema = 2;
+ datafusion_common.Schema schema = 2;
repeated uint32 projection = 4;
ScanLimit limit = 5;
- Statistics statistics = 6;
+ datafusion_common.Statistics statistics = 6;
repeated string table_partition_cols = 7;
string object_store_url = 8;
- repeated PhysicalSortExprNode output_ordering = 9;
+ repeated PhysicalSortExprNodeCollection output_ordering = 9;
}
message ParquetScanExecNode {
FileScanExecConf base_conf = 1;
- LogicalExprNode pruning_predicate = 2;
+
+ // Was pruning predicate based on a logical expr.
+ reserved 2;
+
+ PhysicalExprNode predicate = 3;
}
message CsvScanExecNode {
FileScanExecConf base_conf = 1;
bool has_header = 2;
string delimiter = 3;
+ string quote = 4;
+ oneof optional_escape {
+ string escape = 5;
+ }
}
message AvroScanExecNode {
@@ -1180,10 +1010,32 @@ message HashJoinExecNode {
PhysicalPlanNode left = 1;
PhysicalPlanNode right = 2;
repeated JoinOn on = 3;
- JoinType join_type = 4;
+ datafusion_common.JoinType join_type = 4;
PartitionMode partition_mode = 6;
bool null_equals_null = 7;
JoinFilter filter = 8;
+ repeated uint32 projection = 9;
+}
+
+enum StreamPartitionMode {
+ SINGLE_PARTITION = 0;
+ PARTITIONED_EXEC = 1;
+}
+
+message SymmetricHashJoinExecNode {
+ PhysicalPlanNode left = 1;
+ PhysicalPlanNode right = 2;
+ repeated JoinOn on = 3;
+ datafusion_common.JoinType join_type = 4;
+ StreamPartitionMode partition_mode = 6;
+ bool null_equals_null = 7;
+ JoinFilter filter = 8;
+ repeated PhysicalSortExprNode left_sort_exprs = 9;
+ repeated PhysicalSortExprNode right_sort_exprs = 10;
+}
+
+message InterleaveExecNode {
+ repeated PhysicalPlanNode inputs = 1;
}
message UnionExecNode {
@@ -1191,11 +1043,18 @@ message UnionExecNode {
}
message ExplainExecNode {
- Schema schema = 1;
+ datafusion_common.Schema schema = 1;
repeated StringifiedPlan stringified_plans = 2;
bool verbose = 3;
}
+message AnalyzeExecNode {
+ bool verbose = 1;
+ bool show_statistics = 2;
+ PhysicalPlanNode input = 3;
+ datafusion_common.Schema schema = 4;
+}
+
message CrossJoinExecNode {
PhysicalPlanNode left = 1;
PhysicalPlanNode right = 2;
@@ -1207,13 +1066,16 @@ message PhysicalColumn {
}
message JoinOn {
- PhysicalColumn left = 1;
- PhysicalColumn right = 2;
+ PhysicalExprNode left = 1;
+ PhysicalExprNode right = 2;
}
message EmptyExecNode {
- bool produce_one_row = 1;
- Schema schema = 2;
+ datafusion_common.Schema schema = 1;
+}
+
+message PlaceholderRowExecNode {
+ datafusion_common.Schema schema = 1;
}
message ProjectionExecNode {
@@ -1226,13 +1088,37 @@ enum AggregateMode {
PARTIAL = 0;
FINAL = 1;
FINAL_PARTITIONED = 2;
+ SINGLE = 3;
+ SINGLE_PARTITIONED = 4;
+}
+
+message PartiallySortedInputOrderMode {
+ repeated uint64 columns = 6;
}
message WindowAggExecNode {
PhysicalPlanNode input = 1;
- repeated PhysicalExprNode window_expr = 2;
- repeated string window_expr_name = 3;
- Schema input_schema = 4;
+ repeated PhysicalWindowExprNode window_expr = 2;
+ repeated PhysicalExprNode partition_keys = 5;
+ // Set optional to `None` for `BoundedWindowAggExec`.
+ oneof input_order_mode {
+ datafusion_common.EmptyMessage linear = 7;
+ PartiallySortedInputOrderMode partially_sorted = 8;
+ datafusion_common.EmptyMessage sorted = 9;
+ }
+}
+
+message MaybeFilter {
+ PhysicalExprNode expr = 1;
+}
+
+message MaybePhysicalSortExprs {
+ repeated PhysicalSortExprNode sort_expr = 1;
+}
+
+message AggLimit {
+ // wrap into a message to make it optional
+ uint64 limit = 1;
}
message AggregateExecNode {
@@ -1243,9 +1129,11 @@ message AggregateExecNode {
repeated string group_expr_name = 5;
repeated string aggr_expr_name = 6;
// we need the input schema to the partial aggregate to pass to the final
aggregate
- Schema input_schema = 7;
+ datafusion_common.Schema input_schema = 7;
repeated PhysicalExprNode null_expr = 8;
repeated bool groups = 9;
+ repeated MaybeFilter filter_expr = 10;
+ AggLimit limit = 11;
}
message GlobalLimitExecNode {
@@ -1266,11 +1154,21 @@ message SortExecNode {
repeated PhysicalExprNode expr = 2;
// Maximum number of highest/lowest rows to fetch; negative means no limit
int64 fetch = 3;
+ bool preserve_partitioning = 4;
}
message SortPreservingMergeExecNode {
PhysicalPlanNode input = 1;
repeated PhysicalExprNode expr = 2;
+ // Maximum number of highest/lowest rows to fetch; negative means no limit
+ int64 fetch = 3;
+}
+
+message NestedLoopJoinExecNode {
+ PhysicalPlanNode left = 1;
+ PhysicalPlanNode right = 2;
+ datafusion_common.JoinType join_type = 3;
+ JoinFilter filter = 4;
}
message CoalesceBatchesExecNode {
@@ -1289,35 +1187,40 @@ message PhysicalHashRepartition {
message RepartitionExecNode{
PhysicalPlanNode input = 1;
+ // oneof partition_method {
+ // uint64 round_robin = 2;
+ // PhysicalHashRepartition hash = 3;
+ // uint64 unknown = 4;
+ // }
+ Partitioning partitioning = 5;
+}
+
+message Partitioning {
oneof partition_method {
- uint64 round_robin = 2;
- PhysicalHashRepartition hash = 3;
- uint64 unknown = 4;
+ uint64 round_robin = 1;
+ PhysicalHashRepartition hash = 2;
+ uint64 unknown = 3;
}
}
message JoinFilter{
PhysicalExprNode expression = 1;
repeated ColumnIndex column_indices = 2;
- Schema schema = 3;
+ datafusion_common.Schema schema = 3;
}
message ColumnIndex{
uint32 index = 1;
- JoinSide side = 2;
-}
-
-enum JoinSide{
- LEFT_SIDE = 0;
- RIGHT_SIDE = 1;
+ datafusion_common.JoinSide side = 2;
}
message PartitionedFile {
string path = 1;
uint64 size = 2;
uint64 last_modified_ns = 3;
- repeated ScalarValue partition_values = 4;
+ repeated datafusion_common.ScalarValue partition_values = 4;
FileRange range = 5;
+ datafusion_common.Statistics statistics = 6;
}
message FileRange {
@@ -1329,19 +1232,5 @@ message PartitionStats {
int64 num_rows = 1;
int64 num_batches = 2;
int64 num_bytes = 3;
- repeated ColumnStats column_stats = 4;
+ repeated datafusion_common.ColumnStats column_stats = 4;
}
-
-message Statistics {
- int64 num_rows = 1;
- int64 total_byte_size = 2;
- repeated ColumnStats column_stats = 3;
- bool is_exact = 4;
-}
-
-message ColumnStats {
- ScalarValue min_value = 1;
- ScalarValue max_value = 2;
- uint32 null_count = 3;
- uint32 distinct_count = 4;
-}
\ No newline at end of file
diff --git a/ballista/core/proto/datafusion_common.proto
b/ballista/core/proto/datafusion_common.proto
new file mode 100644
index 00000000..d9ec7dbb
--- /dev/null
+++ b/ballista/core/proto/datafusion_common.proto
@@ -0,0 +1,541 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto3";
+
+package datafusion_common;
+
+message ColumnRelation {
+ string relation = 1;
+}
+
+message Column {
+ string name = 1;
+ ColumnRelation relation = 2;
+}
+
+message DfField{
+ Field field = 1;
+ ColumnRelation qualifier = 2;
+}
+
+message DfSchema {
+ repeated DfField columns = 1;
+ map<string, string> metadata = 2;
+}
+
+message CsvFormat {
+ CsvOptions options = 5;
+}
+
+message ParquetFormat {
+ // Used to be bool enable_pruning = 1;
+ reserved 1;
+ TableParquetOptions options = 2;
+}
+
+message AvroFormat {}
+
+message PrimaryKeyConstraint{
+ repeated uint64 indices = 1;
+}
+
+message UniqueConstraint{
+ repeated uint64 indices = 1;
+}
+
+message Constraint{
+ oneof constraint_mode{
+ PrimaryKeyConstraint primary_key = 1;
+ UniqueConstraint unique = 2;
+ }
+}
+
+message Constraints{
+ repeated Constraint constraints = 1;
+}
+
+enum JoinType {
+ INNER = 0;
+ LEFT = 1;
+ RIGHT = 2;
+ FULL = 3;
+ LEFTSEMI = 4;
+ LEFTANTI = 5;
+ RIGHTSEMI = 6;
+ RIGHTANTI = 7;
+}
+
+enum JoinConstraint {
+ ON = 0;
+ USING = 1;
+}
+
+message AvroOptions {}
+message ArrowOptions {}
+
+message Schema {
+ repeated Field columns = 1;
+ map<string, string> metadata = 2;
+}
+
+message Field {
+ // name of the field
+ string name = 1;
+ ArrowType arrow_type = 2;
+ bool nullable = 3;
+ // for complex data types like structs, unions
+ repeated Field children = 4;
+ map<string, string> metadata = 5;
+ int64 dict_id = 6;
+ bool dict_ordered = 7;
+}
+
+message Timestamp{
+ TimeUnit time_unit = 1;
+ string timezone = 2;
+}
+
+enum TimeUnit{
+ Second = 0;
+ Millisecond = 1;
+ Microsecond = 2;
+ Nanosecond = 3;
+}
+
+enum IntervalUnit{
+ YearMonth = 0;
+ DayTime = 1;
+ MonthDayNano = 2;
+}
+
+message Decimal{
+ reserved 1, 2;
+ uint32 precision = 3;
+ int32 scale = 4;
+}
+
+message List{
+ Field field_type = 1;
+}
+
+message FixedSizeList{
+ Field field_type = 1;
+ int32 list_size = 2;
+}
+
+message Dictionary{
+ ArrowType key = 1;
+ ArrowType value = 2;
+}
+
+message Struct{
+ repeated Field sub_field_types = 1;
+}
+
+message Map {
+ Field field_type = 1;
+ bool keys_sorted = 2;
+}
+
+enum UnionMode{
+ sparse = 0;
+ dense = 1;
+}
+
+message Union{
+ repeated Field union_types = 1;
+ UnionMode union_mode = 2;
+ repeated int32 type_ids = 3;
+}
+
+// Used for List/FixedSizeList/LargeList/Struct
+message ScalarNestedValue {
+ message Dictionary {
+ bytes ipc_message = 1;
+ bytes arrow_data = 2;
+ }
+
+ bytes ipc_message = 1;
+ bytes arrow_data = 2;
+ Schema schema = 3;
+ repeated Dictionary dictionaries = 4;
+}
+
+message ScalarTime32Value {
+ oneof value {
+ int32 time32_second_value = 1;
+ int32 time32_millisecond_value = 2;
+ };
+}
+
+message ScalarTime64Value {
+ oneof value {
+ int64 time64_microsecond_value = 1;
+ int64 time64_nanosecond_value = 2;
+ };
+}
+
+message ScalarTimestampValue {
+ oneof value {
+ int64 time_microsecond_value = 1;
+ int64 time_nanosecond_value = 2;
+ int64 time_second_value = 3;
+ int64 time_millisecond_value = 4;
+ };
+ string timezone = 5;
+}
+
+message ScalarDictionaryValue {
+ ArrowType index_type = 1;
+ ScalarValue value = 2;
+}
+
+message IntervalDayTimeValue {
+ int32 days = 1;
+ int32 milliseconds = 2;
+}
+
+message IntervalMonthDayNanoValue {
+ int32 months = 1;
+ int32 days = 2;
+ int64 nanos = 3;
+}
+
+message UnionField {
+ int32 field_id = 1;
+ Field field = 2;
+}
+
+message UnionValue {
+ // Note that a null union value must have one or more fields, so we
+ // encode a null UnionValue as one with value_id == 128
+ int32 value_id = 1;
+ ScalarValue value = 2;
+ repeated UnionField fields = 3;
+ UnionMode mode = 4;
+}
+
+message ScalarFixedSizeBinary{
+ bytes values = 1;
+ int32 length = 2;
+}
+
+message ScalarValue{
+ // was PrimitiveScalarType null_value = 19;
+ reserved 19;
+
+ oneof value {
+ // was PrimitiveScalarType null_value = 19;
+ // Null value of any type
+ ArrowType null_value = 33;
+
+ bool bool_value = 1;
+ string utf8_value = 2;
+ string large_utf8_value = 3;
+ int32 int8_value = 4;
+ int32 int16_value = 5;
+ int32 int32_value = 6;
+ int64 int64_value = 7;
+ uint32 uint8_value = 8;
+ uint32 uint16_value = 9;
+ uint32 uint32_value = 10;
+ uint64 uint64_value = 11;
+ float float32_value = 12;
+ double float64_value = 13;
+ // Literal Date32 value always has a unit of day
+ int32 date_32_value = 14;
+ ScalarTime32Value time32_value = 15;
+ ScalarNestedValue large_list_value = 16;
+ ScalarNestedValue list_value = 17;
+ ScalarNestedValue fixed_size_list_value = 18;
+ ScalarNestedValue struct_value = 32;
+
+ Decimal128 decimal128_value = 20;
+ Decimal256 decimal256_value = 39;
+
+ int64 date_64_value = 21;
+ int32 interval_yearmonth_value = 24;
+
+ int64 duration_second_value = 35;
+ int64 duration_millisecond_value = 36;
+ int64 duration_microsecond_value = 37;
+ int64 duration_nanosecond_value = 38;
+
+ ScalarTimestampValue timestamp_value = 26;
+ ScalarDictionaryValue dictionary_value = 27;
+ bytes binary_value = 28;
+ bytes large_binary_value = 29;
+ ScalarTime64Value time64_value = 30;
+ IntervalDayTimeValue interval_daytime_value = 25;
+ IntervalMonthDayNanoValue interval_month_day_nano = 31;
+ ScalarFixedSizeBinary fixed_size_binary_value = 34;
+ UnionValue union_value = 42;
+ }
+}
+
+message Decimal128{
+ bytes value = 1;
+ int64 p = 2;
+ int64 s = 3;
+}
+
+message Decimal256{
+ bytes value = 1;
+ int64 p = 2;
+ int64 s = 3;
+}
+
+// Serialized data type
+message ArrowType{
+ oneof arrow_type_enum {
+ EmptyMessage NONE = 1; // arrow::Type::NA
+ EmptyMessage BOOL = 2; // arrow::Type::BOOL
+ EmptyMessage UINT8 = 3; // arrow::Type::UINT8
+ EmptyMessage INT8 = 4; // arrow::Type::INT8
+ EmptyMessage UINT16 = 5; // represents arrow::Type fields in
src/arrow/type.h
+ EmptyMessage INT16 = 6;
+ EmptyMessage UINT32 = 7;
+ EmptyMessage INT32 = 8;
+ EmptyMessage UINT64 = 9;
+ EmptyMessage INT64 = 10 ;
+ EmptyMessage FLOAT16 = 11 ;
+ EmptyMessage FLOAT32 = 12 ;
+ EmptyMessage FLOAT64 = 13 ;
+ EmptyMessage UTF8 = 14 ;
+ EmptyMessage LARGE_UTF8 = 32;
+ EmptyMessage BINARY = 15 ;
+ int32 FIXED_SIZE_BINARY = 16 ;
+ EmptyMessage LARGE_BINARY = 31;
+ EmptyMessage DATE32 = 17 ;
+ EmptyMessage DATE64 = 18 ;
+ TimeUnit DURATION = 19;
+ Timestamp TIMESTAMP = 20 ;
+ TimeUnit TIME32 = 21 ;
+ TimeUnit TIME64 = 22 ;
+ IntervalUnit INTERVAL = 23 ;
+ Decimal DECIMAL = 24 ;
+ List LIST = 25;
+ List LARGE_LIST = 26;
+ FixedSizeList FIXED_SIZE_LIST = 27;
+ Struct STRUCT = 28;
+ Union UNION = 29;
+ Dictionary DICTIONARY = 30;
+ Map MAP = 33;
+ }
+}
+
+//Useful for representing an empty enum variant in rust
+// E.G. enum example{One, Two(i32)}
+// maps to
+// message example{
+// oneof{
+// EmptyMessage One = 1;
+// i32 Two = 2;
+// }
+//}
+message EmptyMessage{}
+
+enum CompressionTypeVariant {
+ GZIP = 0;
+ BZIP2 = 1;
+ XZ = 2;
+ ZSTD = 3;
+ UNCOMPRESSED = 4;
+}
+
+message JsonWriterOptions {
+ CompressionTypeVariant compression = 1;
+}
+
+
+message CsvWriterOptions {
+ // Compression type
+ CompressionTypeVariant compression = 1;
+ // Optional column delimiter. Defaults to `b','`
+ string delimiter = 2;
+ // Whether to write column names as file headers. Defaults to `true`
+ bool has_header = 3;
+ // Optional date format for date arrays
+ string date_format = 4;
+ // Optional datetime format for datetime arrays
+ string datetime_format = 5;
+ // Optional timestamp format for timestamp arrays
+ string timestamp_format = 6;
+ // Optional time format for time arrays
+ string time_format = 7;
+ // Optional value to represent null
+ string null_value = 8;
+}
+
+// Options controlling CSV format
+message CsvOptions {
+ bytes has_header = 1; // Indicates if the CSV has a header row
+ bytes delimiter = 2; // Delimiter character as a byte
+ bytes quote = 3; // Quote character as a byte
+ bytes escape = 4; // Optional escape character as a byte
+ CompressionTypeVariant compression = 5; // Compression type
+ uint64 schema_infer_max_rec = 6; // Max records for schema inference
+ string date_format = 7; // Optional date format
+ string datetime_format = 8; // Optional datetime format
+ string timestamp_format = 9; // Optional timestamp format
+ string timestamp_tz_format = 10; // Optional timestamp with timezone format
+ string time_format = 11; // Optional time format
+ string null_value = 12; // Optional representation of null value
+}
+
+// Options controlling CSV format
+message JsonOptions {
+ CompressionTypeVariant compression = 1; // Compression type
+ uint64 schema_infer_max_rec = 2; // Max records for schema inference
+}
+
+message TableParquetOptions {
+ ParquetOptions global = 1;
+ repeated ColumnSpecificOptions column_specific_options = 2;
+}
+
+message ColumnSpecificOptions {
+ string column_name = 1;
+ ColumnOptions options = 2;
+}
+
+message ColumnOptions {
+ oneof bloom_filter_enabled_opt {
+ bool bloom_filter_enabled = 1;
+ }
+
+ oneof encoding_opt {
+ string encoding = 2;
+ }
+
+ oneof dictionary_enabled_opt {
+ bool dictionary_enabled = 3;
+ }
+
+ oneof compression_opt {
+ string compression = 4;
+ }
+
+ oneof statistics_enabled_opt {
+ string statistics_enabled = 5;
+ }
+
+ oneof bloom_filter_fpp_opt {
+ double bloom_filter_fpp = 6;
+ }
+
+ oneof bloom_filter_ndv_opt {
+ uint64 bloom_filter_ndv = 7;
+ }
+
+ oneof max_statistics_size_opt {
+ uint32 max_statistics_size = 8;
+ }
+}
+
+message ParquetOptions {
+ // Regular fields
+ bool enable_page_index = 1; // default = true
+ bool pruning = 2; // default = true
+ bool skip_metadata = 3; // default = true
+ bool pushdown_filters = 5; // default = false
+ bool reorder_filters = 6; // default = false
+ uint64 data_pagesize_limit = 7; // default = 1024 * 1024
+ uint64 write_batch_size = 8; // default = 1024
+ string writer_version = 9; // default = "1.0"
+ // bool bloom_filter_enabled = 20; // default = false
+ bool allow_single_file_parallelism = 23; // default = true
+ uint64 maximum_parallel_row_group_writers = 24; // default = 1
+ uint64 maximum_buffered_record_batches_per_stream = 25; // default = 2
+ bool bloom_filter_on_read = 26; // default = true
+ bool bloom_filter_on_write = 27; // default = false
+
+ oneof metadata_size_hint_opt {
+ uint64 metadata_size_hint = 4;
+ }
+
+ oneof compression_opt {
+ string compression = 10;
+ }
+
+ oneof dictionary_enabled_opt {
+ bool dictionary_enabled = 11;
+ }
+
+ oneof statistics_enabled_opt {
+ string statistics_enabled = 13;
+ }
+
+ oneof max_statistics_size_opt {
+ uint64 max_statistics_size = 14;
+ }
+
+ oneof column_index_truncate_length_opt {
+ uint64 column_index_truncate_length = 17;
+ }
+
+ oneof encoding_opt {
+ string encoding = 19;
+ }
+
+ oneof bloom_filter_fpp_opt {
+ double bloom_filter_fpp = 21;
+ }
+
+ oneof bloom_filter_ndv_opt {
+ uint64 bloom_filter_ndv = 22;
+ }
+
+ uint64 dictionary_page_size_limit = 12;
+
+ uint64 data_page_row_count_limit = 18;
+
+ uint64 max_row_group_size = 15;
+
+ string created_by = 16;
+}
+
+enum JoinSide{
+ LEFT_SIDE = 0;
+ RIGHT_SIDE = 1;
+}
+
+message Precision{
+ PrecisionInfo precision_info = 1;
+ ScalarValue val = 2;
+}
+
+enum PrecisionInfo {
+ EXACT = 0;
+ INEXACT = 1;
+ ABSENT = 2;
+}
+
+message Statistics {
+ Precision num_rows = 1;
+ Precision total_byte_size = 2;
+ repeated ColumnStats column_stats = 3;
+}
+
+message ColumnStats {
+ Precision min_value = 1;
+ Precision max_value = 2;
+ Precision null_count = 3;
+ Precision distinct_count = 4;
+}
diff --git a/ballista/core/src/serde/generated/ballista.rs
b/ballista/core/src/serde/generated/ballista.rs
index 67714873..c45bdeaf 100644
--- a/ballista/core/src/serde/generated/ballista.rs
+++ b/ballista/core/src/serde/generated/ballista.rs
@@ -45,7 +45,7 @@ pub struct UnresolvedShuffleExecNode {
#[prost(uint32, tag = "1")]
pub stage_id: u32,
#[prost(message, optional, tag = "2")]
- pub schema: ::core::option::Option<::datafusion_proto::protobuf::Schema>,
+ pub schema: ::core::option::Option<::datafusion_proto_common::Schema>,
#[prost(uint32, tag = "4")]
pub output_partition_count: u32,
}
@@ -55,7 +55,7 @@ pub struct ShuffleReaderExecNode {
#[prost(message, repeated, tag = "1")]
pub partition: ::prost::alloc::vec::Vec<ShuffleReaderPartition>,
#[prost(message, optional, tag = "2")]
- pub schema: ::core::option::Option<::datafusion_proto::protobuf::Schema>,
+ pub schema: ::core::option::Option<::datafusion_proto_common::Schema>,
/// The stage to read from
#[prost(uint32, tag = "3")]
pub stage_id: u32,
@@ -379,9 +379,9 @@ pub struct PartitionStats {
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ColumnStats {
#[prost(message, optional, tag = "1")]
- pub min_value:
::core::option::Option<::datafusion_proto::protobuf::ScalarValue>,
+ pub min_value:
::core::option::Option<::datafusion_proto_common::ScalarValue>,
#[prost(message, optional, tag = "2")]
- pub max_value:
::core::option::Option<::datafusion_proto::protobuf::ScalarValue>,
+ pub max_value:
::core::option::Option<::datafusion_proto_common::ScalarValue>,
#[prost(uint32, tag = "3")]
pub null_count: u32,
#[prost(uint32, tag = "4")]
@@ -1083,7 +1083,7 @@ pub struct GetFileMetadataParams {
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct GetFileMetadataResult {
#[prost(message, optional, tag = "1")]
- pub schema: ::core::option::Option<::datafusion_proto::protobuf::Schema>,
+ pub schema: ::core::option::Option<::datafusion_proto_common::Schema>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
diff --git a/dev/docker/ballista-builder.Dockerfile
b/dev/docker/ballista-builder.Dockerfile
index abcd14f2..aa9a6fa5 100644
--- a/dev/docker/ballista-builder.Dockerfile
+++ b/dev/docker/ballista-builder.Dockerfile
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-FROM rust:1.74.1-buster
+FROM rust:1.81-bullseye
ARG EXT_UID
@@ -31,7 +31,7 @@ RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash -
&& \
apt-get install -y nodejs && \
npm install -g yarn
-# create build user with same UID as
+# create build user with same UID as
RUN adduser -q -u $EXT_UID builder --home /home/builder && \
mkdir -p /home/builder/workspace
USER builder
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]