This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-ballista.git


The following commit(s) were added to refs/heads/main by this push:
     new e39a7e68 Update datafusion protobuf definitions (#1057)
e39a7e68 is described below

commit e39a7e68da093ff6f0f002e7c6553d6fd4763dbb
Author: Baris Palaska <[email protected]>
AuthorDate: Thu Sep 19 20:52:24 2024 +0200

    Update datafusion protobuf definitions (#1057)
    
    * update datafusion proto defs
    
    * allow optionals in proto3
    
    update docker environment for higher protoc version
---
 ballista/core/build.rs                        |   3 +
 ballista/core/proto/ballista.proto            |  11 +-
 ballista/core/proto/datafusion.proto          | 789 +++++++++++---------------
 ballista/core/proto/datafusion_common.proto   | 541 ++++++++++++++++++
 ballista/core/src/serde/generated/ballista.rs |  10 +-
 dev/docker/ballista-builder.Dockerfile        |   4 +-
 6 files changed, 896 insertions(+), 462 deletions(-)

diff --git a/ballista/core/build.rs b/ballista/core/build.rs
index a13da777..6e501b88 100644
--- a/ballista/core/build.rs
+++ b/ballista/core/build.rs
@@ -37,10 +37,13 @@ fn main() -> Result<(), String> {
     // We don't include the proto files in releases so that downstreams
     // do not need to have PROTOC included
     if Path::new("proto/datafusion.proto").exists() {
+        println!("cargo:rerun-if-changed=proto/datafusion_common.proto");
         println!("cargo:rerun-if-changed=proto/datafusion.proto");
         println!("cargo:rerun-if-changed=proto/ballista.proto");
         tonic_build::configure()
+            .extern_path(".datafusion_common", "::datafusion_proto_common")
             .extern_path(".datafusion", "::datafusion_proto::protobuf")
+            .protoc_arg("--experimental_allow_proto3_optional")
             .compile(&["proto/ballista.proto"], &["proto"])
             .map_err(|e| format!("protobuf compilation failed: {e}"))?;
         let generated_source_path = out.join("ballista.protobuf.rs");
diff --git a/ballista/core/proto/ballista.proto 
b/ballista/core/proto/ballista.proto
index 78613a72..eab1d801 100644
--- a/ballista/core/proto/ballista.proto
+++ b/ballista/core/proto/ballista.proto
@@ -25,6 +25,7 @@ option java_package = "org.apache.arrow.ballista.protobuf";
 option java_outer_classname = "BallistaProto";
 
 import "datafusion.proto";
+import "datafusion_common.proto";
 
 
///////////////////////////////////////////////////////////////////////////////////////////////////
 // Ballista Physical Plan
@@ -48,13 +49,13 @@ message ShuffleWriterExecNode {
 
 message UnresolvedShuffleExecNode {
   uint32 stage_id = 1;
-  datafusion.Schema schema = 2;
+  datafusion_common.Schema schema = 2;
   uint32 output_partition_count = 4;
 }
 
 message ShuffleReaderExecNode {
   repeated ShuffleReaderPartition partition = 1;
-  datafusion.Schema schema = 2;
+  datafusion_common.Schema schema = 2;
   // The stage to read from
   uint32 stage_id = 3;
 }
@@ -236,8 +237,8 @@ message PartitionStats {
 }
 
 message ColumnStats {
-  datafusion.ScalarValue min_value = 1;
-  datafusion.ScalarValue max_value = 2;
+  datafusion_common.ScalarValue min_value = 1;
+  datafusion_common.ScalarValue max_value = 2;
   uint32 null_count = 3;
   uint32 distinct_count = 4;
 }
@@ -634,7 +635,7 @@ message GetFileMetadataParams {
 }
 
 message GetFileMetadataResult {
-  datafusion.Schema schema = 1;
+  datafusion_common.Schema schema = 1;
 }
 
 message FilePartitionMetadata {
diff --git a/ballista/core/proto/datafusion.proto 
b/ballista/core/proto/datafusion.proto
index 8a9a8c0f..8402b92f 100644
--- a/ballista/core/proto/datafusion.proto
+++ b/ballista/core/proto/datafusion.proto
@@ -24,24 +24,7 @@ option java_multiple_files = true;
 option java_package = "org.apache.arrow.datafusion.protobuf";
 option java_outer_classname = "DatafusionProto";
 
-message ColumnRelation {
-  string relation = 1;
-}
-
-message Column {
-  string name = 1;
-  ColumnRelation relation = 2;
-}
-
-message DfField{
-  Field field = 1;
-  ColumnRelation qualifier = 2;
-}
-
-message DfSchema {
-  repeated DfField columns = 1;
-  map<string, string> metadata = 2;
-}
+import "datafusion_common.proto";
 
 // logical plan
 // LogicalPlan is a nested type
@@ -72,6 +55,10 @@ message LogicalPlanNode {
     ViewTableScanNode view_scan = 24;
     CustomTableScanNode custom_scan = 25;
     PrepareNode prepare = 26;
+    DropViewNode drop_view = 27;
+    DistinctOnNode distinct_on = 28;
+    CopyToNode copy_to = 29;
+    UnnestNode unnest = 30;
   }
 }
 
@@ -84,49 +71,44 @@ message ProjectionColumns {
   repeated string columns = 1;
 }
 
-message CsvFormat {
-  bool has_header = 1;
-  string delimiter = 2;
+message LogicalExprNodeCollection {
+  repeated LogicalExprNode logical_expr_nodes = 1;
 }
 
-message ParquetFormat {
-  // Used to be bool enable_pruning = 1;
-  reserved 1;
-}
-
-message AvroFormat {}
-
 message ListingTableScanNode {
-  string table_name = 1;
+  reserved 1; // was string table_name
+  TableReference table_name = 14;
   repeated string paths = 2;
   string file_extension = 3;
   ProjectionColumns projection = 4;
-  Schema schema = 5;
+  datafusion_common.Schema schema = 5;
   repeated LogicalExprNode filters = 6;
   repeated string table_partition_cols = 7;
   bool collect_stat = 8;
   uint32 target_partitions = 9;
   oneof FileFormatType {
-    CsvFormat csv = 10;
-    ParquetFormat parquet = 11;
-    AvroFormat avro = 12;
+    datafusion_common.CsvFormat csv = 10;
+    datafusion_common.ParquetFormat parquet = 11;
+    datafusion_common.AvroFormat avro = 12;
   }
-  repeated LogicalExprNode file_sort_order = 13;
+  repeated LogicalExprNodeCollection file_sort_order = 13;
 }
 
 message ViewTableScanNode {
-  string table_name = 1;
+  reserved 1; // was string table_name
+  TableReference table_name = 6;
   LogicalPlanNode input = 2;
-  Schema schema = 3;
+  datafusion_common.Schema schema = 3;
   ProjectionColumns projection = 4;
   string definition = 5;
 }
 
 // Logical Plan to Scan a CustomTableProvider registered at runtime
 message CustomTableScanNode {
-  string table_name = 1;
+  reserved 1; // was string table_name
+  TableReference table_name = 6;
   ProjectionColumns projection = 2;
-  Schema schema = 3;
+  datafusion_common.Schema schema = 3;
   repeated LogicalExprNode filters = 4;
   bytes custom_table_data = 5;
 }
@@ -170,40 +152,47 @@ message EmptyRelationNode {
 
 message CreateExternalTableNode {
   reserved 1; // was string name
-  OwnedTableReference name = 12;
+  TableReference name = 9;
   string location = 2;
   string file_type = 3;
-  bool has_header = 4;
-  DfSchema schema = 5;
-  repeated string table_partition_cols = 6;
-  bool if_not_exists = 7;
-  string delimiter = 8;
-  string definition = 9;
-  string file_compression_type = 10;
-  map<string, string> options = 11;
-}
+  datafusion_common.DfSchema schema = 4;
+  repeated string table_partition_cols = 5;
+  bool if_not_exists = 6;
+  string definition = 7;
+  repeated LogicalExprNodeCollection order_exprs = 10;
+  bool unbounded = 11;
+  map<string, string> options = 8;
+  datafusion_common.Constraints constraints = 12;
+  map<string, LogicalExprNode> column_defaults = 13;
+ }
 
 message PrepareNode {
   string name = 1;
-  repeated ArrowType data_types = 2;
+  repeated datafusion_common.ArrowType data_types = 2;
   LogicalPlanNode input = 3;
 }
 
 message CreateCatalogSchemaNode {
   string schema_name = 1;
   bool if_not_exists = 2;
-  DfSchema schema = 3;
+  datafusion_common.DfSchema schema = 3;
 }
 
 message CreateCatalogNode {
   string catalog_name = 1;
   bool if_not_exists = 2;
-  DfSchema schema = 3;
+  datafusion_common.DfSchema schema = 3;
+}
+
+message DropViewNode {
+  TableReference name = 1;
+  bool if_exists = 2;
+  datafusion_common.DfSchema schema = 3;
 }
 
 message CreateViewNode {
   reserved 1; // was string name
-  OwnedTableReference name = 5;
+  TableReference name = 5;
   LogicalPlanNode input = 2;
   bool or_replace = 3;
   string definition = 4;
@@ -237,27 +226,11 @@ message WindowNode {
   repeated LogicalExprNode window_expr = 2;
 }
 
-enum JoinType {
-  INNER = 0;
-  LEFT = 1;
-  RIGHT = 2;
-  FULL = 3;
-  LEFTSEMI = 4;
-  LEFTANTI = 5;
-  RIGHTSEMI = 6;
-  RIGHTANTI = 7;
-}
-
-enum JoinConstraint {
-  ON = 0;
-  USING = 1;
-}
-
 message JoinNode {
   LogicalPlanNode left = 1;
   LogicalPlanNode right = 2;
-  JoinType join_type = 3;
-  JoinConstraint join_constraint = 4;
+  datafusion_common.JoinType join_type = 3;
+  datafusion_common.JoinConstraint join_constraint = 4;
   repeated LogicalExprNode left_join_key = 5;
   repeated LogicalExprNode right_join_key = 6;
   bool null_equals_null = 7;
@@ -268,6 +241,40 @@ message DistinctNode {
   LogicalPlanNode input = 1;
 }
 
+message DistinctOnNode {
+  repeated LogicalExprNode on_expr = 1;
+  repeated LogicalExprNode select_expr = 2;
+  repeated LogicalExprNode sort_expr = 3;
+  LogicalPlanNode input = 4;
+}
+
+message CopyToNode {
+    LogicalPlanNode input = 1;
+    string output_url = 2;
+    oneof format_options {
+      datafusion_common.CsvOptions csv = 8;
+      datafusion_common.JsonOptions json = 9;
+      datafusion_common.TableParquetOptions parquet = 10;
+      datafusion_common.AvroOptions avro = 11;
+      datafusion_common.ArrowOptions arrow = 12;
+    }
+    repeated string partition_by = 7;
+}
+
+message UnnestNode {
+    LogicalPlanNode input = 1;
+    repeated datafusion_common.Column exec_columns = 2;
+    repeated uint64 list_type_columns = 3;
+    repeated uint64 struct_type_columns = 4;
+    repeated uint64 dependency_indices = 5;
+    datafusion_common.DfSchema schema = 6;
+    UnnestOptions options = 7;
+}
+
+message UnnestOptions {
+    bool preserve_nulls = 1;
+}
+
 message UnionNode {
   repeated LogicalPlanNode inputs = 1;
 }
@@ -290,20 +297,21 @@ message SelectionExecNode {
 }
 
 message SubqueryAliasNode {
+  reserved 2; // Was string alias
   LogicalPlanNode input = 1;
-  string alias = 2;
+  TableReference alias = 3;
 }
 
 // logical expressions
 message LogicalExprNode {
   oneof ExprType {
     // column references
-    Column column = 1;
+    datafusion_common.Column column = 1;
 
     // alias
     AliasNode alias = 2;
 
-    ScalarValue literal = 3;
+    datafusion_common.ScalarValue literal = 3;
 
     // binary expressions
     BinaryExprNode binary_expr = 4;
@@ -322,8 +330,8 @@ message LogicalExprNode {
     SortExprNode sort = 12;
     NegativeNode negative = 13;
     InListNode in_list = 14;
-    bool wildcard = 15;
-    ScalarFunctionNode scalar_function = 16;
+    Wildcard wildcard = 15;
+    // was  ScalarFunctionNode scalar_function = 16;
     TryCastNode try_cast = 17;
 
     // window expressions
@@ -335,7 +343,7 @@ message LogicalExprNode {
     // Scalar UDF expressions
     ScalarUDFExprNode scalar_udf_expr = 20;
 
-    GetIndexedField get_indexed_field = 21;
+    // GetIndexedField get_indexed_field = 21;
 
     GroupingSetNode grouping_set = 22;
 
@@ -355,12 +363,18 @@ message LogicalExprNode {
 
     PlaceholderNode placeholder = 34;
 
+    Unnest unnest = 35;
+
   }
 }
 
+message Wildcard {
+  string qualifier = 1;
+}
+
 message PlaceholderNode {
   string id = 1;
-  ArrowType data_type = 2;
+  datafusion_common.ArrowType data_type = 2;
 }
 
 message LogicalExprList {
@@ -379,11 +393,18 @@ message RollupNode {
   repeated LogicalExprNode expr = 1;
 }
 
+message NamedStructField {
+  datafusion_common.ScalarValue name = 1;
+}
 
+message ListIndex {
+  LogicalExprNode key = 1;
+}
 
-message GetIndexedField {
-  LogicalExprNode expr = 1;
-  ScalarValue key = 2;
+message ListRange {
+  LogicalExprNode start = 1;
+  LogicalExprNode stop = 2;
+  LogicalExprNode stride = 3;
 }
 
 message IsNull {
@@ -425,6 +446,7 @@ message Not {
 message AliasNode {
   LogicalExprNode expr = 1;
   string alias = 2;
+  repeated TableReference relation = 3;
 }
 
 message BinaryExprNode {
@@ -439,93 +461,16 @@ message NegativeNode {
   LogicalExprNode expr = 1;
 }
 
+message Unnest {
+  repeated LogicalExprNode exprs = 1;
+}
+
 message InListNode {
   LogicalExprNode expr = 1;
   repeated LogicalExprNode list = 2;
   bool negated = 3;
 }
 
-enum ScalarFunction {
-  Abs = 0;
-  Acos = 1;
-  Asin = 2;
-  Atan = 3;
-  Ascii = 4;
-  Ceil = 5;
-  Cos = 6;
-  Digest = 7;
-  Exp = 8;
-  Floor = 9;
-  Ln = 10;
-  Log = 11;
-  Log10 = 12;
-  Log2 = 13;
-  Round = 14;
-  Signum = 15;
-  Sin = 16;
-  Sqrt = 17;
-  Tan = 18;
-  Trunc = 19;
-  Array = 20;
-  RegexpMatch = 21;
-  BitLength = 22;
-  Btrim = 23;
-  CharacterLength = 24;
-  Chr = 25;
-  Concat = 26;
-  ConcatWithSeparator = 27;
-  DatePart = 28;
-  DateTrunc = 29;
-  InitCap = 30;
-  Left = 31;
-  Lpad = 32;
-  Lower = 33;
-  Ltrim = 34;
-  MD5 = 35;
-  NullIf = 36;
-  OctetLength = 37;
-  Random = 38;
-  RegexpReplace = 39;
-  Repeat = 40;
-  Replace = 41;
-  Reverse = 42;
-  Right = 43;
-  Rpad = 44;
-  Rtrim = 45;
-  SHA224 = 46;
-  SHA256 = 47;
-  SHA384 = 48;
-  SHA512 = 49;
-  SplitPart = 50;
-  StartsWith = 51;
-  Strpos = 52;
-  Substr = 53;
-  ToHex = 54;
-  ToTimestamp = 55;
-  ToTimestampMillis = 56;
-  ToTimestampMicros = 57;
-  ToTimestampSeconds = 58;
-  Now = 59;
-  Translate = 60;
-  Trim = 61;
-  Upper = 62;
-  Coalesce = 63;
-  Power = 64;
-  StructFun = 65;
-  FromUnixtime = 66;
-  Atan2 = 67;
-  DateBin = 68;
-  ArrowTypeof = 69;
-  CurrentDate = 70;
-  CurrentTime = 71;
-  Uuid = 72;
-}
-
-message ScalarFunctionNode {
-  ScalarFunction fun = 1;
-  repeated LogicalExprNode args = 2;
-}
-
 enum AggregateFunction {
   MIN = 0;
   MAX = 1;
@@ -534,10 +479,10 @@ enum AggregateFunction {
   COUNT = 4;
   APPROX_DISTINCT = 5;
   ARRAY_AGG = 6;
-  VARIANCE = 7;
+  // VARIANCE = 7;
   VARIANCE_POP = 8;
-  COVARIANCE = 9;
-  COVARIANCE_POP = 10;
+  // COVARIANCE = 9;
+  // COVARIANCE_POP = 10;
   STDDEV = 11;
   STDDEV_POP = 12;
   CORRELATION = 13;
@@ -545,7 +490,23 @@ enum AggregateFunction {
   APPROX_MEDIAN = 15;
   APPROX_PERCENTILE_CONT_WITH_WEIGHT = 16;
   GROUPING = 17;
-  MEDIAN = 18;
+  // MEDIAN = 18;
+  BIT_AND = 19;
+  BIT_OR = 20;
+  BIT_XOR = 21;
+  BOOL_AND = 22;
+  BOOL_OR = 23;
+  REGR_SLOPE = 26;
+  REGR_INTERCEPT = 27;
+  REGR_COUNT = 28;
+  REGR_R2 = 29;
+  REGR_AVGX = 30;
+  REGR_AVGY = 31;
+  REGR_SXX = 32;
+  REGR_SYY = 33;
+  REGR_SXY = 34;
+  STRING_AGG = 35;
+  NTH_VALUE_AGG = 36;
 }
 
 message AggregateExprNode {
@@ -553,17 +514,20 @@ message AggregateExprNode {
   repeated LogicalExprNode expr = 2;
   bool distinct = 3;
   LogicalExprNode filter = 4;
+  repeated LogicalExprNode order_by = 5;
 }
 
 message AggregateUDFExprNode {
   string fun_name = 1;
   repeated LogicalExprNode args = 2;
   LogicalExprNode filter = 3;
+  repeated LogicalExprNode order_by = 4;
 }
 
 message ScalarUDFExprNode {
   string fun_name = 1;
   repeated LogicalExprNode args = 2;
+  optional bytes fun_definition = 3;
 }
 
 enum BuiltInWindowFunction {
@@ -584,7 +548,8 @@ message WindowExprNode {
   oneof window_function {
     AggregateFunction aggr_function = 1;
     BuiltInWindowFunction built_in_function = 2;
-    // udaf = 3
+    string udaf = 3;
+    string udwf = 9;
   }
   LogicalExprNode expr = 4;
   repeated LogicalExprNode partition_by = 5;
@@ -634,12 +599,12 @@ message WhenThen {
 
 message CastNode {
   LogicalExprNode expr = 1;
-  ArrowType arrow_type = 2;
+  datafusion_common.ArrowType arrow_type = 2;
 }
 
 message TryCastNode {
   LogicalExprNode expr = 1;
-  ArrowType arrow_type = 2;
+  datafusion_common.ArrowType arrow_type = 2;
 }
 
 message SortExprNode {
@@ -672,242 +637,26 @@ enum WindowFrameBoundType {
 
 message WindowFrameBound {
   WindowFrameBoundType window_frame_bound_type = 1;
-  ScalarValue bound_value = 2;
+  datafusion_common.ScalarValue bound_value = 2;
 }
 
 
///////////////////////////////////////////////////////////////////////////////////////////////////
 // Arrow Data Types
 
///////////////////////////////////////////////////////////////////////////////////////////////////
 
-message Schema {
-  repeated Field columns = 1;
-}
-
-message Field {
-  // name of the field
-  string name = 1;
-  ArrowType arrow_type = 2;
-  bool nullable = 3;
-  // for complex data types like structs, unions
-  repeated Field children = 4;
-}
-
 message FixedSizeBinary{
   int32 length = 1;
 }
 
-message Timestamp{
-  TimeUnit time_unit = 1;
-  string timezone = 2;
-}
-
 enum DateUnit{
   Day = 0;
   DateMillisecond = 1;
 }
 
-enum TimeUnit{
-  Second = 0;
-  Millisecond = 1;
-  Microsecond = 2;
-  Nanosecond = 3;
-}
-
-enum IntervalUnit{
-  YearMonth = 0;
-  DayTime = 1;
-  MonthDayNano = 2;
-}
-
-message Decimal{
-  reserved 1, 2;
-  uint32 precision = 3;
-  int32 scale = 4;
-}
-
-message List{
-  Field field_type = 1;
-}
-
-message FixedSizeList{
-  Field field_type = 1;
-  int32 list_size = 2;
-}
-
-message Dictionary{
-  ArrowType key = 1;
-  ArrowType value = 2;
-}
-
-message Struct{
-  repeated Field sub_field_types = 1;
-}
-
-enum UnionMode{
-  sparse = 0;
-  dense = 1;
-}
-
-message Union{
-  repeated Field union_types = 1;
-  UnionMode union_mode = 2;
-  repeated int32 type_ids = 3;
-}
-
-message ScalarListValue{
-  // encode null explicitly to distinguish a list with a null value
-  // from a list with no values)
-  bool is_null = 3;
-  Field field = 1;
-  repeated ScalarValue values = 2;
+message AnalyzedLogicalPlanType {
+  string analyzer_name = 1;
 }
 
-message ScalarTime32Value {
-  oneof value {
-    int32 time32_second_value = 1;
-    int32 time32_millisecond_value = 2;
-  };
-}
-
-message ScalarTime64Value {
-  oneof value {
-    int64 time64_microsecond_value = 1;
-    int64 time64_nanosecond_value = 2;
-  };
-}
-
-message ScalarTimestampValue {
-  oneof value {
-    int64 time_microsecond_value = 1;
-    int64 time_nanosecond_value = 2;
-    int64 time_second_value = 3;
-    int64 time_millisecond_value = 4;
-  };
-  string timezone = 5;
-}
-
-message ScalarDictionaryValue {
-  ArrowType index_type = 1;
-  ScalarValue value = 2;
-}
-
-message IntervalMonthDayNanoValue {
-  int32 months = 1;
-  int32 days = 2;
-  int64 nanos = 3;
-}
-
-message StructValue {
-  // Note that a null struct value must have one or more fields, so we
-  // encode a null StructValue as one witth an empty field_values
-  // list.
-  repeated ScalarValue field_values = 2;
-  repeated Field fields = 3;
-}
-
-message ScalarFixedSizeBinary{
-  bytes values = 1;
-  int32 length = 2;
-}
-
-message ScalarValue{
-  // was PrimitiveScalarType null_value = 19;
-  reserved 19;
-
-  oneof value {
-    // was PrimitiveScalarType null_value = 19;
-    // Null value of any type
-    ArrowType null_value = 33;
-
-    bool   bool_value = 1;
-    string utf8_value = 2;
-    string large_utf8_value = 3;
-    int32  int8_value = 4;
-    int32  int16_value = 5;
-    int32  int32_value = 6;
-    int64  int64_value = 7;
-    uint32 uint8_value = 8;
-    uint32 uint16_value = 9;
-    uint32 uint32_value = 10;
-    uint64 uint64_value = 11;
-    float  float32_value = 12;
-    double float64_value = 13;
-    // Literal Date32 value always has a unit of day
-    int32  date_32_value = 14;
-    ScalarTime32Value time32_value = 15;
-    ScalarListValue list_value = 17;
-    //WAS: ScalarType null_list_value = 18;
-
-    Decimal128 decimal128_value = 20;
-    int64 date_64_value = 21;
-    int32 interval_yearmonth_value = 24;
-    int64 interval_daytime_value = 25;
-    ScalarTimestampValue timestamp_value = 26;
-    ScalarDictionaryValue dictionary_value = 27;
-    bytes binary_value = 28;
-    bytes large_binary_value = 29;
-    ScalarTime64Value time64_value = 30;
-    IntervalMonthDayNanoValue interval_month_day_nano = 31;
-    StructValue struct_value = 32;
-    ScalarFixedSizeBinary fixed_size_binary_value = 34;
-  }
-}
-
-message Decimal128{
-  bytes value = 1;
-  int64 p = 2;
-  int64 s = 3;
-}
-
-// Serialized data type
-message ArrowType{
-  oneof arrow_type_enum {
-    EmptyMessage NONE = 1;     // arrow::Type::NA
-    EmptyMessage BOOL = 2;     // arrow::Type::BOOL
-    EmptyMessage UINT8 = 3;    // arrow::Type::UINT8
-    EmptyMessage INT8 = 4;     // arrow::Type::INT8
-    EmptyMessage UINT16 = 5;   // represents arrow::Type fields in 
src/arrow/type.h
-    EmptyMessage INT16 = 6;
-    EmptyMessage UINT32 = 7;
-    EmptyMessage INT32 = 8;
-    EmptyMessage UINT64 = 9;
-    EmptyMessage INT64 = 10 ;
-    EmptyMessage FLOAT16 = 11 ;
-    EmptyMessage FLOAT32 = 12 ;
-    EmptyMessage FLOAT64 = 13 ;
-    EmptyMessage UTF8 = 14 ;
-    EmptyMessage LARGE_UTF8 = 32;
-    EmptyMessage BINARY = 15 ;
-    int32 FIXED_SIZE_BINARY = 16 ;
-    EmptyMessage LARGE_BINARY = 31;
-    EmptyMessage DATE32 = 17 ;
-    EmptyMessage DATE64 = 18 ;
-    TimeUnit DURATION = 19;
-    Timestamp TIMESTAMP = 20 ;
-    TimeUnit TIME32 = 21 ;
-    TimeUnit TIME64 = 22 ;
-    IntervalUnit INTERVAL = 23 ;
-    Decimal DECIMAL = 24 ;
-    List LIST = 25;
-    List LARGE_LIST = 26;
-    FixedSizeList FIXED_SIZE_LIST = 27;
-    Struct STRUCT = 28;
-    Union UNION = 29;
-    Dictionary DICTIONARY = 30;
-  }
-}
-
-//Useful for representing an empty enum variant in rust
-// E.G. enum example{One, Two(i32)}
-// maps to
-// message example{
-//    oneof{
-//        EmptyMessage One = 1;
-//        i32 Two = 2;
-//   }
-//}
-message EmptyMessage{}
-
 message OptimizedLogicalPlanType {
   string optimizer_name = 1;
 }
@@ -918,12 +667,16 @@ message OptimizedPhysicalPlanType {
 
 message PlanType {
   oneof plan_type_enum {
-    EmptyMessage InitialLogicalPlan = 1;
+    datafusion_common.EmptyMessage InitialLogicalPlan = 1;
+    AnalyzedLogicalPlanType AnalyzedLogicalPlan = 7;
+    datafusion_common.EmptyMessage FinalAnalyzedLogicalPlan = 8;
     OptimizedLogicalPlanType OptimizedLogicalPlan = 2;
-    EmptyMessage FinalLogicalPlan = 3;
-    EmptyMessage InitialPhysicalPlan = 4;
+    datafusion_common.EmptyMessage FinalLogicalPlan = 3;
+    datafusion_common.EmptyMessage InitialPhysicalPlan = 4;
+    datafusion_common.EmptyMessage InitialPhysicalPlanWithStats = 9;
     OptimizedPhysicalPlanType OptimizedPhysicalPlan = 5;
-    EmptyMessage FinalPhysicalPlan = 6;
+    datafusion_common.EmptyMessage FinalPhysicalPlan = 6;
+    datafusion_common.EmptyMessage FinalPhysicalPlanWithStats = 10;
   }
 }
 
@@ -947,7 +700,7 @@ message FullTableReference {
   string table = 3;
 }
 
-message OwnedTableReference {
+message TableReference {
   oneof table_reference_enum {
     BareTableReference bare = 1;
     PartialTableReference partial = 2;
@@ -980,9 +733,70 @@ message PhysicalPlanNode {
     UnionExecNode union = 19;
     ExplainExecNode explain = 20;
     SortPreservingMergeExecNode sort_preserving_merge = 21;
+    NestedLoopJoinExecNode nested_loop_join = 22;
+    AnalyzeExecNode analyze = 23;
+    JsonSinkExecNode json_sink = 24;
+    SymmetricHashJoinExecNode symmetric_hash_join = 25;
+    InterleaveExecNode  interleave = 26;
+    PlaceholderRowExecNode placeholder_row = 27;
+    CsvSinkExecNode csv_sink = 28;
+    ParquetSinkExecNode parquet_sink = 29;
   }
 }
 
+message PartitionColumn {
+  string name = 1;
+  datafusion_common.ArrowType arrow_type = 2;
+}
+
+
+message FileSinkConfig {
+  reserved 6; // writer_mode
+
+  string object_store_url = 1;
+  repeated PartitionedFile file_groups = 2;
+  repeated string table_paths = 3;
+  datafusion_common.Schema output_schema = 4;
+  repeated PartitionColumn table_partition_cols = 5;
+  bool overwrite = 8;
+}
+
+message JsonSink {
+  FileSinkConfig config = 1;
+  datafusion_common.JsonWriterOptions writer_options = 2;
+}
+
+message JsonSinkExecNode {
+  PhysicalPlanNode input = 1;
+  JsonSink sink = 2;
+  datafusion_common.Schema sink_schema = 3;
+  PhysicalSortExprNodeCollection sort_order = 4;
+}
+
+message CsvSink {
+  FileSinkConfig config = 1;
+  datafusion_common.CsvWriterOptions writer_options = 2;
+}
+
+message CsvSinkExecNode {
+  PhysicalPlanNode input = 1;
+  CsvSink sink = 2;
+  datafusion_common.Schema sink_schema = 3;
+  PhysicalSortExprNodeCollection sort_order = 4;
+}
+
+message ParquetSink {
+  FileSinkConfig config = 1;
+  datafusion_common.TableParquetOptions parquet_options = 2;
+}
+
+message ParquetSinkExecNode {
+  PhysicalPlanNode input = 1;
+  ParquetSink sink = 2;
+  datafusion_common.Schema sink_schema = 3;
+  PhysicalSortExprNodeCollection sort_order = 4;
+}
+
 message PhysicalExtensionNode {
   bytes node = 1;
   repeated PhysicalPlanNode inputs = 2;
@@ -990,11 +804,14 @@ message PhysicalExtensionNode {
 
 // physical expressions
 message PhysicalExprNode {
+  // Was date_time_interval_expr
+  reserved 17;
+
   oneof ExprType {
     // column references
     PhysicalColumn column = 1;
 
-    ScalarValue literal = 2;
+    datafusion_common.ScalarValue literal = 2;
 
     // binary expressions
     PhysicalBinaryExprNode binary_expr = 3;
@@ -1012,15 +829,13 @@ message PhysicalExprNode {
     PhysicalSortExprNode sort = 10;
     PhysicalNegativeNode negative = 11;
     PhysicalInListNode in_list = 12;
-    PhysicalScalarFunctionNode scalar_function = 13;
+    //  was PhysicalScalarFunctionNode scalar_function = 13;
     PhysicalTryCastNode try_cast = 14;
-
     // window expressions
     PhysicalWindowExprNode window_expr = 15;
 
     PhysicalScalarUdfNode scalar_udf = 16;
-
-    PhysicalDateTimeIntervalExprNode date_time_interval_expr = 17;
+    // was PhysicalDateTimeIntervalExprNode date_time_interval_expr = 17;
 
     PhysicalLikeExprNode like_expr = 18;
   }
@@ -1029,12 +844,17 @@ message PhysicalExprNode {
 message PhysicalScalarUdfNode {
   string name = 1;
   repeated PhysicalExprNode args = 2;
-  ArrowType return_type = 4;
+  optional bytes fun_definition = 3;
+  datafusion_common.ArrowType return_type = 4;
 }
 
 message PhysicalAggregateExprNode {
-  AggregateFunction aggr_function = 1;
+  oneof AggregateFunction {
+    AggregateFunction aggr_function = 1;
+    string user_defined_aggr_function = 4;
+  }
   repeated PhysicalExprNode expr = 2;
+  repeated PhysicalSortExprNode ordering_req = 5;
   bool distinct = 3;
 }
 
@@ -1042,9 +862,13 @@ message PhysicalWindowExprNode {
   oneof window_function {
     AggregateFunction aggr_function = 1;
     BuiltInWindowFunction built_in_function = 2;
-    // udaf = 3
+    string user_defined_aggr_function = 3;
   }
-  PhysicalExprNode expr = 4;
+  repeated PhysicalExprNode args = 4;
+  repeated PhysicalExprNode partition_by = 5;
+  repeated PhysicalSortExprNode order_by = 6;
+  WindowFrame window_frame = 7;
+  string name = 8;
 }
 
 message PhysicalIsNull {
@@ -1106,21 +930,14 @@ message PhysicalCaseNode {
   PhysicalExprNode else_expr = 3;
 }
 
-message PhysicalScalarFunctionNode {
-  string name = 1;
-  ScalarFunction fun = 2;
-  repeated PhysicalExprNode args = 3;
-  ArrowType return_type = 4;
-}
-
 message PhysicalTryCastNode {
   PhysicalExprNode expr = 1;
-  ArrowType arrow_type = 2;
+  datafusion_common.ArrowType arrow_type = 2;
 }
 
 message PhysicalCastNode {
   PhysicalExprNode expr = 1;
-  ArrowType arrow_type = 2;
+  datafusion_common.ArrowType arrow_type = 2;
 }
 
 message PhysicalNegativeNode {
@@ -1130,6 +947,7 @@ message PhysicalNegativeNode {
 message FilterExecNode {
   PhysicalPlanNode input = 1;
   PhysicalExprNode expr = 2;
+  uint32 default_filter_selectivity = 3;
 }
 
 message FileGroup {
@@ -1141,29 +959,41 @@ message ScanLimit {
   uint32 limit = 1;
 }
 
+message PhysicalSortExprNodeCollection {
+  repeated PhysicalSortExprNode physical_sort_expr_nodes = 1;
+}
+
 message FileScanExecConf {
   // Was repeated ConfigOption options = 10;
   reserved 10;
 
   repeated FileGroup file_groups = 1;
-  Schema schema = 2;
+  datafusion_common.Schema schema = 2;
   repeated uint32 projection = 4;
   ScanLimit limit = 5;
-  Statistics statistics = 6;
+  datafusion_common.Statistics statistics = 6;
   repeated string table_partition_cols = 7;
   string object_store_url = 8;
-  repeated PhysicalSortExprNode output_ordering = 9;
+  repeated PhysicalSortExprNodeCollection output_ordering = 9;
 }
 
 message ParquetScanExecNode {
   FileScanExecConf base_conf = 1;
-  LogicalExprNode pruning_predicate = 2;
+
+  // Was pruning predicate based on a logical expr.
+  reserved 2;
+
+  PhysicalExprNode predicate = 3;
 }
 
 message CsvScanExecNode {
   FileScanExecConf base_conf = 1;
   bool has_header = 2;
   string delimiter = 3;
+  string quote = 4;
+  oneof optional_escape {
+    string escape = 5;
+  }
 }
 
 message AvroScanExecNode {
@@ -1180,10 +1010,32 @@ message HashJoinExecNode {
   PhysicalPlanNode left = 1;
   PhysicalPlanNode right = 2;
   repeated JoinOn on = 3;
-  JoinType join_type = 4;
+  datafusion_common.JoinType join_type = 4;
   PartitionMode partition_mode = 6;
   bool null_equals_null = 7;
   JoinFilter filter = 8;
+  repeated uint32 projection = 9;
+}
+
+enum StreamPartitionMode {
+  SINGLE_PARTITION = 0;
+  PARTITIONED_EXEC = 1;
+}
+
+message SymmetricHashJoinExecNode {
+  PhysicalPlanNode left = 1;
+  PhysicalPlanNode right = 2;
+  repeated JoinOn on = 3;
+  datafusion_common.JoinType join_type = 4;
+  StreamPartitionMode partition_mode = 6;
+  bool null_equals_null = 7;
+  JoinFilter filter = 8;
+  repeated PhysicalSortExprNode left_sort_exprs = 9;
+  repeated PhysicalSortExprNode right_sort_exprs = 10;
+}
+
+message InterleaveExecNode {
+  repeated PhysicalPlanNode inputs = 1;
 }
 
 message UnionExecNode {
@@ -1191,11 +1043,18 @@ message UnionExecNode {
 }
 
 message ExplainExecNode {
-  Schema schema = 1;
+  datafusion_common.Schema schema = 1;
   repeated StringifiedPlan stringified_plans = 2;
   bool verbose = 3;
 }
 
+message AnalyzeExecNode {
+  bool verbose = 1;
+  bool show_statistics = 2;
+  PhysicalPlanNode input = 3;
+  datafusion_common.Schema schema = 4;
+}
+
 message CrossJoinExecNode {
   PhysicalPlanNode left = 1;
   PhysicalPlanNode right = 2;
@@ -1207,13 +1066,16 @@ message PhysicalColumn {
 }
 
 message JoinOn {
-  PhysicalColumn left = 1;
-  PhysicalColumn right = 2;
+  PhysicalExprNode left = 1;
+  PhysicalExprNode right = 2;
 }
 
 message EmptyExecNode {
-  bool produce_one_row = 1;
-  Schema schema = 2;
+  datafusion_common.Schema schema = 1;
+}
+
+message PlaceholderRowExecNode {
+  datafusion_common.Schema schema = 1;
 }
 
 message ProjectionExecNode {
@@ -1226,13 +1088,37 @@ enum AggregateMode {
   PARTIAL = 0;
   FINAL = 1;
   FINAL_PARTITIONED = 2;
+  SINGLE = 3;
+  SINGLE_PARTITIONED = 4;
+}
+
+message PartiallySortedInputOrderMode {
+  repeated uint64 columns = 6;
 }
 
 message WindowAggExecNode {
   PhysicalPlanNode input = 1;
-  repeated PhysicalExprNode window_expr = 2;
-  repeated string window_expr_name = 3;
-  Schema input_schema = 4;
+  repeated PhysicalWindowExprNode window_expr = 2;
+  repeated PhysicalExprNode partition_keys = 5;
+  // Set optional to `None` for `BoundedWindowAggExec`.
+  oneof input_order_mode {
+    datafusion_common.EmptyMessage linear = 7;
+    PartiallySortedInputOrderMode partially_sorted = 8;
+    datafusion_common.EmptyMessage sorted = 9;
+  }
+}
+
+message MaybeFilter {
+  PhysicalExprNode expr = 1;
+}
+
+message MaybePhysicalSortExprs {
+  repeated PhysicalSortExprNode sort_expr = 1;
+}
+
+message AggLimit {
+  // wrap into a message to make it optional
+  uint64 limit = 1;
 }
 
 message AggregateExecNode {
@@ -1243,9 +1129,11 @@ message AggregateExecNode {
   repeated string group_expr_name = 5;
   repeated string aggr_expr_name = 6;
   // we need the input schema to the partial aggregate to pass to the final 
aggregate
-  Schema input_schema = 7;
+  datafusion_common.Schema input_schema = 7;
   repeated PhysicalExprNode null_expr = 8;
   repeated bool groups = 9;
+  repeated MaybeFilter filter_expr = 10;
+  AggLimit limit = 11;
 }
 
 message GlobalLimitExecNode {
@@ -1266,11 +1154,21 @@ message SortExecNode {
   repeated PhysicalExprNode expr = 2;
   // Maximum number of highest/lowest rows to fetch; negative means no limit
   int64 fetch = 3;
+  bool preserve_partitioning = 4;
 }
 
 message SortPreservingMergeExecNode {
   PhysicalPlanNode input = 1;
   repeated PhysicalExprNode expr = 2;
+  // Maximum number of highest/lowest rows to fetch; negative means no limit
+  int64 fetch = 3;
+}
+
+message NestedLoopJoinExecNode {
+  PhysicalPlanNode left = 1;
+  PhysicalPlanNode right = 2;
+  datafusion_common.JoinType join_type = 3;
+  JoinFilter filter = 4;
 }
 
 message CoalesceBatchesExecNode {
@@ -1289,35 +1187,40 @@ message PhysicalHashRepartition {
 
 message RepartitionExecNode{
   PhysicalPlanNode input = 1;
+  // oneof partition_method {
+  //   uint64 round_robin = 2;
+  //   PhysicalHashRepartition hash = 3;
+  //   uint64 unknown = 4;
+  // }
+  Partitioning partitioning = 5;
+}
+
+message Partitioning {
   oneof partition_method {
-    uint64 round_robin = 2;
-    PhysicalHashRepartition hash = 3;
-    uint64 unknown = 4;
+    uint64 round_robin = 1;
+    PhysicalHashRepartition hash = 2;
+    uint64 unknown = 3;
   }
 }
 
 message JoinFilter{
   PhysicalExprNode expression = 1;
   repeated ColumnIndex column_indices = 2;
-  Schema schema = 3;
+  datafusion_common.Schema schema = 3;
 }
 
 message ColumnIndex{
   uint32 index = 1;
-  JoinSide side = 2;
-}
-
-enum JoinSide{
-  LEFT_SIDE = 0;
-  RIGHT_SIDE = 1;
+  datafusion_common.JoinSide side = 2;
 }
 
 message PartitionedFile {
   string path = 1;
   uint64 size = 2;
   uint64 last_modified_ns = 3;
-  repeated ScalarValue partition_values = 4;
+  repeated datafusion_common.ScalarValue partition_values = 4;
   FileRange range = 5;
+  datafusion_common.Statistics statistics = 6;
 }
 
 message FileRange {
@@ -1329,19 +1232,5 @@ message PartitionStats {
   int64 num_rows = 1;
   int64 num_batches = 2;
   int64 num_bytes = 3;
-  repeated ColumnStats column_stats = 4;
+  repeated datafusion_common.ColumnStats column_stats = 4;
 }
-
-message Statistics {
-  int64 num_rows = 1;
-  int64 total_byte_size = 2;
-  repeated ColumnStats column_stats = 3;
-  bool is_exact = 4;
-}
-
-message ColumnStats {
-  ScalarValue min_value = 1;
-  ScalarValue max_value = 2;
-  uint32 null_count = 3;
-  uint32 distinct_count = 4;
-}
\ No newline at end of file
diff --git a/ballista/core/proto/datafusion_common.proto 
b/ballista/core/proto/datafusion_common.proto
new file mode 100644
index 00000000..d9ec7dbb
--- /dev/null
+++ b/ballista/core/proto/datafusion_common.proto
@@ -0,0 +1,541 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto3";
+
+package datafusion_common;
+
+message ColumnRelation {
+  string relation = 1;
+}
+
+message Column {
+  string name = 1;
+  ColumnRelation relation = 2;
+}
+
+message DfField{
+  Field field = 1;
+  ColumnRelation qualifier = 2;
+}
+
+message DfSchema {
+  repeated DfField columns = 1;
+  map<string, string> metadata = 2;
+}
+
+message CsvFormat {
+  CsvOptions options = 5;
+}
+
+message ParquetFormat {
+  // Used to be bool enable_pruning = 1;
+  reserved 1;
+  TableParquetOptions options = 2;
+}
+
+message AvroFormat {}
+
+message PrimaryKeyConstraint{
+  repeated uint64 indices = 1;
+}
+
+message UniqueConstraint{
+  repeated uint64 indices = 1;
+}
+
+message Constraint{
+  oneof constraint_mode{
+    PrimaryKeyConstraint primary_key = 1;
+    UniqueConstraint unique = 2;
+  }
+}
+
+message Constraints{
+  repeated Constraint constraints = 1;
+}
+
+enum JoinType {
+  INNER = 0;
+  LEFT = 1;
+  RIGHT = 2;
+  FULL = 3;
+  LEFTSEMI = 4;
+  LEFTANTI = 5;
+  RIGHTSEMI = 6;
+  RIGHTANTI = 7;
+}
+
+enum JoinConstraint {
+  ON = 0;
+  USING = 1;
+}
+
+message AvroOptions {}
+message ArrowOptions {}
+
+message Schema {
+  repeated Field columns = 1;
+  map<string, string> metadata = 2;
+}
+
+message Field {
+  // name of the field
+  string name = 1;
+  ArrowType arrow_type = 2;
+  bool nullable = 3;
+  // for complex data types like structs, unions
+  repeated Field children = 4;
+  map<string, string> metadata = 5;
+  int64 dict_id = 6;
+  bool dict_ordered = 7;
+}
+
+message Timestamp{
+  TimeUnit time_unit = 1;
+  string timezone = 2;
+}
+
+enum TimeUnit{
+  Second = 0;
+  Millisecond = 1;
+  Microsecond = 2;
+  Nanosecond = 3;
+}
+
+enum IntervalUnit{
+  YearMonth = 0;
+  DayTime = 1;
+  MonthDayNano = 2;
+}
+
+message Decimal{
+  reserved 1, 2;
+  uint32 precision = 3;
+  int32 scale = 4;
+}
+
+message List{
+  Field field_type = 1;
+}
+
+message FixedSizeList{
+  Field field_type = 1;
+  int32 list_size = 2;
+}
+
+message Dictionary{
+  ArrowType key = 1;
+  ArrowType value = 2;
+}
+
+message Struct{
+  repeated Field sub_field_types = 1;
+}
+
+message Map {
+  Field field_type = 1;
+  bool keys_sorted = 2;
+}
+
+enum UnionMode{
+  sparse = 0;
+  dense = 1;
+}
+
+message Union{
+  repeated Field union_types = 1;
+  UnionMode union_mode = 2;
+  repeated int32 type_ids = 3;
+}
+
+// Used for List/FixedSizeList/LargeList/Struct
+message ScalarNestedValue {
+  message Dictionary {
+    bytes ipc_message = 1;
+    bytes arrow_data = 2;
+  }
+
+  bytes ipc_message = 1;
+  bytes arrow_data = 2;
+  Schema schema = 3;
+  repeated Dictionary dictionaries = 4;
+}
+
+message ScalarTime32Value {
+  oneof value {
+    int32 time32_second_value = 1;
+    int32 time32_millisecond_value = 2;
+  };
+}
+
+message ScalarTime64Value {
+  oneof value {
+    int64 time64_microsecond_value = 1;
+    int64 time64_nanosecond_value = 2;
+  };
+}
+
+message ScalarTimestampValue {
+  oneof value {
+    int64 time_microsecond_value = 1;
+    int64 time_nanosecond_value = 2;
+    int64 time_second_value = 3;
+    int64 time_millisecond_value = 4;
+  };
+  string timezone = 5;
+}
+
+message ScalarDictionaryValue {
+  ArrowType index_type = 1;
+  ScalarValue value = 2;
+}
+
+message IntervalDayTimeValue {
+  int32 days = 1;
+  int32 milliseconds = 2;
+}
+
+message IntervalMonthDayNanoValue {
+  int32 months = 1;
+  int32 days = 2;
+  int64 nanos = 3;
+}
+
+message UnionField {
+  int32 field_id = 1;
+  Field field = 2;
+}
+
+message UnionValue {
+  // Note that a null union value must have one or more fields, so we
+  // encode a null UnionValue as one with value_id == 128
+  int32 value_id = 1;
+  ScalarValue value = 2;
+  repeated UnionField fields = 3;
+  UnionMode mode = 4;
+}
+
+message ScalarFixedSizeBinary{
+  bytes values = 1;
+  int32 length = 2;
+}
+
+message ScalarValue{
+  // was PrimitiveScalarType null_value = 19;
+  reserved 19;
+
+  oneof value {
+    // was PrimitiveScalarType null_value = 19;
+    // Null value of any type
+    ArrowType null_value = 33;
+
+    bool   bool_value = 1;
+    string utf8_value = 2;
+    string large_utf8_value = 3;
+    int32  int8_value = 4;
+    int32  int16_value = 5;
+    int32  int32_value = 6;
+    int64  int64_value = 7;
+    uint32 uint8_value = 8;
+    uint32 uint16_value = 9;
+    uint32 uint32_value = 10;
+    uint64 uint64_value = 11;
+    float  float32_value = 12;
+    double float64_value = 13;
+    // Literal Date32 value always has a unit of day
+    int32  date_32_value = 14;
+    ScalarTime32Value time32_value = 15;
+    ScalarNestedValue large_list_value = 16;
+    ScalarNestedValue list_value = 17;
+    ScalarNestedValue fixed_size_list_value = 18;
+    ScalarNestedValue struct_value = 32;
+
+    Decimal128 decimal128_value = 20;
+    Decimal256 decimal256_value = 39;
+
+    int64 date_64_value = 21;
+    int32 interval_yearmonth_value = 24;
+
+    int64 duration_second_value = 35;
+    int64 duration_millisecond_value = 36;
+    int64 duration_microsecond_value = 37;
+    int64 duration_nanosecond_value = 38;
+
+    ScalarTimestampValue timestamp_value = 26;
+    ScalarDictionaryValue dictionary_value = 27;
+    bytes binary_value = 28;
+    bytes large_binary_value = 29;
+    ScalarTime64Value time64_value = 30;
+    IntervalDayTimeValue interval_daytime_value = 25;
+    IntervalMonthDayNanoValue interval_month_day_nano = 31;
+    ScalarFixedSizeBinary fixed_size_binary_value = 34;
+    UnionValue union_value = 42;
+  }
+}
+
+message Decimal128{
+  bytes value = 1;
+  int64 p = 2;
+  int64 s = 3;
+}
+
+message Decimal256{
+  bytes value = 1;
+  int64 p = 2;
+  int64 s = 3;
+}
+
+// Serialized data type
+message ArrowType{
+  oneof arrow_type_enum {
+    EmptyMessage NONE = 1;     // arrow::Type::NA
+    EmptyMessage BOOL = 2;     // arrow::Type::BOOL
+    EmptyMessage UINT8 = 3;    // arrow::Type::UINT8
+    EmptyMessage INT8 = 4;     // arrow::Type::INT8
+    EmptyMessage UINT16 = 5;   // represents arrow::Type fields in 
src/arrow/type.h
+    EmptyMessage INT16 = 6;
+    EmptyMessage UINT32 = 7;
+    EmptyMessage INT32 = 8;
+    EmptyMessage UINT64 = 9;
+    EmptyMessage INT64 = 10 ;
+    EmptyMessage FLOAT16 = 11 ;
+    EmptyMessage FLOAT32 = 12 ;
+    EmptyMessage FLOAT64 = 13 ;
+    EmptyMessage UTF8 = 14 ;
+    EmptyMessage LARGE_UTF8 = 32;
+    EmptyMessage BINARY = 15 ;
+    int32 FIXED_SIZE_BINARY = 16 ;
+    EmptyMessage LARGE_BINARY = 31;
+    EmptyMessage DATE32 = 17 ;
+    EmptyMessage DATE64 = 18 ;
+    TimeUnit DURATION = 19;
+    Timestamp TIMESTAMP = 20 ;
+    TimeUnit TIME32 = 21 ;
+    TimeUnit TIME64 = 22 ;
+    IntervalUnit INTERVAL = 23 ;
+    Decimal DECIMAL = 24 ;
+    List LIST = 25;
+    List LARGE_LIST = 26;
+    FixedSizeList FIXED_SIZE_LIST = 27;
+    Struct STRUCT = 28;
+    Union UNION = 29;
+    Dictionary DICTIONARY = 30;
+    Map MAP = 33;
+  }
+}
+
+//Useful for representing an empty enum variant in rust
+// E.G. enum example{One, Two(i32)}
+// maps to
+// message example{
+//    oneof{
+//        EmptyMessage One = 1;
+//        i32 Two = 2;
+//   }
+//}
+message EmptyMessage{}
+
+enum CompressionTypeVariant {
+  GZIP = 0;
+  BZIP2 = 1;
+  XZ = 2;
+  ZSTD = 3;
+  UNCOMPRESSED = 4;
+}
+
+message JsonWriterOptions {
+  CompressionTypeVariant compression = 1;
+}
+
+
+message CsvWriterOptions {
+  // Compression type
+  CompressionTypeVariant compression = 1;
+  // Optional column delimiter. Defaults to `b','`
+  string delimiter = 2;
+  // Whether to write column names as file headers. Defaults to `true`
+  bool has_header = 3;
+  // Optional date format for date arrays
+  string date_format = 4;
+  // Optional datetime format for datetime arrays
+  string datetime_format = 5;
+  // Optional timestamp format for timestamp arrays
+  string timestamp_format = 6;
+  // Optional time format for time arrays
+  string time_format = 7;
+  // Optional value to represent null
+  string null_value = 8;
+}
+
+// Options controlling CSV format
+message CsvOptions {
+  bytes has_header = 1; // Indicates if the CSV has a header row
+  bytes delimiter = 2; // Delimiter character as a byte
+  bytes quote = 3; // Quote character as a byte
+  bytes escape = 4; // Optional escape character as a byte
+  CompressionTypeVariant compression = 5; // Compression type
+  uint64 schema_infer_max_rec = 6; // Max records for schema inference
+  string date_format = 7; // Optional date format
+  string datetime_format = 8; // Optional datetime format
+  string timestamp_format = 9; // Optional timestamp format
+  string timestamp_tz_format = 10; // Optional timestamp with timezone format
+  string time_format = 11; // Optional time format
+  string null_value = 12; // Optional representation of null value
+}
+
+// Options controlling CSV format
+message JsonOptions {
+  CompressionTypeVariant compression = 1; // Compression type
+  uint64 schema_infer_max_rec = 2; // Max records for schema inference
+}
+
+message TableParquetOptions {
+  ParquetOptions global = 1;
+  repeated ColumnSpecificOptions column_specific_options = 2;
+}
+
+message ColumnSpecificOptions {
+  string column_name = 1;
+  ColumnOptions options = 2;
+}
+
+message ColumnOptions {
+  oneof bloom_filter_enabled_opt {
+    bool bloom_filter_enabled = 1;
+  }
+
+  oneof encoding_opt {
+    string encoding = 2;
+  }
+
+  oneof dictionary_enabled_opt {
+    bool dictionary_enabled = 3;
+  }
+
+  oneof compression_opt {
+    string compression = 4;
+  }
+
+  oneof statistics_enabled_opt {
+    string statistics_enabled = 5;
+  }
+
+  oneof bloom_filter_fpp_opt {
+    double bloom_filter_fpp = 6;
+  }
+
+  oneof bloom_filter_ndv_opt {
+    uint64 bloom_filter_ndv = 7;
+  }
+
+  oneof max_statistics_size_opt {
+    uint32 max_statistics_size = 8;
+  }
+}
+
+message ParquetOptions {
+  // Regular fields
+  bool enable_page_index = 1; // default = true
+  bool pruning = 2; // default = true
+  bool skip_metadata = 3; // default = true
+  bool pushdown_filters = 5; // default = false
+  bool reorder_filters = 6; // default = false
+  uint64 data_pagesize_limit = 7; // default = 1024 * 1024
+  uint64 write_batch_size = 8; // default = 1024
+  string writer_version = 9; // default = "1.0"
+  // bool bloom_filter_enabled = 20; // default = false
+  bool allow_single_file_parallelism = 23; // default = true
+  uint64 maximum_parallel_row_group_writers = 24; // default = 1
+  uint64 maximum_buffered_record_batches_per_stream = 25; // default = 2
+  bool bloom_filter_on_read = 26; // default = true
+  bool bloom_filter_on_write = 27; // default = false
+
+  oneof metadata_size_hint_opt {
+    uint64 metadata_size_hint = 4;
+  }
+
+  oneof compression_opt {
+    string compression = 10;
+  }
+
+  oneof dictionary_enabled_opt {
+    bool dictionary_enabled = 11;
+  }
+
+  oneof statistics_enabled_opt {
+    string statistics_enabled = 13;
+  }
+
+  oneof max_statistics_size_opt {
+    uint64 max_statistics_size = 14;
+  }
+
+  oneof column_index_truncate_length_opt {
+    uint64 column_index_truncate_length = 17;
+  }
+
+  oneof encoding_opt {
+    string encoding = 19;
+  }
+
+  oneof bloom_filter_fpp_opt {
+    double bloom_filter_fpp = 21;
+  }
+
+  oneof bloom_filter_ndv_opt {
+    uint64 bloom_filter_ndv = 22;
+  }
+
+  uint64 dictionary_page_size_limit = 12;
+
+  uint64 data_page_row_count_limit = 18;
+
+  uint64 max_row_group_size = 15;
+
+  string created_by = 16;
+}
+
+enum JoinSide{
+  LEFT_SIDE = 0;
+  RIGHT_SIDE = 1;
+}
+
+message Precision{
+  PrecisionInfo precision_info = 1;
+  ScalarValue val = 2;
+}
+
+enum PrecisionInfo {
+  EXACT = 0;
+  INEXACT = 1;
+  ABSENT = 2;
+}
+
+message Statistics {
+  Precision num_rows = 1;
+  Precision total_byte_size = 2;
+  repeated ColumnStats column_stats = 3;
+}
+
+message ColumnStats {
+  Precision min_value = 1;
+  Precision max_value = 2;
+  Precision null_count = 3;
+  Precision distinct_count = 4;
+}
diff --git a/ballista/core/src/serde/generated/ballista.rs 
b/ballista/core/src/serde/generated/ballista.rs
index 67714873..c45bdeaf 100644
--- a/ballista/core/src/serde/generated/ballista.rs
+++ b/ballista/core/src/serde/generated/ballista.rs
@@ -45,7 +45,7 @@ pub struct UnresolvedShuffleExecNode {
     #[prost(uint32, tag = "1")]
     pub stage_id: u32,
     #[prost(message, optional, tag = "2")]
-    pub schema: ::core::option::Option<::datafusion_proto::protobuf::Schema>,
+    pub schema: ::core::option::Option<::datafusion_proto_common::Schema>,
     #[prost(uint32, tag = "4")]
     pub output_partition_count: u32,
 }
@@ -55,7 +55,7 @@ pub struct ShuffleReaderExecNode {
     #[prost(message, repeated, tag = "1")]
     pub partition: ::prost::alloc::vec::Vec<ShuffleReaderPartition>,
     #[prost(message, optional, tag = "2")]
-    pub schema: ::core::option::Option<::datafusion_proto::protobuf::Schema>,
+    pub schema: ::core::option::Option<::datafusion_proto_common::Schema>,
     /// The stage to read from
     #[prost(uint32, tag = "3")]
     pub stage_id: u32,
@@ -379,9 +379,9 @@ pub struct PartitionStats {
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ColumnStats {
     #[prost(message, optional, tag = "1")]
-    pub min_value: 
::core::option::Option<::datafusion_proto::protobuf::ScalarValue>,
+    pub min_value: 
::core::option::Option<::datafusion_proto_common::ScalarValue>,
     #[prost(message, optional, tag = "2")]
-    pub max_value: 
::core::option::Option<::datafusion_proto::protobuf::ScalarValue>,
+    pub max_value: 
::core::option::Option<::datafusion_proto_common::ScalarValue>,
     #[prost(uint32, tag = "3")]
     pub null_count: u32,
     #[prost(uint32, tag = "4")]
@@ -1083,7 +1083,7 @@ pub struct GetFileMetadataParams {
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct GetFileMetadataResult {
     #[prost(message, optional, tag = "1")]
-    pub schema: ::core::option::Option<::datafusion_proto::protobuf::Schema>,
+    pub schema: ::core::option::Option<::datafusion_proto_common::Schema>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
diff --git a/dev/docker/ballista-builder.Dockerfile 
b/dev/docker/ballista-builder.Dockerfile
index abcd14f2..aa9a6fa5 100644
--- a/dev/docker/ballista-builder.Dockerfile
+++ b/dev/docker/ballista-builder.Dockerfile
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-FROM rust:1.74.1-buster
+FROM rust:1.81-bullseye
 
 ARG EXT_UID
 
@@ -31,7 +31,7 @@ RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - 
&& \
     apt-get install -y nodejs && \
     npm install -g yarn
 
-# create build user with same UID as 
+# create build user with same UID as
 RUN adduser -q -u $EXT_UID builder --home /home/builder && \
     mkdir -p /home/builder/workspace
 USER builder


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to