This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 0f1c515179e [SPARK-41345][CONNECT] Add Hint to Connect Proto 0f1c515179e is described below commit 0f1c515179e5ed34aca27c51f500c26ca19cc748 Author: Rui Wang <rui.w...@databricks.com> AuthorDate: Thu Dec 1 19:07:33 2022 +0800 [SPARK-41345][CONNECT] Add Hint to Connect Proto ### What changes were proposed in this pull request? This PR adds `Hint` to Connect proto. The Hint technically can accept `ANY` parameter, however in this version of implementation by proto, it is limited to only support literals. As a comparison, PySpark hint API even support a few options ((str, list, float, int)) ### Why are the changes needed? Proto API completeness. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? N/A Closes #38857 from amaliujia/hint. Authored-by: Rui Wang <rui.w...@databricks.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- .../main/protobuf/spark/connect/relations.proto | 17 ++ python/pyspark/sql/connect/proto/relations_pb2.py | 174 +++++++++++---------- python/pyspark/sql/connect/proto/relations_pb2.pyi | 56 +++++++ 3 files changed, 167 insertions(+), 80 deletions(-) diff --git a/connector/connect/src/main/protobuf/spark/connect/relations.proto b/connector/connect/src/main/protobuf/spark/connect/relations.proto index f4df95fdd73..bb891d63670 100644 --- a/connector/connect/src/main/protobuf/spark/connect/relations.proto +++ b/connector/connect/src/main/protobuf/spark/connect/relations.proto @@ -53,6 +53,7 @@ message Relation { Drop drop = 21; Tail tail = 22; WithColumns with_columns = 23; + Hint hint = 24; // NA functions NAFill fill_na = 90; @@ -541,3 +542,19 @@ message WithColumns { // An exception is thrown when duplicated names are present in the mapping. repeated Expression.Alias name_expr_list = 2; } + +// Specify a hint over a relation. Hint should have a name and optional parameters. +message Hint { + // (Required) The input relation. + Relation input = 1; + + // (Required) Hint name. + // + // Supported Join hints include BROADCAST, MERGE, SHUFFLE_HASH, SHUFFLE_REPLICATE_NL. + // + // Supported partitioning hints include COALESCE, REPARTITION, REPARTITION_BY_RANGE. + string name = 2; + + // (Optional) Hint parameters. + repeated Expression.Literal parameters = 3; +} diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py b/python/pyspark/sql/connect/proto/relations_pb2.py index 7c4cbf17b85..47be485b546 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.py +++ b/python/pyspark/sql/connect/proto/relations_pb2.py @@ -33,7 +33,7 @@ from pyspark.sql.connect.proto import expressions_pb2 as spark_dot_connect_dot_e DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x1fspark/connect/expressions.proto"\xa5\r\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0b\ [...] + b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x1fspark/connect/expressions.proto"\xd0\r\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0b\ [...] ) @@ -75,6 +75,7 @@ _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY = ( _RENAMECOLUMNSBYNAMETONAMEMAP.nested_types_by_name["RenameColumnsMapEntry"] ) _WITHCOLUMNS = DESCRIPTOR.message_types_by_name["WithColumns"] +_HINT = DESCRIPTOR.message_types_by_name["Hint"] _JOIN_JOINTYPE = _JOIN.enum_types_by_name["JoinType"] _SETOPERATION_SETOPTYPE = _SETOPERATION.enum_types_by_name["SetOpType"] _SORT_SORTDIRECTION = _SORT.enum_types_by_name["SortDirection"] @@ -469,6 +470,17 @@ WithColumns = _reflection.GeneratedProtocolMessageType( ) _sym_db.RegisterMessage(WithColumns) +Hint = _reflection.GeneratedProtocolMessageType( + "Hint", + (_message.Message,), + { + "DESCRIPTOR": _HINT, + "__module__": "spark.connect.relations_pb2" + # @@protoc_insertion_point(class_scope:spark.connect.Hint) + }, +) +_sym_db.RegisterMessage(Hint) + if _descriptor._USE_C_DESCRIPTORS == False: DESCRIPTOR._options = None @@ -478,83 +490,85 @@ if _descriptor._USE_C_DESCRIPTORS == False: _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._options = None _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_options = b"8\001" _RELATION._serialized_start = 82 - _RELATION._serialized_end = 1783 - _UNKNOWN._serialized_start = 1785 - _UNKNOWN._serialized_end = 1794 - _RELATIONCOMMON._serialized_start = 1796 - _RELATIONCOMMON._serialized_end = 1845 - _SQL._serialized_start = 1847 - _SQL._serialized_end = 1874 - _READ._serialized_start = 1877 - _READ._serialized_end = 2303 - _READ_NAMEDTABLE._serialized_start = 2019 - _READ_NAMEDTABLE._serialized_end = 2080 - _READ_DATASOURCE._serialized_start = 2083 - _READ_DATASOURCE._serialized_end = 2290 - _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 2221 - _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 2279 - _PROJECT._serialized_start = 2305 - _PROJECT._serialized_end = 2422 - _FILTER._serialized_start = 2424 - _FILTER._serialized_end = 2536 - _JOIN._serialized_start = 2539 - _JOIN._serialized_end = 3010 - _JOIN_JOINTYPE._serialized_start = 2802 - _JOIN_JOINTYPE._serialized_end = 3010 - _SETOPERATION._serialized_start = 3013 - _SETOPERATION._serialized_end = 3409 - _SETOPERATION_SETOPTYPE._serialized_start = 3272 - _SETOPERATION_SETOPTYPE._serialized_end = 3386 - _LIMIT._serialized_start = 3411 - _LIMIT._serialized_end = 3487 - _OFFSET._serialized_start = 3489 - _OFFSET._serialized_end = 3568 - _TAIL._serialized_start = 3570 - _TAIL._serialized_end = 3645 - _AGGREGATE._serialized_start = 3648 - _AGGREGATE._serialized_end = 3858 - _SORT._serialized_start = 3861 - _SORT._serialized_end = 4411 - _SORT_SORTFIELD._serialized_start = 4015 - _SORT_SORTFIELD._serialized_end = 4203 - _SORT_SORTDIRECTION._serialized_start = 4205 - _SORT_SORTDIRECTION._serialized_end = 4313 - _SORT_SORTNULLS._serialized_start = 4315 - _SORT_SORTNULLS._serialized_end = 4397 - _DROP._serialized_start = 4413 - _DROP._serialized_end = 4513 - _DEDUPLICATE._serialized_start = 4516 - _DEDUPLICATE._serialized_end = 4687 - _LOCALRELATION._serialized_start = 4689 - _LOCALRELATION._serialized_end = 4724 - _SAMPLE._serialized_start = 4727 - _SAMPLE._serialized_end = 4951 - _RANGE._serialized_start = 4954 - _RANGE._serialized_end = 5099 - _SUBQUERYALIAS._serialized_start = 5101 - _SUBQUERYALIAS._serialized_end = 5215 - _REPARTITION._serialized_start = 5218 - _REPARTITION._serialized_end = 5360 - _SHOWSTRING._serialized_start = 5363 - _SHOWSTRING._serialized_end = 5504 - _STATSUMMARY._serialized_start = 5506 - _STATSUMMARY._serialized_end = 5598 - _STATCROSSTAB._serialized_start = 5600 - _STATCROSSTAB._serialized_end = 5701 - _NAFILL._serialized_start = 5704 - _NAFILL._serialized_end = 5838 - _NADROP._serialized_start = 5841 - _NADROP._serialized_end = 5975 - _NAREPLACE._serialized_start = 5978 - _NAREPLACE._serialized_end = 6274 - _NAREPLACE_REPLACEMENT._serialized_start = 6133 - _NAREPLACE_REPLACEMENT._serialized_end = 6274 - _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_start = 6276 - _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_end = 6390 - _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_start = 6393 - _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_end = 6652 - _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_start = 6585 - _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_end = 6652 - _WITHCOLUMNS._serialized_start = 6655 - _WITHCOLUMNS._serialized_end = 6786 + _RELATION._serialized_end = 1826 + _UNKNOWN._serialized_start = 1828 + _UNKNOWN._serialized_end = 1837 + _RELATIONCOMMON._serialized_start = 1839 + _RELATIONCOMMON._serialized_end = 1888 + _SQL._serialized_start = 1890 + _SQL._serialized_end = 1917 + _READ._serialized_start = 1920 + _READ._serialized_end = 2346 + _READ_NAMEDTABLE._serialized_start = 2062 + _READ_NAMEDTABLE._serialized_end = 2123 + _READ_DATASOURCE._serialized_start = 2126 + _READ_DATASOURCE._serialized_end = 2333 + _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 2264 + _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 2322 + _PROJECT._serialized_start = 2348 + _PROJECT._serialized_end = 2465 + _FILTER._serialized_start = 2467 + _FILTER._serialized_end = 2579 + _JOIN._serialized_start = 2582 + _JOIN._serialized_end = 3053 + _JOIN_JOINTYPE._serialized_start = 2845 + _JOIN_JOINTYPE._serialized_end = 3053 + _SETOPERATION._serialized_start = 3056 + _SETOPERATION._serialized_end = 3452 + _SETOPERATION_SETOPTYPE._serialized_start = 3315 + _SETOPERATION_SETOPTYPE._serialized_end = 3429 + _LIMIT._serialized_start = 3454 + _LIMIT._serialized_end = 3530 + _OFFSET._serialized_start = 3532 + _OFFSET._serialized_end = 3611 + _TAIL._serialized_start = 3613 + _TAIL._serialized_end = 3688 + _AGGREGATE._serialized_start = 3691 + _AGGREGATE._serialized_end = 3901 + _SORT._serialized_start = 3904 + _SORT._serialized_end = 4454 + _SORT_SORTFIELD._serialized_start = 4058 + _SORT_SORTFIELD._serialized_end = 4246 + _SORT_SORTDIRECTION._serialized_start = 4248 + _SORT_SORTDIRECTION._serialized_end = 4356 + _SORT_SORTNULLS._serialized_start = 4358 + _SORT_SORTNULLS._serialized_end = 4440 + _DROP._serialized_start = 4456 + _DROP._serialized_end = 4556 + _DEDUPLICATE._serialized_start = 4559 + _DEDUPLICATE._serialized_end = 4730 + _LOCALRELATION._serialized_start = 4732 + _LOCALRELATION._serialized_end = 4767 + _SAMPLE._serialized_start = 4770 + _SAMPLE._serialized_end = 4994 + _RANGE._serialized_start = 4997 + _RANGE._serialized_end = 5142 + _SUBQUERYALIAS._serialized_start = 5144 + _SUBQUERYALIAS._serialized_end = 5258 + _REPARTITION._serialized_start = 5261 + _REPARTITION._serialized_end = 5403 + _SHOWSTRING._serialized_start = 5406 + _SHOWSTRING._serialized_end = 5547 + _STATSUMMARY._serialized_start = 5549 + _STATSUMMARY._serialized_end = 5641 + _STATCROSSTAB._serialized_start = 5643 + _STATCROSSTAB._serialized_end = 5744 + _NAFILL._serialized_start = 5747 + _NAFILL._serialized_end = 5881 + _NADROP._serialized_start = 5884 + _NADROP._serialized_end = 6018 + _NAREPLACE._serialized_start = 6021 + _NAREPLACE._serialized_end = 6317 + _NAREPLACE_REPLACEMENT._serialized_start = 6176 + _NAREPLACE_REPLACEMENT._serialized_end = 6317 + _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_start = 6319 + _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_end = 6433 + _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_start = 6436 + _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_end = 6695 + _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_start = 6628 + _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_end = 6695 + _WITHCOLUMNS._serialized_start = 6698 + _WITHCOLUMNS._serialized_end = 6829 + _HINT._serialized_start = 6832 + _HINT._serialized_end = 6972 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi index dbc69a48d88..996afbedcdc 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.pyi +++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi @@ -82,6 +82,7 @@ class Relation(google.protobuf.message.Message): DROP_FIELD_NUMBER: builtins.int TAIL_FIELD_NUMBER: builtins.int WITH_COLUMNS_FIELD_NUMBER: builtins.int + HINT_FIELD_NUMBER: builtins.int FILL_NA_FIELD_NUMBER: builtins.int DROP_NA_FIELD_NUMBER: builtins.int REPLACE_FIELD_NUMBER: builtins.int @@ -135,6 +136,8 @@ class Relation(google.protobuf.message.Message): @property def with_columns(self) -> global___WithColumns: ... @property + def hint(self) -> global___Hint: ... + @property def fill_na(self) -> global___NAFill: """NA functions""" @property @@ -174,6 +177,7 @@ class Relation(google.protobuf.message.Message): drop: global___Drop | None = ..., tail: global___Tail | None = ..., with_columns: global___WithColumns | None = ..., + hint: global___Hint | None = ..., fill_na: global___NAFill | None = ..., drop_na: global___NADrop | None = ..., replace: global___NAReplace | None = ..., @@ -200,6 +204,8 @@ class Relation(google.protobuf.message.Message): b"fill_na", "filter", b"filter", + "hint", + b"hint", "join", b"join", "limit", @@ -265,6 +271,8 @@ class Relation(google.protobuf.message.Message): b"fill_na", "filter", b"filter", + "hint", + b"hint", "join", b"join", "limit", @@ -336,6 +344,7 @@ class Relation(google.protobuf.message.Message): "drop", "tail", "with_columns", + "hint", "fill_na", "drop_na", "replace", @@ -1865,3 +1874,50 @@ class WithColumns(google.protobuf.message.Message): ) -> None: ... global___WithColumns = WithColumns + +class Hint(google.protobuf.message.Message): + """Specify a hint over a relation. Hint should have a name and optional parameters.""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + INPUT_FIELD_NUMBER: builtins.int + NAME_FIELD_NUMBER: builtins.int + PARAMETERS_FIELD_NUMBER: builtins.int + @property + def input(self) -> global___Relation: + """(Required) The input relation.""" + name: builtins.str + """(Required) Hint name. + + Supported Join hints include BROADCAST, MERGE, SHUFFLE_HASH, SHUFFLE_REPLICATE_NL. + + Supported partitioning hints include COALESCE, REPARTITION, REPARTITION_BY_RANGE. + """ + @property + def parameters( + self, + ) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[ + pyspark.sql.connect.proto.expressions_pb2.Expression.Literal + ]: + """(Optional) Hint parameters.""" + def __init__( + self, + *, + input: global___Relation | None = ..., + name: builtins.str = ..., + parameters: collections.abc.Iterable[ + pyspark.sql.connect.proto.expressions_pb2.Expression.Literal + ] + | None = ..., + ) -> None: ... + def HasField( + self, field_name: typing_extensions.Literal["input", b"input"] + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "input", b"input", "name", b"name", "parameters", b"parameters" + ], + ) -> None: ... + +global___Hint = Hint --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org