This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 012d19d8e9b2 [SPARK-48227][PYTHON][DOC] Document the requirement of seed in protos 012d19d8e9b2 is described below commit 012d19d8e9b28f7ce266753bcfff4a76c9510245 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Thu May 9 16:58:44 2024 -0700 [SPARK-48227][PYTHON][DOC] Document the requirement of seed in protos ### What changes were proposed in this pull request? Document the requirement of seed in protos ### Why are the changes needed? the seed should be set at client side document it to avoid cases like https://github.com/apache/spark/pull/46456 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #46518 from zhengruifeng/doc_random. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../common/src/main/protobuf/spark/connect/relations.proto | 8 ++++++-- python/pyspark/sql/connect/plan.py | 10 ++++------ python/pyspark/sql/connect/proto/relations_pb2.pyi | 10 ++++++++-- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto index 3882b2e85396..0b3c9d4253e8 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto @@ -467,7 +467,9 @@ message Sample { // (Optional) Whether to sample with replacement. optional bool with_replacement = 4; - // (Optional) The random seed. + // (Required) The random seed. + // This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details), + // however, still keep it 'optional' here for backward compatibility. optional int64 seed = 5; // (Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it. @@ -687,7 +689,9 @@ message StatSampleBy { // If a stratum is not specified, we treat its fraction as zero. repeated Fraction fractions = 3; - // (Optional) The random seed. + // (Required) The random seed. + // This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details), + // however, still keep it 'optional' here for backward compatibility. optional int64 seed = 5; message Fraction { diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index 4ac4946745f5..3d3303fb15c5 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -717,7 +717,7 @@ class Sample(LogicalPlan): lower_bound: float, upper_bound: float, with_replacement: bool, - seed: Optional[int], + seed: int, deterministic_order: bool = False, ) -> None: super().__init__(child) @@ -734,8 +734,7 @@ class Sample(LogicalPlan): plan.sample.lower_bound = self.lower_bound plan.sample.upper_bound = self.upper_bound plan.sample.with_replacement = self.with_replacement - if self.seed is not None: - plan.sample.seed = self.seed + plan.sample.seed = self.seed plan.sample.deterministic_order = self.deterministic_order return plan @@ -1526,7 +1525,7 @@ class StatSampleBy(LogicalPlan): child: Optional["LogicalPlan"], col: Column, fractions: Sequence[Tuple[Column, float]], - seed: Optional[int], + seed: int, ) -> None: super().__init__(child) @@ -1554,8 +1553,7 @@ class StatSampleBy(LogicalPlan): fraction.stratum.CopyFrom(k.to_plan(session).literal) fraction.fraction = float(v) plan.sample_by.fractions.append(fraction) - if self._seed is not None: - plan.sample_by.seed = self._seed + plan.sample_by.seed = self._seed return plan diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi index 5dfb47da67a9..9b6f4b43544f 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.pyi +++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi @@ -1865,7 +1865,10 @@ class Sample(google.protobuf.message.Message): with_replacement: builtins.bool """(Optional) Whether to sample with replacement.""" seed: builtins.int - """(Optional) The random seed.""" + """(Required) The random seed. + This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details), + however, still keep it 'optional' here for backward compatibility. + """ deterministic_order: builtins.bool """(Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it. This flag is true when invoking `dataframe.randomSplit` to randomly splits DataFrame with the @@ -2545,7 +2548,10 @@ class StatSampleBy(google.protobuf.message.Message): If a stratum is not specified, we treat its fraction as zero. """ seed: builtins.int - """(Optional) The random seed.""" + """(Required) The random seed. + This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details), + however, still keep it 'optional' here for backward compatibility. + """ def __init__( self, *, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org