This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 012d19d8e9b2 [SPARK-48227][PYTHON][DOC] Document the requirement of 
seed in protos
012d19d8e9b2 is described below

commit 012d19d8e9b28f7ce266753bcfff4a76c9510245
Author: Ruifeng Zheng <ruife...@apache.org>
AuthorDate: Thu May 9 16:58:44 2024 -0700

    [SPARK-48227][PYTHON][DOC] Document the requirement of seed in protos
    
    ### What changes were proposed in this pull request?
    Document the requirement of seed in protos
    
    ### Why are the changes needed?
    the seed should be set at client side
    
    document it to avoid cases like https://github.com/apache/spark/pull/46456
    
    ### Does this PR introduce _any_ user-facing change?
    no
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #46518 from zhengruifeng/doc_random.
    
    Authored-by: Ruifeng Zheng <ruife...@apache.org>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../common/src/main/protobuf/spark/connect/relations.proto     |  8 ++++++--
 python/pyspark/sql/connect/plan.py                             | 10 ++++------
 python/pyspark/sql/connect/proto/relations_pb2.pyi             | 10 ++++++++--
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git 
a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto 
b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
index 3882b2e85396..0b3c9d4253e8 100644
--- a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
+++ b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
@@ -467,7 +467,9 @@ message Sample {
   // (Optional) Whether to sample with replacement.
   optional bool with_replacement = 4;
 
-  // (Optional) The random seed.
+  // (Required) The random seed.
+  // This filed is required to avoid generate mutable dataframes (see 
SPARK-48184 for details),
+  // however, still keep it 'optional' here for backward compatibility.
   optional int64 seed = 5;
 
   // (Required) Explicitly sort the underlying plan to make the ordering 
deterministic or cache it.
@@ -687,7 +689,9 @@ message StatSampleBy {
   // If a stratum is not specified, we treat its fraction as zero.
   repeated Fraction fractions = 3;
 
-  // (Optional) The random seed.
+  // (Required) The random seed.
+  // This filed is required to avoid generate mutable dataframes (see 
SPARK-48184 for details),
+  // however, still keep it 'optional' here for backward compatibility.
   optional int64 seed = 5;
 
   message Fraction {
diff --git a/python/pyspark/sql/connect/plan.py 
b/python/pyspark/sql/connect/plan.py
index 4ac4946745f5..3d3303fb15c5 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -717,7 +717,7 @@ class Sample(LogicalPlan):
         lower_bound: float,
         upper_bound: float,
         with_replacement: bool,
-        seed: Optional[int],
+        seed: int,
         deterministic_order: bool = False,
     ) -> None:
         super().__init__(child)
@@ -734,8 +734,7 @@ class Sample(LogicalPlan):
         plan.sample.lower_bound = self.lower_bound
         plan.sample.upper_bound = self.upper_bound
         plan.sample.with_replacement = self.with_replacement
-        if self.seed is not None:
-            plan.sample.seed = self.seed
+        plan.sample.seed = self.seed
         plan.sample.deterministic_order = self.deterministic_order
         return plan
 
@@ -1526,7 +1525,7 @@ class StatSampleBy(LogicalPlan):
         child: Optional["LogicalPlan"],
         col: Column,
         fractions: Sequence[Tuple[Column, float]],
-        seed: Optional[int],
+        seed: int,
     ) -> None:
         super().__init__(child)
 
@@ -1554,8 +1553,7 @@ class StatSampleBy(LogicalPlan):
                 fraction.stratum.CopyFrom(k.to_plan(session).literal)
                 fraction.fraction = float(v)
                 plan.sample_by.fractions.append(fraction)
-        if self._seed is not None:
-            plan.sample_by.seed = self._seed
+        plan.sample_by.seed = self._seed
         return plan
 
 
diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi 
b/python/pyspark/sql/connect/proto/relations_pb2.pyi
index 5dfb47da67a9..9b6f4b43544f 100644
--- a/python/pyspark/sql/connect/proto/relations_pb2.pyi
+++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi
@@ -1865,7 +1865,10 @@ class Sample(google.protobuf.message.Message):
     with_replacement: builtins.bool
     """(Optional) Whether to sample with replacement."""
     seed: builtins.int
-    """(Optional) The random seed."""
+    """(Required) The random seed.
+    This filed is required to avoid generate mutable dataframes (see 
SPARK-48184 for details),
+    however, still keep it 'optional' here for backward compatibility.
+    """
     deterministic_order: builtins.bool
     """(Required) Explicitly sort the underlying plan to make the ordering 
deterministic or cache it.
     This flag is true when invoking `dataframe.randomSplit` to randomly splits 
DataFrame with the
@@ -2545,7 +2548,10 @@ class StatSampleBy(google.protobuf.message.Message):
         If a stratum is not specified, we treat its fraction as zero.
         """
     seed: builtins.int
-    """(Optional) The random seed."""
+    """(Required) The random seed.
+    This filed is required to avoid generate mutable dataframes (see 
SPARK-48184 for details),
+    however, still keep it 'optional' here for backward compatibility.
+    """
     def __init__(
         self,
         *,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to