Re: [PR] [SPARK-46812][SQL][PYTHON] Make mapInPandas / mapInArrow support ResourceProfile [spark]

via GitHub Thu, 01 Feb 2024 17:56:00 -0800


zhengruifeng commented on code in PR #44852:
URL: https://github.com/apache/spark/pull/44852#discussion_r1475419041



##########
python/pyspark/sql/pandas/map_ops.py:
##########
@@ -65,6 +74,12 @@ def mapInPandas(
 
             .. versionadded: 3.5.0
 
+        profile : :class:`pyspark.resource.ResourceProfile`. The optional 
ResourceProfile
+            to be used for mapInPandas.
+
+            .. versionadded: 3.5.1

Review Comment:
   we don't add new feature in maintenance release



##########
python/pyspark/sql/tests/test_resources.py:
##########
@@ -0,0 +1,104 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark import SparkContext, TaskContext
+from pyspark.resource import TaskResourceRequests, ResourceProfileBuilder
+from pyspark.sql import SparkSession
+from pyspark.testing.sqlutils import (
+    have_pandas,
+    have_pyarrow,
+    pandas_requirement_message,
+    pyarrow_requirement_message,
+)
+from pyspark.testing.utils import ReusedPySparkTestCase
+
+
+@unittest.skipIf(
+    not have_pandas or not have_pyarrow,
+    pandas_requirement_message or pyarrow_requirement_message,
+)
+class ResourceProfileTestsMixin(object):
+    def test_map_in_arrow_without_profile(self):
+        def func(iterator):
+            tc = TaskContext.get()
+            assert tc.cpus() == 1
+            for batch in iterator:
+                yield batch
+
+        df = self.spark.range(10)
+        df.mapInArrow(func, "id long").collect()
+
+    def test_map_in_arrow_with_profile(self):
+        def func(iterator):
+            tc = TaskContext.get()
+            assert tc.cpus() == 3
+            for batch in iterator:
+                yield batch
+
+        df = self.spark.range(10)
+
+        treqs = TaskResourceRequests().cpus(3)
+        rp = ResourceProfileBuilder().require(treqs).build
+        df.mapInArrow(func, "id long", False, rp).collect()
+
+    def test_map_in_pandas_without_profile(self):
+        def func(iterator):
+            tc = TaskContext.get()
+            assert tc.cpus() == 1
+            for batch in iterator:
+                yield batch
+
+        df = self.spark.range(10)
+        df.mapInPandas(func, "id long").collect()
+
+    def test_map_in_pandas_with_profile(self):
+        def func(iterator):
+            tc = TaskContext.get()
+            assert tc.cpus() == 3
+            for batch in iterator:
+                yield batch
+
+        df = self.spark.range(10)
+
+        treqs = TaskResourceRequests().cpus(3)
+        rp = ResourceProfileBuilder().require(treqs).build
+        df.mapInPandas(func, "id long", False, rp).collect()
+
+
+class ResourceProfileTests(ResourceProfileTestsMixin, ReusedPySparkTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.sc = SparkContext("local-cluster[1, 4, 1024]", cls.__name__, 
conf=cls.conf())
+        cls.spark = SparkSession(cls.sc)
+
+    @classmethod
+    def tearDownClass(cls):
+        super(ResourceProfileTests, cls).tearDownClass()
+        cls.spark.stop()
+
+
+if __name__ == "__main__":
+    from pyspark.sql.tests.test_resources import *  # noqa: F401

Review Comment:
   please add this test in `dev/sparktestsupport/modules.py`, otherwise it is 
skipped in GA



##########
python/pyspark/sql/pandas/map_ops.py:
##########
@@ -175,6 +196,11 @@ def mapInArrow(
 
             .. versionadded: 3.5.0
 
+        profile : :class:`pyspark.resource.ResourceProfile`. The optional 
ResourceProfile
+            to be used for mapInPandas.
+
+            .. versionadded: 3.5.1

Review Comment:
   ```suggestion
               .. versionadded: 4.0.0
   ```



##########
python/pyspark/sql/pandas/map_ops.py:
##########
@@ -65,6 +74,12 @@ def mapInPandas(
 
             .. versionadded: 3.5.0
 
+        profile : :class:`pyspark.resource.ResourceProfile`. The optional 
ResourceProfile
+            to be used for mapInPandas.
+
+            .. versionadded: 3.5.1

Review Comment:
   ```suggestion
               .. versionadded: 4.0.0
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-46812][SQL][PYTHON] Make mapInPandas / mapInArrow support ResourceProfile [spark]

Reply via email to