This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 2a8ed47aaeb [SPARK-41036][CONNECT][PYTHON] columns` API should use `schema` API to avoid data fetching 2a8ed47aaeb is described below commit 2a8ed47aaeb226dff45fe6cca38e30833bb0eae6 Author: Rui Wang <rui.w...@databricks.com> AuthorDate: Fri Nov 11 11:22:03 2022 +0800 [SPARK-41036][CONNECT][PYTHON] columns` API should use `schema` API to avoid data fetching ### What changes were proposed in this pull request? Current `columns` is implemented based on `limit` which runs a job to fetch data and get schema from the data collection. However a more efficient way is to call `schema` API which only need to analyze the plan without collect data. This approach should be more efficient in most of the cases. ### Why are the changes needed? Efficiency ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? UT Closes #38546 from amaliujia/improve_python_columns. Authored-by: Rui Wang <rui.w...@databricks.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/sql/connect/dataframe.py | 9 ++------- python/pyspark/sql/tests/connect/test_connect_basic.py | 5 +++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index 0c19c67309d..f2e528fc83c 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -140,13 +140,8 @@ class DataFrame(object): """Returns the list of columns of the current data frame.""" if self._plan is None: return [] - if "columns" not in self._cache and self._plan is not None: - pdd = self.limit(0).toPandas() - if pdd is None: - raise Exception("Empty result") - # Translate to standard pytho array - self._cache["columns"] = pdd.columns.values - return self._cache["columns"] + + return self.schema().names def count(self) -> int: """Returns the number of rows in the data frame""" diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py index b2509c75dd6..3f75eb1b18f 100644 --- a/python/pyspark/sql/tests/connect/test_connect_basic.py +++ b/python/pyspark/sql/tests/connect/test_connect_basic.py @@ -106,6 +106,11 @@ class SparkConnectTests(SparkConnectSQLTestCase): # Check that the limit is applied self.assertEqual(len(data.index), 10) + def test_columns(self): + # SPARK-41036: test `columns` API for python client. + columns = self.connect.read.table(self.tbl_name).columns + self.assertEqual(["id", "name"], columns) + def test_collect(self): df = self.connect.read.table(self.tbl_name) data = df.limit(10).collect() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org