This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch fix/geoseries-empty-check-perf in repository https://gitbox.apache.org/repos/asf/sedona.git
commit b55c8e8e643aa353e5dcc60e761a48eacbde73b1 Author: Jia Yu <[email protected]> AuthorDate: Fri Mar 20 01:20:37 2026 -0700 [GH-2768] Replace len(self)==0 with cheaper _is_empty() check in GeoSeries Replace all 6 occurrences of len(self) == 0 in GeoSeries with a new _is_empty() helper that uses spark_frame.take(1) instead of DataFrame.count(). This avoids triggering a full Spark scan just to check if the series is empty. Affected methods: crs (getter), build_area(), polygonize(), union_all(), intersection_all(), total_bounds. --- python/sedona/spark/geopandas/geoseries.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/python/sedona/spark/geopandas/geoseries.py b/python/sedona/spark/geopandas/geoseries.py index 00ca985a64..d38d141580 100644 --- a/python/sedona/spark/geopandas/geoseries.py +++ b/python/sedona/spark/geopandas/geoseries.py @@ -341,6 +341,10 @@ class GeoSeries(GeoFrame, pspd.Series): if crs: self.set_crs(crs, inplace=True) + def _is_empty(self) -> bool: + """Check if this GeoSeries has no rows without triggering a full Spark scan.""" + return not self._internal.spark_frame.take(1) + # ============================================================================ # COORDINATE REFERENCE SYSTEM (CRS) OPERATIONS # ============================================================================ @@ -382,7 +386,7 @@ class GeoSeries(GeoFrame, pspd.Series): """ from pyproj import CRS - if len(self) == 0: + if self._is_empty(): return None # F.first is non-deterministic, but it doesn't matter because all non-null values should be the same. @@ -1152,7 +1156,7 @@ class GeoSeries(GeoFrame, pspd.Series): ) def build_area(self, node=True): - if len(self) == 0: + if self._is_empty(): return GeoSeries([], name="polygons", crs=self.crs) if node: @@ -1189,7 +1193,7 @@ class GeoSeries(GeoFrame, pspd.Series): "Sedona does not support full=True for polygonize." ) - if len(self) == 0: + if self._is_empty(): return GeoSeries([], name="polygons", crs=self.crs) if node: @@ -1245,7 +1249,7 @@ class GeoSeries(GeoFrame, pspd.Series): f"Sedona does not support manually specifying different union methods. Ignoring non-default method argument of {method}" ) - if len(self) == 0: + if self._is_empty(): # While it's not explicitly defined in GeoPandas docs, this is what GeoPandas returns for empty GeoSeries. # If it ever changes for some reason, we'll catch that with the test from shapely.geometry import GeometryCollection @@ -1260,7 +1264,7 @@ class GeoSeries(GeoFrame, pspd.Series): return geom def intersection_all(self) -> BaseGeometry: - if len(self) == 0: + if self._is_empty(): from shapely.geometry import GeometryCollection return GeometryCollection() @@ -2645,7 +2649,7 @@ class GeoSeries(GeoFrame, pspd.Series): def total_bounds(self): import warnings - if len(self) == 0: + if self._is_empty(): # numpy 'min' cannot handle empty arrays # TODO with numpy >= 1.15, the 'initial' argument can be used return np.array([np.nan, np.nan, np.nan, np.nan])
