This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch geopandas-tier2-batch-e in repository https://gitbox.apache.org/repos/asf/sedona.git
commit dab88ed230cf6de11df7f5b6fc8af2a70431154d Author: Jia Yu <[email protected]> AuthorDate: Wed Mar 11 23:17:51 2026 -0700 Add frechet_distance, hausdorff_distance, geom_equals, interpolate, project to geopandas GeoSeries Implement 5 new GeoSeries methods matching the geopandas API: - frechet_distance: via ST_FrechetDistance (densify not supported, raises NotImplementedError) - hausdorff_distance: via ST_HausdorffDistance (supports densify parameter) - geom_equals: via ST_Equals (binary predicate) - interpolate: via ST_LineInterpolatePoint (supports normalized parameter) - project: via ST_LineLocatePoint (supports normalized parameter) All functions support align parameter and work through GeoDataFrame delegation. --- python/sedona/spark/geopandas/base.py | 257 +++++++++++++++++++++ python/sedona/spark/geopandas/geoseries.py | 83 +++++++ python/tests/geopandas/test_geoseries.py | 171 ++++++++++++++ .../tests/geopandas/test_match_geopandas_series.py | 98 ++++++++ 4 files changed, 609 insertions(+) diff --git a/python/sedona/spark/geopandas/base.py b/python/sedona/spark/geopandas/base.py index ac6827e097..ee3e3c6e1c 100644 --- a/python/sedona/spark/geopandas/base.py +++ b/python/sedona/spark/geopandas/base.py @@ -2363,6 +2363,263 @@ class GeoFrame(metaclass=ABCMeta): """ return _delegate_to_geometry_column("distance", self, other, align) + def frechet_distance(self, other, align=None, densify=None): + """Returns a ``Series`` containing the discrete Fréchet distance to aligned `other`. + + The Fréchet distance is a measure of similarity: it is the greatest distance + between any point in A and the closest point in B. The discrete distance is an + approximation of this metric: only vertices are considered. The parameter + ``densify`` makes this approximation less coarse by splitting the line segments + between vertices before computing the distance. + + The operation works on a 1-to-1 row-wise manner. + + Parameters + ---------- + other : GeoSeries or geometric object + The GeoSeries (elementwise) or geometric object to find the + distance to. + align : bool | None (default None) + If True, automatically aligns GeoSeries based on their indices. None defaults to True. + If False, the order of elements is preserved. + densify : float, optional + The densify parameter is not supported by Sedona. + Passing a value will raise a ``NotImplementedError``. + + Returns + ------- + Series (float) + + Examples + -------- + >>> from sedona.spark.geopandas import GeoSeries + >>> from shapely.geometry import LineString + >>> s1 = GeoSeries( + ... [ + ... LineString([(0, 0), (1, 0), (2, 0)]), + ... LineString([(0, 0), (1, 1)]), + ... ] + ... ) + >>> s2 = GeoSeries( + ... [ + ... LineString([(0, 1), (1, 2), (2, 1)]), + ... LineString([(1, 0), (2, 1)]), + ... ] + ... ) + + >>> s1.frechet_distance(s2) + 0 2.0 + 1 2.0 + dtype: float64 + + See also + -------- + GeoSeries.hausdorff_distance + """ + return _delegate_to_geometry_column( + "frechet_distance", self, other, align, densify + ) + + def hausdorff_distance(self, other, align=None, densify=None): + """Returns a ``Series`` containing the Hausdorff distance to aligned `other`. + + The Hausdorff distance is the largest distance consisting of any point in `self` + with the nearest point in `other`. + + The operation works on a 1-to-1 row-wise manner. + + Parameters + ---------- + other : GeoSeries or geometric object + The GeoSeries (elementwise) or geometric object to find the + distance to. + align : bool | None (default None) + If True, automatically aligns GeoSeries based on their indices. None defaults to True. + If False, the order of elements is preserved. + densify : float, optional + The fraction by which to densify each segment. Each segment will be + split into a number of equal-length subsegments whose fraction of + the segment length is closest to the given fraction. + + Returns + ------- + Series (float) + + Examples + -------- + >>> from sedona.spark.geopandas import GeoSeries + >>> from shapely.geometry import LineString + >>> s1 = GeoSeries( + ... [ + ... LineString([(0, 0), (1, 0), (2, 0)]), + ... LineString([(0, 0), (1, 1)]), + ... ] + ... ) + >>> s2 = GeoSeries( + ... [ + ... LineString([(0, 1), (1, 2), (2, 1)]), + ... LineString([(1, 0), (2, 1)]), + ... ] + ... ) + + >>> s1.hausdorff_distance(s2) + 0 2.0 + 1 2.0 + dtype: float64 + + See also + -------- + GeoSeries.frechet_distance + """ + return _delegate_to_geometry_column( + "hausdorff_distance", self, other, align, densify + ) + + def geom_equals(self, other, align=None): + """Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for + each aligned geometry equal to `other`. + + An object is said to be equal to `other` if its set-theoretic + boundary, interior, and exterior coincides with those of the other. + + The operation works on a 1-to-1 row-wise manner. + + Parameters + ---------- + other : GeoSeries or geometric object + The GeoSeries (elementwise) or geometric object to test for + equality. + align : bool | None (default None) + If True, automatically aligns GeoSeries based on their indices. None defaults to True. + If False, the order of elements is preserved. + + Returns + ------- + Series (bool) + + Examples + -------- + >>> from sedona.spark.geopandas import GeoSeries + >>> from shapely.geometry import Point + >>> s1 = GeoSeries( + ... [ + ... Point(0, 0), + ... Point(1, 1), + ... Point(2, 2), + ... ] + ... ) + >>> s2 = GeoSeries( + ... [ + ... Point(0, 0), + ... Point(2, 2), + ... Point(2, 2), + ... ] + ... ) + + >>> s1.geom_equals(s2) + 0 True + 1 False + 2 True + dtype: bool + + See also + -------- + GeoSeries.geom_equals_exact + """ + return _delegate_to_geometry_column("geom_equals", self, other, align) + + def interpolate(self, distance, normalized=False): + """Return a point at the specified distance along each geometry. + + Parameters + ---------- + distance : float or Series of floats + Distance(s) along the geometries at which a point should be + returned. If np.array or pd.Series are used then it must have + same length as the GeoSeries. + normalized : bool (default False) + If True, ``distance`` will be interpreted as a fraction + of the geometric object's length. + + Returns + ------- + GeoSeries + + Examples + -------- + >>> from sedona.spark.geopandas import GeoSeries + >>> from shapely.geometry import LineString + >>> s = GeoSeries( + ... [ + ... LineString([(0, 0), (2, 0), (0, 2)]), + ... LineString([(0, 0), (2, 2)]), + ... LineString([(2, 0), (0, 2)]), + ... ], + ... ) + + >>> s.interpolate(1) + 0 POINT (1 0) + 1 POINT (0.70711 0.70711) + 2 POINT (1.29289 0.70711) + dtype: geometry + + See also + -------- + GeoSeries.project + """ + return _delegate_to_geometry_column("interpolate", self, distance, normalized) + + def project(self, other, normalized=False, align=None): + """Return the distance along each geometry nearest to `other`. + + The operation works on a 1-to-1 row-wise manner. + + The project method is the inverse of interpolate. + + Parameters + ---------- + other : GeoSeries or geometric object + The *other* geometry to compute the projected point from. + normalized : bool (default False) + If True, return the distance normalized to the length of the object. + align : bool | None (default None) + If True, automatically aligns GeoSeries based on their indices. None defaults to True. + If False, the order of elements is preserved. + + Returns + ------- + Series (float) + + Examples + -------- + >>> from sedona.spark.geopandas import GeoSeries + >>> from shapely.geometry import LineString, Point + >>> s = GeoSeries( + ... [ + ... LineString([(0, 0), (2, 0), (0, 2)]), + ... LineString([(0, 0), (2, 2)]), + ... LineString([(2, 0), (0, 2)]), + ... ], + ... ) + + >>> s.project(Point(1, 0)) + 0 1.000000 + 1 0.707107 + 2 0.707107 + dtype: float64 + + >>> s.project(Point(1, 0), normalized=True) + 0 0.207107 + 1 0.250000 + 2 0.250000 + dtype: float64 + + See also + -------- + GeoSeries.interpolate + """ + return _delegate_to_geometry_column("project", self, other, normalized, align) + def intersection(self, other, align=None): """Returns a ``GeoSeries`` of the intersection of points in each aligned geometry with `other`. diff --git a/python/sedona/spark/geopandas/geoseries.py b/python/sedona/spark/geopandas/geoseries.py index 96631f3679..3734441fd0 100644 --- a/python/sedona/spark/geopandas/geoseries.py +++ b/python/sedona/spark/geopandas/geoseries.py @@ -1330,6 +1330,89 @@ class GeoSeries(GeoFrame, pspd.Series): ) return result + def frechet_distance(self, other, align=None, densify=None) -> pspd.Series: + if densify is not None: + raise NotImplementedError( + "Sedona does not support the densify parameter for frechet_distance." + ) + + other_series, extended = self._make_series_of_val(other) + align = False if extended else align + + spark_expr = stf.ST_FrechetDistance(F.col("L"), F.col("R")) + result = self._row_wise_operation( + spark_expr, + other_series, + align, + default_val=None, + ) + return result + + def hausdorff_distance(self, other, align=None, densify=None) -> pspd.Series: + other_series, extended = self._make_series_of_val(other) + align = False if extended else align + + if densify is not None: + spark_expr = stf.ST_HausdorffDistance(F.col("L"), F.col("R"), densify) + else: + spark_expr = stf.ST_HausdorffDistance(F.col("L"), F.col("R")) + result = self._row_wise_operation( + spark_expr, + other_series, + align, + default_val=None, + ) + return result + + def geom_equals(self, other, align=None) -> pspd.Series: + other_series, extended = self._make_series_of_val(other) + align = False if extended else align + + spark_expr = stp.ST_Equals(F.col("L"), F.col("R")) + result = self._row_wise_operation( + spark_expr, + other_series, + align, + returns_geom=False, + default_val=False, + ) + return _to_bool(result) + + def interpolate(self, distance, normalized=False) -> "GeoSeries": + other_series, extended = self._make_series_of_val(distance) + align = not extended + + if normalized: + spark_expr = stf.ST_LineInterpolatePoint(F.col("L"), F.col("R")) + else: + spark_expr = stf.ST_LineInterpolatePoint( + F.col("L"), F.col("R") / stf.ST_Length(F.col("L")) + ) + return self._row_wise_operation( + spark_expr, + other_series, + align=align, + returns_geom=True, + ) + + def project(self, other, normalized=False, align=None) -> pspd.Series: + other_series, extended = self._make_series_of_val(other) + align = False if extended else align + + if normalized: + spark_expr = stf.ST_LineLocatePoint(F.col("L"), F.col("R")) + else: + spark_expr = stf.ST_LineLocatePoint(F.col("L"), F.col("R")) * stf.ST_Length( + F.col("L") + ) + result = self._row_wise_operation( + spark_expr, + other_series, + align, + default_val=None, + ) + return result + def intersection( self, other: Union["GeoSeries", BaseGeometry], align: Union[bool, None] = None ) -> "GeoSeries": diff --git a/python/tests/geopandas/test_geoseries.py b/python/tests/geopandas/test_geoseries.py index 9bc572a151..e929e3e43d 100644 --- a/python/tests/geopandas/test_geoseries.py +++ b/python/tests/geopandas/test_geoseries.py @@ -2635,6 +2635,177 @@ e": "Feature", "properties": {}, "geometry": {"type": "Point", "coordinates": [3 expected = pd.Series(["FF2F11212", "212101212"]) self.check_pd_series_equal(result, expected) + def test_frechet_distance(self): + s1 = GeoSeries( + [ + LineString([(0, 0), (1, 0), (2, 0)]), + LineString([(0, 0), (1, 1)]), + ] + ) + s2 = GeoSeries( + [ + LineString([(0, 1), (1, 2), (2, 1)]), + LineString([(1, 0), (2, 1)]), + ] + ) + + result = s1.frechet_distance(s2, align=False) + expected = pd.Series([2.0, 1.0]) + self.check_pd_series_equal(result, expected) + + # Test with single geometry + line = LineString([(0, 1), (1, 2), (2, 1)]) + result = s1.frechet_distance(line) + expected = pd.Series([2.0, 1.0]) + self.check_pd_series_equal(result, expected) + + # Test that GeoDataFrame works too + df_result = s1.to_geoframe().frechet_distance(s2, align=False) + expected = pd.Series([2.0, 1.0]) + self.check_pd_series_equal(df_result, expected) + + # Test that densify raises NotImplementedError + with pytest.raises(NotImplementedError): + s1.frechet_distance(s2, densify=0.5) + + def test_hausdorff_distance(self): + s1 = GeoSeries( + [ + LineString([(0, 0), (1, 0), (2, 0)]), + LineString([(0, 0), (1, 1)]), + ] + ) + s2 = GeoSeries( + [ + LineString([(0, 1), (1, 2), (2, 1)]), + LineString([(1, 0), (2, 1)]), + ] + ) + + result = s1.hausdorff_distance(s2, align=False) + expected = pd.Series([2.0, 1.0]) + self.check_pd_series_equal(result, expected) + + # Test with single geometry + line = LineString([(0, 1), (1, 2), (2, 1)]) + result = s1.hausdorff_distance(line) + expected = pd.Series([2.0, 1.0]) + self.check_pd_series_equal(result, expected) + + # Test that GeoDataFrame works too + df_result = s1.to_geoframe().hausdorff_distance(s2, align=False) + expected = pd.Series([2.0, 1.0]) + self.check_pd_series_equal(df_result, expected) + + # Test with densify parameter + result = s1.hausdorff_distance(s2, densify=0.5, align=False) + expected = pd.Series([2.0, 1.0]) + self.check_pd_series_equal(result, expected) + + def test_geom_equals(self): + s1 = GeoSeries( + [ + Point(0, 0), + Point(1, 1), + Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), + ] + ) + s2 = GeoSeries( + [ + Point(0, 0), + Point(2, 2), + Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), + ] + ) + + result = s1.geom_equals(s2, align=False) + expected = pd.Series([True, False, True]) + self.check_pd_series_equal(result, expected) + + # Test with single geometry + result = s1.geom_equals(Point(0, 0)) + expected = pd.Series([True, False, False]) + self.check_pd_series_equal(result, expected) + + # Test that GeoDataFrame works too + df_result = s1.to_geoframe().geom_equals(s2, align=False) + expected = pd.Series([True, False, True]) + self.check_pd_series_equal(df_result, expected) + + def test_interpolate(self): + s = GeoSeries( + [ + LineString([(0, 0), (2, 0), (0, 2)]), + LineString([(0, 0), (2, 2)]), + LineString([(2, 0), (0, 2)]), + ] + ) + + # Test with absolute distance + result = s.interpolate(1) + expected = gpd.GeoSeries( + [ + Point(1, 0), + Point(0.7071067811865476, 0.7071067811865476), + Point(1.2928932188134524, 0.7071067811865476), + ] + ) + self.check_sgpd_equals_gpd(result, expected) + + # Test with normalized distance + result = s.interpolate(0.5, normalized=True) + expected = gpd.GeoSeries(s.to_geopandas().interpolate(0.5, normalized=True)) + self.check_sgpd_equals_gpd(result, expected) + + # Test that GeoDataFrame works too + df_result = s.to_geoframe().interpolate(1) + expected = gpd.GeoSeries( + [ + Point(1, 0), + Point(0.7071067811865476, 0.7071067811865476), + Point(1.2928932188134524, 0.7071067811865476), + ] + ) + self.check_sgpd_equals_gpd(df_result, expected) + + def test_project(self): + s = GeoSeries( + [ + LineString([(0, 0), (2, 0), (0, 2)]), + LineString([(0, 0), (2, 2)]), + LineString([(2, 0), (0, 2)]), + ] + ) + + # Test with a single point + result = s.project(Point(1, 0)) + expected = pd.Series([1.0, 0.7071067811865476, 0.7071067811865476]) + self.check_pd_series_equal(result, expected) + + # Test with normalized=True + result = s.project(Point(1, 0), normalized=True) + expected = pd.Series(s.to_geopandas().project(Point(1, 0), normalized=True)) + self.check_pd_series_equal(result, expected) + + # Test with two GeoSeries + s2 = GeoSeries( + [ + Point(1, 0), + Point(1, 0), + Point(2, 1), + ] + ) + result = s.project(s2, align=False) + expected = pd.Series( + s.to_geopandas().project(gpd.GeoSeries(s2.to_geopandas()), align=False) + ) + self.check_pd_series_equal(result, expected) + + # Test that GeoDataFrame works too + df_result = s.to_geoframe().project(Point(1, 0)) + expected = pd.Series([1.0, 0.7071067811865476, 0.7071067811865476]) + self.check_pd_series_equal(df_result, expected) + def test_set_crs(self): geo_series = sgpd.GeoSeries([Point(0, 0), Point(1, 1)], name="geometry") assert geo_series.crs == None diff --git a/python/tests/geopandas/test_match_geopandas_series.py b/python/tests/geopandas/test_match_geopandas_series.py index 6dd5af9dff..2a92585385 100644 --- a/python/tests/geopandas/test_match_geopandas_series.py +++ b/python/tests/geopandas/test_match_geopandas_series.py @@ -1285,6 +1285,104 @@ class TestMatchGeopandasSeries(TestGeopandasBase): ) self.check_pd_series_equal(sgpd_result, gpd_result) + def test_frechet_distance(self): + line_pairs = [ + (self.linestrings, self.linestrings), + (self.linearrings, self.linearrings), + (self.linestrings, self.linearrings), + ] + for geom, geom2 in line_pairs: + # Skip pairs containing empty geometries + if any(g.is_empty for g in geom) or any(g.is_empty for g in geom2): + continue + + sgpd_result = GeoSeries(geom).frechet_distance(GeoSeries(geom2), align=True) + gpd_result = gpd.GeoSeries(geom).frechet_distance( + gpd.GeoSeries(geom2), align=True + ) + self.check_pd_series_equal(sgpd_result, gpd_result) + + if len(geom) == len(geom2): + sgpd_result = GeoSeries(geom).frechet_distance( + GeoSeries(geom2), align=False + ) + gpd_result = gpd.GeoSeries(geom).frechet_distance( + gpd.GeoSeries(geom2), align=False + ) + self.check_pd_series_equal(sgpd_result, gpd_result) + + def test_hausdorff_distance(self): + for geom, geom2 in self.pairs: + # Skip pairs with empty geometries — Sedona returns 0.0 instead of NaN + if any(g.is_empty for g in geom) or any(g.is_empty for g in geom2): + continue + + sgpd_result = GeoSeries(geom).hausdorff_distance( + GeoSeries(geom2), align=True + ) + gpd_result = gpd.GeoSeries(geom).hausdorff_distance( + gpd.GeoSeries(geom2), align=True + ) + self.check_pd_series_equal(sgpd_result, gpd_result) + + if len(geom) == len(geom2): + sgpd_result = GeoSeries(geom).hausdorff_distance( + GeoSeries(geom2), align=False + ) + gpd_result = gpd.GeoSeries(geom).hausdorff_distance( + gpd.GeoSeries(geom2), align=False + ) + self.check_pd_series_equal(sgpd_result, gpd_result) + + def test_geom_equals(self): + for geom, geom2 in self.pairs: + if self.contains_any_geom_collection(geom, geom2): + continue + + sgpd_result = GeoSeries(geom).geom_equals(GeoSeries(geom2), align=True) + gpd_result = gpd.GeoSeries(geom).geom_equals( + gpd.GeoSeries(geom2), align=True + ) + self.check_pd_series_equal(sgpd_result, gpd_result) + + if len(geom) == len(geom2): + sgpd_result = GeoSeries(geom).geom_equals(GeoSeries(geom2), align=False) + gpd_result = gpd.GeoSeries(geom).geom_equals( + gpd.GeoSeries(geom2), align=False + ) + self.check_pd_series_equal(sgpd_result, gpd_result) + + def test_interpolate(self): + for geom in [self.linestrings, self.linearrings]: + # Skip empty geometries + non_empty = [g for g in geom if not g.is_empty] + if not non_empty: + continue + + sgpd_result = GeoSeries(non_empty).interpolate(1.0) + gpd_result = gpd.GeoSeries(non_empty).interpolate(1.0) + self.check_sgpd_equals_gpd(sgpd_result, gpd_result) + + sgpd_result = GeoSeries(non_empty).interpolate(0.5, normalized=True) + gpd_result = gpd.GeoSeries(non_empty).interpolate(0.5, normalized=True) + self.check_sgpd_equals_gpd(sgpd_result, gpd_result) + + def test_project(self): + for geom in [self.linestrings, self.linearrings]: + # Skip empty geometries + non_empty = [g for g in geom if not g.is_empty] + if not non_empty: + continue + + point = Point(1, 1) + sgpd_result = GeoSeries(non_empty).project(point) + gpd_result = gpd.GeoSeries(non_empty).project(point) + self.check_pd_series_equal(sgpd_result, gpd_result) + + sgpd_result = GeoSeries(non_empty).project(point, normalized=True) + gpd_result = gpd.GeoSeries(non_empty).project(point, normalized=True) + self.check_pd_series_equal(sgpd_result, gpd_result) + def test_set_crs(self): for geom in self.geoms: if isinstance(geom[0], Polygon) and geom[0] == Polygon():
