Copilot commented on code in PR #642:
URL: https://github.com/apache/sedona-db/pull/642#discussion_r2829094804
##########
python/sedonadb/src/dataframe.rs:
##########
@@ -189,13 +189,51 @@ impl InternalDataFrame {
) -> Result<(), PySedonaError> {
// sort_by needs to be SortExpr. A Vec<String> can unambiguously be
interpreted as
// field names (ascending), but other types of expressions aren't
supported here yet.
+ // We need to special-case geometry columns until we have a logical
optimizer rule or
+ // sorting for user-defined types is supported.
+ let geometry_column_indices =
self.inner.schema().geometry_column_indices()?;
+ let geometry_column_names = geometry_column_indices
+ .iter()
+ .map(|i| self.inner.schema().field(*i).name().as_str())
+ .collect::<HashSet<&str>>();
let sort_by_expr = sort_by
.into_iter()
.map(|name| {
- let column = Expr::Column(Column::new_unqualified(name));
- SortExpr::new(column, true, false)
+ let column =
Expr::Column(Column::new_unqualified(name.clone()));
+ if geometry_column_names.contains(name.as_str()) {
+ // Create the call sd_order(column). If we're ordering by
geometry but don't have
+ // the required feature for high quality sort output, give
an error. This is mostly
+ // an issue when using maturin develop because geography
is not a default feature.
+ #[cfg(feature = "s2geography")]
+ let has_geography = true;
+ #[cfg(not(feature = "s2geography"))]
+ let has_geography = false;
+
+ if has_geography {
+ let order_udf_opt: Option<ScalarUDF> = ctx
+ .inner
+ .functions
+ .scalar_udf("sd_order")
+ .map(|udf_opt| udf_opt.clone().into());
+ if let Some(order_udf) = order_udf_opt {
+ Ok(SortExpr::new(order_udf.call(vec![column]), true,
false))
+ } else {
+ Err(PySedonaError::SedonaPython(
+ "Can't order by geometry field when sd_order() is
not available"
+ .to_string(),
+ ))
+ }
+ } else {
+ Err(PySedonaError::SedonaPython(
+ "Use maturin develop --features
's2geography,pyo3/extension-module' for dev geography support"
+ .to_string(),
+ ))
+ }
+ } else {
Review Comment:
Incorrect indentation: Line 212 is missing proper indentation. The `if
has_geography {` statement should be indented 4 more spaces to align with the
code block it belongs to (specifically, to match the indentation of line 204's
comment). While the logic and brace matching appear correct, this indentation
issue significantly reduces code readability and makes the control flow harder
to follow.
```suggestion
let order_udf_opt: Option<ScalarUDF> = ctx
.inner
.functions
.scalar_udf("sd_order")
.map(|udf_opt| udf_opt.clone().into());
if let Some(order_udf) = order_udf_opt {
Ok(SortExpr::new(order_udf.call(vec![column]),
true, false))
} else {
Err(PySedonaError::SedonaPython(
"Can't order by geometry field when
sd_order() is not available"
.to_string(),
))
}
} else {
Err(PySedonaError::SedonaPython(
"Use maturin develop --features
's2geography,pyo3/extension-module' for dev geography support"
.to_string(),
))
}
} else {
```
##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -356,6 +356,35 @@ def test_write_geoparquet_options(geoarrow_data):
assert tmp_parquet.stat().st_size > (size_with_default_compression * 2)
+def test_write_sort_by_geometry(con):
+ if "s2geography" not in sedonadb.__features__:
+ pytest.skip("Ordering currently requires build with feature
s2geography")
+
+ con.funcs.table.sd_random_geometry(
+ "Point", 10000, seed=948, bounds=[-50, -50, 50, 50]
+ ).to_view("pts", overwrite=True)
+ df = con.sql("SELECT id, ST_SetSRID(geometry, 4326) AS geometry FROM pts")
+
+ # Write sorted and unsorted output and ensure
+ with tempfile.TemporaryDirectory() as td:
+ df.to_parquet(Path(td) / "unsorted.parquet")
+ df.to_parquet(Path(td) / "sorted.parquet", sort_by="geometry")
+
+ gdf_unsorted = geopandas.read_parquet(Path(td) /
"unsorted.parquet").to_crs(
+ 3857
+ )
+ gdf_sorted = geopandas.read_parquet(Path(td) /
"sorted.parquet").to_crs(3857)
+
+ neighbour_distance_unsorted = gdf_unsorted.geometry.distance(
+ gdf_unsorted.geometry.shift(1)
+ ).median()
+ neighbour_distance_sorted = gdf_sorted.geometry.distance(
+ gdf_sorted.geometry.shift(1)
+ ).median()
+
+ assert neighbour_distance_sorted < (neighbour_distance_unsorted / 20)
+
+
Review Comment:
Missing test coverage: The implementation supports sorting by multiple
columns (both geometry and non-geometry columns), but there's no test case that
verifies this scenario works correctly. Consider adding a test that sorts by
multiple columns, such as sorting by a geometry column and a non-geometry
column together, to ensure the logic correctly handles mixed column types.
```suggestion
def test_write_sort_by_geometry_and_id(con):
if "s2geography" not in sedonadb.__features__:
pytest.skip("Ordering currently requires build with feature
s2geography")
# Construct a small deterministic dataset with duplicate geometries
df = con.sql(
"""
SELECT id, ST_SetSRID(geom, 4326) AS geometry
FROM (
VALUES
(1, ST_GeomFromText('POINT (0 0)')),
(3, ST_GeomFromText('POINT (0 0)')),
(4, ST_GeomFromText('POINT (1 1)')),
(2, ST_GeomFromText('POINT (1 1)'))
) AS t(id, geom)
"""
)
with tempfile.TemporaryDirectory() as td:
out_path = Path(td) / "sorted_multi.parquet"
# Sort by geometry as primary key and id as secondary key
df.to_parquet(out_path, sort_by=["geometry", "id"])
gdf = geopandas.read_parquet(out_path)
# For each distinct geometry, ids should be in ascending order
geom_wkts = gdf.geometry.to_wkt()
for wkt in geom_wkts.unique():
ids = gdf.loc[geom_wkts == wkt, "id"].tolist()
assert ids == sorted(ids)
```
##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -356,6 +356,35 @@ def test_write_geoparquet_options(geoarrow_data):
assert tmp_parquet.stat().st_size > (size_with_default_compression * 2)
+def test_write_sort_by_geometry(con):
+ if "s2geography" not in sedonadb.__features__:
+ pytest.skip("Ordering currently requires build with feature
s2geography")
+
+ con.funcs.table.sd_random_geometry(
+ "Point", 10000, seed=948, bounds=[-50, -50, 50, 50]
+ ).to_view("pts", overwrite=True)
+ df = con.sql("SELECT id, ST_SetSRID(geometry, 4326) AS geometry FROM pts")
+
+ # Write sorted and unsorted output and ensure
Review Comment:
Incomplete comment: The comment on line 368 says "Write sorted and unsorted
output and ensure" but doesn't complete the thought. It should explain what is
being ensured, for example: "Write sorted and unsorted output and ensure sorted
output has better spatial locality".
```suggestion
# Write sorted and unsorted output and ensure sorting by geometry
improves spatial locality
```
##########
python/sedonadb/src/dataframe.rs:
##########
@@ -189,13 +189,51 @@ impl InternalDataFrame {
) -> Result<(), PySedonaError> {
// sort_by needs to be SortExpr. A Vec<String> can unambiguously be
interpreted as
// field names (ascending), but other types of expressions aren't
supported here yet.
+ // We need to special-case geometry columns until we have a logical
optimizer rule or
+ // sorting for user-defined types is supported.
+ let geometry_column_indices =
self.inner.schema().geometry_column_indices()?;
+ let geometry_column_names = geometry_column_indices
+ .iter()
+ .map(|i| self.inner.schema().field(*i).name().as_str())
+ .collect::<HashSet<&str>>();
let sort_by_expr = sort_by
.into_iter()
.map(|name| {
- let column = Expr::Column(Column::new_unqualified(name));
- SortExpr::new(column, true, false)
+ let column =
Expr::Column(Column::new_unqualified(name.clone()));
+ if geometry_column_names.contains(name.as_str()) {
+ // Create the call sd_order(column). If we're ordering by
geometry but don't have
+ // the required feature for high quality sort output, give
an error. This is mostly
+ // an issue when using maturin develop because geography
is not a default feature.
+ #[cfg(feature = "s2geography")]
+ let has_geography = true;
+ #[cfg(not(feature = "s2geography"))]
+ let has_geography = false;
+
+ if has_geography {
+ let order_udf_opt: Option<ScalarUDF> = ctx
+ .inner
+ .functions
+ .scalar_udf("sd_order")
+ .map(|udf_opt| udf_opt.clone().into());
+ if let Some(order_udf) = order_udf_opt {
+ Ok(SortExpr::new(order_udf.call(vec![column]), true,
false))
+ } else {
+ Err(PySedonaError::SedonaPython(
+ "Can't order by geometry field when sd_order() is
not available"
+ .to_string(),
+ ))
+ }
+ } else {
+ Err(PySedonaError::SedonaPython(
+ "Use maturin develop --features
's2geography,pyo3/extension-module' for dev geography support"
+ .to_string(),
+ ))
+ }
+ } else {
Review Comment:
Incorrect indentation: Lines 213-231 should be indented 4 more spaces to
properly reflect their nesting level within the `if has_geography` block that
starts on line 212. The current indentation makes it difficult to understand
the code structure and control flow.
```suggestion
let order_udf_opt: Option<ScalarUDF> = ctx
.inner
.functions
.scalar_udf("sd_order")
.map(|udf_opt| udf_opt.clone().into());
if let Some(order_udf) = order_udf_opt {
Ok(SortExpr::new(order_udf.call(vec![column]),
true, false))
} else {
Err(PySedonaError::SedonaPython(
"Can't order by geometry field when
sd_order() is not available"
.to_string(),
))
}
} else {
Err(PySedonaError::SedonaPython(
"Use maturin develop --features
's2geography,pyo3/extension-module' for dev geography support"
.to_string(),
))
}
} else {
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]