Copilot commented on code in PR #599:
URL: https://github.com/apache/sedona-db/pull/599#discussion_r2794415318
##########
sedona-cli/src/main.rs:
##########
@@ -133,7 +162,26 @@ async fn main_inner() -> Result<()> {
env::set_current_dir(p).unwrap();
};
- let ctx = SedonaContext::new_local_interactive().await?;
+ let mut rt_builder = RuntimeEnvBuilder::new();
+ // set memory pool size
+ if let Some(memory_limit) = args.memory_limit {
+ // set memory pool type
+ let pool: Arc<dyn MemoryPool> = match args.mem_pool_type {
+ PoolType::Fair => Arc::new(TrackConsumersPool::new(
+ SedonaFairSpillPool::new(memory_limit,
args.unspillable_reserve_ratio),
+ NonZeroUsize::new(10).unwrap(),
+ )),
+ PoolType::Greedy => Arc::new(TrackConsumersPool::new(
+ GreedyMemoryPool::new(memory_limit),
+ NonZeroUsize::new(10).unwrap(),
Review Comment:
The `NonZeroUsize::new(10).unwrap()` value is duplicated and the `unwrap()`
is avoidable for a constant. Consider creating a single local variable (e.g.,
`let track_capacity = NonZeroUsize::new(10).expect(\"...\");`) and reusing it
for both branches, or otherwise centralizing this value to avoid drift.
```suggestion
let track_capacity =
NonZeroUsize::new(10).expect("track capacity must be non-zero");
let pool: Arc<dyn MemoryPool> = match args.mem_pool_type {
PoolType::Fair => Arc::new(TrackConsumersPool::new(
SedonaFairSpillPool::new(memory_limit,
args.unspillable_reserve_ratio),
track_capacity,
)),
PoolType::Greedy => Arc::new(TrackConsumersPool::new(
GreedyMemoryPool::new(memory_limit),
track_capacity,
```
##########
rust/sedona/src/memory_pool.rs:
##########
@@ -0,0 +1,279 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion::execution::memory_pool::{
+ MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation,
+};
+use datafusion_common::{resources_datafusion_err, DataFusionError, Result};
+use parking_lot::Mutex;
+
+pub const DEFAULT_UNSPILLABLE_RESERVE_RATIO: f64 = 0.2;
+
+/// A [`MemoryPool`] implementation similar to DataFusion's
[`datafusion::execution::memory_pool::FairSpillPool`],
+/// but with the following changes:
+///
+/// It implements a reservation mechanism for unspillable memory consumers.
This addresses an issue
+/// where spillable consumers could potentially exhaust all available memory,
preventing unspillable
+/// operations from acquiring necessary resources. This behavior is tracked in
DataFusion issue
+/// https://github.com/apache/datafusion/issues/17334. In the context of
Sedona, a typical example
+/// is a [`sedona_spatial_join::exec::SpatialJoinExec`] operator with an auto
inserted
+/// [`datafusion::physical_plan::repartition::RepartitionExec`] for the probe
side. The Merge
+/// consumer of [`datafusion::physical_plan::repartition::RepartitionExec`] is
unspillable, while
+/// the [`sedona_spatial_join::exec::SpatialJoinExec`] is spillable.
+/// [`sedona_spatial_join::exec::SpatialJoinExec`] could consume all memory,
resulting a reservation
Review Comment:
Grammar fix in doc comment: change 'resulting a reservation' to 'resulting
in a reservation'.
```suggestion
/// [`sedona_spatial_join::exec::SpatialJoinExec`] could consume all memory,
resulting in a reservation
```
##########
sedona-cli/src/main.rs:
##########
@@ -187,3 +235,68 @@ fn parse_command(command: &str) -> Result<String, String> {
Err("-c flag expects only non empty commands".to_string())
}
}
+
+#[derive(Debug, Clone, Copy)]
+enum ByteUnit {
+ Byte,
+ KiB,
+ MiB,
+ GiB,
+ TiB,
+}
+
+impl ByteUnit {
+ fn multiplier(&self) -> u64 {
+ match self {
+ ByteUnit::Byte => 1,
+ ByteUnit::KiB => 1 << 10,
+ ByteUnit::MiB => 1 << 20,
+ ByteUnit::GiB => 1 << 30,
+ ByteUnit::TiB => 1 << 40,
+ }
+ }
+}
+
+fn parse_size_string(size: &str, label: &str) -> Result<usize, String> {
+ static BYTE_SUFFIXES: LazyLock<HashMap<&'static str, ByteUnit>> =
LazyLock::new(|| {
+ let mut m = HashMap::new();
+ m.insert("b", ByteUnit::Byte);
+ m.insert("k", ByteUnit::KiB);
+ m.insert("kb", ByteUnit::KiB);
+ m.insert("m", ByteUnit::MiB);
+ m.insert("mb", ByteUnit::MiB);
+ m.insert("g", ByteUnit::GiB);
+ m.insert("gb", ByteUnit::GiB);
+ m.insert("t", ByteUnit::TiB);
+ m.insert("tb", ByteUnit::TiB);
+ m
+ });
+
+ static SUFFIX_REGEX: LazyLock<regex::Regex> =
+ LazyLock::new(|| regex::Regex::new(r"^(-?[0-9]+)([a-z]+)?$").unwrap());
+
+ let lower = size.to_lowercase();
+ if let Some(caps) = SUFFIX_REGEX.captures(&lower) {
+ let num_str = caps.get(1).unwrap().as_str();
+ let num = num_str
+ .parse::<usize>()
+ .map_err(|_| format!("Invalid numeric value in {label}
'{size}'"))?;
Review Comment:
The regex allows negative values (`-?`), but the parse target is `usize`, so
negative inputs will always fail at parsing and produce a numeric parse error
rather than a more direct validation error. Consider removing the optional
minus from the regex so negatives fall into the general \"Invalid {label}\"
path, or explicitly detect a leading '-' and return a clearer message.
##########
sedona-cli/src/main.rs:
##########
@@ -57,6 +64,28 @@ struct Args {
)]
command: Vec<String>,
+ #[clap(
+ short = 'm',
+ long,
+ help = "The memory pool limitation (e.g. '10g'), default to None (no
limit)",
+ value_parser(extract_memory_pool_size)
+ )]
+ memory_limit: Option<usize>,
+
+ #[clap(
+ long,
+ help = "Specify the memory pool type 'greedy' or 'fair'",
+ default_value_t = PoolType::Greedy
+ )]
+ mem_pool_type: PoolType,
+
+ #[clap(
+ long,
+ help = "The fraction of memory reserved for unspillable consumers (0.0
- 1.0)",
+ default_value_t = DEFAULT_UNSPILLABLE_RESERVE_RATIO
+ )]
+ unspillable_reserve_ratio: f64,
Review Comment:
The help text documents `unspillable_reserve_ratio` as (0.0 - 1.0), but the
CLI currently accepts any `f64` value. This can lead to surprising behavior
(e.g., values > 1.0 effectively reserve all memory for unspillable; NaN
silently becomes 0 when cast later). Add a clap value parser / validator to
enforce an inclusive range (0.0..=1.0) and return a clear error when out of
range.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]