Copilot commented on code in PR #608:
URL: https://github.com/apache/sedona-db/pull/608#discussion_r2816935451
##########
python/sedonadb/python/sedonadb/_options.py:
##########
@@ -52,3 +108,93 @@ def width(self) -> Optional[int]:
@width.setter
def width(self, value: Optional[int]):
self._width = value
+
+ # --- Runtime options (must be set before first query) ---
+
+ @property
+ def memory_limit(self) -> Union[int, str, None]:
+ """Maximum memory for query execution.
+
+ Accepts an integer (bytes) or a human-readable string such as
+ `"4gb"`, `"512m"`, or `"1.5g"`. When set, a bounded memory pool is
+ created to enforce this limit. Without a memory limit, DataFusion's
+ default unbounded pool is used.
+
+ Must be set before the first query is executed.
+
+ Examples:
+
+ >>> sd = sedona.db.connect()
+ >>> sd.options.memory_limit = "4gb"
+ >>> sd.options.memory_limit = 4 * 1024 * 1024 * 1024 # equivalent
+ """
+ return self._memory_limit
+
+ @memory_limit.setter
+ def memory_limit(self, value: Union[int, str, None]) -> None:
+ self._check_runtime_mutable("memory_limit")
+ if value is not None and not isinstance(value, (int, str)):
+ raise TypeError(
+ f"memory_limit must be an int, str, or None, got
{type(value).__name__}"
+ )
+ self._memory_limit = value
+
+ @property
+ def temp_dir(self) -> Optional[str]:
+ """Directory for temporary/spill files.
+
+ When set, disk-based spilling will use this directory. When `None`,
+ DataFusion's default temporary directory is used.
+
+ Must be set before the first query is executed.
+ """
+ return self._temp_dir
+
+ @temp_dir.setter
+ def temp_dir(self, value: Optional[str]) -> None:
+ self._check_runtime_mutable("temp_dir")
+ self._temp_dir = value
+
+ @property
+ def memory_pool_type(self) -> str:
+ """Memory pool type: `"greedy"` or `"fair"`.
+
+ - `"greedy"`: A simple pool that grants reservations on a
+ first-come-first-served basis. This is the default.
+ - `"fair"`: A pool that fairly distributes memory among spillable
+ consumers and reserves a fraction for unspillable consumers
+ (configured via `unspillable_reserve_ratio`).
+
+ Only takes effect when `memory_limit` is set.
+ Must be set before the first query is executed.
+ """
+ return self._memory_pool_type
+
+ @memory_pool_type.setter
+ def memory_pool_type(self, value: Literal["greedy", "fair"]) -> None:
+ self._check_runtime_mutable("memory_pool_type")
+ if value not in ("greedy", "fair"):
+ raise ValueError(
+ f"memory_pool_type must be 'greedy' or 'fair', got '{value}'"
+ )
+ self._memory_pool_type = value
+
+ @property
+ def unspillable_reserve_ratio(self) -> Optional[float]:
+ """Fraction of memory reserved for unspillable consumers (0.0 - 1.0).
+
+ Only applies when `memory_pool_type` is `"fair"` and
+ `memory_limit` is set. Defaults to 0.2 when not explicitly set.
+
+ Must be set before the first query is executed.
+ """
+ return self._unspillable_reserve_ratio
+
+ @unspillable_reserve_ratio.setter
+ def unspillable_reserve_ratio(self, value: Optional[float]) -> None:
+ self._check_runtime_mutable("unspillable_reserve_ratio")
+ if value is not None and not (0.0 <= value <= 1.0):
+ raise ValueError(
+ f"unspillable_reserve_ratio must be between 0.0 and 1.0, got
{value}"
+ )
+ self._unspillable_reserve_ratio = value
Review Comment:
`unspillable_reserve_ratio` validates the numeric range but not the type. If
a user sets it to a string (e.g., "0.2"), the comparison will raise a
`TypeError` from Python rather than a clear validation error. Consider checking
`isinstance(value, (int, float))` (and casting to `float`) when `value` is not
`None`, and raising a `TypeError` with a helpful message otherwise.
```suggestion
if value is None:
self._unspillable_reserve_ratio = None
return
if not isinstance(value, (int, float)):
raise TypeError(
"unspillable_reserve_ratio must be a number between 0.0 and
1.0 "
f"or None, got {type(value).__name__}"
)
value_float = float(value)
if not (0.0 <= value_float <= 1.0):
raise ValueError(
f"unspillable_reserve_ratio must be between 0.0 and 1.0, got
{value_float}"
)
self._unspillable_reserve_ratio = value_float
```
##########
python/sedonadb/python/sedonadb/context.py:
##########
@@ -47,12 +52,46 @@ class SedonaContext:
╞═══════╡
│ 1 │
└───────┘
+
+ Configuring memory limits:
+
+ >>> sd = sedona.db.connect()
+ >>> sd.options.memory_limit = "4gb"
+ >>> sd.options.memory_pool_type = "fair"
+ >>> sd.options.temp_dir = "/tmp/sedona-spill"
"""
def __init__(self):
- self._impl = InternalContext()
+ self.__impl = None
self.options = Options()
+ @property
+ def _impl(self):
+ """Lazily initialize the internal Rust context on first use.
+
+ This allows runtime options (memory_limit, temp_dir, etc.) to be
+ configured via `self.options` before the context is created.
+ Once created, runtime options are frozen.
+ """
+ if self.__impl is None:
+ self.options._runtime_frozen = True
+
+ # Build a dict[str, str] of non-None runtime options
+ opts = {}
+ if self.options.memory_limit is not None:
+ opts["memory_limit"] = str(self.options.memory_limit)
+ if self.options.temp_dir is not None:
+ opts["temp_dir"] = self.options.temp_dir
+ if self.options.memory_pool_type is not None:
+ opts["memory_pool_type"] = self.options.memory_pool_type
+ if self.options.unspillable_reserve_ratio is not None:
+ opts["unspillable_reserve_ratio"] = str(
+ self.options.unspillable_reserve_ratio
+ )
+
+ self.__impl = InternalContext(opts)
+ return self.__impl
Review Comment:
`self.options._runtime_frozen` is set to `True` before
`InternalContext(opts)` is created. If context initialization fails (e.g.,
invalid memory_limit string), the exception will leave `__impl` as `None` but
runtime options permanently frozen, preventing the user from correcting options
and retrying. Freeze options only after successful `InternalContext` creation
(or unfreeze in an exception handler).
##########
python/sedonadb/python/sedonadb/context.py:
##########
@@ -381,7 +420,17 @@ def funcs(self) -> Functions:
def connect() -> SedonaContext:
- """Create a new [SedonaContext][sedonadb.context.SedonaContext]"""
+ """Create a new [SedonaContext][sedonadb.context.SedonaContext]
+
+ Runtime configuration (memory limits, spill directory, pool type)
+ can be set via `options` on the returned context before executing
+ the first query::
+
+ sd = sedona.db.connect()
+ sd.options.memory_limit = "4gb"
+ sd.options.memory_pool_type = "fair"
+ sd.options.temp_dir = "/tmp/sedona-spill"
+ """
return SedonaContext()
Review Comment:
The PR description/example shows `connect(...)` /
`SedonaContext.__init__(...)` accepting runtime configuration parameters, but
this `connect()` function still takes no arguments and configuration is only
possible via `sd.options` before first use. Either update the public API to
match the described signature or update the PR description/docs to reflect the
options-based configuration approach.
##########
python/sedonadb/python/sedonadb/_options.py:
##########
@@ -52,3 +108,93 @@ def width(self) -> Optional[int]:
@width.setter
def width(self, value: Optional[int]):
self._width = value
+
+ # --- Runtime options (must be set before first query) ---
+
+ @property
+ def memory_limit(self) -> Union[int, str, None]:
+ """Maximum memory for query execution.
+
+ Accepts an integer (bytes) or a human-readable string such as
+ `"4gb"`, `"512m"`, or `"1.5g"`. When set, a bounded memory pool is
+ created to enforce this limit. Without a memory limit, DataFusion's
+ default unbounded pool is used.
+
+ Must be set before the first query is executed.
+
+ Examples:
+
+ >>> sd = sedona.db.connect()
+ >>> sd.options.memory_limit = "4gb"
+ >>> sd.options.memory_limit = 4 * 1024 * 1024 * 1024 # equivalent
+ """
+ return self._memory_limit
+
+ @memory_limit.setter
+ def memory_limit(self, value: Union[int, str, None]) -> None:
+ self._check_runtime_mutable("memory_limit")
+ if value is not None and not isinstance(value, (int, str)):
+ raise TypeError(
+ f"memory_limit must be an int, str, or None, got
{type(value).__name__}"
+ )
+ self._memory_limit = value
+
+ @property
+ def temp_dir(self) -> Optional[str]:
+ """Directory for temporary/spill files.
+
+ When set, disk-based spilling will use this directory. When `None`,
+ DataFusion's default temporary directory is used.
+
+ Must be set before the first query is executed.
+ """
+ return self._temp_dir
+
+ @temp_dir.setter
+ def temp_dir(self, value: Optional[str]) -> None:
+ self._check_runtime_mutable("temp_dir")
+ self._temp_dir = value
Review Comment:
`Options.temp_dir` setter doesn't validate the input type. Passing a
non-string value (e.g., a `Path` or an int) will be accepted here but will
later fail (or behave unexpectedly) when building the `dict[str, str]` and
converting it to the Rust `HashMap<String, String>`. Consider validating
`value` is `str`/`None` (and optionally accepting `PathLike` and converting to
`str`) to raise a clearer error at the point of assignment.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]