codeant-ai-for-open-source[bot] commented on code in PR #36538:
URL: https://github.com/apache/superset/pull/36538#discussion_r2706279927
##########
superset/examples/data_loading.py:
##########
@@ -14,44 +14,169 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-from .bart_lines import load_bart_lines
-from .big_data import load_big_data
-from .birth_names import load_birth_names
-from .country_map import load_country_map_data
+"""Auto-discover and load example datasets from Parquet files."""
+
+import logging
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional
+
+import yaml
+
+# Import loaders that have custom logic (dashboards, CSS, etc.)
+from superset.cli.test_loaders import load_big_data
+
from .css_templates import load_css_templates
-from .deck import load_deck_dash
-from .energy import load_energy
-from .flights import load_flights
-from .international_sales import load_international_sales
-from .long_lat import load_long_lat_data
-from .misc_dashboard import load_misc_dashboard
-from .multiformat_time_series import load_multiformat_time_series
-from .paris import load_paris_iris_geojson
-from .random_time_series import load_random_time_series_data
-from .sf_population_polygons import load_sf_population_polygons
-from .supported_charts_dashboard import load_supported_charts_dashboard
-from .tabbed_dashboard import load_tabbed_dashboard
+
+# Import generic loader for Parquet datasets
+from .generic_loader import create_generic_loader
from .utils import load_examples_from_configs
-from .world_bank import load_world_bank_health_n_pop
+logger = logging.getLogger(__name__)
+
+
+def get_dataset_config_from_yaml(example_dir: Path) -> Dict[str,
Optional[str]]:
+ """Read table_name, schema, and data_file from dataset.yaml if it
exists."""
+ result: Dict[str, Optional[str]] = {
+ "table_name": None,
+ "schema": None,
+ "data_file": None,
+ }
+ dataset_yaml = example_dir / "dataset.yaml"
+ if dataset_yaml.exists():
+ try:
+ with open(dataset_yaml) as f:
+ config = yaml.safe_load(f)
+ result["table_name"] = config.get("table_name")
+ result["data_file"] = config.get("data_file")
+ schema = config.get("schema")
+ # Treat SQLite's 'main' schema as null (use target database
default)
+ result["schema"] = None if schema == "main" else schema
+ except Exception:
+ logger.debug("Could not read dataset.yaml from %s", example_dir)
+ return result
+
+
+def get_examples_directory() -> Path:
+ """Get the path to the examples directory."""
+ from .helpers import get_examples_folder
+
+ return Path(get_examples_folder())
+
+
+def _get_multi_dataset_config(
+ example_dir: Path, dataset_name: str, data_file: Path
+) -> Dict[str, Any]:
+ """Read config for a multi-dataset example from datasets/{name}.yaml."""
+ datasets_yaml = example_dir / "datasets" / f"{dataset_name}.yaml"
+ result: Dict[str, Any] = {
+ "table_name": dataset_name,
+ "schema": None,
+ "data_file": data_file,
+ }
+
+ if not datasets_yaml.exists():
+ return result
+
+ try:
+ with open(datasets_yaml) as f:
+ yaml_config = yaml.safe_load(f)
+ result["table_name"] = yaml_config.get("table_name") or
dataset_name
+ raw_schema = yaml_config.get("schema")
+ result["schema"] = None if raw_schema == "main" else raw_schema
+
+ # Use explicit data_file from YAML if specified
+ explicit_data_file = yaml_config.get("data_file")
+ if explicit_data_file:
+ candidate = example_dir / "data" / explicit_data_file
+ if candidate.exists():
+ result["data_file"] = candidate
+ else:
+ logger.warning(
+ "data_file '%s' specified in YAML does not exist",
+ explicit_data_file,
+ )
+ except Exception:
+ logger.debug("Could not read datasets yaml from %s", datasets_yaml)
+
+ return result
+
+
+def discover_datasets() -> Dict[str, Callable[..., None]]:
+ """Auto-discover all example datasets and create loaders for them.
+
+ Examples are organized as:
+ superset/examples/{example_name}/data.parquet # Single
dataset
+ superset/examples/{example_name}/data/{name}.parquet # Multiple
datasets
+
+ Table names and data file references are read from
dataset.yaml/datasets/*.yaml
+ if present, otherwise derived from the folder/file name.
+ """
+ loaders: Dict[str, Callable[..., None]] = {}
+ examples_dir = get_examples_directory()
+
+ if not examples_dir.exists():
+ return loaders
+
+ # Discover single data.parquet files (simple examples)
+ for data_file in sorted(examples_dir.glob("*/data.parquet")):
+ example_dir = data_file.parent
+ dataset_name = example_dir.name
+
+ if dataset_name.startswith("_"):
+ continue
+
+ config = get_dataset_config_from_yaml(example_dir)
+ table_name = config["table_name"] or dataset_name
+ explicit_data_file = config.get("data_file")
+ if explicit_data_file:
+ resolved_file = example_dir / explicit_data_file
+ else:
+ resolved_file = data_file
+ if explicit_data_file and not resolved_file.exists():
+ logger.warning("data_file '%s' does not exist", explicit_data_file)
+ resolved_file = data_file
+
+ loader_name = f"load_{dataset_name}"
+ loaders[loader_name] = create_generic_loader(
+ dataset_name,
+ table_name=table_name,
+ schema=config["schema"],
+ data_file=resolved_file,
+ )
+
+ # Discover multiple parquet files in data/ folders (complex examples)
+ for data_file in sorted(examples_dir.glob("*/data/*.parquet")):
+ dataset_name = data_file.stem
+ example_dir = data_file.parent.parent
+
+ if example_dir.name.startswith("_"):
+ continue
+
+ config = _get_multi_dataset_config(example_dir, dataset_name,
data_file)
+ loader_name = f"load_{dataset_name}"
+ if loader_name not in loaders:
+ loaders[loader_name] = create_generic_loader(
+ dataset_name,
+ table_name=config["table_name"],
+ schema=config["schema"],
+ data_file=config["data_file"],
+ )
+
+ return loaders
+
+
+# Auto-discover and create all dataset loaders
+_auto_loaders = discover_datasets()
+
+# Add auto-discovered loaders to module namespace
Review Comment:
**Suggestion:** The module eagerly calls `discover_datasets()` at import
time, which internally uses `get_examples_folder()` and Flask's `current_app`;
importing this module outside of an active Flask application context (e.g., in
tests or tooling) will raise a `RuntimeError` and prevent the module from
importing at all, so the discovery should be wrapped to fail gracefully instead
of crashing on import. [logic error]
<details>
<summary><b>Severity Level:</b> Critical 🚨</summary>
```mdx
- ❌ Importing module raises RuntimeError blocking imports.
- ❌ Tests importing examples module fail immediately.
- ⚠️ CLI tooling performing static imports is broken.
- ⚠️ Static analysis / type-checkers error on import.
```
</details>
```suggestion
try:
_auto_loaders = discover_datasets()
except Exception:
logger.debug(
"Skipping auto-discovery of example datasets (likely no app
context)",
exc_info=True,
)
_auto_loaders = {}
# Add auto-discovered loaders to module namespace
if _auto_loaders:
```
<details>
<summary><b>Steps of Reproduction ✅ </b></summary>
```mdx
1. In a fresh Python process (no Flask app context), import the module:
`import
superset.examples.data_loading`.
- This triggers module import of file at
`superset/examples/data_loading.py`.
- The module-level statement at `data_loading.py:168` executes
`_auto_loaders =
discover_datasets()`.
2. `discover_datasets()` at `data_loading.py:104` calls
`get_examples_directory()` at
`data_loading.py:59`.
- `get_examples_directory()` does `from .helpers import
get_examples_folder` and then
`Path(get_examples_folder())`.
3. `get_examples_folder()` is defined in `superset/examples/helpers.py:53`
and uses
`current_app.config["BASE_DIR"]` (see `helpers.py:31` import of
`current_app` and
`helpers.py:53` implementation).
- Accessing `flask.current_app` outside an active application context
raises
`RuntimeError: Working outside of application context.`
4. As a result, the import in step 1 fails with a RuntimeError coming from
`helpers.get_examples_folder()` and the process cannot import
`superset.examples.data_loading`.
- Observed call chain: data_loading.py:168 -> data_loading.py:59 ->
helpers.get_examples_folder() (helpers.py:53) -> current_app
(helpers.py:31) causing
the exception.
5. Because discovery is done eagerly at import-time (module-level) the
failure prevents
any consumer from importing this module for tests, tooling, or static
inspection without a
Flask app context.
```
</details>
<details>
<summary><b>Prompt for AI Agent 🤖 </b></summary>
```mdx
This is a comment left during a code review.
**Path:** superset/examples/data_loading.py
**Line:** 169:171
**Comment:**
*Logic Error: The module eagerly calls `discover_datasets()` at import
time, which internally uses `get_examples_folder()` and Flask's `current_app`;
importing this module outside of an active Flask application context (e.g., in
tests or tooling) will raise a `RuntimeError` and prevent the module from
importing at all, so the discovery should be wrapped to fail gracefully instead
of crashing on import.
Validate the correctness of the flagged issue. If correct, How can I resolve
this? If you propose a fix, implement it and please make it concise.
```
</details>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]