Re: [PR] feat(examples): Modernize example data loading with Parquet and YAML configs [superset]

via GitHub Mon, 19 Jan 2026 15:07:44 -0800


codeant-ai-for-open-source[bot] commented on code in PR #36538:
URL: https://github.com/apache/superset/pull/36538#discussion_r2706279927



##########
superset/examples/data_loading.py:
##########
@@ -14,44 +14,169 @@
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
-from .bart_lines import load_bart_lines
-from .big_data import load_big_data
-from .birth_names import load_birth_names
-from .country_map import load_country_map_data
+"""Auto-discover and load example datasets from Parquet files."""
+
+import logging
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional
+
+import yaml
+
+# Import loaders that have custom logic (dashboards, CSS, etc.)
+from superset.cli.test_loaders import load_big_data
+
 from .css_templates import load_css_templates
-from .deck import load_deck_dash
-from .energy import load_energy
-from .flights import load_flights
-from .international_sales import load_international_sales
-from .long_lat import load_long_lat_data
-from .misc_dashboard import load_misc_dashboard
-from .multiformat_time_series import load_multiformat_time_series
-from .paris import load_paris_iris_geojson
-from .random_time_series import load_random_time_series_data
-from .sf_population_polygons import load_sf_population_polygons
-from .supported_charts_dashboard import load_supported_charts_dashboard
-from .tabbed_dashboard import load_tabbed_dashboard
+
+# Import generic loader for Parquet datasets
+from .generic_loader import create_generic_loader
 from .utils import load_examples_from_configs
-from .world_bank import load_world_bank_health_n_pop
 
+logger = logging.getLogger(__name__)
+
+
+def get_dataset_config_from_yaml(example_dir: Path) -> Dict[str, 
Optional[str]]:
+    """Read table_name, schema, and data_file from dataset.yaml if it 
exists."""
+    result: Dict[str, Optional[str]] = {
+        "table_name": None,
+        "schema": None,
+        "data_file": None,
+    }
+    dataset_yaml = example_dir / "dataset.yaml"
+    if dataset_yaml.exists():
+        try:
+            with open(dataset_yaml) as f:
+                config = yaml.safe_load(f)
+                result["table_name"] = config.get("table_name")
+                result["data_file"] = config.get("data_file")
+                schema = config.get("schema")
+                # Treat SQLite's 'main' schema as null (use target database 
default)
+                result["schema"] = None if schema == "main" else schema
+        except Exception:
+            logger.debug("Could not read dataset.yaml from %s", example_dir)
+    return result
+
+
+def get_examples_directory() -> Path:
+    """Get the path to the examples directory."""
+    from .helpers import get_examples_folder
+
+    return Path(get_examples_folder())
+
+
+def _get_multi_dataset_config(
+    example_dir: Path, dataset_name: str, data_file: Path
+) -> Dict[str, Any]:
+    """Read config for a multi-dataset example from datasets/{name}.yaml."""
+    datasets_yaml = example_dir / "datasets" / f"{dataset_name}.yaml"
+    result: Dict[str, Any] = {
+        "table_name": dataset_name,
+        "schema": None,
+        "data_file": data_file,
+    }
+
+    if not datasets_yaml.exists():
+        return result
+
+    try:
+        with open(datasets_yaml) as f:
+            yaml_config = yaml.safe_load(f)
+            result["table_name"] = yaml_config.get("table_name") or 
dataset_name
+            raw_schema = yaml_config.get("schema")
+            result["schema"] = None if raw_schema == "main" else raw_schema
+
+            # Use explicit data_file from YAML if specified
+            explicit_data_file = yaml_config.get("data_file")
+            if explicit_data_file:
+                candidate = example_dir / "data" / explicit_data_file
+                if candidate.exists():
+                    result["data_file"] = candidate
+                else:
+                    logger.warning(
+                        "data_file '%s' specified in YAML does not exist",
+                        explicit_data_file,
+                    )
+    except Exception:
+        logger.debug("Could not read datasets yaml from %s", datasets_yaml)
+
+    return result
+
+
+def discover_datasets() -> Dict[str, Callable[..., None]]:
+    """Auto-discover all example datasets and create loaders for them.
+
+    Examples are organized as:
+        superset/examples/{example_name}/data.parquet           # Single 
dataset
+        superset/examples/{example_name}/data/{name}.parquet    # Multiple 
datasets
+
+    Table names and data file references are read from 
dataset.yaml/datasets/*.yaml
+    if present, otherwise derived from the folder/file name.
+    """
+    loaders: Dict[str, Callable[..., None]] = {}
+    examples_dir = get_examples_directory()
+
+    if not examples_dir.exists():
+        return loaders
+
+    # Discover single data.parquet files (simple examples)
+    for data_file in sorted(examples_dir.glob("*/data.parquet")):
+        example_dir = data_file.parent
+        dataset_name = example_dir.name
+
+        if dataset_name.startswith("_"):
+            continue
+
+        config = get_dataset_config_from_yaml(example_dir)
+        table_name = config["table_name"] or dataset_name
+        explicit_data_file = config.get("data_file")
+        if explicit_data_file:
+            resolved_file = example_dir / explicit_data_file
+        else:
+            resolved_file = data_file
+        if explicit_data_file and not resolved_file.exists():
+            logger.warning("data_file '%s' does not exist", explicit_data_file)
+            resolved_file = data_file
+
+        loader_name = f"load_{dataset_name}"
+        loaders[loader_name] = create_generic_loader(
+            dataset_name,
+            table_name=table_name,
+            schema=config["schema"],
+            data_file=resolved_file,
+        )
+
+    # Discover multiple parquet files in data/ folders (complex examples)
+    for data_file in sorted(examples_dir.glob("*/data/*.parquet")):
+        dataset_name = data_file.stem
+        example_dir = data_file.parent.parent
+
+        if example_dir.name.startswith("_"):
+            continue
+
+        config = _get_multi_dataset_config(example_dir, dataset_name, 
data_file)
+        loader_name = f"load_{dataset_name}"
+        if loader_name not in loaders:
+            loaders[loader_name] = create_generic_loader(
+                dataset_name,
+                table_name=config["table_name"],
+                schema=config["schema"],
+                data_file=config["data_file"],
+            )
+
+    return loaders
+
+
+# Auto-discover and create all dataset loaders
+_auto_loaders = discover_datasets()
+
+# Add auto-discovered loaders to module namespace

Review Comment:
   **Suggestion:** The module eagerly calls `discover_datasets()` at import 
time, which internally uses `get_examples_folder()` and Flask's `current_app`; 
importing this module outside of an active Flask application context (e.g., in 
tests or tooling) will raise a `RuntimeError` and prevent the module from 
importing at all, so the discovery should be wrapped to fail gracefully instead 
of crashing on import. [logic error]
   
   <details>
   <summary><b>Severity Level:</b> Critical 🚨</summary>
   
   ```mdx
   - ❌ Importing module raises RuntimeError blocking imports.
   - ❌ Tests importing examples module fail immediately.
   - ⚠️ CLI tooling performing static imports is broken.
   - ⚠️ Static analysis / type-checkers error on import.
   ```
   </details>
   
   ```suggestion
   try:
       _auto_loaders = discover_datasets()
   except Exception:
       logger.debug(
           "Skipping auto-discovery of example datasets (likely no app 
context)",
           exc_info=True,
       )
       _auto_loaders = {}
   
   # Add auto-discovered loaders to module namespace
   if _auto_loaders:
   ```
   <details>
   <summary><b>Steps of Reproduction ✅ </b></summary>
   
   ```mdx
   1. In a fresh Python process (no Flask app context), import the module: 
`import
   superset.examples.data_loading`.
   
      - This triggers module import of file at 
`superset/examples/data_loading.py`.
   
      - The module-level statement at `data_loading.py:168` executes 
`_auto_loaders =
      discover_datasets()`.
   
   2. `discover_datasets()` at `data_loading.py:104` calls 
`get_examples_directory()` at
   `data_loading.py:59`.
   
      - `get_examples_directory()` does `from .helpers import 
get_examples_folder` and then
      `Path(get_examples_folder())`.
   
   3. `get_examples_folder()` is defined in `superset/examples/helpers.py:53` 
and uses
   `current_app.config["BASE_DIR"]` (see `helpers.py:31` import of 
`current_app` and
   `helpers.py:53` implementation).
   
      - Accessing `flask.current_app` outside an active application context 
raises
      `RuntimeError: Working outside of application context.`
   
   4. As a result, the import in step 1 fails with a RuntimeError coming from
   `helpers.get_examples_folder()` and the process cannot import
   `superset.examples.data_loading`.
   
      - Observed call chain: data_loading.py:168 -> data_loading.py:59 ->
      helpers.get_examples_folder() (helpers.py:53) -> current_app 
(helpers.py:31) causing
      the exception.
   
   5. Because discovery is done eagerly at import-time (module-level) the 
failure prevents
   any consumer from importing this module for tests, tooling, or static 
inspection without a
   Flask app context.
   ```
   </details>
   <details>
   <summary><b>Prompt for AI Agent 🤖 </b></summary>
   
   ```mdx
   This is a comment left during a code review.
   
   **Path:** superset/examples/data_loading.py
   **Line:** 169:171
   **Comment:**
        *Logic Error: The module eagerly calls `discover_datasets()` at import 
time, which internally uses `get_examples_folder()` and Flask's `current_app`; 
importing this module outside of an active Flask application context (e.g., in 
tests or tooling) will raise a `RuntimeError` and prevent the module from 
importing at all, so the discovery should be wrapped to fail gracefully instead 
of crashing on import.
   
   Validate the correctness of the flagged issue. If correct, How can I resolve 
this? If you propose a fix, implement it and please make it concise.
   ```
   </details>



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat(examples): Modernize example data loading with Parquet and YAML configs [superset]

Reply via email to