This is an automated email from the ASF dual-hosted git repository.

beto pushed a commit to branch explorable
in repository https://gitbox.apache.org/repos/asf/superset.git

commit e1899f1014f4b20464b243b04fa5290ab0a39d92
Author: Beto Dealmeida <[email protected]>
AuthorDate: Fri Oct 17 16:37:26 2025 -0400

    GroupFilter
---
 superset/semantic_layers/snowflake_.py | 393 +++++++++++++++++++++++++++++++--
 1 file changed, 378 insertions(+), 15 deletions(-)

diff --git a/superset/semantic_layers/snowflake_.py 
b/superset/semantic_layers/snowflake_.py
index 15eb8c1048..97e92944f9 100644
--- a/superset/semantic_layers/snowflake_.py
+++ b/superset/semantic_layers/snowflake_.py
@@ -674,7 +674,6 @@ class SnowflakeExplorable:
         order: list[tuple[Metric | Dimension, OrderDirection]] | None = None,
         limit: int | None = None,
         offset: int | None = None,
-        *,
         group_limit: GroupLimit | None = None,
     ) -> tuple[str, tuple[FilterValues]]:
         """
@@ -683,16 +682,6 @@ class SnowflakeExplorable:
         This also returns the parameters need to run `cursor.execute()`, passed
         separately to prevent SQL injection.
         """
-
-        def alias(element: Metric | Dimension) -> str:
-            """
-            Alias a metric or dimension.
-            """
-            return f"{element.id} AS {self._quote(element.id)}"
-
-        dimension_arguments = ", ".join(alias(dimension) for dimension in 
dimensions)
-        metric_arguments = ", ".join(alias(metric) for metric in metrics)
-
         filters = filters or set()
         where_clause, where_parameters = self._build_predicates(
             {filter_ for filter_ in filters if filter_.type == 
PredicateType.WHERE}
@@ -700,11 +689,71 @@ class SnowflakeExplorable:
         having_clause, having_parameters = self._build_predicates(
             {filter_ for filter_ in filters if filter_.type == 
PredicateType.HAVING}
         )
-        order_clause = ", ".join(
-            f"{element.id} {direction.value}" for element, direction in (order 
or [])
+
+        if group_limit:
+            query = self._build_query_with_group_limit(
+                metrics,
+                dimensions,
+                where_clause,
+                having_clause,
+                order,
+                limit,
+                offset,
+                group_limit,
+            )
+        else:
+            query = self._build_simple_query(
+                metrics,
+                dimensions,
+                where_clause,
+                having_clause,
+                order,
+                limit,
+                offset,
+            )
+
+        return query, where_parameters + having_parameters
+
+    def _alias_element(self, element: Metric | Dimension) -> str:
+        """
+        Generate an aliased column expression for a metric or dimension.
+        """
+        return f"{element.id} AS {self._quote(element.id)}"
+
+    def _build_order_clause(
+        self,
+        order: list[tuple[Metric | Dimension, OrderDirection]] | None,
+    ) -> str:
+        """
+        Build the ORDER BY clause from a list of (element, direction) tuples.
+        """
+        if not order:
+            return ""
+        return ", ".join(
+            f"{self._quote(element.id)} {direction.value}"
+            for element, direction in order
+        )
+
+    def _build_simple_query(
+        self,
+        metrics: list[Metric],
+        dimensions: list[Dimension],
+        where_clause: str,
+        having_clause: str,
+        order: list[tuple[Metric | Dimension, OrderDirection]] | None,
+        limit: int | None,
+        offset: int | None,
+    ) -> str:
+        """
+        Build a query without group limiting.
+        """
+        dimension_arguments = ", ".join(
+            self._alias_element(dimension) for dimension in dimensions
         )
+        metric_arguments = ", ".join(self._alias_element(metric) for metric in 
metrics)
+        order_clause = self._build_order_clause(order)
 
-        query = dedent(
+        return dedent(
             f"""
             SELECT * FROM SEMANTIC_VIEW(
                 {self.uid()}
@@ -719,7 +768,251 @@ class SnowflakeExplorable:
             """  # noqa: S608
         )
 
-        return query, where_parameters + having_parameters
+    def _build_top_groups_cte(
+        self,
+        group_limit: GroupLimit,
+        where_clause: str,
+        having_clause: str,
+    ) -> str:
+        """
+        Build a CTE that finds the top N combinations of limited dimensions.
+        """
+        limited_dimension_arguments = ", ".join(
+            self._alias_element(dimension) for dimension in 
group_limit.dimensions
+        )
+        limited_dimension_names = ", ".join(
+            self._quote(dimension.id) for dimension in group_limit.dimensions
+        )
+
+        return dedent(
+            f"""
+            WITH top_groups AS (
+                SELECT {limited_dimension_names}
+                FROM SEMANTIC_VIEW(
+                    {self.uid()}
+                    DIMENSIONS {limited_dimension_arguments}
+                    METRICS {group_limit.metric.id}
+                        AS {self._quote(group_limit.metric.id)}
+                    {"WHERE " + where_clause if where_clause else ""}
+                )
+                {"HAVING " + having_clause if having_clause else ""}
+                ORDER BY
+                    {self._quote(group_limit.metric.id)} 
{group_limit.direction.value}
+                LIMIT {group_limit.top}
+            )
+            """  # noqa: S608
+        )
+
+    def _build_group_filter(self, group_limit: GroupLimit) -> str:
+        """
+        Build a WHERE filter that restricts results to top N groups.
+        """
+        if len(group_limit.dimensions) == 1:
+            dimension_id = self._quote(group_limit.dimensions[0].id)
+            return f"{dimension_id} IN (SELECT {dimension_id} FROM 
top_groups)"  # noqa: S608
+
+        # Multi-column IN clause
+        dimension_tuple = ", ".join(
+            self._quote(dim.id) for dim in group_limit.dimensions
+        )
+        return f"({dimension_tuple}) IN (SELECT {dimension_tuple} FROM 
top_groups)"  # noqa: S608
+
+    def _build_case_expression(
+        self,
+        dimension: Dimension,
+        group_condition: str,
+    ) -> str:
+        """
+        Build a CASE expression that replaces non-top values with 'Other'.
+
+        Args:
+            dimension: The dimension to build the CASE for
+            group_condition: The condition to check if value is in top groups
+                            (e.g., "staff_id IN (SELECT staff_id FROM 
top_groups)")
+
+        Returns:
+            SQL CASE expression
+        """
+        dimension_id = self._quote(dimension.id)
+        return f"""CASE
+            WHEN {group_condition} THEN {dimension_id}
+            ELSE CAST('Other' AS VARCHAR)
+        END"""
+
+    def _build_query_with_others(
+        self,
+        metrics: list[Metric],
+        dimensions: list[Dimension],
+        where_clause: str,
+        having_clause: str,
+        order: list[tuple[Metric | Dimension, OrderDirection]] | None,
+        limit: int | None,
+        offset: int | None,
+        group_limit: GroupLimit,
+    ) -> str:
+        """
+        Build a query that groups non-top N values as 'Other'.
+
+        This uses a two-stage approach:
+        1. CTE to find top N groups
+        2. Subquery with CASE expressions to replace non-top values with 
'Other'
+        3. Outer query to re-aggregate with the new grouping
+        """
+        top_groups_cte = self._build_top_groups_cte(
+            group_limit,
+            where_clause,
+            having_clause,
+        )
+
+        # Determine which dimensions are limited vs non-limited
+        limited_dimension_ids = {dim.id for dim in group_limit.dimensions}
+        non_limited_dimensions = [
+            dim for dim in dimensions if dim.id not in limited_dimension_ids
+        ]
+
+        # Build the group condition for CASE expressions
+        if len(group_limit.dimensions) == 1:
+            dimension_id = self._quote(group_limit.dimensions[0].id)
+            group_condition = f"{dimension_id} IN (SELECT {dimension_id} FROM 
top_groups)"  # noqa: S608
+        else:
+            dimension_tuple = ", ".join(
+                self._quote(dim.id) for dim in group_limit.dimensions
+            )
+            group_condition = f"({dimension_tuple}) IN (SELECT 
{dimension_tuple} FROM top_groups)"  # noqa: S608
+
+        # Build CASE expressions for limited dimensions
+        case_expressions = []
+        case_expressions_for_groupby = []
+        for dim in group_limit.dimensions:
+            case_expr = self._build_case_expression(dim, group_condition)
+            alias = self._quote(dim.id)
+            case_expressions.append(f"{case_expr} AS {alias}")
+            # Store the full CASE expression for GROUP BY (not just alias)
+            case_expressions_for_groupby.append(case_expr)
+
+        # Build SELECT for non-limited dimensions (pass through)
+        non_limited_selects = [
+            f"{self._quote(dim.id)} AS {self._quote(dim.id)}"
+            for dim in non_limited_dimensions
+        ]
+
+        # Build metric aggregations
+        metric_aggregations = [
+            f"SUM({self._quote(metric.id)}) AS {self._quote(metric.id)}"
+            for metric in metrics
+        ]
+
+        # Build the subquery that gets raw data from SEMANTIC_VIEW
+        dimension_arguments = ", ".join(
+            self._alias_element(dimension) for dimension in dimensions
+        )
+        metric_arguments = ", ".join(self._alias_element(metric) for metric in 
metrics)
+
+        subquery = dedent(
+            f"""
+            raw_data AS (
+                SELECT * FROM SEMANTIC_VIEW(
+                    {self.uid()}
+                    DIMENSIONS {dimension_arguments}
+                    METRICS {metric_arguments}
+                    {"WHERE " + where_clause if where_clause else ""}
+                )
+                {"HAVING " + having_clause if having_clause else ""}
+            )
+            """  # noqa: S608
+        )
+
+        # Build GROUP BY clause (full CASE expressions + non-limited 
dimensions)
+        # We need to repeat the full CASE expressions, not use aliases, because
+        # Snowflake may interpret the alias as the original column reference
+        group_by_columns = case_expressions_for_groupby + [
+            self._quote(dim.id) for dim in non_limited_dimensions
+        ]
+        group_by_clause = ", ".join(group_by_columns)
+
+        # Build final SELECT columns
+        select_columns = case_expressions + non_limited_selects + 
metric_aggregations
+        select_clause = ",\n    ".join(select_columns)
+
+        # Build ORDER BY clause (need to reference the aliased columns)
+        order_clause = self._build_order_clause(order)
+
+        return dedent(
+            f"""
+            {top_groups_cte},
+            {subquery}
+            SELECT
+                {select_clause}
+            FROM raw_data
+            GROUP BY {group_by_clause}
+            {"ORDER BY " + order_clause if order_clause else ""}
+            {"LIMIT " + str(limit) if limit is not None else ""}
+            {"OFFSET " + str(offset) if offset is not None else ""}
+            """  # noqa: S608
+        )
+
+    def _build_query_with_group_limit(
+        self,
+        metrics: list[Metric],
+        dimensions: list[Dimension],
+        where_clause: str,
+        having_clause: str,
+        order: list[tuple[Metric | Dimension, OrderDirection]] | None,
+        limit: int | None,
+        offset: int | None,
+        group_limit: GroupLimit,
+    ) -> str:
+        """
+        Build a query with group limiting (top N groups).
+
+        If group_others is True, groups non-top values as 'Other'.
+        Otherwise, filters to show only top N groups.
+        """
+        if group_limit.group_others:
+            return self._build_query_with_others(
+                metrics,
+                dimensions,
+                where_clause,
+                having_clause,
+                order,
+                limit,
+                offset,
+                group_limit,
+            )
+
+        # Standard group limiting: just filter to top N groups
+        # We can't use CTE references inside SEMANTIC_VIEW(), so we wrap it
+        dimension_arguments = ", ".join(
+            self._alias_element(dimension) for dimension in dimensions
+        )
+        metric_arguments = ", ".join(self._alias_element(metric) for metric in 
metrics)
+        order_clause = self._build_order_clause(order)
+
+        top_groups_cte = self._build_top_groups_cte(
+            group_limit,
+            where_clause,
+            having_clause,
+        )
+        group_filter = self._build_group_filter(group_limit)
+
+        return dedent(
+            f"""
+            {top_groups_cte}
+            SELECT * FROM (
+                SELECT * FROM SEMANTIC_VIEW(
+                    {self.uid()}
+                    {"DIMENSIONS " + dimension_arguments if 
dimension_arguments else ""}
+                    {"METRICS " + metric_arguments if metric_arguments else ""}
+                    {"WHERE " + where_clause if where_clause else ""}
+                )
+                {"HAVING " + having_clause if having_clause else ""}
+            ) AS subquery
+            WHERE {group_filter}
+            {"ORDER BY " + order_clause if order_clause else ""}
+            {"LIMIT " + str(limit) if limit is not None else ""}
+            {"OFFSET " + str(offset) if offset is not None else ""}
+            """  # noqa: S608
+        )
 
     __repr__ = uid
 
@@ -825,3 +1118,73 @@ if __name__ == "__main__":
             },
         )
     )
+
+    # Example 1: Group limiting without group_others
+    print("\n" + "=" * 80)
+    print("EXAMPLE 1: Group Limiting (group_others=False)")
+    print("Top 3 categories by total sales price")
+    print("=" * 80)
+
+    year_dim = Dimension(
+        id="DATE.YEAR",
+        name="YEAR",
+        type=INTEGER,
+        description=None,
+        definition=None,
+        grain=None,
+    )
+    category_dim = Dimension(
+        id="ITEM.CATEGORY",
+        name="CATEGORY",
+        type=STRING,
+        description=None,
+        definition="I_CATEGORY",
+        grain=None,
+    )
+    sales_metric = Metric(
+        "STORESALES.TOTALSALESPRICE",
+        "TOTALSALESPRICE",
+        NUMBER,
+        None,
+        None,
+    )
+
+    query_without_others, _ = explorable._get_query(
+        metrics=[sales_metric],
+        dimensions=[year_dim, category_dim],
+        filters={
+            NativeFilter(PredicateType.WHERE, "Year = '2002'"),
+            NativeFilter(PredicateType.WHERE, "Month = '12'"),
+        },
+        group_limit=GroupLimit(
+            dimensions=[category_dim],
+            top=3,
+            metric=sales_metric,
+            direction=OrderDirection.DESC,
+            group_others=False,
+        ),
+    )
+    print(query_without_others)
+
+    # Example 2: Group limiting with group_others
+    print("\n" + "=" * 80)
+    print("EXAMPLE 2: Group Limiting (group_others=True)")
+    print("Top 3 categories by total sales price + 'Other'")
+    print("=" * 80)
+
+    query_with_others, _ = explorable._get_query(
+        metrics=[sales_metric],
+        dimensions=[year_dim, category_dim],
+        filters={
+            NativeFilter(PredicateType.WHERE, "Year = '2002'"),
+            NativeFilter(PredicateType.WHERE, "Month = '12'"),
+        },
+        group_limit=GroupLimit(
+            dimensions=[category_dim],
+            top=3,
+            metric=sales_metric,
+            direction=OrderDirection.DESC,
+            group_others=True,
+        ),
+    )
+    print(query_with_others)

Reply via email to