codeant-ai-for-open-source[bot] commented on code in PR #37950:
URL: https://github.com/apache/superset/pull/37950#discussion_r2817910818


##########
extensions/ai_assistant/backend/src/ai_assistant/tools.py:
##########
@@ -0,0 +1,532 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Vambery AI Agent Database Tools.
+
+These tools allow the AI agent to introspect database schemas and execute
+SQL queries during its reasoning process.
+Uses Superset's Database model for all database operations, respecting
+security permissions and row-level security.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from superset.extensions import db
+from superset.models.core import Database
+from superset.sql.parse import Table
+
+logger = logging.getLogger(__name__)
+
+
+def _extract_result_data(result: Any) -> tuple[list[dict[str, Any]], 
list[str]]:
+    """
+    Extract rows (as list of dicts) and column names from a QueryResult.
+
+    The QueryResult from database.execute() stores results in
+    result.statements[i].data as a Pandas DataFrame.
+    """
+    rows: list[dict[str, Any]] = []
+    columns: list[str] = []
+    try:
+        if hasattr(result, "statements") and result.statements:
+            stmt = result.statements[0]
+            df = stmt.data
+            if df is not None and hasattr(df, "to_dict"):
+                # Pandas DataFrame
+                rows = df.to_dict(orient="records")
+                columns = list(df.columns)
+            elif df is not None and hasattr(df, "keys"):
+                # dict-like
+                rows = [df] if not isinstance(df, list) else df
+                columns = list(df.keys()) if isinstance(df, dict) else []
+    except Exception as ex:
+        logger.debug("Could not extract result data: %s", ex)
+
+    # Fallback: legacy result format
+    if not rows:
+        if hasattr(result, "data") and result.data:
+            rows = result.data if isinstance(result.data, list) else []
+        elif hasattr(result, "rows") and hasattr(result, "columns"):
+            columns = result.columns
+            rows = [dict(zip(columns, row, strict=False)) for row in 
result.rows]
+
+    return rows, columns
+
+
+# --------------------------------------------------------------------------
+# OpenAI function/tool definitions (sent to the LLM)
+# --------------------------------------------------------------------------
+
+TOOL_DEFINITIONS: list[dict[str, Any]] = [
+    {
+        "type": "function",
+        "function": {
+            "name": "list_schemas",
+            "description": (
+                "List all available schemas in the connected database. "
+                "Use this first to understand the database structure."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "list_tables",
+            "description": (
+                "List all tables in a specific schema. "
+                "Use this to discover which tables are available."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "schema_name": {
+                        "type": "string",
+                        "description": "The schema name to list tables from",
+                    },
+                },
+                "required": ["schema_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_table_columns",
+            "description": (
+                "Get the column names, data types, and nullable info for a 
table. "
+                "Use this to understand the structure of a table before 
writing queries."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "table_name": {
+                        "type": "string",
+                        "description": "Name of the table",
+                    },
+                    "schema_name": {
+                        "type": "string",
+                        "description": "Schema the table belongs to",
+                    },
+                },
+                "required": ["table_name", "schema_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "sample_table_data",
+            "description": (
+                "Get a sample of rows from a table (up to 20 rows). "
+                "Use this to understand what kind of data a table contains."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "table_name": {
+                        "type": "string",
+                        "description": "Name of the table",
+                    },
+                    "schema_name": {
+                        "type": "string",
+                        "description": "Schema the table belongs to",
+                    },
+                },
+                "required": ["table_name", "schema_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_distinct_values",
+            "description": (
+                "Get distinct/unique values for a specific column in a table. "
+                "Use this to understand the domain of a column (e.g., 
categories, statuses)."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "table_name": {
+                        "type": "string",
+                        "description": "Name of the table",
+                    },
+                    "schema_name": {
+                        "type": "string",
+                        "description": "Schema the table belongs to",
+                    },
+                    "column_name": {
+                        "type": "string",
+                        "description": "Name of the column to get distinct 
values for",
+                    },
+                },
+                "required": ["table_name", "schema_name", "column_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "execute_sql",
+            "description": (
+                "Execute an arbitrary SQL SELECT query and return results (max 
50 rows). "
+                "Use this to test queries, explore data, or verify results. "
+                "Only SELECT statements are allowed."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sql": {
+                        "type": "string",
+                        "description": "The SQL SELECT query to execute",
+                    },
+                },
+                "required": ["sql"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "set_editor_sql",
+            "description": (
+                "Set the SQL query in the user's SQL editor. "
+                "Use this when you have the final, tested query ready for the 
user. "
+                "This replaces the content of the active SQL editor tab."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sql": {
+                        "type": "string",
+                        "description": "The SQL query to place in the editor",
+                    },
+                },
+                "required": ["sql"],
+            },
+        },
+    },
+]
+
+
+# --------------------------------------------------------------------------
+# Tool execution functions
+# --------------------------------------------------------------------------
+
+
+def _get_database(database_id: int) -> Database:
+    """Load a Database model by ID, raising ValueError if not found."""
+    database = db.session.query(Database).filter_by(id=database_id).first()
+    if not database:
+        raise ValueError(f"Database with id={database_id} not found")
+    return database
+
+
+def tool_list_schemas(database_id: int, catalog: str | None = None) -> 
dict[str, Any]:
+    """List all schemas in the database."""
+    try:
+        database = _get_database(database_id)
+        schemas = database.get_all_schema_names(catalog=catalog)
+        return {"schemas": sorted(schemas)}
+    except Exception as ex:
+        logger.error("Error listing schemas for db %s: %s", database_id, ex)
+        return {"error": str(ex)}
+
+
+def tool_list_tables(
+    database_id: int, schema_name: str, catalog: str | None = None
+) -> dict[str, Any]:
+    """List all tables in a specific schema."""
+    try:
+        database = _get_database(database_id)
+        tables = database.get_all_table_names_in_schema(
+            catalog=catalog, schema=schema_name
+        )
+        # tables is a set of (table_name, schema, catalog) tuples
+        table_names = sorted(t[0] for t in tables)
+        return {"tables": table_names, "schema": schema_name}
+    except Exception as ex:
+        logger.error(
+            "Error listing tables for db %s, schema %s: %s",
+            database_id,
+            schema_name,
+            ex,
+        )
+        return {"error": str(ex)}
+
+
+def tool_get_table_columns(
+    database_id: int,
+    table_name: str,
+    schema_name: str,
+    catalog: str | None = None,
+) -> dict[str, Any]:
+    """Get column metadata for a table."""
+    try:
+        database = _get_database(database_id)
+        table = Table(table=table_name, schema=schema_name, catalog=catalog)
+        columns = database.get_columns(table)
+        # Format columns for readability
+        col_info = []
+        for col in columns:
+            col_info.append(
+                {
+                    "name": col.get("column_name") or col.get("name", 
"unknown"),
+                    "type": str(col.get("type", "unknown")),
+                    "nullable": col.get("nullable", True),
+                }
+            )
+        return {"table": table_name, "schema": schema_name, "columns": 
col_info}
+    except Exception as ex:
+        logger.error(
+            "Error getting columns for %s.%s in db %s: %s",
+            schema_name,
+            table_name,
+            database_id,
+            ex,
+        )
+        return {"error": str(ex)}
+
+
+def tool_sample_table_data(
+    database_id: int,
+    table_name: str,
+    schema_name: str,
+    catalog: str | None = None,
+    max_rows: int = 20,
+) -> dict[str, Any]:
+    """Get sample data from a table."""
+    try:
+        database = _get_database(database_id)
+        # Use TOP for MSSQL, LIMIT for others
+        db_engine = database.db_engine_spec.__name__ if 
database.db_engine_spec else ""
+        if "mssql" in db_engine.lower() or "MSSql" in db_engine:

Review Comment:
   **Suggestion:** In the MSSQL branch of the table sampling helper, schema and 
table names are interpolated directly into the SQL inside square brackets 
without any validation, so a crafted name containing ']' (or other special 
characters) could break out of the identifier and inject arbitrary SQL, which 
is a SQL injection vulnerability even though the function is intended only for 
safe introspection. [security]
   
   <details>
   <summary><b>Severity Level:</b> Critical 🚨</summary>
   
   ```mdx
   - ❌ AI assistant table sampling can execute injected MSSQL SQL.
   - ❌ Compromise of connected MSSQL database integrity possible.
   - ⚠️ MSSQL-backed metadata exploration becomes security-sensitive surface.
   ```
   </details>
   
   ```suggestion
           # Basic identifier safety to avoid SQL injection via schema/table 
names
           if any(ch in schema_name for ch in '[];"') or any(
               ch in table_name for ch in '[];"'
           ):
               raise ValueError("Invalid characters in schema or table name")
           if "mssql" in db_engine.lower():
   ```
   <details>
   <summary><b>Steps of Reproduction ✅ </b></summary>
   
   ```mdx
   1. Ensure Superset is configured with a MSSQL database entry so that
   `database.db_engine_spec.__name__` contains `"mssql"` (see
   `superset.models.core.Database`, and helper `_get_database()` at
   `extensions/ai_assistant/backend/src/ai_assistant/tools.py:240-245`).
   
   2. From any backend component that uses the AI assistant tools, call 
`execute_tool()` in
   `extensions/ai_assistant/backend/src/ai_assistant/tools.py:469-532` with
   `tool_name="sample_table_data"`, a valid `database_id` for that MSSQL 
database, and
   `arguments={"schema_name": "foo]; DROP TABLE sensitive_table;--", 
"table_name": "bar"}`.
   
   3. Inside `execute_tool`, the `"sample_table_data"` branch forwards the call 
to
   `tool_sample_table_data()` at
   `extensions/ai_assistant/backend/src/ai_assistant/tools.py:314-356` with the 
provided
   `schema_name` and `table_name`.
   
   4. In `tool_sample_table_data`, the MSSQL branch at lines `324-329` builds 
`sql = f"SELECT
   TOP {max_rows} * FROM [{schema_name}].[{table_name}]"`, producing SQL like 
`SELECT TOP 20
   * FROM [foo]; DROP TABLE sensitive_table;--].[bar]`, which is passed 
unmodified to
   `Database.execute()` at `superset/models/core.py:1295-1309`, allowing the 
injected `DROP
   TABLE` to execute if the underlying MSSQL driver permits multiple statements.
   ```
   </details>
   <details>
   <summary><b>Prompt for AI Agent 🤖 </b></summary>
   
   ```mdx
   This is a comment left during a code review.
   
   **Path:** extensions/ai_assistant/backend/src/ai_assistant/tools.py
   **Line:** 326:326
   **Comment:**
        *Security: In the MSSQL branch of the table sampling helper, schema and 
table names are interpolated directly into the SQL inside square brackets 
without any validation, so a crafted name containing ']' (or other special 
characters) could break out of the identifier and inject arbitrary SQL, which 
is a SQL injection vulnerability even though the function is intended only for 
safe introspection.
   
   Validate the correctness of the flagged issue. If correct, How can I resolve 
this? If you propose a fix, implement it and please make it concise.
   ```
   </details>
   <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=2f0a38a819f6fe4e41287fdcecdacebcc88e7c46427ad08c0a47cc2759ef5b13&reaction=like'>👍</a>
 | <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=2f0a38a819f6fe4e41287fdcecdacebcc88e7c46427ad08c0a47cc2759ef5b13&reaction=dislike'>👎</a>



##########
.devcontainer/start-local.sh:
##########
@@ -0,0 +1,109 @@
+#!/bin/bash
+# Start Superset for local dev container development (Cursor).
+# Runs Flask backend + webpack dev server directly in the container.
+# Postgres runs via docker-in-docker.
+set -e
+
+WORKSPACE_DIR="${WORKSPACE_DIR:-$(pwd)}"
+cd "$WORKSPACE_DIR"
+
+echo "=== Starting Superset (local dev container mode) ==="
+
+# --------------------------------------------------------------------------
+# 1. Source user's local env overrides (MSSQL creds, AI config, etc.)
+# --------------------------------------------------------------------------
+if [ -f "$WORKSPACE_DIR/docker/.env-local" ]; then
+    echo "[env] Loading docker/.env-local ..."
+    set -a
+    # shellcheck disable=SC1091
+    source "$WORKSPACE_DIR/docker/.env-local"
+    set +a
+fi
+
+# --------------------------------------------------------------------------
+# 2. Make sure Postgres is running
+# --------------------------------------------------------------------------
+echo "[db] Ensuring PostgreSQL is running ..."
+docker compose -f 
"$WORKSPACE_DIR/.devcontainer/docker-compose-devcontainer.yml" up -d
+
+echo "[db] Waiting for Postgres ..."
+for i in $(seq 1 30); do
+    docker compose -f 
"$WORKSPACE_DIR/.devcontainer/docker-compose-devcontainer.yml" exec -T db 
pg_isready -U superset > /dev/null 2>&1 && break
+    sleep 1
+done
+echo "[db] PostgreSQL is ready."

Review Comment:
   **Suggestion:** The PostgreSQL readiness loop always prints that the 
database is ready after 30 iterations without checking whether `pg_isready` 
ever succeeded, so the script can continue and fail later in less obvious ways 
if the database never became available. [logic error]
   
   <details>
   <summary><b>Severity Level:</b> Major ⚠️</summary>
   
   ```mdx
   - ⚠️ Script reports ready database when PostgreSQL is down.
   - ⚠️ Migrations fail after misleading readiness message.
   ```
   </details>
   
   ```suggestion
   db_ready=false
   for i in $(seq 1 30); do
       if docker compose -f 
"$WORKSPACE_DIR/.devcontainer/docker-compose-devcontainer.yml" exec -T db 
pg_isready -U superset > /dev/null 2>&1; then
           db_ready=true
           break
       fi
       sleep 1
   done
   if [ "$db_ready" = true ]; then
       echo "[db] PostgreSQL is ready."
   else
       echo "[db] ERROR: PostgreSQL did not become ready after 30 seconds." >&2
       exit 1
   fi
   ```
   <details>
   <summary><b>Steps of Reproduction ✅ </b></summary>
   
   ```mdx
   1. Misconfigure or break the Postgres service in
   `.devcontainer/docker-compose-devcontainer.yml` so container `db` never 
becomes ready
   (e.g. wrong image or bad config) while keeping the script 
`.devcontainer/start-local.sh`
   unchanged.
   
   2. Run `.devcontainer/start-local.sh` from the repository root; it starts 
Docker with
   `docker compose ... up -d` at line 27.
   
   3. The readiness loop at `.devcontainer/start-local.sh:29-33` calls 
`pg_isready` 30 times,
   each attempt failing because PostgreSQL is not accepting connections, but 
the loop still
   exits after the last iteration.
   
   4. At `.devcontainer/start-local.sh:34`, the script unconditionally prints 
"[db]
   PostgreSQL is ready." and then proceeds to run `superset db upgrade` at line 
52, which
   fails with connection errors despite the prior success message.
   ```
   </details>
   <details>
   <summary><b>Prompt for AI Agent 🤖 </b></summary>
   
   ```mdx
   This is a comment left during a code review.
   
   **Path:** .devcontainer/start-local.sh
   **Line:** 30:34
   **Comment:**
        *Logic Error: The PostgreSQL readiness loop always prints that the 
database is ready after 30 iterations without checking whether `pg_isready` 
ever succeeded, so the script can continue and fail later in less obvious ways 
if the database never became available.
   
   Validate the correctness of the flagged issue. If correct, How can I resolve 
this? If you propose a fix, implement it and please make it concise.
   ```
   </details>
   <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=778daeb1cad443120b680880907742095ee62b81c8345c9fdc59584adc30c600&reaction=like'>👍</a>
 | <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=778daeb1cad443120b680880907742095ee62b81c8345c9fdc59584adc30c600&reaction=dislike'>👎</a>



##########
extensions/ai_assistant/frontend/src/ChatPanel.tsx:
##########
@@ -0,0 +1,695 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Vambery AI Agent Chat Panel
+ *
+ * Renders in the SQL Lab right sidebar. Provides a chat interface
+ * for users to interact with the AI agent, which can inspect
+ * database schemas, write SQL queries, and modify the SQL editor.
+ */
+
+import React, { useState, useRef, useEffect, useCallback, useMemo } from 
"react";
+import { sqlLab, authentication, useTheme, SupersetTheme } from 
"@apache-superset/core";
+
+// --------------------------------------------------------------------------
+// Types
+// --------------------------------------------------------------------------
+
+interface ChatMessage {
+  role: "user" | "assistant" | "system";
+  content: string;
+  timestamp: number;
+  steps?: AgentStep[];
+  actions?: EditorAction[];
+  error?: boolean;
+  hasRunnable?: boolean;
+}
+
+interface AgentStep {
+  type: string;
+  tool: string;
+  args: Record<string, unknown>;
+  result_summary: string;
+}
+
+interface EditorAction {
+  type: string;
+  sql?: string;
+}
+
+interface ChatContext {
+  database_id: number | undefined;
+  database_name: string | undefined;
+  schema: string | null;
+  catalog: string | null;
+  current_sql: string;
+}
+
+// --------------------------------------------------------------------------
+// SSE Streaming API Helper
+// --------------------------------------------------------------------------
+
+interface StreamEvent {
+  event: "step" | "action" | "response" | "error";
+  data: Record<string, unknown>;
+}
+
+/**
+ * Post a chat message and stream SSE events back.
+ * Calls onEvent for each parsed SSE event as it arrives.
+ */
+async function postChatStream(
+  messages: { role: string; content: string }[],
+  context: ChatContext,
+  onEvent: (evt: StreamEvent) => void
+): Promise<void> {
+  const csrfToken = await authentication.getCSRFToken();
+
+  const response = await fetch("/api/v1/ai_assistant/chat/stream", {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+      ...(csrfToken ? { "X-CSRFToken": csrfToken } : {}),
+    },
+    credentials: "same-origin",
+    body: JSON.stringify({ messages, context }),
+  });
+
+  if (!response.ok) {
+    const errData = await response.json().catch(() => ({}));
+    throw new Error(
+      (errData as Record<string, string>).error ||
+        `HTTP ${response.status}: ${response.statusText}`
+    );
+  }
+
+  const reader = response.body?.getReader();
+  if (!reader) throw new Error("ReadableStream not supported");
+
+  const decoder = new TextDecoder();
+  let buffer = "";
+
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+
+    buffer += decoder.decode(value, { stream: true });
+
+    // SSE format: "event: <type>\ndata: <json>\n\n"
+    // Split on double newline to get complete events
+    const parts = buffer.split("\n\n");
+    // Last part may be incomplete — keep it in the buffer
+    buffer = parts.pop() || "";
+
+    for (const part of parts) {
+      if (!part.trim()) continue;
+      let eventType = "";
+      let eventData = "";
+
+      for (const line of part.split("\n")) {
+        if (line.startsWith("event: ")) {
+          eventType = line.slice(7).trim();
+        } else if (line.startsWith("data: ")) {
+          eventData = line.slice(6);
+        }
+      }
+
+      if (eventType && eventData) {
+        try {
+          onEvent({

Review Comment:
   **Suggestion:** The streaming helper calls an async event handler without 
awaiting it, so when the stream finishes, any pending async work (like applying 
editor actions and setting the runnable flag) may not have completed, causing 
the final assistant message to be built with an outdated `hasRunnable` value 
and potentially hiding the "Re-run query" button even when actions ran. [logic 
error]
   
   <details>
   <summary><b>Severity Level:</b> Major ⚠️</summary>
   
   ```mdx
   - ⚠️ "Re-run query" button sometimes missing after auto-run actions.
   - ⚠️ Assistant message state can be inconsistent with editor actions.
   ```
   </details>
   
   ```suggestion
     onEvent: (evt: StreamEvent) => Promise<void> | void
   ): Promise<void> {
     const csrfToken = await authentication.getCSRFToken();
   
     const response = await fetch("/api/v1/ai_assistant/chat/stream", {
       method: "POST",
       headers: {
         "Content-Type": "application/json",
         ...(csrfToken ? { "X-CSRFToken": csrfToken } : {}),
       },
       credentials: "same-origin",
       body: JSON.stringify({ messages, context }),
     });
   
     if (!response.ok) {
       const errData = await response.json().catch(() => ({}));
       throw new Error(
         (errData as Record<string, string>).error ||
           `HTTP ${response.status}: ${response.statusText}`
       );
     }
   
     const reader = response.body?.getReader();
     if (!reader) throw new Error("ReadableStream not supported");
   
     const decoder = new TextDecoder();
     let buffer = "";
   
     while (true) {
       const { done, value } = await reader.read();
       if (done) break;
   
       buffer += decoder.decode(value, { stream: true });
   
       // SSE format: "event: <type>\ndata: <json>\n\n"
       // Split on double newline to get complete events
       const parts = buffer.split("\n\n");
       // Last part may be incomplete — keep it in the buffer
       buffer = parts.pop() || "";
   
       for (const part of parts) {
         if (!part.trim()) continue;
         let eventType = "";
         let eventData = "";
   
         for (const line of part.split("\n")) {
           if (line.startsWith("event: ")) {
             eventType = line.slice(7).trim();
           } else if (line.startsWith("data: ")) {
             eventData = line.slice(6);
           }
         }
   
         if (eventType && eventData) {
           try {
             await onEvent({
   ```
   <details>
   <summary><b>Steps of Reproduction ✅ </b></summary>
   
   ```mdx
   1. Open SQL Lab and ensure the AI assistant panel mounts, rendering 
`ChatPanel` from
   `extensions/ai_assistant/frontend/src/ChatPanel.tsx`.
   
   2. In `ChatPanel.handleSend` (around lines 436–528 in the same file), send a 
message while
   a database is selected so that `handleSend` calls `await 
postChatStream(apiMessages,
   context, async (evt) => { ... })`.
   
   3. When the backend `/api/v1/ai_assistant/chat/stream` endpoint emits an 
`"action"` SSE
   event, the async callback in `handleSend` awaits `applyAction(action)` and 
sets
   `hasRunnable = true` (see `applyAction` in `ChatPanel.tsx` around lines 
394–423), but
   `postChatStream` (lines 78–146) invokes `onEvent(...)` without `await`, so 
it does not
   wait for that Promise to resolve.
   
   4. As soon as the stream ends, `postChatStream` resolves and `handleSend` 
immediately
   builds the assistant message using the current `hasRunnable` value (still 
`false` if the
   async handler hasn't finished), so the `"Re-run query"` button, which 
depends on
   `msg.hasRunnable`, is omitted even though the action was applied and the 
query was
   auto-run.
   ```
   </details>
   <details>
   <summary><b>Prompt for AI Agent 🤖 </b></summary>
   
   ```mdx
   This is a comment left during a code review.
   
   **Path:** extensions/ai_assistant/frontend/src/ChatPanel.tsx
   **Line:** 81:136
   **Comment:**
        *Logic Error: The streaming helper calls an async event handler without 
awaiting it, so when the stream finishes, any pending async work (like applying 
editor actions and setting the runnable flag) may not have completed, causing 
the final assistant message to be built with an outdated `hasRunnable` value 
and potentially hiding the "Re-run query" button even when actions ran.
   
   Validate the correctness of the flagged issue. If correct, How can I resolve 
this? If you propose a fix, implement it and please make it concise.
   ```
   </details>
   <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=74b11b692658bb4e68e50e9861d44cb9ddfb3362188f6df00ddb3a2348ac8f26&reaction=like'>👍</a>
 | <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=74b11b692658bb4e68e50e9861d44cb9ddfb3362188f6df00ddb3a2348ac8f26&reaction=dislike'>👎</a>



##########
extensions/ai_assistant/backend/src/ai_assistant/tools.py:
##########
@@ -0,0 +1,532 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Vambery AI Agent Database Tools.
+
+These tools allow the AI agent to introspect database schemas and execute
+SQL queries during its reasoning process.
+Uses Superset's Database model for all database operations, respecting
+security permissions and row-level security.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from superset.extensions import db
+from superset.models.core import Database
+from superset.sql.parse import Table
+
+logger = logging.getLogger(__name__)
+
+
+def _extract_result_data(result: Any) -> tuple[list[dict[str, Any]], 
list[str]]:
+    """
+    Extract rows (as list of dicts) and column names from a QueryResult.
+
+    The QueryResult from database.execute() stores results in
+    result.statements[i].data as a Pandas DataFrame.
+    """
+    rows: list[dict[str, Any]] = []
+    columns: list[str] = []
+    try:
+        if hasattr(result, "statements") and result.statements:
+            stmt = result.statements[0]
+            df = stmt.data
+            if df is not None and hasattr(df, "to_dict"):
+                # Pandas DataFrame
+                rows = df.to_dict(orient="records")
+                columns = list(df.columns)
+            elif df is not None and hasattr(df, "keys"):
+                # dict-like
+                rows = [df] if not isinstance(df, list) else df
+                columns = list(df.keys()) if isinstance(df, dict) else []
+    except Exception as ex:
+        logger.debug("Could not extract result data: %s", ex)
+
+    # Fallback: legacy result format
+    if not rows:
+        if hasattr(result, "data") and result.data:
+            rows = result.data if isinstance(result.data, list) else []
+        elif hasattr(result, "rows") and hasattr(result, "columns"):
+            columns = result.columns
+            rows = [dict(zip(columns, row, strict=False)) for row in 
result.rows]
+
+    return rows, columns
+
+
+# --------------------------------------------------------------------------
+# OpenAI function/tool definitions (sent to the LLM)
+# --------------------------------------------------------------------------
+
+TOOL_DEFINITIONS: list[dict[str, Any]] = [
+    {
+        "type": "function",
+        "function": {
+            "name": "list_schemas",
+            "description": (
+                "List all available schemas in the connected database. "
+                "Use this first to understand the database structure."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "list_tables",
+            "description": (
+                "List all tables in a specific schema. "
+                "Use this to discover which tables are available."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "schema_name": {
+                        "type": "string",
+                        "description": "The schema name to list tables from",
+                    },
+                },
+                "required": ["schema_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_table_columns",
+            "description": (
+                "Get the column names, data types, and nullable info for a 
table. "
+                "Use this to understand the structure of a table before 
writing queries."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "table_name": {
+                        "type": "string",
+                        "description": "Name of the table",
+                    },
+                    "schema_name": {
+                        "type": "string",
+                        "description": "Schema the table belongs to",
+                    },
+                },
+                "required": ["table_name", "schema_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "sample_table_data",
+            "description": (
+                "Get a sample of rows from a table (up to 20 rows). "
+                "Use this to understand what kind of data a table contains."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "table_name": {
+                        "type": "string",
+                        "description": "Name of the table",
+                    },
+                    "schema_name": {
+                        "type": "string",
+                        "description": "Schema the table belongs to",
+                    },
+                },
+                "required": ["table_name", "schema_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_distinct_values",
+            "description": (
+                "Get distinct/unique values for a specific column in a table. "
+                "Use this to understand the domain of a column (e.g., 
categories, statuses)."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "table_name": {
+                        "type": "string",
+                        "description": "Name of the table",
+                    },
+                    "schema_name": {
+                        "type": "string",
+                        "description": "Schema the table belongs to",
+                    },
+                    "column_name": {
+                        "type": "string",
+                        "description": "Name of the column to get distinct 
values for",
+                    },
+                },
+                "required": ["table_name", "schema_name", "column_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "execute_sql",
+            "description": (
+                "Execute an arbitrary SQL SELECT query and return results (max 
50 rows). "
+                "Use this to test queries, explore data, or verify results. "
+                "Only SELECT statements are allowed."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sql": {
+                        "type": "string",
+                        "description": "The SQL SELECT query to execute",
+                    },
+                },
+                "required": ["sql"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "set_editor_sql",
+            "description": (
+                "Set the SQL query in the user's SQL editor. "
+                "Use this when you have the final, tested query ready for the 
user. "
+                "This replaces the content of the active SQL editor tab."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sql": {
+                        "type": "string",
+                        "description": "The SQL query to place in the editor",
+                    },
+                },
+                "required": ["sql"],
+            },
+        },
+    },
+]
+
+
+# --------------------------------------------------------------------------
+# Tool execution functions
+# --------------------------------------------------------------------------
+
+
+def _get_database(database_id: int) -> Database:
+    """Load a Database model by ID, raising ValueError if not found."""
+    database = db.session.query(Database).filter_by(id=database_id).first()
+    if not database:
+        raise ValueError(f"Database with id={database_id} not found")
+    return database
+
+
+def tool_list_schemas(database_id: int, catalog: str | None = None) -> 
dict[str, Any]:
+    """List all schemas in the database."""
+    try:
+        database = _get_database(database_id)
+        schemas = database.get_all_schema_names(catalog=catalog)
+        return {"schemas": sorted(schemas)}
+    except Exception as ex:
+        logger.error("Error listing schemas for db %s: %s", database_id, ex)
+        return {"error": str(ex)}
+
+
+def tool_list_tables(
+    database_id: int, schema_name: str, catalog: str | None = None
+) -> dict[str, Any]:
+    """List all tables in a specific schema."""
+    try:
+        database = _get_database(database_id)
+        tables = database.get_all_table_names_in_schema(
+            catalog=catalog, schema=schema_name
+        )
+        # tables is a set of (table_name, schema, catalog) tuples
+        table_names = sorted(t[0] for t in tables)
+        return {"tables": table_names, "schema": schema_name}
+    except Exception as ex:
+        logger.error(
+            "Error listing tables for db %s, schema %s: %s",
+            database_id,
+            schema_name,
+            ex,
+        )
+        return {"error": str(ex)}
+
+
+def tool_get_table_columns(
+    database_id: int,
+    table_name: str,
+    schema_name: str,
+    catalog: str | None = None,
+) -> dict[str, Any]:
+    """Get column metadata for a table."""
+    try:
+        database = _get_database(database_id)
+        table = Table(table=table_name, schema=schema_name, catalog=catalog)
+        columns = database.get_columns(table)
+        # Format columns for readability
+        col_info = []
+        for col in columns:
+            col_info.append(
+                {
+                    "name": col.get("column_name") or col.get("name", 
"unknown"),
+                    "type": str(col.get("type", "unknown")),
+                    "nullable": col.get("nullable", True),
+                }
+            )
+        return {"table": table_name, "schema": schema_name, "columns": 
col_info}
+    except Exception as ex:
+        logger.error(
+            "Error getting columns for %s.%s in db %s: %s",
+            schema_name,
+            table_name,
+            database_id,
+            ex,
+        )
+        return {"error": str(ex)}
+
+
+def tool_sample_table_data(
+    database_id: int,
+    table_name: str,
+    schema_name: str,
+    catalog: str | None = None,
+    max_rows: int = 20,
+) -> dict[str, Any]:
+    """Get sample data from a table."""
+    try:
+        database = _get_database(database_id)
+        # Use TOP for MSSQL, LIMIT for others
+        db_engine = database.db_engine_spec.__name__ if 
database.db_engine_spec else ""
+        if "mssql" in db_engine.lower() or "MSSql" in db_engine:
+            sql = f"SELECT TOP {max_rows} * FROM 
[{schema_name}].[{table_name}]"
+        else:
+            sql = f'SELECT * FROM "{schema_name}"."{table_name}" LIMIT 
{max_rows}'
+
+        from superset_core.api.types import QueryOptions
+
+        options = QueryOptions(
+            catalog=catalog,
+            schema=schema_name,
+            limit=max_rows,
+        )
+        result = database.execute(sql, options)
+        rows, _ = _extract_result_data(result)
+        rows = rows[:max_rows]
+
+        return {
+            "table": table_name,
+            "schema": schema_name,
+            "row_count": len(rows),
+            "data": rows,
+        }
+    except Exception as ex:
+        logger.error(
+            "Error sampling data from %s.%s in db %s: %s",
+            schema_name,
+            table_name,
+            database_id,
+            ex,
+        )
+        return {"error": str(ex)}
+
+
+def tool_get_distinct_values(
+    database_id: int,
+    table_name: str,
+    schema_name: str,
+    column_name: str,
+    catalog: str | None = None,
+    max_values: int = 50,
+) -> dict[str, Any]:
+    """Get distinct values for a specific column."""
+    try:
+        database = _get_database(database_id)
+        db_engine = database.db_engine_spec.__name__ if 
database.db_engine_spec else ""
+        if "mssql" in db_engine.lower() or "MSSql" in db_engine:

Review Comment:
   **Suggestion:** In the MSSQL branch of the distinct-values helper, schema, 
table, and column names are concatenated directly into the SQL inside square 
brackets without any validation, so a maliciously crafted identifier containing 
']' (or similar) could terminate the identifier and inject arbitrary SQL 
despite this function being intended for safe metadata access. [security]
   
   <details>
   <summary><b>Severity Level:</b> Critical 🚨</summary>
   
   ```mdx
   - ❌ AI assistant distinct-value lookup can run injected MSSQL SQL.
   - ❌ Attackers may alter or drop MSSQL tables via metadata tool.
   - ⚠️ Compromises trust in AI-driven schema exploration features.
   ```
   </details>
   
   ```suggestion
           # Basic identifier safety to avoid SQL injection via 
schema/table/column names
           if any(ch in schema_name for ch in '[];"') or any(
               ch in table_name for ch in '[];"'
           ) or any(ch in column_name for ch in '[];"'):
               raise ValueError("Invalid characters in schema, table, or column 
name")
           if "mssql" in db_engine.lower():
   ```
   <details>
   <summary><b>Steps of Reproduction ✅ </b></summary>
   
   ```mdx
   1. Configure a MSSQL-backed `Database` in Superset and ensure it is 
accessible via
   `_get_database()` at 
`extensions/ai_assistant/backend/src/ai_assistant/tools.py:240-245`.
   
   2. From code using the AI assistant tools, invoke `execute_tool()` at
   `extensions/ai_assistant/backend/src/ai_assistant/tools.py:469-532` with
   `tool_name="get_distinct_values"`, the MSSQL `database_id`, and 
`arguments={"schema_name":
   "public", "table_name": "users]; DROP TABLE sensitive_table;--", 
"column_name": "status"}`
   or similar crafted identifiers.
   
   3. `execute_tool` routes this call to `tool_get_distinct_values()` at
   `extensions/ai_assistant/backend/src/ai_assistant/tools.py:359-416`, passing 
through the
   attacker-controlled `schema_name`, `table_name`, and `column_name`.
   
   4. In `tool_get_distinct_values`, the MSSQL branch at lines `369-377` 
constructs `sql` by
   interpolating the identifiers inside `[...]` without validation, yielding a 
query such as
   `SELECT DISTINCT TOP 50 [status] FROM [public].[users]; DROP TABLE 
sensitive_table;--]
   WHERE [status] IS NOT NULL ORDER BY [status]`, which is then executed via
   `Database.execute()` at `superset/models/core.py:1295-1309`, allowing the 
injected
   statement to run if the MSSQL driver permits multiple statements.
   ```
   </details>
   <details>
   <summary><b>Prompt for AI Agent 🤖 </b></summary>
   
   ```mdx
   This is a comment left during a code review.
   
   **Path:** extensions/ai_assistant/backend/src/ai_assistant/tools.py
   **Line:** 371:371
   **Comment:**
        *Security: In the MSSQL branch of the distinct-values helper, schema, 
table, and column names are concatenated directly into the SQL inside square 
brackets without any validation, so a maliciously crafted identifier containing 
']' (or similar) could terminate the identifier and inject arbitrary SQL 
despite this function being intended for safe metadata access.
   
   Validate the correctness of the flagged issue. If correct, How can I resolve 
this? If you propose a fix, implement it and please make it concise.
   ```
   </details>
   <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=0aeea65a62d6ddef4d8d0cbdc49499141df7a7b04da1ae1c9b848fddc9352847&reaction=like'>👍</a>
 | <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=0aeea65a62d6ddef4d8d0cbdc49499141df7a7b04da1ae1c9b848fddc9352847&reaction=dislike'>👎</a>



##########
extensions/ai_assistant/backend/src/ai_assistant/tools.py:
##########
@@ -0,0 +1,532 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Vambery AI Agent Database Tools.
+
+These tools allow the AI agent to introspect database schemas and execute
+SQL queries during its reasoning process.
+Uses Superset's Database model for all database operations, respecting
+security permissions and row-level security.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from superset.extensions import db
+from superset.models.core import Database
+from superset.sql.parse import Table
+
+logger = logging.getLogger(__name__)
+
+
+def _extract_result_data(result: Any) -> tuple[list[dict[str, Any]], 
list[str]]:
+    """
+    Extract rows (as list of dicts) and column names from a QueryResult.
+
+    The QueryResult from database.execute() stores results in
+    result.statements[i].data as a Pandas DataFrame.
+    """
+    rows: list[dict[str, Any]] = []
+    columns: list[str] = []
+    try:
+        if hasattr(result, "statements") and result.statements:
+            stmt = result.statements[0]
+            df = stmt.data
+            if df is not None and hasattr(df, "to_dict"):
+                # Pandas DataFrame
+                rows = df.to_dict(orient="records")
+                columns = list(df.columns)
+            elif df is not None and hasattr(df, "keys"):
+                # dict-like
+                rows = [df] if not isinstance(df, list) else df
+                columns = list(df.keys()) if isinstance(df, dict) else []
+    except Exception as ex:
+        logger.debug("Could not extract result data: %s", ex)
+
+    # Fallback: legacy result format
+    if not rows:
+        if hasattr(result, "data") and result.data:
+            rows = result.data if isinstance(result.data, list) else []
+        elif hasattr(result, "rows") and hasattr(result, "columns"):
+            columns = result.columns
+            rows = [dict(zip(columns, row, strict=False)) for row in 
result.rows]
+
+    return rows, columns
+
+
+# --------------------------------------------------------------------------
+# OpenAI function/tool definitions (sent to the LLM)
+# --------------------------------------------------------------------------
+
+TOOL_DEFINITIONS: list[dict[str, Any]] = [
+    {
+        "type": "function",
+        "function": {
+            "name": "list_schemas",
+            "description": (
+                "List all available schemas in the connected database. "
+                "Use this first to understand the database structure."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "list_tables",
+            "description": (
+                "List all tables in a specific schema. "
+                "Use this to discover which tables are available."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "schema_name": {
+                        "type": "string",
+                        "description": "The schema name to list tables from",
+                    },
+                },
+                "required": ["schema_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_table_columns",
+            "description": (
+                "Get the column names, data types, and nullable info for a 
table. "
+                "Use this to understand the structure of a table before 
writing queries."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "table_name": {
+                        "type": "string",
+                        "description": "Name of the table",
+                    },
+                    "schema_name": {
+                        "type": "string",
+                        "description": "Schema the table belongs to",
+                    },
+                },
+                "required": ["table_name", "schema_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "sample_table_data",
+            "description": (
+                "Get a sample of rows from a table (up to 20 rows). "
+                "Use this to understand what kind of data a table contains."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "table_name": {
+                        "type": "string",
+                        "description": "Name of the table",
+                    },
+                    "schema_name": {
+                        "type": "string",
+                        "description": "Schema the table belongs to",
+                    },
+                },
+                "required": ["table_name", "schema_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_distinct_values",
+            "description": (
+                "Get distinct/unique values for a specific column in a table. "
+                "Use this to understand the domain of a column (e.g., 
categories, statuses)."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "table_name": {
+                        "type": "string",
+                        "description": "Name of the table",
+                    },
+                    "schema_name": {
+                        "type": "string",
+                        "description": "Schema the table belongs to",
+                    },
+                    "column_name": {
+                        "type": "string",
+                        "description": "Name of the column to get distinct 
values for",
+                    },
+                },
+                "required": ["table_name", "schema_name", "column_name"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "execute_sql",
+            "description": (
+                "Execute an arbitrary SQL SELECT query and return results (max 
50 rows). "
+                "Use this to test queries, explore data, or verify results. "
+                "Only SELECT statements are allowed."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sql": {
+                        "type": "string",
+                        "description": "The SQL SELECT query to execute",
+                    },
+                },
+                "required": ["sql"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "set_editor_sql",
+            "description": (
+                "Set the SQL query in the user's SQL editor. "
+                "Use this when you have the final, tested query ready for the 
user. "
+                "This replaces the content of the active SQL editor tab."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sql": {
+                        "type": "string",
+                        "description": "The SQL query to place in the editor",
+                    },
+                },
+                "required": ["sql"],
+            },
+        },
+    },
+]
+
+
+# --------------------------------------------------------------------------
+# Tool execution functions
+# --------------------------------------------------------------------------
+
+
+def _get_database(database_id: int) -> Database:
+    """Load a Database model by ID, raising ValueError if not found."""
+    database = db.session.query(Database).filter_by(id=database_id).first()
+    if not database:
+        raise ValueError(f"Database with id={database_id} not found")
+    return database
+
+
+def tool_list_schemas(database_id: int, catalog: str | None = None) -> 
dict[str, Any]:
+    """List all schemas in the database."""
+    try:
+        database = _get_database(database_id)
+        schemas = database.get_all_schema_names(catalog=catalog)
+        return {"schemas": sorted(schemas)}
+    except Exception as ex:
+        logger.error("Error listing schemas for db %s: %s", database_id, ex)
+        return {"error": str(ex)}
+
+
+def tool_list_tables(
+    database_id: int, schema_name: str, catalog: str | None = None
+) -> dict[str, Any]:
+    """List all tables in a specific schema."""
+    try:
+        database = _get_database(database_id)
+        tables = database.get_all_table_names_in_schema(
+            catalog=catalog, schema=schema_name
+        )
+        # tables is a set of (table_name, schema, catalog) tuples
+        table_names = sorted(t[0] for t in tables)
+        return {"tables": table_names, "schema": schema_name}
+    except Exception as ex:
+        logger.error(
+            "Error listing tables for db %s, schema %s: %s",
+            database_id,
+            schema_name,
+            ex,
+        )
+        return {"error": str(ex)}
+
+
+def tool_get_table_columns(
+    database_id: int,
+    table_name: str,
+    schema_name: str,
+    catalog: str | None = None,
+) -> dict[str, Any]:
+    """Get column metadata for a table."""
+    try:
+        database = _get_database(database_id)
+        table = Table(table=table_name, schema=schema_name, catalog=catalog)
+        columns = database.get_columns(table)
+        # Format columns for readability
+        col_info = []
+        for col in columns:
+            col_info.append(
+                {
+                    "name": col.get("column_name") or col.get("name", 
"unknown"),
+                    "type": str(col.get("type", "unknown")),
+                    "nullable": col.get("nullable", True),
+                }
+            )
+        return {"table": table_name, "schema": schema_name, "columns": 
col_info}
+    except Exception as ex:
+        logger.error(
+            "Error getting columns for %s.%s in db %s: %s",
+            schema_name,
+            table_name,
+            database_id,
+            ex,
+        )
+        return {"error": str(ex)}
+
+
+def tool_sample_table_data(
+    database_id: int,
+    table_name: str,
+    schema_name: str,
+    catalog: str | None = None,
+    max_rows: int = 20,
+) -> dict[str, Any]:
+    """Get sample data from a table."""
+    try:
+        database = _get_database(database_id)
+        # Use TOP for MSSQL, LIMIT for others
+        db_engine = database.db_engine_spec.__name__ if 
database.db_engine_spec else ""
+        if "mssql" in db_engine.lower() or "MSSql" in db_engine:
+            sql = f"SELECT TOP {max_rows} * FROM 
[{schema_name}].[{table_name}]"
+        else:
+            sql = f'SELECT * FROM "{schema_name}"."{table_name}" LIMIT 
{max_rows}'
+
+        from superset_core.api.types import QueryOptions
+
+        options = QueryOptions(
+            catalog=catalog,
+            schema=schema_name,
+            limit=max_rows,
+        )
+        result = database.execute(sql, options)
+        rows, _ = _extract_result_data(result)
+        rows = rows[:max_rows]
+
+        return {
+            "table": table_name,
+            "schema": schema_name,
+            "row_count": len(rows),
+            "data": rows,
+        }
+    except Exception as ex:
+        logger.error(
+            "Error sampling data from %s.%s in db %s: %s",
+            schema_name,
+            table_name,
+            database_id,
+            ex,
+        )
+        return {"error": str(ex)}
+
+
+def tool_get_distinct_values(
+    database_id: int,
+    table_name: str,
+    schema_name: str,
+    column_name: str,
+    catalog: str | None = None,
+    max_values: int = 50,
+) -> dict[str, Any]:
+    """Get distinct values for a specific column."""
+    try:
+        database = _get_database(database_id)
+        db_engine = database.db_engine_spec.__name__ if 
database.db_engine_spec else ""
+        if "mssql" in db_engine.lower() or "MSSql" in db_engine:
+            sql = (
+                f"SELECT DISTINCT TOP {max_values} [{column_name}] "
+                f"FROM [{schema_name}].[{table_name}] "
+                f"WHERE [{column_name}] IS NOT NULL "
+                f"ORDER BY [{column_name}]"
+            )
+        else:
+            sql = (
+                f'SELECT DISTINCT "{column_name}" '
+                f'FROM "{schema_name}"."{table_name}" '
+                f'WHERE "{column_name}" IS NOT NULL '
+                f'ORDER BY "{column_name}" LIMIT {max_values}'
+            )
+
+        from superset_core.api.types import QueryOptions
+
+        options = QueryOptions(
+            catalog=catalog,
+            schema=schema_name,
+            limit=max_values,
+        )
+        result = database.execute(sql, options)
+
+        rows, _ = _extract_result_data(result)
+        values = []
+        for row in rows[:max_values]:
+            val = list(row.values())[0] if isinstance(row, dict) else row
+            values.append(str(val) if val is not None else None)
+
+        return {
+            "table": table_name,
+            "column": column_name,
+            "distinct_count": len(values),
+            "values": values,
+        }
+    except Exception as ex:
+        logger.error(
+            "Error getting distinct values for %s.%s.%s in db %s: %s",
+            schema_name,
+            table_name,
+            column_name,
+            database_id,
+            ex,
+        )
+        return {"error": str(ex)}
+
+
+def tool_execute_sql(
+    database_id: int,
+    sql: str,
+    schema_name: str | None = None,
+    catalog: str | None = None,
+    max_rows: int = 50,
+) -> dict[str, Any]:
+    """Execute an arbitrary SQL query."""
+    try:
+        # Basic safety check - only allow SELECT
+        sql_stripped = sql.strip().upper()
+        if not sql_stripped.startswith("SELECT") and not 
sql_stripped.startswith(
+            "WITH"
+        ):
+            return {
+                "error": "Only SELECT and WITH (CTE) queries are allowed for 
safety"
+            }
+
+        database = _get_database(database_id)
+
+        from superset_core.api.types import QueryOptions

Review Comment:
   **Suggestion:** The safety check in the generic SQL execution helper only 
verifies that the query string starts with SELECT or WITH, but does not prevent 
multiple statements (e.g., `WITH ...; DELETE ...`), so a user-supplied query 
could still execute non-SELECT statements and violate the "read-only" 
guarantee. [security]
   
   <details>
   <summary><b>Severity Level:</b> Critical 🚨</summary>
   
   ```mdx
   - ❌ AI assistant `execute_sql` tool not truly read-only.
   - ❌ Malicious multi-statement queries can modify or delete data.
   - ⚠️ Breaks safety expectations for AI-generated exploratory queries.
   ```
   </details>
   
   ```suggestion
           # Basic safety check - only allow a single SELECT/CTE statement
           sql_stripped = sql.strip()
           sql_upper = sql_stripped.upper()
           if not sql_upper.startswith("SELECT") and not 
sql_upper.startswith("WITH"):
               return {
                   "error": "Only SELECT and WITH (CTE) queries are allowed for 
safety"
               }
           # Disallow multiple statements separated by semicolons
           if ";" in sql_stripped.rstrip(";"):
               return {"error": "Multiple SQL statements are not allowed"}
   ```
   <details>
   <summary><b>Steps of Reproduction ✅ </b></summary>
   
   ```mdx
   1. Use any backend path that leverages `execute_tool()` at
   `extensions/ai_assistant/backend/src/ai_assistant/tools.py:469-532` for 
executing SQL via
   the AI assistant, passing `tool_name="execute_sql"` and a valid 
`database_id`.
   
   2. Supply an attacker-controlled SQL string in `arguments["sql"]`, such as 
`\"SELECT 1;
   DELETE FROM sensitive_table\"`, which begins with `SELECT` but includes a 
second statement
   after a semicolon.
   
   3. `execute_tool` dispatches this request to `tool_execute_sql()` at
   `extensions/ai_assistant/backend/src/ai_assistant/tools.py:419-457` with the 
provided
   `sql`.
   
   4. Inside `tool_execute_sql`, the safety check at lines `432-439` converts 
the SQL to
   upper case, verifies it starts with `SELECT`, and does not inspect for 
additional
   statements; it then calls `Database.execute(sql, options)` at
   `superset/models/core.py:1295-1309`, allowing any subsequent non-SELECT 
statement in the
   same string to execute if the database/driver accepts multi-statement 
execution.
   ```
   </details>
   <details>
   <summary><b>Prompt for AI Agent 🤖 </b></summary>
   
   ```mdx
   This is a comment left during a code review.
   
   **Path:** extensions/ai_assistant/backend/src/ai_assistant/tools.py
   **Line:** 432:439
   **Comment:**
        *Security: The safety check in the generic SQL execution helper only 
verifies that the query string starts with SELECT or WITH, but does not prevent 
multiple statements (e.g., `WITH ...; DELETE ...`), so a user-supplied query 
could still execute non-SELECT statements and violate the "read-only" guarantee.
   
   Validate the correctness of the flagged issue. If correct, How can I resolve 
this? If you propose a fix, implement it and please make it concise.
   ```
   </details>
   <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=3efe2cb4539af93c794fb6ef2f014fe87bc4207fcae23037885c07b4bd332af2&reaction=like'>👍</a>
 | <a 
href='https://app.codeant.ai/feedback?pr_url=https%3A%2F%2Fgithub.com%2Fapache%2Fsuperset%2Fpull%2F37950&comment_hash=3efe2cb4539af93c794fb6ef2f014fe87bc4207fcae23037885c07b4bd332af2&reaction=dislike'>👎</a>



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to