(doris-mcp-server) branch master updated: Fix SELECT detection and max_rows bypass for comment-prefixed SQL (#87)

freeoneplus Thu, 21 May 2026 04:42:12 -0700

This is an automated email from the ASF dual-hosted git repository.

FreeOnePlus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris-mcp-server.git



The following commit(s) were added to refs/heads/master by this push:
     new 3fe6811  Fix SELECT detection and max_rows bypass for comment-prefixed 
SQL (#87)
3fe6811 is described below

commit 3fe6811632e60c103b7b1bf98c502967f89dc53f
Author: Bibo <[email protected]>
AuthorDate: Thu May 21 19:41:56 2026 +0800

    Fix SELECT detection and max_rows bypass for comment-prefixed SQL (#87)
    
    * [fix] Use cursor.description to detect result-returning queries
    
    Replace brittle sql_upper.startswith() keyword check with
    cursor.description, which reliably detects any statement that
    returns a result set (SELECT, SHOW, DESCRIBE, EXPLAIN, CTEs, etc.)
    without maintaining a hardcoded list of SQL keywords.
    
    * Fix max_rows bypass for comment-prefixed SELECT in execute_sql_for_mcp
    
    Same root cause as #75: sql.upper().startswith("SELECT") at
    query_executor.py:689 returns False when SQL begins with a leading
    '--' or '/* */' comment, so the auto-injected LIMIT {max_rows} is
    silently skipped and large queries can be dispatched unbounded.
    
    This callsite runs before cursor.execute, so cursor.description is not
    available. Use a small helper that strips leading SQL comments before
    extracting the first keyword.
    
    * Add regression tests for result-set detection contract
    
    These tests pin the user-facing contract of DorisConnection.execute():
    any SQL the driver reports as producing a result set must return its
    rows, regardless of how the statement is phrased.
    
    Guards against regression of:
    - Issue #62 Bug 5 (CTE / WITH returning empty data)
    - The leading-comment bug fixed in #75 (data: [] with row_count > 0
      for SELECT prefixed by '--' or '/* */')
    - The same-root-cause LIMIT bypass fixed in the previous commit
    
    Also adds unit tests for the new get_first_sql_keyword helper.
    
    ---------
    
    Co-authored-by: jonas.brami <[email protected]>
---
 doris_mcp_server/utils/db.py             |  29 ++++--
 doris_mcp_server/utils/query_executor.py |   9 +-
 test/utils/test_db.py                    | 161 ++++++++++++++++++++++++++++++-
 3 files changed, 185 insertions(+), 14 deletions(-)

diff --git a/doris_mcp_server/utils/db.py b/doris_mcp_server/utils/db.py
index cd7b0ce..a48811f 100644
--- a/doris_mcp_server/utils/db.py
+++ b/doris_mcp_server/utils/db.py
@@ -24,6 +24,7 @@ Supports asynchronous operations and concurrent connection 
management, ensuring
 
 import asyncio
 import logging
+import re
 import time
 from contextlib import asynccontextmanager
 from dataclasses import dataclass
@@ -36,6 +37,22 @@ from aiomysql import Connection, Pool
 from .logger import get_logger
 
 
+_SQL_COMMENT_RE = re.compile(r"/\*.*?\*/|--[^\n]*", re.DOTALL)
+
+
+def get_first_sql_keyword(sql: str) -> str:
+    """Return the first SQL keyword (uppercase), ignoring leading 
comments/whitespace.
+
+    Strips `--` line comments and `/* */` block comments before extracting
+    the first token. A leading comment must not change how a statement is
+    classified (e.g. `-- note\\nSELECT 1` is still a SELECT).
+    """
+    if not sql:
+        return ""
+    stripped = _SQL_COMMENT_RE.sub("", sql).strip()
+    if not stripped:
+        return ""
+    return stripped.split(None, 1)[0].upper()
 
 
 @dataclass
@@ -95,15 +112,9 @@ class DorisConnection:
             async with self.connection.cursor(aiomysql.DictCursor) as cursor:
                 await cursor.execute(sql, params)
 
-                # Check if it's a query statement (statement that returns 
result set)
-                # FIX for Issue #62 Bug 5: Added WITH support for Common Table 
Expressions (CTE)
-                sql_upper = sql.strip().upper()
-                if (sql_upper.startswith("SELECT") or
-                    sql_upper.startswith("SHOW") or
-                    sql_upper.startswith("DESCRIBE") or
-                    sql_upper.startswith("DESC") or
-                    sql_upper.startswith("EXPLAIN") or
-                    sql_upper.startswith("WITH")):  # FIX: Support CTE queries
+                # cursor.description is set by the DB driver for any statement 
that returns rows,
+                # avoiding a brittle hardcoded keyword list (e.g. missing 
WITH/CTE, comments before keywords).
+                if cursor.description:
                     data = await cursor.fetchall()
                     row_count = len(data)
                 else:
diff --git a/doris_mcp_server/utils/query_executor.py 
b/doris_mcp_server/utils/query_executor.py
index 75abf34..b919cad 100644
--- a/doris_mcp_server/utils/query_executor.py
+++ b/doris_mcp_server/utils/query_executor.py
@@ -35,7 +35,7 @@ from decimal import Decimal
 
 import sqlparse
 
-from .db import DorisConnectionManager, QueryResult
+from .db import DorisConnectionManager, QueryResult, get_first_sql_keyword
 from .logger import get_logger
 from .sql_security_utils import get_auth_context
 
@@ -685,8 +685,11 @@ class DorisQueryExecutor:
                 else:
                     self.logger.warning("Security configuration not found, 
proceeding without validation")
 
-                # Add LIMIT if not present and it's a SELECT query
-                if sql.upper().startswith("SELECT") and "LIMIT" not in 
sql.upper():
+                # Add LIMIT if not present and it's a SELECT query.
+                # get_first_sql_keyword skips leading comments so `-- 
note\nSELECT ...`
+                # still gets the LIMIT cap (sql.startswith would silently 
bypass it).
+                sql_upper = sql.upper()
+                if get_first_sql_keyword(sql) == "SELECT" and "LIMIT" not in 
sql_upper:
                     if sql.endswith(";"):
                         sql = sql[:-1]
                     sql = f"{sql} LIMIT {limit}"
diff --git a/test/utils/test_db.py b/test/utils/test_db.py
index 4ad721a..e12d507 100644
--- a/test/utils/test_db.py
+++ b/test/utils/test_db.py
@@ -1,7 +1,11 @@
-from unittest.mock import MagicMock
+from unittest.mock import AsyncMock, MagicMock
 import pytest
 
-from doris_mcp_server.utils.db import DorisConnection, DorisSessionCache
+from doris_mcp_server.utils.db import (
+    DorisConnection,
+    DorisSessionCache,
+    get_first_sql_keyword,
+)
 
 
 @pytest.fixture
@@ -76,3 +80,156 @@ class TestDorisSessionCache:
         connection_manager.release_connection.assert_any_call("query", 
mock_conn1)
         connection_manager.release_connection.assert_any_call("system", 
mock_conn2)
         assert connection_manager.release_connection.call_count == 2
+
+
+class TestGetFirstSqlKeyword:
+    """Unit tests for get_first_sql_keyword.
+
+    Used by query_executor.py:689 to detect SELECT before cursor.execute
+    (where cursor.description is not yet available), so the auto-injected
+    LIMIT {max_rows} cap also works when the SQL is comment-prefixed.
+    """
+
+    def test_plain_select(self):
+        assert get_first_sql_keyword("SELECT 1") == "SELECT"
+
+    def test_leading_whitespace(self):
+        assert get_first_sql_keyword("   \n\t SELECT 1") == "SELECT"
+
+    def test_lowercase(self):
+        assert get_first_sql_keyword("select 1") == "SELECT"
+
+    def test_line_comment_then_select(self):
+        sql = "-- a leading note\nSELECT 1"
+        assert get_first_sql_keyword(sql) == "SELECT"
+
+    def test_block_comment_then_select(self):
+        sql = "/* note */ SELECT 1"
+        assert get_first_sql_keyword(sql) == "SELECT"
+
+    def test_multiline_block_comment_then_select(self):
+        sql = "/*\n multi\n line\n*/\nSELECT 1"
+        assert get_first_sql_keyword(sql) == "SELECT"
+
+    def test_mixed_whitespace_and_comments(self):
+        sql = "  -- one\n  /* two */ \n  SELECT 1"
+        assert get_first_sql_keyword(sql) == "SELECT"
+
+    def test_comment_then_with_cte(self):
+        sql = "-- note\nWITH x AS (SELECT 1) SELECT * FROM x"
+        assert get_first_sql_keyword(sql) == "WITH"
+
+    def test_non_select_unaffected(self):
+        assert get_first_sql_keyword("INSERT INTO t VALUES (1)") == "INSERT"
+        assert get_first_sql_keyword("-- c\nINSERT INTO t VALUES (1)") == 
"INSERT"
+
+    def test_empty_and_only_comments(self):
+        assert get_first_sql_keyword("") == ""
+        assert get_first_sql_keyword("   ") == ""
+        assert get_first_sql_keyword("-- only a comment") == ""
+        assert get_first_sql_keyword("/* only */") == ""
+
+
+def _make_doris_connection(cursor_description, fetchall_rows, rowcount=0):
+    """Build a DorisConnection whose underlying cursor returns the given 
values.
+
+    The driver-level cursor is fully mocked: only `description`, `fetchall()`
+    and `rowcount` matter for the result-set-detection branch we want to test.
+    """
+    cursor = MagicMock()
+    cursor.execute = AsyncMock(return_value=None)
+    cursor.fetchall = AsyncMock(return_value=fetchall_rows)
+    cursor.description = cursor_description
+    cursor.rowcount = rowcount
+
+    cursor_ctx = MagicMock()
+    cursor_ctx.__aenter__ = AsyncMock(return_value=cursor)
+    cursor_ctx.__aexit__ = AsyncMock(return_value=None)
+
+    raw_connection = MagicMock()
+    raw_connection.cursor = MagicMock(return_value=cursor_ctx)
+
+    return DorisConnection(connection=raw_connection, session_id="test")
+
+
+class TestExecuteResultSetDetection:
+    """Behavior contract for DorisConnection.execute().
+
+    These tests pin the user-facing contract: any statement the driver
+    reports as producing a result set must have its rows returned, and any
+    statement that does not produce a result set must report rowcount.
+
+    Guards against regression of:
+    - Issue #62 Bug 5 (CTE / WITH returning empty data)
+    - The leading-comment bug (SELECT prefixed by `--` or `/* */` returning
+      empty data while row_count was non-zero)
+    - Future "missing keyword in the whitelist" bugs of the same class
+
+    The tests deliberately do not assert anything about how the SQL text is
+    parsed — they only assert that when `cursor.description` is populated,
+    rows are fetched, regardless of the SQL phrasing.
+    """
+
+    @pytest.mark.parametrize(
+        "sql",
+        [
+            "SELECT 1",
+            "   SELECT 1",
+            "-- leading line comment\nSELECT 1",
+            "/* leading block comment */ SELECT 1",
+            "/*\n multi\n line\n*/\nSELECT 1",
+            "  -- one\n  /* two */ \n  SELECT 1",
+            "(SELECT 1)",
+            "WITH t AS (SELECT 1) SELECT * FROM t",
+            "-- comment\nWITH t AS (SELECT 1) SELECT * FROM t",
+            "SHOW TABLES",
+            "DESC some_table",
+            "EXPLAIN SELECT 1",
+        ],
+        ids=[
+            "plain_select",
+            "leading_whitespace",
+            "line_comment_then_select",
+            "block_comment_then_select",
+            "multiline_block_comment",
+            "mixed_whitespace_and_comments",
+            "parenthesized_select",
+            "with_cte",
+            "comment_then_with_cte",
+            "show",
+            "desc",
+            "explain",
+        ],
+    )
+    async def test_returns_rows_when_driver_reports_result_set(self, sql):
+        rows = [{"col": 1}]
+        conn = _make_doris_connection(
+            cursor_description=[("col", None, None, None, None, None, None)],
+            fetchall_rows=rows,
+        )
+
+        result = await conn.execute(sql)
+
+        assert result.data == rows
+        assert result.row_count == len(rows)
+
+    @pytest.mark.parametrize(
+        "sql, affected",
+        [
+            ("INSERT INTO t VALUES (1)", 1),
+            ("UPDATE t SET x = 1", 5),
+            ("DELETE FROM t WHERE x = 1", 3),
+            ("CREATE TABLE t (x INT)", 0),
+        ],
+    )
+    async def test_no_fetch_when_driver_reports_no_result_set(self, sql, 
affected):
+        conn = _make_doris_connection(
+            cursor_description=None,
+            fetchall_rows=[],
+            rowcount=affected,
+        )
+
+        result = await conn.execute(sql)
+
+        assert result.data == []
+        assert result.row_count == affected


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris-mcp-server) branch master updated: Fix SELECT detection and max_rows bypass for comment-prefixed SQL (#87)

Reply via email to