codeant-ai-for-open-source[bot] commented on code in PR #37677: URL: https://github.com/apache/superset/pull/37677#discussion_r2765001048
########## superset/db_engine_specs/datastore.py: ########## @@ -0,0 +1,611 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import logging +import re +from datetime import datetime +from re import Pattern +from typing import Any, TYPE_CHECKING, TypedDict +from urllib import parse + +from apispec import APISpec +from apispec.ext.marshmallow import MarshmallowPlugin +from flask_babel import gettext as __ +from marshmallow import fields, Schema +from marshmallow.exceptions import ValidationError +from sqlalchemy import column, types +from sqlalchemy.engine.base import Engine +from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.engine.url import URL +from sqlalchemy.sql import sqltypes + +from superset.constants import TimeGrain +from superset.databases.schemas import encrypted_field_properties, EncryptedString +from superset.databases.utils import make_url_safe +from superset.db_engine_specs.base import ( + BaseEngineSpec, + BasicPropertiesType, + DatabaseCategory, +) +from superset.db_engine_specs.exceptions import SupersetDBAPIConnectionError +from superset.errors import SupersetError, SupersetErrorType +from superset.exceptions import SupersetException +from superset.models.core import Database +from superset.sql.parse import LimitMethod, Table +from superset.superset_typing import ResultSetColumnType +from superset.utils import json +from superset.utils.hashing import hash_from_str + +logger = logging.getLogger(__name__) + +try: + import google.auth + from google.cloud import datastore + from google.oauth2 import service_account + + dependencies_installed = True +except ImportError: + dependencies_installed = False + +if TYPE_CHECKING: + from superset.models.core import Database # pragma: no cover + from superset.models.sql_lab import Query # pragma: no cover + +logger = logging.getLogger() + +CONNECTION_DATABASE_PERMISSIONS_REGEX = re.compile( + "Access Denied: Project (?P<project_name>.+?): User does not have " + + "datastore.databases.create permission in project (?P<project>.+?)" +) + +TABLE_DOES_NOT_EXIST_REGEX = re.compile( + 'Table name "(?P<table>.*?)" missing dataset while no default ' + "dataset is set in the request" +) + +COLUMN_DOES_NOT_EXIST_REGEX = re.compile( + r"Unrecognized name: (?P<column>.*?) at \[(?P<location>.+?)\]" +) + +SCHEMA_DOES_NOT_EXIST_REGEX = re.compile( + r"datastore error: 404 Not found: Dataset (?P<dataset>.*?):" + r"(?P<schema>.*?) was not found in location" +) + +SYNTAX_ERROR_REGEX = re.compile( + 'Syntax error: Expected end of input but got identifier "(?P<syntax_error>.+?)"' +) + +ma_plugin = MarshmallowPlugin() + + +class DatastoreParametersSchema(Schema): + credentials_info = EncryptedString( + required=False, + metadata={"description": "Contents of Datastore JSON credentials."}, + ) + query = fields.Dict(required=False) + + +class DatastoreParametersType(TypedDict): + credentials_info: dict[str, Any] + query: dict[str, Any] + + +class DatastoreEngineSpec(BaseEngineSpec): # pylint: disable=too-many-public-methods + """Engine spec for Google's Datastore + + As contributed by @hychang.1997.tw""" + + engine = "datastore" + engine_name = "Google Datastore" + max_column_name_length = 128 + disable_ssh_tunneling = True + + parameters_schema = DatastoreParametersSchema() + default_driver = "datastore" + sqlalchemy_uri_placeholder = "datastore://{project_id}" + + # Use FETCH_MANY to prevent Superset from injecting LIMIT via sqlglot AST + # manipulation. GQL queries should not be modified by sqlglot since it + # uses BigQuery dialect which transforms GQL-incompatible syntax. + limit_method = LimitMethod.FETCH_MANY + + metadata = { + "description": ( + "Google Cloud Datastore is a highly scalable NoSQL database " + "for your applications." + ), + "logo": "google-biquery.png", + "homepage_url": "https://cloud.google.com/datastore/", + "categories": [ + DatabaseCategory.CLOUD_GCP, + DatabaseCategory.PROPRIETARY, + ], + "pypi_packages": ["python-datastore-sqlalchemy"], + "connection_string": "datastore://{project_id}", + "authentication_methods": [ + { + "name": "Service Account JSON", + "description": ( + "Upload service account credentials JSON or paste in Secure Extra" + ), + "secure_extra": { + "credentials_info": { + "type": "service_account", + "project_id": "...", + "private_key_id": "...", + "private_key": "...", + "client_email": "...", + "client_id": "...", + "auth_uri": "...", + "token_uri": "...", + } + }, + }, + ], + "notes": ( + "Create a Service Account via GCP console with access to " + "datastore datasets." + ), + "docs_url": "https://github.com/splasky/Python-datastore-sqlalchemy", + } + + # Datastore doesn't maintain context when running multiple statements in the + # same cursor, so we need to run all statements at once + run_multiple_statements_as_one = True + + allows_hidden_cc_in_orderby = True + + supports_dynamic_schema = True + supports_catalog = supports_dynamic_catalog = supports_cross_catalog_queries = True + + # when editing the database, mask this field in `encrypted_extra` + # pylint: disable=invalid-name + encrypted_extra_sensitive_fields = {"$.credentials_info.private_key"} + + """ + https://www.python.org/dev/peps/pep-0249/#arraysize + raw_connections bypass the sqlalchemy-datastore query execution context and deal + with raw dbapi connection directly. + If this value is not set, the default value is set to 1. + """ + arraysize = 5000 + + _date_trunc_functions = { + "DATE": "DATE_TRUNC", + "DATETIME": "DATETIME_TRUNC", + "TIME": "TIME_TRUNC", + "TIMESTAMP": "TIMESTAMP_TRUNC", + } + + _time_grain_expressions = { + None: "{col}", + TimeGrain.SECOND: "CAST(TIMESTAMP_SECONDS(" + "UNIX_SECONDS(CAST({col} AS TIMESTAMP))" + ") AS {type})", + TimeGrain.MINUTE: "CAST(TIMESTAMP_SECONDS(" + "60 * DIV(UNIX_SECONDS(CAST({col} AS TIMESTAMP)), 60)" + ") AS {type})", + TimeGrain.FIVE_MINUTES: "CAST(TIMESTAMP_SECONDS(" + "5*60 * DIV(UNIX_SECONDS(CAST({col} AS TIMESTAMP)), 5*60)" + ") AS {type})", + TimeGrain.TEN_MINUTES: "CAST(TIMESTAMP_SECONDS(" + "10*60 * DIV(UNIX_SECONDS(CAST({col} AS TIMESTAMP)), 10*60)" + ") AS {type})", + TimeGrain.FIFTEEN_MINUTES: "CAST(TIMESTAMP_SECONDS(" + "15*60 * DIV(UNIX_SECONDS(CAST({col} AS TIMESTAMP)), 15*60)" + ") AS {type})", + TimeGrain.THIRTY_MINUTES: "CAST(TIMESTAMP_SECONDS(" + "30*60 * DIV(UNIX_SECONDS(CAST({col} AS TIMESTAMP)), 30*60)" + ") AS {type})", + TimeGrain.HOUR: "{func}({col}, HOUR)", + TimeGrain.DAY: "{func}({col}, DAY)", + TimeGrain.WEEK: "{func}({col}, WEEK)", + TimeGrain.WEEK_STARTING_MONDAY: "{func}({col}, ISOWEEK)", + TimeGrain.MONTH: "{func}({col}, MONTH)", + TimeGrain.QUARTER: "{func}({col}, QUARTER)", + TimeGrain.YEAR: "{func}({col}, YEAR)", + } + + custom_errors: dict[Pattern[str], tuple[str, SupersetErrorType, dict[str, Any]]] = { + CONNECTION_DATABASE_PERMISSIONS_REGEX: ( + __( + "Unable to connect. Verify that the following roles are set " + 'on the service account: "Cloud Datastore Viewer", ' + '"Cloud Datastore User", "Cloud Datastore Creator"' + ), + SupersetErrorType.CONNECTION_DATABASE_PERMISSIONS_ERROR, + {}, + ), + TABLE_DOES_NOT_EXIST_REGEX: ( + __( + 'The table "%(table)s" does not exist. ' + "A valid table must be used to run this query.", + ), + SupersetErrorType.TABLE_DOES_NOT_EXIST_ERROR, + {}, + ), + COLUMN_DOES_NOT_EXIST_REGEX: ( + __('We can\'t seem to resolve column "%(column)s" at line %(location)s.'), + SupersetErrorType.COLUMN_DOES_NOT_EXIST_ERROR, + {}, + ), + SCHEMA_DOES_NOT_EXIST_REGEX: ( + __( + 'The schema "%(schema)s" does not exist. ' + "A valid schema must be used to run this query." + ), + SupersetErrorType.SCHEMA_DOES_NOT_EXIST_ERROR, + {}, + ), + SYNTAX_ERROR_REGEX: ( + __( + "Please check your query for syntax errors at or near " + '"%(syntax_error)s". Then, try running your query again.' + ), + SupersetErrorType.SYNTAX_ERROR, + {}, + ), + } + + @staticmethod + def _mutate_label(label: str) -> str: + """ + Datastore field_name should start with a letter or underscore and contain + only alphanumeric characters. Labels that start with a number are prefixed + with an underscore. Any unsupported characters are replaced with underscores + and an md5 hash is added to the end of the label to avoid possible + collisions. + + :param label: Expected expression label + :return: Conditionally mutated label + """ + label_hashed = "_" + hash_from_str(label) + + # if label starts with number, add underscore as first character + label_mutated = "_" + label if re.match(r"^\d", label) else label + + # replace non-alphanumeric characters with underscores + label_mutated = re.sub(r"[^\w]+", "_", label_mutated) + if label_mutated != label: + # add first 5 chars from md5 hash to label to avoid possible collisions + label_mutated += label_hashed[:6] + + return label_mutated + + @classmethod + def _truncate_label(cls, label: str) -> str: + """Datastore requires column names start with either a letter or + underscore. To make sure this is always the case, an underscore is prefixed + to the md5 hash of the original label. + + :param label: expected expression label + :return: truncated label + """ + return "_" + hash_from_str(label) + + @classmethod + def convert_dttm( + cls, target_type: str, dttm: datetime, db_extra: dict[str, Any] | None = None + ) -> str | None: + sqla_type = cls.get_sqla_column_type(target_type) + if isinstance(sqla_type, types.Date): + return f"CAST('{dttm.date().isoformat()}' AS DATE)" + if isinstance(sqla_type, types.TIMESTAMP): + return f"""CAST('{dttm.isoformat(timespec="microseconds")}' AS TIMESTAMP)""" + if isinstance(sqla_type, types.DateTime): + return f"""CAST('{dttm.isoformat(timespec="microseconds")}' AS DATETIME)""" + if isinstance(sqla_type, types.Time): + return f"""CAST('{dttm.strftime("%H:%M:%S.%f")}' AS TIME)""" + return None + + @classmethod + def fetch_data(cls, cursor: Any, limit: int | None = None) -> list[tuple[Any, ...]]: + data = super().fetch_data(cursor, limit) + # Support type Datastore Row, introduced here PR #4071 + # google.cloud.datastore.table.Row + if data and type(data[0]).__name__ == "Row": + data = [r.values() for r in data] # type: ignore + return data + + @classmethod + def _get_client(cls, engine: Engine, database: Database) -> datastore.Client: + """ + Return the Datastore client associated with an engine. + """ + if not dependencies_installed: + raise SupersetException( + "Could not import libraries needed to connect to Datastore." + ) + + if credentials_info := engine.dialect.credentials_info: + credentials = service_account.Credentials.from_service_account_info( + credentials_info + ) + return datastore.Client(credentials=credentials) + + try: + credentials = google.auth.default()[0] + return datastore.Client(credentials=credentials, database=database) Review Comment: **Suggestion:** The Datastore client is instantiated with an unsupported keyword argument `database`, which will raise a TypeError at runtime when default Google credentials are used, preventing connections from being established. [type error] <details> <summary><b>Severity Level:</b> Critical 🚨</summary> ```mdx - ❌ Datastore DB connections fail during client instantiation. - ❌ Default catalog resolution (`get_default_catalog`) breaks. - ⚠️ Any metadata calls relying on `_get_client` error. ``` </details> ```suggestion return datastore.Client(credentials=credentials) ``` <details> <summary><b>Steps of Reproduction ✅ </b></summary> ```mdx 1. Import the module `superset/db_engine_specs/datastore.py` (occurs on any process that loads DB engine specs). The function `_get_client` is defined starting at line 329 and executed when a Datastore client is needed. 2. Trigger a code path that calls `DatastoreEngineSpec.get_default_catalog(database)` (file: superset/db_engine_specs/datastore.py, lines 353-368). At line 366 the code executes `client = cls._get_client(engine, database)`. 3. In `_get_client` (file: superset/db_engine_specs/datastore.py, lines 329-346), if `engine.dialect.credentials_info` is falsy (no explicit service-account provided) the code reaches the `try:` block at line 344 and executes `credentials = google.auth.default()[0]` (line 345). 4. Immediately after, the code executes the existing call `return datastore.Client(credentials=credentials, database=database)` (line 346). The google Cloud Datastore client does not accept a `database` keyword in its constructor in supported client versions, causing Python to raise TypeError like: "TypeError: __init__() got an unexpected keyword argument 'database'". This prevents `get_default_catalog()` from returning `client.project` and causes connection/metadata lookup to fail. (Concrete evidence: lines 345-346 in the file show the exact offending call.) ``` </details> <details> <summary><b>Prompt for AI Agent 🤖 </b></summary> ```mdx This is a comment left during a code review. **Path:** superset/db_engine_specs/datastore.py **Line:** 346:346 **Comment:** *Type Error: The Datastore client is instantiated with an unsupported keyword argument `database`, which will raise a TypeError at runtime when default Google credentials are used, preventing connections from being established. Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise. ``` </details> ########## tests/integration_tests/db_engine_specs/datastore_tests.py: ########## @@ -0,0 +1,516 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import unittest.mock as mock +from datetime import datetime +from typing import Any + +import pytest +from marshmallow.exceptions import ValidationError +from sqlalchemy import column +from sqlalchemy.engine.url import make_url + +from superset.connectors.sqla.models import TableColumn +from superset.db_engine_specs.base import BaseEngineSpec +from superset.db_engine_specs.datastore import DatastoreEngineSpec +from superset.errors import ErrorLevel, SupersetError, SupersetErrorType +from superset.superset_typing import ResultSetColumnType +from tests.integration_tests.base_tests import SupersetTestCase +from tests.integration_tests.fixtures.birth_names_dashboard import ( + load_birth_names_dashboard_with_slices, # noqa: F401 + load_birth_names_data, # noqa: F401 +) + + +class TestDatastoreDbEngineSpec(SupersetTestCase): + def test_datastore_sqla_column_label(self): + """ + DB Eng Specs (datastore): Test column label + """ + # Expected labels with SHA-256 hash suffix (first 5 chars prefixed with _) + test_cases = { + "Col": "Col", + "SUM(x)": "SUM_x__b681e", + "SUM[x]": "SUM_x__ceaf6", + "12345_col": "_12345_col_b1415", + } + for original, expected in test_cases.items(): + actual = DatastoreEngineSpec.make_label_compatible(column(original).name) + assert actual == expected + + def test_timegrain_expressions(self): + """ + DB Eng Specs (datastore): Test time grain expressions + """ + col = column("temporal") + test_cases = { + "DATE": "DATE_TRUNC(temporal, HOUR)", + "TIME": "TIME_TRUNC(temporal, HOUR)", + "DATETIME": "DATETIME_TRUNC(temporal, HOUR)", + "TIMESTAMP": "TIMESTAMP_TRUNC(temporal, HOUR)", + } + for type_, expected in test_cases.items(): + col.type = type_ + actual = DatastoreEngineSpec.get_timestamp_expr( + col=col, pdf=None, time_grain="PT1H" + ) + assert str(actual) == expected + + def test_custom_minute_timegrain_expressions(self): + """ + DB Eng Specs (datastore): Test time grain expressions + """ + col = column("temporal") + test_cases = { + "DATE": "CAST(TIMESTAMP_SECONDS(" + "5*60 * DIV(UNIX_SECONDS(CAST(temporal AS TIMESTAMP)), 5*60)" + ") AS DATE)", + "DATETIME": "CAST(TIMESTAMP_SECONDS(" + "5*60 * DIV(UNIX_SECONDS(CAST(temporal AS TIMESTAMP)), 5*60)" + ") AS DATETIME)", + "TIMESTAMP": "CAST(TIMESTAMP_SECONDS(" + "5*60 * DIV(UNIX_SECONDS(CAST(temporal AS TIMESTAMP)), 5*60)" + ") AS TIMESTAMP)", + } + for type_, expected in test_cases.items(): + col.type = type_ + actual = DatastoreEngineSpec.get_timestamp_expr( + col=col, pdf=None, time_grain="PT5M" + ) + assert str(actual) == expected + + def test_fetch_data(self): + """ + DB Eng Specs (datastore): Test fetch data + """ + + # Mock a google.cloud.datastore.table.Row + class Row: + def __init__(self, value): + self._value = value + + def values(self): + return self._value + + data1 = [(1, "foo")] + with mock.patch.object(BaseEngineSpec, "fetch_data", return_value=data1): + result = DatastoreEngineSpec.fetch_data(None, 0) + assert result == data1 + + data2 = [Row(1), Row(2)] + with mock.patch.object(BaseEngineSpec, "fetch_data", return_value=data2): + result = DatastoreEngineSpec.fetch_data(None, 0) + assert result == [1, 2] Review Comment: **Suggestion:** The `test_fetch_data` stub for the Datastore `Row` type returns a bare scalar from `values()` and asserts a list of scalars, which contradicts the `fetch_data` contract of returning a list of tuples and can hide bugs with multi-column rows; the stub should return a tuple (or sequence) and the assertion should expect a list of tuples. [logic error] <details> <summary><b>Severity Level:</b> Major ⚠️</summary> ```mdx - ❌ Test may miss multi-column fetch_data regressions. - ⚠️ Datastore fetch_data integration test weaker than intended. ``` </details> ```suggestion return (self._value,) data1 = [(1, "foo")] with mock.patch.object(BaseEngineSpec, "fetch_data", return_value=data1): result = DatastoreEngineSpec.fetch_data(None, 0) assert result == data1 data2 = [Row(1), Row(2)] with mock.patch.object(BaseEngineSpec, "fetch_data", return_value=data2): result = DatastoreEngineSpec.fetch_data(None, 0) assert result == [(1,), (2,)] ``` <details> <summary><b>Steps of Reproduction ✅ </b></summary> ```mdx 1. Run the single integration test: pytest tests/integration_tests/db_engine_specs/datastore_tests.py::TestDatastoreDbEngineSpec::test_fetch_data -q. 2. The test defines a local Row stub at tests/integration_tests/db_engine_specs/datastore_tests.py:101-106 whose values() returns a bare scalar (the code visible in the test file). 3. The test patches BaseEngineSpec.fetch_data to return a list of those Row objects at tests/integration_tests/db_engine_specs/datastore_tests.py:114-115. 4. DatastoreEngineSpec.fetch_data (superset/db_engine_specs/datastore.py:320-326) calls r.values() for each Row; because the test stub returns a scalar, the resulting list contains scalars. The test asserts a list of scalars at tests/...:116, so it passes, but this contradicts the declared fetch_data contract (list[tuple]) and would not catch regressions where real Row.values() returns a sequence/tuple. Changing the stub to return a tuple and asserting [(1,), (2,)] makes the test verify the expected shape. ``` </details> <details> <summary><b>Prompt for AI Agent 🤖 </b></summary> ```mdx This is a comment left during a code review. **Path:** tests/integration_tests/db_engine_specs/datastore_tests.py **Line:** 106:116 **Comment:** *Logic Error: The `test_fetch_data` stub for the Datastore `Row` type returns a bare scalar from `values()` and asserts a list of scalars, which contradicts the `fetch_data` contract of returning a list of tuples and can hide bugs with multi-column rows; the stub should return a tuple (or sequence) and the assertion should expect a list of tuples. Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise. ``` </details> ########## tests/integration_tests/db_engine_specs/datastore_tests.py: ########## @@ -0,0 +1,516 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import unittest.mock as mock +from datetime import datetime +from typing import Any + +import pytest +from marshmallow.exceptions import ValidationError +from sqlalchemy import column Review Comment: **Suggestion:** This integration test module patches `sqlalchemy_datastore.*` in a decorator, which causes the entire test module import to fail with `ModuleNotFoundError` when the optional `sqlalchemy_datastore` package is not installed; using `pytest.importorskip("sqlalchemy_datastore")` at module import time will skip the whole test module instead of crashing when the dependency is missing. [possible bug] <details> <summary><b>Severity Level:</b> Major ⚠️</summary> ```mdx - ❌ Integration test module import fails during collection. - ⚠️ CI or local test runs may error without optional dependency. ``` </details> ```suggestion pytest.importorskip("sqlalchemy_datastore") ``` <details> <summary><b>Steps of Reproduction ✅ </b></summary> ```mdx 1. Ensure the optional package is not installed: uninstall sqlalchemy_datastore from the test environment (pip uninstall sqlalchemy-datastore). 2. Run pytest collection for the integration tests: pytest tests/integration_tests/db_engine_specs/datastore_tests.py -q. Pytest will import the module at tests/integration_tests/db_engine_specs/datastore_tests.py (file top-level imports and decorators live in this file). 3. During module import, the decorators at tests/integration_tests/db_engine_specs/datastore_tests.py:228-236 (the @mock.patch("sqlalchemy_datastore.base.create_datastore_client", ...) and @mock.patch("sqlalchemy_datastore._helpers.create_datastore_client", ...)) are applied; mock.patch with a string target attempts to resolve/import the target module, which raises ModuleNotFoundError when sqlalchemy_datastore is absent. 4. Observe ModuleNotFoundError preventing test collection and causing the test run to fail. Using pytest.importorskip("sqlalchemy_datastore") at module import (as suggested) will skip the whole test module on missing optional dependency instead of crashing. ``` </details> <details> <summary><b>Prompt for AI Agent 🤖 </b></summary> ```mdx This is a comment left during a code review. **Path:** tests/integration_tests/db_engine_specs/datastore_tests.py **Line:** 23:23 **Comment:** *Possible Bug: This integration test module patches `sqlalchemy_datastore.*` in a decorator, which causes the entire test module import to fail with `ModuleNotFoundError` when the optional `sqlalchemy_datastore` package is not installed; using `pytest.importorskip("sqlalchemy_datastore")` at module import time will skip the whole test module instead of crashing when the dependency is missing. Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise. ``` </details> ########## tests/integration_tests/db_engine_specs/datastore_tests.py: ########## @@ -0,0 +1,516 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import unittest.mock as mock +from datetime import datetime +from typing import Any + +import pytest +from marshmallow.exceptions import ValidationError +from sqlalchemy import column +from sqlalchemy.engine.url import make_url + +from superset.connectors.sqla.models import TableColumn +from superset.db_engine_specs.base import BaseEngineSpec +from superset.db_engine_specs.datastore import DatastoreEngineSpec +from superset.errors import ErrorLevel, SupersetError, SupersetErrorType +from superset.superset_typing import ResultSetColumnType +from tests.integration_tests.base_tests import SupersetTestCase +from tests.integration_tests.fixtures.birth_names_dashboard import ( + load_birth_names_dashboard_with_slices, # noqa: F401 + load_birth_names_data, # noqa: F401 +) + + +class TestDatastoreDbEngineSpec(SupersetTestCase): + def test_datastore_sqla_column_label(self): + """ + DB Eng Specs (datastore): Test column label + """ + # Expected labels with SHA-256 hash suffix (first 5 chars prefixed with _) + test_cases = { + "Col": "Col", + "SUM(x)": "SUM_x__b681e", + "SUM[x]": "SUM_x__ceaf6", + "12345_col": "_12345_col_b1415", + } + for original, expected in test_cases.items(): + actual = DatastoreEngineSpec.make_label_compatible(column(original).name) + assert actual == expected + + def test_timegrain_expressions(self): + """ + DB Eng Specs (datastore): Test time grain expressions + """ + col = column("temporal") + test_cases = { + "DATE": "DATE_TRUNC(temporal, HOUR)", + "TIME": "TIME_TRUNC(temporal, HOUR)", + "DATETIME": "DATETIME_TRUNC(temporal, HOUR)", + "TIMESTAMP": "TIMESTAMP_TRUNC(temporal, HOUR)", + } + for type_, expected in test_cases.items(): + col.type = type_ + actual = DatastoreEngineSpec.get_timestamp_expr( + col=col, pdf=None, time_grain="PT1H" + ) + assert str(actual) == expected + + def test_custom_minute_timegrain_expressions(self): + """ + DB Eng Specs (datastore): Test time grain expressions + """ + col = column("temporal") + test_cases = { + "DATE": "CAST(TIMESTAMP_SECONDS(" + "5*60 * DIV(UNIX_SECONDS(CAST(temporal AS TIMESTAMP)), 5*60)" + ") AS DATE)", + "DATETIME": "CAST(TIMESTAMP_SECONDS(" + "5*60 * DIV(UNIX_SECONDS(CAST(temporal AS TIMESTAMP)), 5*60)" + ") AS DATETIME)", + "TIMESTAMP": "CAST(TIMESTAMP_SECONDS(" + "5*60 * DIV(UNIX_SECONDS(CAST(temporal AS TIMESTAMP)), 5*60)" + ") AS TIMESTAMP)", + } + for type_, expected in test_cases.items(): + col.type = type_ + actual = DatastoreEngineSpec.get_timestamp_expr( + col=col, pdf=None, time_grain="PT5M" + ) + assert str(actual) == expected + + def test_fetch_data(self): + """ + DB Eng Specs (datastore): Test fetch data + """ + + # Mock a google.cloud.datastore.table.Row + class Row: + def __init__(self, value): + self._value = value + + def values(self): + return self._value + + data1 = [(1, "foo")] + with mock.patch.object(BaseEngineSpec, "fetch_data", return_value=data1): + result = DatastoreEngineSpec.fetch_data(None, 0) + assert result == data1 + + data2 = [Row(1), Row(2)] + with mock.patch.object(BaseEngineSpec, "fetch_data", return_value=data2): + result = DatastoreEngineSpec.fetch_data(None, 0) + assert result == [1, 2] + + def test_extract_errors(self): + msg = "403 POST https://datastore.googleapis.com/: Access Denied: Project my-project: User does not have datastore.databases.create permission in project my-project" # noqa: E501 + result = DatastoreEngineSpec.extract_errors(Exception(msg)) + assert result == [ + SupersetError( + message="Unable to connect. Verify that the following roles are set " + 'on the service account: "Cloud Datastore Viewer", ' + '"Cloud Datastore User", "Cloud Datastore Creator"', + error_type=SupersetErrorType.CONNECTION_DATABASE_PERMISSIONS_ERROR, + level=ErrorLevel.ERROR, + extra={ + "engine_name": "Google Datastore", + "issue_codes": [ + { + "code": 1017, + "message": "", + } + ], + }, + ) + ] + + msg = "datastore error: 404 Not found: Dataset fakeDataset:bogusSchema was not found in location" # noqa: E501 + result = DatastoreEngineSpec.extract_errors(Exception(msg)) + assert result == [ + SupersetError( + message='The schema "bogusSchema" does not exist. A valid schema must be used to run this query.', # noqa: E501 + error_type=SupersetErrorType.SCHEMA_DOES_NOT_EXIST_ERROR, + level=ErrorLevel.ERROR, + extra={ + "engine_name": "Google Datastore", + "issue_codes": [ + { + "code": 1003, + "message": "Issue 1003 - There is a syntax error in the SQL query. Perhaps there was a misspelling or a typo.", # noqa: E501 + }, + { + "code": 1004, + "message": "Issue 1004 - The column was deleted or renamed in the database.", # noqa: E501 + }, + ], + }, + ) + ] + + msg = 'Table name "badtable" missing dataset while no default dataset is set in the request' # noqa: E501 + result = DatastoreEngineSpec.extract_errors(Exception(msg)) + assert result == [ + SupersetError( + message='The table "badtable" does not exist. A valid table must be used to run this query.', # noqa: E501 + error_type=SupersetErrorType.TABLE_DOES_NOT_EXIST_ERROR, + level=ErrorLevel.ERROR, + extra={ + "engine_name": "Google Datastore", + "issue_codes": [ + { + "code": 1003, + "message": "Issue 1003 - There is a syntax error in the SQL query. Perhaps there was a misspelling or a typo.", # noqa: E501 + }, + { + "code": 1005, + "message": "Issue 1005 - The table was deleted or renamed in the database.", # noqa: E501 + }, + ], + }, + ) + ] + + msg = "Unrecognized name: badColumn at [1:8]" + result = DatastoreEngineSpec.extract_errors(Exception(msg)) + assert result == [ + SupersetError( + message='We can\'t seem to resolve column "badColumn" at line 1:8.', + error_type=SupersetErrorType.COLUMN_DOES_NOT_EXIST_ERROR, + level=ErrorLevel.ERROR, + extra={ + "engine_name": "Google Datastore", + "issue_codes": [ + { + "code": 1003, + "message": "Issue 1003 - There is a syntax error in the SQL query. Perhaps there was a misspelling or a typo.", # noqa: E501 + }, + { + "code": 1004, + "message": "Issue 1004 - The column was deleted or renamed in the database.", # noqa: E501 + }, + ], + }, + ) + ] + + msg = 'Syntax error: Expected end of input but got identifier "from_"' + result = DatastoreEngineSpec.extract_errors(Exception(msg)) + assert result == [ + SupersetError( + message='Please check your query for syntax errors at or near "from_". Then, try running your query again.', # noqa: E501 + error_type=SupersetErrorType.SYNTAX_ERROR, + level=ErrorLevel.ERROR, + extra={ + "engine_name": "Google Datastore", + "issue_codes": [ + { + "code": 1030, + "message": "Issue 1030 - The query has a syntax error.", + } + ], + }, + ) + ] + + @mock.patch("superset.models.core.Database.db_engine_spec", DatastoreEngineSpec) + @mock.patch( + "sqlalchemy_datastore.base.create_datastore_client", + mock.Mock(return_value=(mock.Mock(), mock.Mock())), + ) + @mock.patch( + "sqlalchemy_datastore._helpers.create_datastore_client", + mock.Mock(return_value=(mock.Mock(), mock.Mock())), + ) + @pytest.mark.usefixtures("load_birth_names_dashboard_with_slices") + def test_calculated_column_in_order_by(self): + table = self.get_table(name="birth_names") + TableColumn( + column_name="gender_cc", + type="VARCHAR(255)", + table=table, + expression=""" + case + when gender='boy' then 'male' + else 'female' + end + """, + ) + + table.database.sqlalchemy_uri = "datastore://" + query_obj = { + "groupby": ["gender_cc"], + "is_timeseries": False, + "filter": [], + "orderby": [["gender_cc", True]], + } + sql = table.get_query_str(query_obj) + assert "ORDER BY gender_cc ASC" in sql + + def test_build_sqlalchemy_uri(self): + """ + DB Eng Specs (datastore): Test building SQLAlchemy URI from parameters + """ + parameters: dict[str, Any] = {"query": {}} + encrypted_extra = { + "credentials_info": { + "project_id": "my-project", + "private_key": "SECRET", + } + } + result = DatastoreEngineSpec.build_sqlalchemy_uri(parameters, encrypted_extra) + assert result == "datastore://my-project/?" + + # Test with query parameters + parameters_with_query: dict[str, Any] = {"query": {"location": "US"}} + result = DatastoreEngineSpec.build_sqlalchemy_uri( + parameters_with_query, encrypted_extra + ) + assert result == "datastore://my-project/?location=US" + + # Test missing encrypted_extra raises ValidationError + with pytest.raises(ValidationError, match="Missing service credentials"): + DatastoreEngineSpec.build_sqlalchemy_uri(parameters, None) + + # Test missing project_id raises ValidationError + bad_extra = {"credentials_info": {"private_key": "SECRET"}} + with pytest.raises(ValidationError, match="Invalid service credentials"): + DatastoreEngineSpec.build_sqlalchemy_uri(parameters, bad_extra) + + def test_get_function_names(self): + """ + DB Eng Specs (datastore): Test retrieving function names for autocomplete + """ + database = mock.MagicMock() + result = DatastoreEngineSpec.get_function_names(database) + assert result == ["sum", "avg", "count"] + + def test_get_view_names(self): + """ + DB Eng Specs (datastore): Test that Datastore returns no view names + """ + database = mock.MagicMock() + inspector = mock.MagicMock() + result = DatastoreEngineSpec.get_view_names(database, inspector, "some_schema") + assert result == set() + + def test_validate_parameters(self): + """ + DB Eng Specs (datastore): Test parameter validation returns no errors + """ + result = DatastoreEngineSpec.validate_parameters( + { + "host": "localhost", + "port": 5432, + "username": "", + "password": "", + "database": "", + "query": {}, + } + ) + assert result == [] + + def test_get_allow_cost_estimate(self): + """ + DB Eng Specs (datastore): Test that cost estimate is not supported + """ + assert DatastoreEngineSpec.get_allow_cost_estimate({}) is False + + def test_parse_error_exception(self): + """ + DB Eng Specs (datastore): Test error message parsing extracts first line + """ + multiline_msg = ( + 'datastore error: 400 Syntax error: Table "t" must be qualified.\n' + "\n" + "(job ID: abc-123)\n" + "\n" + " -----Query Job SQL Follows-----\n" + " 1:select * from t\n" + ) + result = DatastoreEngineSpec.parse_error_exception(Exception(multiline_msg)) + assert str(result) == ( + 'datastore error: 400 Syntax error: Table "t" must be qualified.' + ) + + # Simple single-line messages pass through unchanged + simple_msg = "Some simple error" + result = DatastoreEngineSpec.parse_error_exception(Exception(simple_msg)) + assert str(result) == simple_msg + + def test_convert_dttm(self): + """ + DB Eng Specs (datastore): Test datetime conversion for all supported types + """ + dttm = datetime(2019, 1, 2, 3, 4, 5, 678900) + + assert ( + DatastoreEngineSpec.convert_dttm("DATE", dttm) + == "CAST('2019-01-02' AS DATE)" + ) + assert ( + DatastoreEngineSpec.convert_dttm("DATETIME", dttm) + == "CAST('2019-01-02T03:04:05.678900' AS DATETIME)" + ) + assert ( + DatastoreEngineSpec.convert_dttm("TIMESTAMP", dttm) + == "CAST('2019-01-02T03:04:05.678900' AS TIMESTAMP)" + ) + assert ( + DatastoreEngineSpec.convert_dttm("TIME", dttm) + == "CAST('03:04:05.678900' AS TIME)" + ) + assert DatastoreEngineSpec.convert_dttm("UnknownType", dttm) is None + + def test_get_parameters_from_uri(self): + """ + DB Eng Specs (datastore): Test extracting parameters from URI + """ + encrypted_extra = { + "credentials_info": { + "project_id": "my-project", + "private_key": "SECRET", + } + } + result = DatastoreEngineSpec.get_parameters_from_uri( + "datastore://my-project/", encrypted_extra + ) + assert result == { + "credentials_info": { + "project_id": "my-project", + "private_key": "SECRET", + }, + "query": {}, + } + + # URI with query parameters + result = DatastoreEngineSpec.get_parameters_from_uri( + "datastore://my-project/?location=US", encrypted_extra + ) + assert result["query"] == {"location": "US"} + + # Missing encrypted_extra raises ValidationError + with pytest.raises(ValidationError, match="Invalid service credentials"): + DatastoreEngineSpec.get_parameters_from_uri("datastore://my-project/", None) + + def test_get_dbapi_exception_mapping(self): + """ + DB Eng Specs (datastore): Test DBAPI exception mapping includes + DefaultCredentialsError + """ + from superset.db_engine_specs.exceptions import SupersetDBAPIConnectionError Review Comment: **Suggestion:** The `test_get_dbapi_exception_mapping` test assumes that the `google.auth` package is installed and will fail when it is not, even though the engine spec can legitimately return an empty mapping in that case; adding `pytest.importorskip("google.auth")` inside the test ensures it is skipped rather than failing when the optional dependency is missing. [possible bug] <details> <summary><b>Severity Level:</b> Major ⚠️</summary> ```mdx - ❌ test_get_dbapi_exception_mapping fails without google.auth. - ⚠️ Test suite unstable across developer environments. ``` </details> ```suggestion pytest.importorskip("google.auth") ``` <details> <summary><b>Steps of Reproduction ✅ </b></summary> ```mdx 1. Remove google-auth from the test environment (pip uninstall google-auth) so the optional google.auth package is unavailable. 2. Run just this test: pytest tests/integration_tests/db_engine_specs/datastore_tests.py::TestDatastoreDbEngineSpec::test_get_dbapi_exception_mapping -q. 3. The test calls DatastoreEngineSpec.get_dbapi_exception_mapping (superset/db_engine_specs/datastore.py) and asserts len(mapping) == 1; when google.auth is missing the engine spec can legitimately return an empty mapping and the assertion fails, causing the test to fail. 4. Adding pytest.importorskip("google.auth") at the top of the test (as suggested) will skip the test in environments missing the optional dependency instead of failing. ``` </details> <details> <summary><b>Prompt for AI Agent 🤖 </b></summary> ```mdx This is a comment left during a code review. **Path:** tests/integration_tests/db_engine_specs/datastore_tests.py **Line:** 413:413 **Comment:** *Possible Bug: The `test_get_dbapi_exception_mapping` test assumes that the `google.auth` package is installed and will fail when it is not, even though the engine spec can legitimately return an empty mapping in that case; adding `pytest.importorskip("google.auth")` inside the test ensures it is skipped rather than failing when the optional dependency is missing. Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise. ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
