Copilot commented on code in PR #36777:
URL: https://github.com/apache/superset/pull/36777#discussion_r2673545500
##########
tests/integration_tests/db_engine_specs/databricks_tests.py:
##########
@@ -59,3 +61,44 @@ def test_extras_with_ssl_custom(self):
extras = DatabricksNativeEngineSpec.get_extra_params(database)
connect_args = extras["engine_params"]["connect_args"]
assert connect_args["ssl"] == "1"
+
+ def test_handle_boolean_in_clause(self):
+ """
+ Test that boolean IN clauses use boolean literals instead of integers.
+
+ Databricks requires boolean literals (True/False) in IN clauses,
+ not integers (0/1). This test verifies that handle_boolean_in_clause
+ converts values properly and uses OR conditions with equality checks.
+ """
+ # Create a mock table with a boolean column
+ test_table = Table(
+ "test_table",
+ mock.MagicMock(),
+ Column("is_test_user", Boolean),
+ )
+ sqla_col = test_table.c.is_test_user
+
+ # Test with boolean values
+ result = DatabricksNativeEngineSpec.handle_boolean_in_clause(
+ sqla_col, [False]
+ )
+ # Verify the result is an OR condition (not an IN clause)
+ assert result is not None
+
+ # Test with integer values (should be converted to booleans)
+ result = DatabricksNativeEngineSpec.handle_boolean_in_clause(
+ sqla_col, [0, 1]
+ )
+ assert result is not None
+
+ # Test with string values
+ result = DatabricksNativeEngineSpec.handle_boolean_in_clause(
+ sqla_col, ["false", "true"]
+ )
+ assert result is not None
+
+ # Test with empty list (should return false condition)
+ from sqlalchemy import false
+ result = DatabricksNativeEngineSpec.handle_boolean_in_clause(sqla_col,
[])
+ # The result should be a false condition when all values are filtered
out
+ assert result is not None
Review Comment:
The test does not cover important edge cases such as: invalid numeric values
(e.g., 2, -1, 0.5), unrecognized string values (e.g., "yes", "no"), mixed
valid/invalid inputs, None values mixed with valid booleans, and duplicate
values that map to the same boolean. These edge cases should be tested to
ensure the implementation handles them correctly and to prevent regressions.
##########
superset/models/helpers.py:
##########
@@ -3079,19 +3079,45 @@ def get_sqla_query( # pylint:
disable=too-many-arguments,too-many-locals,too-ma
raise QueryObjectValidationError(
_("Filter value list cannot be empty")
)
- if len(eq) > len(
- eq_without_none := [x for x in eq if x is not None]
+ # Handle boolean columns specially for databases that
require
+ # boolean literals in IN clauses (e.g., Databricks)
+ if (
+ target_generic_type == utils.GenericDataType.BOOLEAN
+ and hasattr(db_engine_spec, "handle_boolean_in_clause")
):
- is_null_cond = sqla_col.is_(None)
- if eq:
- cond = or_(is_null_cond,
sqla_col.in_(eq_without_none))
+ if len(eq) > len(
+ eq_without_none := [x for x in eq if x is not None]
+ ):
+ is_null_cond = sqla_col.is_(None)
+ if eq_without_none:
+ boolean_cond =
db_engine_spec.handle_boolean_in_clause(
+ sqla_col, eq_without_none
+ )
+ cond = or_(is_null_cond, boolean_cond)
+ else:
+ cond = is_null_cond
else:
- cond = is_null_cond
+ cond = db_engine_spec.handle_boolean_in_clause(
+ sqla_col, list(eq)
Review Comment:
The conversion list(eq) is redundant since eq is already asserted to be a
tuple or list at line 3077. This creates an unnecessary copy of the data.
Consider passing eq directly to the method instead.
```suggestion
sqla_col, eq
```
##########
superset/db_engine_specs/databricks.py:
##########
@@ -242,6 +242,54 @@ def convert_dttm(
def epoch_to_dttm(cls) -> str:
return HiveEngineSpec.epoch_to_dttm()
+ @classmethod
+ def handle_boolean_in_clause(
+ cls, sqla_col: Any, values: list[Any]
+ ) -> Any:
+ """
+ Handle boolean values in IN clauses for Databricks.
+
+ Databricks requires boolean literals (True/False) in IN clauses,
+ not integers (0/1). This method uses OR conditions with equality
+ checks to ensure boolean literals are used instead of integers.
+
+ :param sqla_col: SQLAlchemy column element
+ :param values: List of values for the IN clause
+ :return: SQLAlchemy expression for the boolean IN clause
+ """
+ from sqlalchemy import or_
+
+ # Convert values to proper boolean types
+ boolean_values = []
+ for val in values:
+ if val is None:
+ continue
+ # Ensure boolean values are properly typed
+ if isinstance(val, bool):
+ boolean_values.append(val)
+ elif isinstance(val, (int, float)):
+ # Convert 0/1 to False/True
+ boolean_values.append(bool(val))
+ elif isinstance(val, str):
+ # Handle string representations
+ if val.lower() in ("true", "1"):
+ boolean_values.append(True)
+ elif val.lower() in ("false", "0"):
+ boolean_values.append(False)
+ else:
+ # For other types, try to convert to boolean
+ boolean_values.append(bool(val))
Review Comment:
The boolean conversion logic for integers and floats is incorrect. Using
bool() on any non-zero number will return True, which means values like 2, 3,
-1, 0.5, etc. will all be converted to True when they should likely be rejected
or handled differently. For a boolean column filter, only 0 and 1 should be
accepted as valid integer inputs. Consider checking if the value is
specifically 0 or 1 before converting, and either skip or raise an error for
other numeric values.
```suggestion
# Only accept canonical integer/float encodings 0 and 1
if val in (0, 1):
boolean_values.append(bool(val))
elif isinstance(val, str):
# Handle string representations
lower_val = val.lower()
if lower_val in ("true", "1"):
boolean_values.append(True)
elif lower_val in ("false", "0"):
boolean_values.append(False)
# Ignore all other types and non-canonical numeric/string values
```
##########
tests/integration_tests/db_engine_specs/databricks_tests.py:
##########
@@ -59,3 +61,44 @@ def test_extras_with_ssl_custom(self):
extras = DatabricksNativeEngineSpec.get_extra_params(database)
connect_args = extras["engine_params"]["connect_args"]
assert connect_args["ssl"] == "1"
+
+ def test_handle_boolean_in_clause(self):
+ """
+ Test that boolean IN clauses use boolean literals instead of integers.
+
+ Databricks requires boolean literals (True/False) in IN clauses,
+ not integers (0/1). This test verifies that handle_boolean_in_clause
+ converts values properly and uses OR conditions with equality checks.
+ """
+ # Create a mock table with a boolean column
+ test_table = Table(
+ "test_table",
+ mock.MagicMock(),
+ Column("is_test_user", Boolean),
+ )
+ sqla_col = test_table.c.is_test_user
+
+ # Test with boolean values
+ result = DatabricksNativeEngineSpec.handle_boolean_in_clause(
+ sqla_col, [False]
+ )
+ # Verify the result is an OR condition (not an IN clause)
+ assert result is not None
+
+ # Test with integer values (should be converted to booleans)
+ result = DatabricksNativeEngineSpec.handle_boolean_in_clause(
+ sqla_col, [0, 1]
+ )
+ assert result is not None
+
+ # Test with string values
+ result = DatabricksNativeEngineSpec.handle_boolean_in_clause(
+ sqla_col, ["false", "true"]
+ )
+ assert result is not None
+
+ # Test with empty list (should return false condition)
+ from sqlalchemy import false
Review Comment:
The import statement for sqlalchemy.false at line 101 should be moved to the
top of the file with the other imports from sqlalchemy at line 19. Importing
modules in the middle of functions reduces code readability and goes against
PEP 8 style guidelines which recommend placing all imports at the top of the
file.
##########
superset/db_engine_specs/databricks.py:
##########
@@ -242,6 +242,54 @@ def convert_dttm(
def epoch_to_dttm(cls) -> str:
return HiveEngineSpec.epoch_to_dttm()
+ @classmethod
+ def handle_boolean_in_clause(
+ cls, sqla_col: Any, values: list[Any]
+ ) -> Any:
+ """
+ Handle boolean values in IN clauses for Databricks.
+
+ Databricks requires boolean literals (True/False) in IN clauses,
+ not integers (0/1). This method uses OR conditions with equality
+ checks to ensure boolean literals are used instead of integers.
+
+ :param sqla_col: SQLAlchemy column element
+ :param values: List of values for the IN clause
+ :return: SQLAlchemy expression for the boolean IN clause
+ """
+ from sqlalchemy import or_
+
+ # Convert values to proper boolean types
+ boolean_values = []
+ for val in values:
+ if val is None:
+ continue
+ # Ensure boolean values are properly typed
+ if isinstance(val, bool):
+ boolean_values.append(val)
+ elif isinstance(val, (int, float)):
+ # Convert 0/1 to False/True
+ boolean_values.append(bool(val))
+ elif isinstance(val, str):
+ # Handle string representations
+ if val.lower() in ("true", "1"):
+ boolean_values.append(True)
+ elif val.lower() in ("false", "0"):
+ boolean_values.append(False)
+ else:
+ # For other types, try to convert to boolean
+ boolean_values.append(bool(val))
+
+ if not boolean_values:
+ # If all values are None, return a condition that's always false
+ from sqlalchemy import false
+ return false()
+
+ # Use OR conditions with equality checks for boolean values
+ # This ensures boolean literals (True/False) are used instead of
integers
+ conditions = [sqla_col == val for val in boolean_values]
Review Comment:
The boolean_values list can contain duplicate entries when multiple input
values map to the same boolean. For example, [0, False, "false", "0"] would all
convert to False, creating duplicate conditions in the OR clause. Consider
using a set to deduplicate the boolean values before creating conditions. This
would generate more efficient SQL and avoid redundant equality checks.
```suggestion
# Deduplicate boolean values to avoid redundant equality checks
conditions = [sqla_col == val for val in set(boolean_values)]
```
##########
superset/db_engine_specs/databricks.py:
##########
@@ -242,6 +242,54 @@ def convert_dttm(
def epoch_to_dttm(cls) -> str:
return HiveEngineSpec.epoch_to_dttm()
+ @classmethod
+ def handle_boolean_in_clause(
+ cls, sqla_col: Any, values: list[Any]
+ ) -> Any:
+ """
+ Handle boolean values in IN clauses for Databricks.
+
+ Databricks requires boolean literals (True/False) in IN clauses,
+ not integers (0/1). This method uses OR conditions with equality
+ checks to ensure boolean literals are used instead of integers.
+
+ :param sqla_col: SQLAlchemy column element
+ :param values: List of values for the IN clause
+ :return: SQLAlchemy expression for the boolean IN clause
+ """
+ from sqlalchemy import or_
+
+ # Convert values to proper boolean types
+ boolean_values = []
+ for val in values:
+ if val is None:
+ continue
+ # Ensure boolean values are properly typed
+ if isinstance(val, bool):
+ boolean_values.append(val)
+ elif isinstance(val, (int, float)):
+ # Convert 0/1 to False/True
+ boolean_values.append(bool(val))
+ elif isinstance(val, str):
+ # Handle string representations
+ if val.lower() in ("true", "1"):
+ boolean_values.append(True)
+ elif val.lower() in ("false", "0"):
+ boolean_values.append(False)
Review Comment:
The handling of string values that don't match the expected patterns is
inconsistent. If a string value like "yes", "no", "maybe", or any other
non-boolean string is passed, it silently falls through without being converted
to a boolean. This could lead to unexpected behavior where invalid string
values are simply ignored. Consider explicitly handling unrecognized string
values, either by skipping them with a log warning or by raising a validation
error to provide clearer feedback.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]