Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
findepi commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2005577033
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -215,6 +230,69 @@ fn is_supported_dictionary_type(data_type: &DataType) ->
bool {
DataType::Dictionary(_, inner) if is_supported_type(inner))
}
+/// Try to move a cast from a column to the other side of a `=` / `!=` operator
+///
+/// Specifically, rewrites
+/// ```sql
+/// cast(col)
+/// ```
+///
+/// To
+///
+/// ```sql
+/// col cast()
+/// col
+/// ```
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => {
+// Only try for integer types (TODO can we do this for other types
+// like timestamps)?
+use DataType::*;
+if matches!(
+target_type,
+Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
+) {
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+
+let array = ScalarValue::to_array(lit_value).ok()?;
+let casted =
+arrow::compute::cast_with_options(&array, target_type,
&opts).ok()?;
Review Comment:
```suggestion
let cast = lit_value.cast_to(target_type).ok()?;
```
##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
right: Expr,
right_schema: &DFSchema,
) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
let (left_type, right_type) = BinaryTypeCoercer::new(
&left.get_type(left_schema)?,
&op,
&right.get_type(right_schema)?,
)
.get_input_types()?;
+
Ok((
left.cast_to(&left_type, left_schema)?,
right.cast_to(&right_type, right_schema)?,
))
}
}
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => {
+// Only try for integer types (TODO can we do this for other types
like timestamps)?
+use DataType::*;
+if matches!(
+target_type,
+Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
+) {
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+let array = ScalarValue::to_array(lit_value).ok()?;
+let casted =
+arrow::compute::cast_with_options(&array, target_type,
&opts).ok()?;
Review Comment:
```suggestion
let cast = lit_value.cast_to(target_type).ok()?;
```
##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
right: Expr,
right_schema: &DFSchema,
) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&left.get_type(left_schema)?)
Review Comment:
We swapped left and right, but did nothing to `op`. This is correct only
when op is symmetric (eg `=`, `<>` but not `>` or `<`)
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -215,6 +230,69 @@ fn is_supported_dictionary_type(data_type: &DataType) ->
bool {
DataType::Dictionary(_, inner) if i
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
findepi commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2005680477
##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
right: Expr,
right_schema: &DFSchema,
) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
let (left_type, right_type) = BinaryTypeCoercer::new(
&left.get_type(left_schema)?,
&op,
&right.get_type(right_schema)?,
)
.get_input_types()?;
+
Ok((
left.cast_to(&left_type, left_schema)?,
right.cast_to(&right_type, right_schema)?,
))
}
}
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => {
+// Only try for integer types (TODO can we do this for other types
like timestamps)?
+use DataType::*;
+if matches!(
+target_type,
+Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
+) {
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+let array = ScalarValue::to_array(lit_value).ok()?;
+let casted =
+arrow::compute::cast_with_options(&array, target_type,
&opts).ok()?;
+ScalarValue::try_from_array(&casted, 0)
Review Comment:
IMO it should be enough to add logic in the other file.
What coercions get applied is the SQL dialect-specific, query
planner-dependent.
What optimizations get applied -- is the common optimizer brain all
datafusion users end up using.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2011082531
##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
right: Expr,
right_schema: &DFSchema,
) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
let (left_type, right_type) = BinaryTypeCoercer::new(
&left.get_type(left_schema)?,
&op,
&right.get_type(right_schema)?,
)
.get_input_types()?;
+
Ok((
left.cast_to(&left_type, left_schema)?,
right.cast_to(&right_type, right_schema)?,
))
}
}
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => {
+// Only try for integer types (TODO can we do this for other types
like timestamps)?
+use DataType::*;
+if matches!(
+target_type,
+Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
+) {
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+let array = ScalarValue::to_array(lit_value).ok()?;
+let casted =
+arrow::compute::cast_with_options(&array, target_type,
&opts).ok()?;
+ScalarValue::try_from_array(&casted, 0)
Review Comment:
So you mean the "prevent cast" part should be removed, is that correct?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb merged PR #15110: URL: https://github.com/apache/datafusion/pull/15110 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2759304080 :rockeet -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2757628083 @alamb @findepi Thanks for your reviews and helps! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2753032125 > Perhaps you can revert the changes to type_coercsion.rs in this PR so we can get it merged and then make a follow on PR to address #15161 ? OK! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2005677168
##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
right: Expr,
right_schema: &DFSchema,
) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
let (left_type, right_type) = BinaryTypeCoercer::new(
&left.get_type(left_schema)?,
&op,
&right.get_type(right_schema)?,
)
.get_input_types()?;
+
Ok((
left.cast_to(&left_type, left_schema)?,
right.cast_to(&right_type, right_schema)?,
))
}
}
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => {
+// Only try for integer types (TODO can we do this for other types
like timestamps)?
+use DataType::*;
+if matches!(
+target_type,
+Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
+) {
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+let array = ScalarValue::to_array(lit_value).ok()?;
+let casted =
+arrow::compute::cast_with_options(&array, target_type,
&opts).ok()?;
+ScalarValue::try_from_array(&casted, 0)
Review Comment:
@findepi thank you for the review. This file is focused on avoiding casts
being applied to the columns, and I was trying to emulate the behavior of
duckdb and postgres, where they always cast the literal and even throw an error
if the literal cannot be casted. So I was wondering whether we should even take
the operator into account here.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2005666997
##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
right: Expr,
right_schema: &DFSchema,
) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&left.get_type(left_schema)?)
Review Comment:
@findepi thank you for the review. This file is for avoiding the cast to be
applied to the column, and I was trying to emulate the behavior of duckdb and
postgres. So I was wondering should we even consider the operator here, as
duckdb and postgres both cast the literal anyways, even throwing error if the
literal can not be casted.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2005666997
##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
right: Expr,
right_schema: &DFSchema,
) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&left.get_type(left_schema)?)
Review Comment:
@findepi thank you for the review. This file is for avoiding the cast to be
applied to the column, and I was trying to emulate the behavior of duckdb and
postgres. So I was wondering should we even consider the operator here, as
duckdb and postgres both cast the literal anyways, even throwing error if the
literal can not be casted.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2727773494 > Does this make sense @alan910127 ? So we'll keep both the coercion and cast unwrapping optimizations in this PR, is that correct? I'm unsure about "support other types," do you mean in the unwrapping step? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1997943659
##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
right: Expr,
right_schema: &DFSchema,
) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
let (left_type, right_type) = BinaryTypeCoercer::new(
&left.get_type(left_schema)?,
&op,
&right.get_type(right_schema)?,
)
.get_input_types()?;
+
Ok((
left.cast_to(&left_type, left_schema)?,
right.cast_to(&right_type, right_schema)?,
))
}
}
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
Review Comment:
this seems duplicated with the function in simplify_expressions, should I
extract them into a utility function?
##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
right: Expr,
right_schema: &DFSchema,
) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
let (left_type, right_type) = BinaryTypeCoercer::new(
&left.get_type(left_schema)?,
&op,
&right.get_type(right_schema)?,
)
.get_input_types()?;
+
Ok((
left.cast_to(&left_type, left_schema)?,
right.cast_to(&right_type, right_schema)?,
))
}
}
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
Review Comment:
although I was wondering If we should care about the operator, since both
postgres and duckdb cast the literal regardless of the operator (they even emit
an error when the literal cannot be casted)
we do not perform the round-trip casting verification here is also a concern
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1997945198
##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
right: Expr,
right_schema: &DFSchema,
) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op,
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
let (left_type, right_type) = BinaryTypeCoercer::new(
&left.get_type(left_schema)?,
&op,
&right.get_type(right_schema)?,
)
.get_input_types()?;
+
Ok((
left.cast_to(&left_type, left_schema)?,
right.cast_to(&right_type, right_schema)?,
))
}
}
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
Review Comment:
tho I was wondering If we should care about the operator, since both
postgres and duckdb cast the literal regardless of the operator (they even emit
an error when the literal cannot be casted)
we do not perform the round-trip casting verification here is also a concern
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2725533031 > Actually I'm quite curious is string literal really an issue? If we want string, we can have query with quote `select * from t1 where column1 < '10';`, while if we want numeric, we can have query without quote `select * from t1 where column1 < 10;`. In practice, when is it beneficial to convert string literal into numeric type? π€ I think the core usecase is the one given on https://github.com/apache/datafusion/issues/15035 Specifcally when the user is comparing a numeric column to a string literal (date = '202401'). The `date` column and `'202401'` need to be converted to a common type, and converting the string to numeric is faster (and can be used to prune parquet predicates, for example) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1994246538
##
datafusion/core/tests/expr_api/mod.rs:
##
@@ -330,12 +330,12 @@ async fn test_create_physical_expr_coercion() {
create_expr_test(lit(1i32).eq(col("id")), "CAST(1 AS Utf8) = id@0");
// compare int col to string literal `i = '202410'`
// Note this casts the column (not the field)
-create_expr_test(col("i").eq(lit("202410")), "CAST(i@1 AS Utf8) = 202410");
-create_expr_test(lit("202410").eq(col("i")), "202410 = CAST(i@1 AS Utf8)");
+create_expr_test(col("i").eq(lit("202410")), "i@1 = 202410");
Review Comment:
π
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
jayzhan211 commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2721231187 Actually I'm quite curious is string literal really an issue? If we want string, we can have query with quote `select * from t1 where column1 < '10';`, while if we want numeric, we can have query without quote `select * from t1 where column1 < 10;`. In practice, when is it beneficial to convert string literal into numeric type? π€ -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
jayzhan211 commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2721079153 I try a bit and I think there are still some works to be done in how we parse the sql. https://github.com/apache/datafusion/pull/15202. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
jayzhan211 commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2716395289
> I don't believe manipulating casts after-the-fact is the right approach.
> Given the query select * from t1 where column1 < '10'; where column1 is an
Int32, the literal '10' should be treated as unknown, and the preference should
be to cast that. The generated predicate should then be column1 < cast('10' as
int), and whatever const folding happens will then take care of parsing '10' as
an int during planning.
> Without a proper "unknown" type, I think it should be sufficient to rank
implicitly casting to utf8 last compared to any other implicit cast.
Maybe we can figure out if it is possible to parse to the idea type for
scalar, so '123' can be casted to i64 not str and '1.2' can be casted to float
not str.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987052567
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,33 @@ pub(super) fn
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
true
}
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+dbg!(lit_value, target_type, op);
Review Comment:
I think we should probably remove `dbg!` before merge
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2711540590 @findepi I wonder if you have some time to double check the correctness of this optimization (distributing a cast) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2716046223 > I don't think the source of the cast matters (for this particular optimization) Ah, yes. Since the string -> int -> string is checked and we're only checking equality, then they'll be the same. But I' m not sure if this unwrapping would happen after the string literal coercion. > That is my understanding and also what I think should happen Got it, should I revert this PR to the unwrap cast optimization or directly combine both the optimization and the literal coercion in this PR? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2715747690
> I don't believe manipulating casts after-the-fact is the right approach.
>
> Given the query select * from t1 where column1 < '10'; where column1 is an
Int32, the literal '10' should be treated as unknown, and the preference should
be to cast that. The generated predicate should then be column1 < cast('10' as
int), and whatever const folding happens will then take care of parsing '10' as
an int during planning.
I agree @scsmithr -- For the case in
https://github.com/apache/datafusion/issues/15161 I agree that changing the
coercion rules as you suggest is a better plan than trying to fix it up
afterwards
However, I think the optimization in this PR is valuable for cases when the
user has ended up with the casts some other way and DataFusion is now
optimizing the expression
> @findepi sorry I didn't notice your comment and I just pushed a new
version with the unwrapping logic deleted.
>
> So you think the two optimizations **should** coexist?
That is my understanding and also what I think should happen
> > And it looks feasible -- the plan in #15110 (comment) IMO should work
>
> But we have no way to tell if the cast is set by the user or not in the
unwrap cast function. Doesn't this matter?
I don't think the source of the cast matters (for this particular
optimization)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987850535
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,45 @@ pub(super) fn
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
true
}
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+macro_rules! cast_or_else_return_none {
Review Comment:
Since `ScalarValue` is already dynamic, I don't think we need a macro to run
on each type
I played around a little locally and this is what I came up with. What do
you think?
```rust
/// Try to move a cast from a literal to the other side of a `=` / `!=`
operator
///
/// Specifically, rewrites
/// ```sql
/// cast(col)
/// ```
///
/// To
///
/// ```sql
/// col cast()
/// col
/// ```
fn cast_literal_to_type_with_op(
lit_value: &ScalarValue,
target_type: &DataType,
op: Operator,
) -> Option {
let (
Operator::Eq | Operator::NotEq,
ScalarValue::Utf8(Some(_))
| ScalarValue::Utf8View(Some(_))
| ScalarValue::LargeUtf8(Some(_)),
) = (op, lit_value)
else {
return None;
};
// Only try for integer types (TODO can we do this for other types
// like timestamps)?
use DataType::*;
if matches!(
target_type,
Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
) {
let opts = arrow::compute::CastOptions {
safe: false,
format_options: Default::default(),
};
let array = ScalarValue::to_array(lit_value).ok()?;
let casted =
arrow::compute::cast_with_options(&array, target_type,
&opts).ok()?;
ScalarValue::try_from_array(&casted, 0).ok()
} else {
None
}
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2715045737 @findepi sorry I didn't notice your comment and I just pushed a new version with the unwrapping logic deleted. So you think the two optimizations **should** coexist? > And it looks feasible -- the plan in #15110 (comment) IMO should work But we have no way to tell if the cast is set by the user or not in the unwrap cast function. Doesn't this matter? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
scsmithr commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2715007343
> > I'm not sure if it's only possible to handle it when the cast is created
(type coercion?)
>
> That may well be the right thing to do π€
>
> It is funny that @scsmithr filed a ticket this morning with a similar
symptom:
>
> * [Incorrect cast of integer columns to utf8 when comparing with utf8
constant #15161](https://github.com/apache/datafusion/issues/15161)
>
>
> π€
For context, I was running some implicit cast queries on a few different
systems to compare outputs, and hit the aforementioned issue with datafusion.
"Unexpected" might be a better word than "incorrect", but datafusion is the
outlier here.
My take on this:
I don't believe manipulating casts after-the-fact is the right approach.
Given the query `select * from t1 where column1 < '10';` where `column1 is
an Int32, the literal `'10'` should be treated as unknown, and the preference
should be to cast that. The generated predicate should then be `column1 <
cast('10' as int)`, and whatever const folding happens will then take care of
parsing `'10'` as an int during planning.
Without a proper "unknown" type, I think it should be sufficient to rank
implicitly casting to utf8 last compared to any other implicit cast.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
findepi commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2715005910 This cast unwrapping is valuable orthogonally to what coercions get applied by the frontend. And it looks feasible -- the plan in https://github.com/apache/datafusion/pull/15110#issuecomment-2714474220 IMO should work (for integer and string type pairs). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714782607 > I'm not sure if it's only possible to handle it when the cast is created (type coercion?) That may well be the right thing to do π€ It is funny that @scsmithr filed a ticket this morning with a similar symptom: - https://github.com/apache/datafusion/issues/15161 π€ -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714725299 > As in only do the rewrite if the sequence > > - cast to int > - cast (back) to string > > results in the exact same string as went int @alamb I found that this approach is different from how postgres and duckdb handle this situation: - `SELECT * FROM t WHERE a = '0123'` - they will cast the '0123' to an integer and it will match, so 1 row is returned. - `SELECT * FROM t WHERE cast(a AS string) = '0123'` - they will not cast the '0123', and this is a string comparison, so no rows are returned. postgres ```sql db=# create table t as select 123 a; SELECT 1 db=# select * from t where a = '0123'; a - 123 (1 row) db=# select * from t where cast(a AS text) = '0123'; a --- (0 rows) ``` duckdb ```sql D create table t as select 123 a; D select * from t where a = '0123'; βββββ β a β β int32 β βββββ€ β 123 β βββββ D select * from t where cast(a AS string) = '0123'; ββ β aβ β int32 β ββ€ β 0 rows β ββ ``` I'm not sure if it's only possible to handle it when the `cast` is created (type coercion?) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
findepi commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1989510726
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,45 @@ pub(super) fn
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
true
}
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+macro_rules! cast_or_else_return_none {
+($value:ident, $ty:expr) => {{
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+let array = ScalarValue::to_array($value).ok()?;
+let casted = arrow::compute::cast_with_options(&array, &$ty,
&opts).ok()?;
+let scalar = ScalarValue::try_from_array(&casted, 0).ok()?;
+Some(scalar)
+}};
+}
+
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => match target_type {
+DataType::Int8 => cast_or_else_return_none!(lit_value,
DataType::Int8),
+DataType::Int16 => cast_or_else_return_none!(lit_value,
DataType::Int16),
+DataType::Int32 => cast_or_else_return_none!(lit_value,
DataType::Int32),
+DataType::Int64 => cast_or_else_return_none!(lit_value,
DataType::Int64),
+DataType::UInt8 => cast_or_else_return_none!(lit_value,
DataType::UInt8),
+DataType::UInt16 => cast_or_else_return_none!(lit_value,
DataType::UInt16),
+DataType::UInt32 => cast_or_else_return_none!(lit_value,
DataType::UInt32),
+DataType::UInt64 => cast_or_else_return_none!(lit_value,
DataType::UInt64),
+_ => None,
+},
+_ => None,
Review Comment:
As it's written, and as the function is named, this looks like generic
approach that will work many different source/target type pairs, we just need
to add logic.
However, the exact checks necessary for soundness will likely differ.
For casts between exact numbers (ints and decimals) and string types, the
https://github.com/apache/datafusion/pull/15110#issuecomment-2714474220 looks
like a sound plan.
Plus, we could also simplify impossible target literals like this
- `cast(an_int as string) = 'abc'` or `cast(an_int as string) = '007'` to
`if(an_int is null, null, false)`
For casts between numbers and numbers, we likely need to recognize max
addressible values, and cast impreciseness, especially when we want to handle
`<`, `<=`, `>`, `>=` operators.
https://github.com/trinodb/trino/blob/a870656f406b0f76e97c740659a58554d472994d/core/trino-main/src/main/java/io/trino/sql/planner/iterative/rule/UnwrapCastInComparison.java#L199-L354
might serve as a good reference. Note how it handles NaN, max values, lossy
round-tripping, sometimes converting `<` to `<=` as necessary.
For e.g cast from timestamp to date, the logic will likely be different again
https://github.com/trinodb/trino/blob/a870656f406b0f76e97c740659a58554d472994d/core/trino-main/src/main/java/io/trino/sql/planner/iterative/rule/UnwrapCastInComparison.java#L357-L389.
With all this in mind, I'd suggest structuring this code so it's clear it
addresses only the exact number and string case, by checking expr and literal
types and delegating to a function that has "int" and "utf8" or "string" in the
name.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
findepi commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714661450 > Maybe we can add a check that the string would be the same when we round tripped it > > As in only do the rewrite if the sequence > > * cast to int > * cast (back) to string > > results in the exact same string as went int neat -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714477098 (thank you for checking this @findepi -- π -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714474220 > @findepi so the test cases in `unwrap_cast` are not correct, and they're showing that I'm over-unwrapping. Am I understanding this correctly? I think @findepi is saying that the mapping proposed in the PR is more subtle casting an integer `123` to a string will result in `'123'`. It would not match the string `'0123'` However, as written this PR will also match `'0123'` as we will cast `0123` to an integer which results in `123` So in other words, we should not do the conversion from `'0123'` (string) to `123` (int) Maybe we can add a check that the string would be the same when we round tripped it As in only do the rewrite if the sequence * cast to int * cast (back) to string results in the exact same string as went int -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on PR #15110: URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714179107 @findepi so the test cases in `unwrap_cast` are not correct, and they're showing that I'm over-unwrapping. Am I understanding this correctly? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] - To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
findepi commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2713266973
> @findepi I wonder if you have some time to double check the correctness of
this optimization (distributing a cast)
unwrapping cast is not an easy feat. The current implementation isn't
exactly correct, as it conflates different strings which cast back to the same
number:
```sql
CREATE OR REPLACE TABLE t AS SELECT arrow_cast('123', 'Int64') a;
-- correctly finds the row
SELECT * FROM t WHERE cast(a AS string) = '123';
-- incorrectly also finds the row
SELECT * FROM t WHERE cast(a AS string) = '0123';
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
findepi commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1988740063
##
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##
@@ -1758,7 +1758,7 @@ impl TreeNodeRewriter for Simplifier<'_,
S> {
// try_cast/cast(expr as data_type) op literal
Expr::BinaryExpr(BinaryExpr { left, op, right })
if
is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary(
-info, &left, &right,
+info, &left, &right, op,
Review Comment:
add `op` between left and right args -- this helps understand the role of
the two `&Expr` args to the function
(or maybe even pass the whole `&BinaryExpr` into the function)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1988197176
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,45 @@ pub(super) fn
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
true
}
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+macro_rules! cast_or_else_return_none {
Review Comment:
> if they treat this behavior as an "optimization" or if it's simply an
expected behavior to cast the literal to the column's type
It looks like my guess might be correctβthey always cast the literal to the
column's type, regardless of the operator.
```sql
D create table t as values (1), (2), (3);
D select * from t where col0 < '10';
βββββ
β col0 β
β int32 β
βββββ€
β 1 β
β 2 β
β 3 β
βββββ
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1988192389
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,45 @@ pub(super) fn
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
true
}
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+macro_rules! cast_or_else_return_none {
Review Comment:
> instead of doing another round of PR reviews I figured I would just push
the commits directly to save time.
Thank you! Iβm just curiousβis storing a parquet file in slt tests common?
The tests Iβve worked with so far use tables created in the same test file.
> Strangely they don't seem to care about errors:
Interesting! This makes me wonder if they treat this behavior as an
"optimization" or if it's simply an expected behavior to cast the literal to
the column's type.
> I played around a little locally and this is what I came up with. What do
you think?
Oh, this looks much better! However, Iβd prefer leaving the outer match as
is, maybe thereβs potential for another optimization in the future?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1988192389
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,45 @@ pub(super) fn
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
true
}
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+macro_rules! cast_or_else_return_none {
Review Comment:
> instead of doing another round of PR reviews I figured I would just push
the commits directly to save time.
Thank you! Iβm just curiousβis storing a parquet file in slt tests common?
The tests Iβve worked with so far use tables created in the same test file.
> Strangely they don't seem to care about errors:
Interesting! This makes me wonder if they treat this behavior as an
"optimization" or if it's simply an expected behavior that both sides of a
comparison operator should be of the same type.
> I played around a little locally and this is what I came up with. What do
you think?
Oh, this looks much better! However, Iβd prefer leaving the outer match as
is, maybe thereβs potential for another optimization in the future?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987355491
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,33 @@ pub(super) fn
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
true
}
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+dbg!(lit_value, target_type, op);
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(ref str))
+| ScalarValue::Utf8View(Some(ref str))
+| ScalarValue::LargeUtf8(Some(ref str)),
+) => match target_type {
+DataType::Int8 => str.parse::().ok().map(ScalarValue::from),
Review Comment:
Also 2 sqllogictest test cases are added (optimized/number not in range)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987353874
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,33 @@ pub(super) fn
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
true
}
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+dbg!(lit_value, target_type, op);
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(ref str))
+| ScalarValue::Utf8View(Some(ref str))
+| ScalarValue::LargeUtf8(Some(ref str)),
+) => match target_type {
+DataType::Int8 => str.parse::().ok().map(ScalarValue::from),
Review Comment:
I tried to use your suggested casting implementation, could you help me
check if I'm implementing it correctly?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987350998
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,33 @@ pub(super) fn
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
true
}
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+dbg!(lit_value, target_type, op);
Review Comment:
thanks -- I forgot to remove it :sweat_smile:
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]
alamb commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987055078
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -468,6 +510,10 @@ mod tests {
// the 999 is not within the range of MAX(int32) and
MIN(int32), we don't cast the lit(999) to int32 type
let expr_lt = cast(col("c1"), DataType::Int64).lt(lit(999i64));
assert_eq!(optimize_test(expr_lt.clone(), &schema), expr_lt);
+
+// cast(c1, UTF8) < 123, only eq/not_eq should be optimized
Review Comment:
π
##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,33 @@ pub(super) fn
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
true
}
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+dbg!(lit_value, target_type, op);
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(ref str))
+| ScalarValue::Utf8View(Some(ref str))
+| ScalarValue::LargeUtf8(Some(ref str)),
+) => match target_type {
+DataType::Int8 => str.parse::().ok().map(ScalarValue::from),
Review Comment:
by calling `ok()` here I think this converts a predicate like `cast(col,
utf8) = 'foo'` into `col = 'null'`
Which is not quite the same thing.
Also, `str.parse()` likely has subltely different semantics than the arrow
cast kernels.
This, can can you please make this function:
1. Return `None` if the conversion fails (following the convention of other
functions in this module like `try_cast_dictionary`)
2. Use the cast kernel to parse the values?
The `cast` kernel can be called by:
1. Converting ScalarValue to an array with
[`ScalarValue::to_array`](https://docs.rs/datafusion/latest/datafusion/common/enum.ScalarValue.html#method.to_array)
2. Call `cast`: https://docs.rs/arrow/latest/arrow/array/cast/index.html
3. Calling
[`ScalarValue::try_from_array`](https://docs.rs/datafusion/latest/datafusion/common/enum.ScalarValue.html#method.try_from_array)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
