Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-04-05 Thread via GitHub


findepi commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2005577033


##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -215,6 +230,69 @@ fn is_supported_dictionary_type(data_type: &DataType) -> 
bool {
 DataType::Dictionary(_, inner) if is_supported_type(inner))
 }
 
+/// Try to move a cast from a column to the other side of a `=` / `!=` operator
+///
+/// Specifically, rewrites
+/// ```sql
+/// cast(col)  
+/// ```
+///
+/// To
+///
+/// ```sql
+/// col  cast()
+/// col  
+/// ```
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => {
+// Only try for integer types (TODO can we do this for other types
+// like timestamps)?
+use DataType::*;
+if matches!(
+target_type,
+Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
+) {
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+
+let array = ScalarValue::to_array(lit_value).ok()?;
+let casted =
+arrow::compute::cast_with_options(&array, target_type, 
&opts).ok()?;

Review Comment:
   ```suggestion
   let cast = lit_value.cast_to(target_type).ok()?;
   ```



##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
 right: Expr,
 right_schema: &DFSchema,
 ) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
 let (left_type, right_type) = BinaryTypeCoercer::new(
 &left.get_type(left_schema)?,
 &op,
 &right.get_type(right_schema)?,
 )
 .get_input_types()?;
+
 Ok((
 left.cast_to(&left_type, left_schema)?,
 right.cast_to(&right_type, right_schema)?,
 ))
 }
 }
 
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => {
+// Only try for integer types (TODO can we do this for other types 
like timestamps)?
+use DataType::*;
+if matches!(
+target_type,
+Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
+) {
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+let array = ScalarValue::to_array(lit_value).ok()?;
+let casted =
+arrow::compute::cast_with_options(&array, target_type, 
&opts).ok()?;

Review Comment:
   ```suggestion
   let cast = lit_value.cast_to(target_type).ok()?;
   ```



##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
 right: Expr,
 right_schema: &DFSchema,
 ) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&left.get_type(left_schema)?)

Review Comment:
   We swapped left and right, but did nothing to `op`. This is correct only 
when op is symmetric (eg `=`, `<>` but not `>` or `<`)



##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -215,6 +230,69 @@ fn is_supported_dictionary_type(data_type: &DataType) -> 
bool {
 DataType::Dictionary(_, inner) if i

Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-04-05 Thread via GitHub


findepi commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2005680477


##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
 right: Expr,
 right_schema: &DFSchema,
 ) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
 let (left_type, right_type) = BinaryTypeCoercer::new(
 &left.get_type(left_schema)?,
 &op,
 &right.get_type(right_schema)?,
 )
 .get_input_types()?;
+
 Ok((
 left.cast_to(&left_type, left_schema)?,
 right.cast_to(&right_type, right_schema)?,
 ))
 }
 }
 
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => {
+// Only try for integer types (TODO can we do this for other types 
like timestamps)?
+use DataType::*;
+if matches!(
+target_type,
+Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
+) {
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+let array = ScalarValue::to_array(lit_value).ok()?;
+let casted =
+arrow::compute::cast_with_options(&array, target_type, 
&opts).ok()?;
+ScalarValue::try_from_array(&casted, 0)

Review Comment:
   IMO it should be enough to add logic in the other file.
   What coercions get applied is the SQL dialect-specific, query 
planner-dependent.
   What optimizations get applied -- is the common optimizer brain all 
datafusion users end up using.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-27 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2011082531


##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
 right: Expr,
 right_schema: &DFSchema,
 ) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
 let (left_type, right_type) = BinaryTypeCoercer::new(
 &left.get_type(left_schema)?,
 &op,
 &right.get_type(right_schema)?,
 )
 .get_input_types()?;
+
 Ok((
 left.cast_to(&left_type, left_schema)?,
 right.cast_to(&right_type, right_schema)?,
 ))
 }
 }
 
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => {
+// Only try for integer types (TODO can we do this for other types 
like timestamps)?
+use DataType::*;
+if matches!(
+target_type,
+Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
+) {
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+let array = ScalarValue::to_array(lit_value).ok()?;
+let casted =
+arrow::compute::cast_with_options(&array, target_type, 
&opts).ok()?;
+ScalarValue::try_from_array(&casted, 0)

Review Comment:
   So you mean the "prevent cast" part should be removed, is that correct?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-27 Thread via GitHub


alamb merged PR #15110:
URL: https://github.com/apache/datafusion/pull/15110


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-27 Thread via GitHub


alamb commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2759304080

   :rockeet
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-27 Thread via GitHub


alan910127 commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2757628083

   @alamb @findepi Thanks for your reviews and helps!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-25 Thread via GitHub


alan910127 commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2753032125

   > Perhaps you can revert the changes to type_coercsion.rs in this PR so we 
can get it merged and then make a follow on PR to address #15161 ?
   
   OK!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-20 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2005677168


##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
 right: Expr,
 right_schema: &DFSchema,
 ) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
 let (left_type, right_type) = BinaryTypeCoercer::new(
 &left.get_type(left_schema)?,
 &op,
 &right.get_type(right_schema)?,
 )
 .get_input_types()?;
+
 Ok((
 left.cast_to(&left_type, left_schema)?,
 right.cast_to(&right_type, right_schema)?,
 ))
 }
 }
 
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => {
+// Only try for integer types (TODO can we do this for other types 
like timestamps)?
+use DataType::*;
+if matches!(
+target_type,
+Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
+) {
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+let array = ScalarValue::to_array(lit_value).ok()?;
+let casted =
+arrow::compute::cast_with_options(&array, target_type, 
&opts).ok()?;
+ScalarValue::try_from_array(&casted, 0)

Review Comment:
   @findepi thank you for the review. This file is focused on avoiding casts 
being applied to the columns, and I was trying to emulate the behavior of 
duckdb and postgres, where they always cast the literal and even throw an error 
if the literal cannot be casted. So I was wondering whether we should even take 
the operator into account here.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-20 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2005666997


##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
 right: Expr,
 right_schema: &DFSchema,
 ) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&left.get_type(left_schema)?)

Review Comment:
   @findepi thank you for the review. This file is for avoiding the cast to be 
applied to the column, and I was trying to emulate the behavior of duckdb and 
postgres. So I was wondering should we even consider the operator here, as 
duckdb and postgres both cast the literal anyways, even throwing error if the 
literal can not be casted.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-20 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r2005666997


##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
 right: Expr,
 right_schema: &DFSchema,
 ) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&left.get_type(left_schema)?)

Review Comment:
   @findepi thank you for the review. This file is for avoiding the cast to be 
applied to the column, and I was trying to emulate the behavior of duckdb and 
postgres. So I was wondering should we even consider the operator here, as 
duckdb and postgres both cast the literal anyways, even throwing error if the 
literal can not be casted.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-17 Thread via GitHub


alan910127 commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2727773494

   > Does this make sense @alan910127 ?
   
   So we'll keep both the coercion and cast unwrapping optimizations in this 
PR, is that correct? I'm unsure about "support other types," do you mean in the 
unwrapping step?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-17 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1997943659


##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
 right: Expr,
 right_schema: &DFSchema,
 ) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
 let (left_type, right_type) = BinaryTypeCoercer::new(
 &left.get_type(left_schema)?,
 &op,
 &right.get_type(right_schema)?,
 )
 .get_input_types()?;
+
 Ok((
 left.cast_to(&left_type, left_schema)?,
 right.cast_to(&right_type, right_schema)?,
 ))
 }
 }
 
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),

Review Comment:
   this seems duplicated with the function in simplify_expressions, should I 
extract them into a utility function?



##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
 right: Expr,
 right_schema: &DFSchema,
 ) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
 let (left_type, right_type) = BinaryTypeCoercer::new(
 &left.get_type(left_schema)?,
 &op,
 &right.get_type(right_schema)?,
 )
 .get_input_types()?;
+
 Ok((
 left.cast_to(&left_type, left_schema)?,
 right.cast_to(&right_type, right_schema)?,
 ))
 }
 }
 
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),

Review Comment:
   although I was wondering If we should care about the operator, since both 
postgres and duckdb cast the literal regardless of the operator (they even emit 
an error when the literal cannot be casted)
   
   we do not perform the round-trip casting verification here is also a concern



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-16 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1997945198


##
datafusion/optimizer/src/analyzer/type_coercion.rs:
##
@@ -290,19 +290,72 @@ impl<'a> TypeCoercionRewriter<'a> {
 right: Expr,
 right_schema: &DFSchema,
 ) -> Result<(Expr, Expr)> {
+if let Expr::Literal(ref lit_value) = left {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&right.get_type(right_schema)?)
+{
+return Ok((casted, right));
+};
+}
+
+if let Expr::Literal(ref lit_value) = right {
+if let Some(casted) =
+try_cast_literal_to_type(lit_value, op, 
&left.get_type(left_schema)?)
+{
+return Ok((left, casted));
+};
+}
+
 let (left_type, right_type) = BinaryTypeCoercer::new(
 &left.get_type(left_schema)?,
 &op,
 &right.get_type(right_schema)?,
 )
 .get_input_types()?;
+
 Ok((
 left.cast_to(&left_type, left_schema)?,
 right.cast_to(&right_type, right_schema)?,
 ))
 }
 }
 
+fn try_cast_literal_to_type(
+lit_value: &ScalarValue,
+op: Operator,
+target_type: &DataType,
+) -> Option {
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),

Review Comment:
   tho I was wondering If we should care about the operator, since both 
postgres and duckdb cast the literal regardless of the operator (they even emit 
an error when the literal cannot be casted)
   
   we do not perform the round-trip casting verification here is also a concern



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-14 Thread via GitHub


alamb commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2725533031

   > Actually I'm quite curious is string literal really an issue? If we want 
string, we can have query with quote `select * from t1 where column1 < '10';`, 
while if we want numeric, we can have query without quote `select * from t1 
where column1 < 10;`. In practice, when is it beneficial to convert string 
literal into numeric type? πŸ€”
   
   I think the core usecase is the one given on 
https://github.com/apache/datafusion/issues/15035
   
   Specifcally when the user is comparing a numeric column to a string literal 
(date = '202401'). 
   
   The `date` column and `'202401'` need to be converted to a common type, and 
converting the string to numeric is faster (and can be used to prune parquet 
predicates, for example)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-13 Thread via GitHub


alamb commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1994246538


##
datafusion/core/tests/expr_api/mod.rs:
##
@@ -330,12 +330,12 @@ async fn test_create_physical_expr_coercion() {
 create_expr_test(lit(1i32).eq(col("id")), "CAST(1 AS Utf8) = id@0");
 // compare int col to string literal `i = '202410'`
 // Note this casts the column (not the field)
-create_expr_test(col("i").eq(lit("202410")), "CAST(i@1 AS Utf8) = 202410");
-create_expr_test(lit("202410").eq(col("i")), "202410 = CAST(i@1 AS Utf8)");
+create_expr_test(col("i").eq(lit("202410")), "i@1 = 202410");

Review Comment:
   πŸŽ‰ 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-13 Thread via GitHub


jayzhan211 commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2721231187

   Actually I'm quite curious is string literal really an issue? If we want 
string, we can have query with quote `select * from t1 where column1 < '10';`, 
while if we want numeric, we can have query without quote `select * from t1 
where column1 < 10;`. In practice, when is it beneficial to convert string 
literal into numeric type? πŸ€” 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-13 Thread via GitHub


jayzhan211 commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2721079153

   I try a bit and I think there are still some works to be done in how we 
parse the sql. https://github.com/apache/datafusion/pull/15202.
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


jayzhan211 commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2716395289

   > I don't believe manipulating casts after-the-fact is the right approach.
   
   > Given the query select * from t1 where column1 < '10'; where column1 is an 
Int32, the literal '10' should be treated as unknown, and the preference should 
be to cast that. The generated predicate should then be column1 < cast('10' as 
int), and whatever const folding happens will then take care of parsing '10' as 
an int during planning.
   
   > Without a proper "unknown" type, I think it should be sufficient to rank 
implicitly casting to utf8 last compared to any other implicit cast.
   
   Maybe we can figure out if it is possible to parse to the idea type for 
scalar, so '123' can be casted to i64 not str and '1.2' can be casted to float 
not str.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


alamb commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987052567


##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,33 @@ pub(super) fn 
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
 true
 }
 
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+dbg!(lit_value, target_type, op);

Review Comment:
   I think we should probably remove `dbg!`  before merge



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


alamb commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2711540590

   @findepi  I wonder if you have some time to double check the correctness of 
this optimization (distributing a cast)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


alan910127 commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2716046223

   > I don't think the source of the cast matters (for this particular 
optimization)
   
   Ah, yes. Since the string -> int -> string is checked and we're only 
checking equality, then they'll be the same. But I' m not sure if this 
unwrapping would happen after the string literal coercion.
   
   > That is my understanding and also what I think should happen
   
   Got it, should I revert this PR to the unwrap cast optimization or directly 
combine both the optimization and the literal coercion in this PR?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


alamb commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2715747690

   > I don't believe manipulating casts after-the-fact is the right approach.
   > 
   > Given the query select * from t1 where column1 < '10'; where column1 is an 
Int32, the literal '10' should be treated as unknown, and the preference should 
be to cast that. The generated predicate should then be column1 < cast('10' as 
int), and whatever const folding happens will then take care of parsing '10' as 
an int during planning.
   
   I agree @scsmithr  -- For the case in 
https://github.com/apache/datafusion/issues/15161  I agree that changing the 
coercion rules as you suggest is a better plan than trying to fix it up 
afterwards
   
   However, I think the optimization in this PR is valuable for cases when the 
user has ended up with the casts some other way and DataFusion is now 
optimizing the expression
   
   > @findepi sorry I didn't notice your comment and I just pushed a new 
version with the unwrapping logic deleted.
   > 
   > So you think the two optimizations **should** coexist?
   
   That is my understanding and also what I think should happen
   
   
   > > And it looks feasible -- the plan in #15110 (comment) IMO should work
   > 
   > But we have no way to tell if the cast is set by the user or not in the 
unwrap cast function. Doesn't this matter?
   
   I don't think the source of the cast matters (for this particular 
optimization)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


alamb commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987850535


##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,45 @@ pub(super) fn 
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
 true
 }
 
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+macro_rules! cast_or_else_return_none {

Review Comment:
   Since `ScalarValue` is already dynamic, I don't think we need a macro to run 
on each type
   
   I played around a little locally and this is what I came up with. What do 
you think?
   
   ```rust
   /// Try to move a cast from a literal to the other side of a `=` / `!=` 
operator
   ///
   /// Specifically, rewrites
   /// ```sql
   /// cast(col)  
   /// ```
   ///
   /// To
   ///
   /// ```sql
   /// col  cast()
   /// col  
   /// ```
   fn cast_literal_to_type_with_op(
   lit_value: &ScalarValue,
   target_type: &DataType,
   op: Operator,
   ) -> Option {
   let (
   Operator::Eq | Operator::NotEq,
   ScalarValue::Utf8(Some(_))
   | ScalarValue::Utf8View(Some(_))
   | ScalarValue::LargeUtf8(Some(_)),
   ) = (op, lit_value)
   else {
   return None;
   };
   
   // Only try for integer types (TODO can we do this for other types
   // like timestamps)?
   use DataType::*;
   if matches!(
   target_type,
   Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
   ) {
   let opts = arrow::compute::CastOptions {
   safe: false,
   format_options: Default::default(),
   };
   
   let array = ScalarValue::to_array(lit_value).ok()?;
   let casted =
   arrow::compute::cast_with_options(&array, target_type, 
&opts).ok()?;
   ScalarValue::try_from_array(&casted, 0).ok()
   } else {
   None
   }
   }
   ```
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


alan910127 commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2715045737

   @findepi sorry I didn't notice your comment and I just pushed a new version 
with the unwrapping logic deleted.
   
   So you think the two optimizations **should** coexist?
   
   > And it looks feasible -- the plan in #15110 (comment) IMO should work
   
   But we have no way to tell if the cast is set by the user or not in the 
unwrap cast function. Doesn't this matter?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


scsmithr commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2715007343

   > > I'm not sure if it's only possible to handle it when the cast is created 
(type coercion?)
   > 
   > That may well be the right thing to do πŸ€”
   > 
   > It is funny that @scsmithr filed a ticket this morning with a similar 
symptom:
   > 
   > * [Incorrect cast of integer columns to utf8 when comparing with utf8 
constant #15161](https://github.com/apache/datafusion/issues/15161)
   > 
   > 
   > πŸ€”
   
   For context, I was running some implicit cast queries on a few different 
systems to compare outputs, and hit the aforementioned issue with datafusion. 
"Unexpected" might be a better word than "incorrect", but datafusion is the 
outlier here.
   
   My take on this:
   
   I don't believe manipulating casts after-the-fact is the right approach. 
   
   Given the query `select * from t1 where column1 < '10';` where `column1 is 
an Int32, the literal `'10'` should be treated as unknown, and the preference 
should be to cast that. The generated predicate should then be `column1 < 
cast('10' as int)`, and whatever const folding happens will then take care of 
parsing `'10'` as an int during planning.
   
   Without a proper "unknown" type, I think it should be sufficient to rank 
implicitly casting to utf8 last compared to any other implicit cast.
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


findepi commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2715005910

   This cast unwrapping is valuable orthogonally to what coercions get applied 
by the frontend.
   
   
   And it looks feasible -- the plan in 
https://github.com/apache/datafusion/pull/15110#issuecomment-2714474220 IMO 
should work (for integer and string type pairs).
   
   
   
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


alamb commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714782607

   > I'm not sure if it's only possible to handle it when the cast is created 
(type coercion?)
   
   That may well be the right thing to do πŸ€” 
   
   It is funny that @scsmithr  filed a ticket this morning with a similar 
symptom:
   - https://github.com/apache/datafusion/issues/15161
   
   πŸ€” 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


alan910127 commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714725299

   > As in only do the rewrite if the sequence
   > 
   > - cast to int
   > - cast (back) to string
   > 
   > results in the exact same string as went int
   
   @alamb I found that this approach is different from how postgres and duckdb 
handle this situation:
   
   - `SELECT * FROM t WHERE a = '0123'` - they will cast the '0123' to an 
integer and it will match, so 1 row is returned.
   - `SELECT * FROM t WHERE cast(a AS string) = '0123'` - they will not cast 
the '0123', and this is a string comparison, so no rows are returned.
   
   
   postgres
   
   ```sql
   db=# create table t as select 123 a;
   SELECT 1
   db=# select * from t where a = '0123';
 a
   -
123
   (1 row)
   
   db=# select * from t where cast(a AS text) = '0123';
a
   ---
   (0 rows)
   
   ```
   
   
   
   duckdb
   
   ```sql
   D create table t as select 123 a;
   D select * from t where a = '0123';
   β”Œβ”€β”€β”€β”
   β”‚   a   β”‚
   β”‚ int32 β”‚
   β”œβ”€β”€β”€β”€
   β”‚  123  β”‚
   β””β”€β”€β”€β”˜
   D select * from t where cast(a AS string) = '0123';
   β”Œβ”
   β”‚   aβ”‚
   β”‚ int32  β”‚
   β”œβ”€
   β”‚ 0 rows β”‚
   β””β”˜
   ```
   
   
   I'm not sure if it's only possible to handle it when the `cast` is created 
(type coercion?)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


findepi commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1989510726


##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,45 @@ pub(super) fn 
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
 true
 }
 
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+macro_rules! cast_or_else_return_none {
+($value:ident, $ty:expr) => {{
+let opts = arrow::compute::CastOptions {
+safe: false,
+format_options: Default::default(),
+};
+let array = ScalarValue::to_array($value).ok()?;
+let casted = arrow::compute::cast_with_options(&array, &$ty, 
&opts).ok()?;
+let scalar = ScalarValue::try_from_array(&casted, 0).ok()?;
+Some(scalar)
+}};
+}
+
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(_))
+| ScalarValue::Utf8View(Some(_))
+| ScalarValue::LargeUtf8(Some(_)),
+) => match target_type {
+DataType::Int8 => cast_or_else_return_none!(lit_value, 
DataType::Int8),
+DataType::Int16 => cast_or_else_return_none!(lit_value, 
DataType::Int16),
+DataType::Int32 => cast_or_else_return_none!(lit_value, 
DataType::Int32),
+DataType::Int64 => cast_or_else_return_none!(lit_value, 
DataType::Int64),
+DataType::UInt8 => cast_or_else_return_none!(lit_value, 
DataType::UInt8),
+DataType::UInt16 => cast_or_else_return_none!(lit_value, 
DataType::UInt16),
+DataType::UInt32 => cast_or_else_return_none!(lit_value, 
DataType::UInt32),
+DataType::UInt64 => cast_or_else_return_none!(lit_value, 
DataType::UInt64),
+_ => None,
+},
+_ => None,

Review Comment:
   As it's written, and as the function is named, this looks like generic 
approach that will work many different source/target type pairs, we just need 
to add logic.
   However, the exact checks necessary for soundness will likely differ.
   
   For casts between exact numbers (ints and decimals) and string types, the 
https://github.com/apache/datafusion/pull/15110#issuecomment-2714474220 looks 
like a sound plan.
   Plus, we could also simplify impossible target literals like this
   - `cast(an_int as string) = 'abc'` or `cast(an_int as string) = '007'` to 
`if(an_int is null, null, false)`
   
   For casts between numbers and numbers, we likely need to recognize max 
addressible values, and cast impreciseness, especially when we want to handle 
`<`, `<=`, `>`, `>=` operators.
   
https://github.com/trinodb/trino/blob/a870656f406b0f76e97c740659a58554d472994d/core/trino-main/src/main/java/io/trino/sql/planner/iterative/rule/UnwrapCastInComparison.java#L199-L354
 might serve as a good reference. Note how it handles NaN, max values, lossy 
round-tripping, sometimes converting `<` to `<=` as necessary.
   
   For e.g cast from timestamp to date, the logic will likely be different again
   
https://github.com/trinodb/trino/blob/a870656f406b0f76e97c740659a58554d472994d/core/trino-main/src/main/java/io/trino/sql/planner/iterative/rule/UnwrapCastInComparison.java#L357-L389.
   
   With all this in mind, I'd suggest structuring this code so it's clear it 
addresses only the exact number and string case, by checking expr and literal 
types and delegating to a function that has "int" and "utf8" or "string" in the 
name.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


findepi commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714661450

   > Maybe we can add a check that the string would be the same when we round 
tripped it
   > 
   > As in only do the rewrite if the sequence
   > 
   > * cast to int
   > * cast (back) to string
   > 
   > results in the exact same string as went int
   
   neat


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


alamb commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714477098

   (thank you for checking this @findepi  -- πŸ™ 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


alamb commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714474220

   > @findepi so the test cases in `unwrap_cast` are not correct, and they're 
showing that I'm over-unwrapping. Am I understanding this correctly?
   
   I think @findepi  is saying that the mapping proposed in the PR is more 
subtle
   
   casting an integer `123` to a string will result in `'123'`.
   
   It would not match the string `'0123'`
   
   However, as written this PR will also match `'0123'` as we will cast `0123` 
to an integer which results in `123`
   
   So in other words, we should not do the conversion from `'0123'` (string) to 
`123` (int)
   
   Maybe we can add a check that the string would be the same when we round 
tripped it
   
   As in  only do the rewrite if the sequence
   * cast to int
   * cast (back) to string
   
   results in the exact same string as went int
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


alan910127 commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2714179107

   @findepi so the test cases in `unwrap_cast` are not correct, and they're 
showing that I'm over-unwrapping. Am I understanding this correctly?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


findepi commented on PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#issuecomment-2713266973

   > @findepi I wonder if you have some time to double check the correctness of 
this optimization (distributing a cast)
   
   unwrapping cast is not an easy feat. The current implementation isn't 
exactly correct, as it conflates different strings which cast back to the same 
number:
   
   ```sql
   CREATE OR REPLACE TABLE t AS SELECT arrow_cast('123', 'Int64') a;
   
   -- correctly finds the row
   SELECT * FROM t WHERE cast(a AS string) = '123';
   
   -- incorrectly also finds the row
   SELECT * FROM t WHERE cast(a AS string) = '0123';
   ```
   
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-11 Thread via GitHub


findepi commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1988740063


##
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##
@@ -1758,7 +1758,7 @@ impl TreeNodeRewriter for Simplifier<'_, 
S> {
 // try_cast/cast(expr as data_type) op literal
 Expr::BinaryExpr(BinaryExpr { left, op, right })
 if 
is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary(
-info, &left, &right,
+info, &left, &right, op,

Review Comment:
   add `op` between left and right args -- this helps understand the role of 
the two `&Expr` args to the function
   (or maybe even pass the whole `&BinaryExpr` into the function)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-10 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1988197176


##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,45 @@ pub(super) fn 
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
 true
 }
 
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+macro_rules! cast_or_else_return_none {

Review Comment:
   > if they treat this behavior as an "optimization" or if it's simply an 
expected behavior to cast the literal to the column's type
   
   It looks like my guess might be correctβ€”they always cast the literal to the 
column's type, regardless of the operator.
   
   ```sql
   D create table t as values (1), (2), (3);
   D select * from t where col0 < '10';
   β”Œβ”€β”€β”€β”
   β”‚ col0  β”‚
   β”‚ int32 β”‚
   β”œβ”€β”€β”€β”€
   β”‚ 1 β”‚
   β”‚ 2 β”‚
   β”‚ 3 β”‚
   β””β”€β”€β”€β”˜
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-10 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1988192389


##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,45 @@ pub(super) fn 
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
 true
 }
 
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+macro_rules! cast_or_else_return_none {

Review Comment:
   > instead of doing another round of PR reviews I figured I would just push 
the commits directly to save time.
   
   Thank you! I’m just curiousβ€”is storing a parquet file in slt tests common? 
The tests I’ve worked with so far use tables created in the same test file.
   
   > Strangely they don't seem to care about errors:
   
   Interesting! This makes me wonder if they treat this behavior as an 
"optimization" or if it's simply an expected behavior to cast the literal to 
the column's type.
   
   > I played around a little locally and this is what I came up with. What do 
you think?
   
   Oh, this looks much better! However, I’d prefer leaving the outer match as 
is, maybe there’s potential for another optimization in the future?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-10 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1988192389


##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,45 @@ pub(super) fn 
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
 true
 }
 
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+macro_rules! cast_or_else_return_none {

Review Comment:
   > instead of doing another round of PR reviews I figured I would just push 
the commits directly to save time.
   
   Thank you! I’m just curiousβ€”is storing a parquet file in slt tests common? 
The tests I’ve worked with so far use tables created in the same test file.
   
   > Strangely they don't seem to care about errors:
   
   Interesting! This makes me wonder if they treat this behavior as an 
"optimization" or if it's simply an expected behavior that both sides of a 
comparison operator should be of the same type.
   
   > I played around a little locally and this is what I came up with. What do 
you think?
   
   Oh, this looks much better! However, I’d prefer leaving the outer match as 
is, maybe there’s potential for another optimization in the future?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-10 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987355491


##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,33 @@ pub(super) fn 
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
 true
 }
 
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+dbg!(lit_value, target_type, op);
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(ref str))
+| ScalarValue::Utf8View(Some(ref str))
+| ScalarValue::LargeUtf8(Some(ref str)),
+) => match target_type {
+DataType::Int8 => str.parse::().ok().map(ScalarValue::from),

Review Comment:
   Also 2 sqllogictest test cases are added (optimized/number not in range)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-10 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987353874


##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,33 @@ pub(super) fn 
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
 true
 }
 
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+dbg!(lit_value, target_type, op);
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(ref str))
+| ScalarValue::Utf8View(Some(ref str))
+| ScalarValue::LargeUtf8(Some(ref str)),
+) => match target_type {
+DataType::Int8 => str.parse::().ok().map(ScalarValue::from),

Review Comment:
   I tried to use your suggested casting implementation, could you help me 
check if I'm implementing it correctly?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-10 Thread via GitHub


alan910127 commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987350998


##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,33 @@ pub(super) fn 
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
 true
 }
 
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+dbg!(lit_value, target_type, op);

Review Comment:
   thanks -- I forgot to remove it :sweat_smile: 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



Re: [PR] perf: unwrap cast for comparing ints =/!= strings [datafusion]

2025-03-10 Thread via GitHub


alamb commented on code in PR #15110:
URL: https://github.com/apache/datafusion/pull/15110#discussion_r1987055078


##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -468,6 +510,10 @@ mod tests {
 // the 999 is not within the range of MAX(int32) and 
MIN(int32), we don't cast the lit(999) to int32 type
 let expr_lt = cast(col("c1"), DataType::Int64).lt(lit(999i64));
 assert_eq!(optimize_test(expr_lt.clone(), &schema), expr_lt);
+
+// cast(c1, UTF8) < 123, only eq/not_eq should be optimized

Review Comment:
   πŸ‘ 



##
datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs:
##
@@ -177,6 +192,33 @@ pub(super) fn 
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
 true
 }
 
+fn cast_literal_to_type_with_op(
+lit_value: &ScalarValue,
+target_type: &DataType,
+op: Operator,
+) -> Option {
+dbg!(lit_value, target_type, op);
+match (op, lit_value) {
+(
+Operator::Eq | Operator::NotEq,
+ScalarValue::Utf8(Some(ref str))
+| ScalarValue::Utf8View(Some(ref str))
+| ScalarValue::LargeUtf8(Some(ref str)),
+) => match target_type {
+DataType::Int8 => str.parse::().ok().map(ScalarValue::from),

Review Comment:
   by calling `ok()` here I think this converts a predicate like `cast(col, 
utf8) = 'foo'` into `col = 'null'`
   
   Which is not quite the same thing. 
   
   Also, `str.parse()` likely has subltely different semantics than the arrow 
cast kernels. 
   
   This, can can you please make this function:
   1. Return `None` if the conversion fails (following the convention of other 
functions in this module like `try_cast_dictionary`)
   2. Use the cast kernel to parse the values?
   
   
   
   The `cast` kernel can be called by:
   1. Converting ScalarValue to an array with 
[`ScalarValue::to_array`](https://docs.rs/datafusion/latest/datafusion/common/enum.ScalarValue.html#method.to_array)
   2. Call `cast`: https://docs.rs/arrow/latest/arrow/array/cast/index.html
   3. Calling 
[`ScalarValue::try_from_array`](https://docs.rs/datafusion/latest/datafusion/common/enum.ScalarValue.html#method.try_from_array)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


-
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]