liukun4515 commented on a change in pull request #1607:
URL: https://github.com/apache/arrow-datafusion/pull/1607#discussion_r788294882
##########
File path: datafusion/src/physical_plan/coercion_rule/binary_rule.rs
##########
@@ -293,14 +289,192 @@ fn coercion_decimal_mathematics_type(
}
}
+/// Determine if a DataType is signed numeric or not
+pub fn is_signed_numeric(dt: &DataType) -> bool {
+ matches!(
+ dt,
+ DataType::Int8
+ | DataType::Int16
+ | DataType::Int32
+ | DataType::Int64
+ | DataType::Float16
+ | DataType::Float32
+ | DataType::Float64
+ | DataType::Decimal(_, _)
+ )
+}
+
+/// Determine if a DataType is numeric or not
+pub fn is_numeric(dt: &DataType) -> bool {
+ is_signed_numeric(dt)
+ || match dt {
+ DataType::UInt8 | DataType::UInt16 | DataType::UInt32 |
DataType::UInt64 => {
+ true
+ }
+ _ => false,
+ }
+}
+
+/// Coercion rules for dictionary values (aka the type of the dictionary
itself)
+fn dictionary_value_coercion(
+ lhs_type: &DataType,
+ rhs_type: &DataType,
+) -> Option<DataType> {
+ numerical_coercion(lhs_type, rhs_type).or_else(||
string_coercion(lhs_type, rhs_type))
+}
+
+/// Coercion rules for Dictionaries: the type that both lhs and rhs
+/// can be casted to for the purpose of a computation.
+///
+/// It would likely be preferable to cast primitive values to
+/// dictionaries, and thus avoid unpacking dictionary as well as doing
+/// faster comparisons. However, the arrow compute kernels (e.g. eq)
+/// don't have DictionaryArray support yet, so fall back to unpacking
+/// the dictionaries
+fn dictionary_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
+ match (lhs_type, rhs_type) {
+ (
+ DataType::Dictionary(_lhs_index_type, lhs_value_type),
+ DataType::Dictionary(_rhs_index_type, rhs_value_type),
+ ) => dictionary_value_coercion(lhs_value_type, rhs_value_type),
+ (DataType::Dictionary(_index_type, value_type), _) => {
+ dictionary_value_coercion(value_type, rhs_type)
+ }
+ (_, DataType::Dictionary(_index_type, value_type)) => {
+ dictionary_value_coercion(lhs_type, value_type)
+ }
+ _ => None,
+ }
+}
+
+/// Coercion rules for Strings: the type that both lhs and rhs can be
+/// casted to for the purpose of a string computation
+fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
+ use arrow::datatypes::DataType::*;
+ match (lhs_type, rhs_type) {
+ (Utf8, Utf8) => Some(Utf8),
+ (LargeUtf8, Utf8) => Some(LargeUtf8),
+ (Utf8, LargeUtf8) => Some(LargeUtf8),
+ (LargeUtf8, LargeUtf8) => Some(LargeUtf8),
+ _ => None,
+ }
+}
+
+/// coercion rules for like operations.
+/// This is a union of string coercion rules and dictionary coercion rules
+fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType>
{
+ string_coercion(lhs_type, rhs_type)
+ .or_else(|| dictionary_coercion(lhs_type, rhs_type))
+}
+
+/// Coercion rules for Temporal columns: the type that both lhs and rhs can be
+/// casted to for the purpose of a date computation
+fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
+ use arrow::datatypes::DataType::*;
+ use arrow::datatypes::TimeUnit;
+ match (lhs_type, rhs_type) {
+ (Utf8, Date32) => Some(Date32),
+ (Date32, Utf8) => Some(Date32),
+ (Utf8, Date64) => Some(Date64),
+ (Date64, Utf8) => Some(Date64),
+ (Timestamp(lhs_unit, lhs_tz), Timestamp(rhs_unit, rhs_tz)) => {
+ let tz = match (lhs_tz, rhs_tz) {
+ // can't cast across timezones
+ (Some(lhs_tz), Some(rhs_tz)) => {
+ if lhs_tz != rhs_tz {
+ return None;
+ } else {
+ Some(lhs_tz.clone())
+ }
+ }
+ (Some(lhs_tz), None) => Some(lhs_tz.clone()),
+ (None, Some(rhs_tz)) => Some(rhs_tz.clone()),
+ (None, None) => None,
+ };
+
+ let unit = match (lhs_unit, rhs_unit) {
+ (TimeUnit::Second, TimeUnit::Millisecond) => TimeUnit::Second,
+ (TimeUnit::Second, TimeUnit::Microsecond) => TimeUnit::Second,
+ (TimeUnit::Second, TimeUnit::Nanosecond) => TimeUnit::Second,
+ (TimeUnit::Millisecond, TimeUnit::Second) => TimeUnit::Second,
+ (TimeUnit::Millisecond, TimeUnit::Microsecond) =>
TimeUnit::Millisecond,
+ (TimeUnit::Millisecond, TimeUnit::Nanosecond) =>
TimeUnit::Millisecond,
+ (TimeUnit::Microsecond, TimeUnit::Second) => TimeUnit::Second,
+ (TimeUnit::Microsecond, TimeUnit::Millisecond) =>
TimeUnit::Millisecond,
+ (TimeUnit::Microsecond, TimeUnit::Nanosecond) =>
TimeUnit::Microsecond,
+ (TimeUnit::Nanosecond, TimeUnit::Second) => TimeUnit::Second,
+ (TimeUnit::Nanosecond, TimeUnit::Millisecond) =>
TimeUnit::Millisecond,
+ (TimeUnit::Nanosecond, TimeUnit::Microsecond) =>
TimeUnit::Microsecond,
+ (l, r) => {
+ assert_eq!(l, r);
+ l.clone()
+ }
+ };
+
+ Some(Timestamp(unit, tz))
+ }
+ _ => None,
+ }
+}
+
+pub(crate) fn is_dictionary(t: &DataType) -> bool {
Review comment:
https://github.com/apache/arrow-datafusion/issues/1613
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]