[GitHub] [arrow] bkietz commented on a diff in pull request #15180: ARROW-11402: [C++] prefer casting literal over casting field ref

via GitHub Fri, 27 Jan 2023 14:33:24 -0800


bkietz commented on code in PR #15180:
URL: https://github.com/apache/arrow/pull/15180#discussion_r1089482446



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -368,6 +368,153 @@ bool Expression::IsSatisfiable() const {
 
 namespace {
 
+TypeHolder SmallestTypeFor(const arrow::Datum& value) {
+  switch (value.type()->id()) {
+    case Type::INT8:
+      return int8();
+    case Type::UINT8:
+      return uint8();
+    case Type::INT16: {
+      int16_t i16 = value.scalar_as<Int16Scalar>().value;
+      if (i16 <= std::numeric_limits<int8_t>::max() &&
+          i16 >= std::numeric_limits<int8_t>::min()) {
+        return int8();
+      }
+      return int16();
+    }
+    case Type::UINT16: {
+      uint16_t ui16 = value.scalar_as<UInt16Scalar>().value;
+      if (ui16 <= std::numeric_limits<uint8_t>::max()) {
+        return uint8();
+      }
+      return uint16();
+    }
+    case Type::INT32: {
+      int32_t i32 = value.scalar_as<Int32Scalar>().value;
+      if (i32 <= std::numeric_limits<int8_t>::max() &&
+          i32 >= std::numeric_limits<int8_t>::min()) {
+        return int8();
+      }
+      if (i32 <= std::numeric_limits<int16_t>::max() &&
+          i32 >= std::numeric_limits<int16_t>::min()) {
+        return int16();
+      }
+      return int32();
+    }
+    case Type::UINT32: {
+      uint32_t ui32 = value.scalar_as<UInt32Scalar>().value;
+      if (ui32 <= std::numeric_limits<uint8_t>::max()) {
+        return uint8();
+      }
+      if (ui32 <= std::numeric_limits<uint16_t>::max()) {
+        return uint16();
+      }
+      return uint32();
+    }
+    case Type::INT64: {
+      int64_t i64 = value.scalar_as<Int64Scalar>().value;
+      if (i64 <= std::numeric_limits<int8_t>::max() &&
+          i64 >= std::numeric_limits<int8_t>::min()) {
+        return int8();
+      }
+      if (i64 <= std::numeric_limits<int16_t>::max() &&
+          i64 >= std::numeric_limits<int16_t>::min()) {
+        return int16();
+      }
+      if (i64 <= std::numeric_limits<int32_t>::max() &&
+          i64 >= std::numeric_limits<int32_t>::min()) {
+        return int32();
+      }
+      return int64();
+    }
+    case Type::UINT64: {
+      uint64_t ui64 = value.scalar_as<UInt64Scalar>().value;
+      if (ui64 <= std::numeric_limits<uint8_t>::max()) {
+        return uint8();
+      }
+      if (ui64 <= std::numeric_limits<uint16_t>::max()) {
+        return uint16();
+      }
+      if (ui64 <= std::numeric_limits<uint32_t>::max()) {
+        return uint32();
+      }
+      return uint64();
+    }
+    case Type::DOUBLE: {
+      double doub = value.scalar_as<DoubleScalar>().value;
+      if (double(float(doub)) == doub) {

Review Comment:
   This doesn't catch `nan`s. I'd recommend including an explicit clause for 
`nan`s and `inf`s. For the value itself, though, I think this roundtrip is 
reasonable.



##########
cpp/src/arrow/compute/exec/expression_test.cc:
##########
@@ -593,8 +594,23 @@ TEST(Expression, BindWithImplicitCasts) {
     ExpectBindsTo(cmp(field_ref("dict_str"), field_ref("str")),
                   cmp(cast(field_ref("dict_str"), utf8()), field_ref("str")));
 
+    // Should prefer the literal
     ExpectBindsTo(cmp(field_ref("dict_i32"), literal(int64_t(4))),
-                  cmp(cast(field_ref("dict_i32"), int64()), 
literal(int64_t(4))));
+                  cmp(field_ref("dict_i32"), literal(int32_t(4))));
+    ExpectBindsTo(cmp(field_ref("dict_i32"), literal(int64_t(4))),

Review Comment:
   Please add a case which demonstrates the behavior in the presence of 
unsigned integers. `SmallestTypeFor` preserves signedness information, which 
can produce some odd edge cases which we should be explicit about. For example:
   
   ```suggestion
       ExpectBindsTo(cmp(field_ref("i8"), literal(uint8_t(4))),
                     cmp(cast(field_ref("i8"), int16()), literal(int16_t(4))));
       ExpectBindsTo(cmp(field_ref("dict_i32"), literal(int64_t(4))),
   ```



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -368,6 +368,153 @@ bool Expression::IsSatisfiable() const {
 
 namespace {
 
+TypeHolder SmallestTypeFor(const arrow::Datum& value) {
+  switch (value.type()->id()) {
+    case Type::INT8:
+      return int8();
+    case Type::UINT8:
+      return uint8();
+    case Type::INT16: {
+      int16_t i16 = value.scalar_as<Int16Scalar>().value;
+      if (i16 <= std::numeric_limits<int8_t>::max() &&
+          i16 >= std::numeric_limits<int8_t>::min()) {
+        return int8();
+      }
+      return int16();
+    }
+    case Type::UINT16: {
+      uint16_t ui16 = value.scalar_as<UInt16Scalar>().value;
+      if (ui16 <= std::numeric_limits<uint8_t>::max()) {
+        return uint8();
+      }
+      return uint16();
+    }
+    case Type::INT32: {
+      int32_t i32 = value.scalar_as<Int32Scalar>().value;
+      if (i32 <= std::numeric_limits<int8_t>::max() &&
+          i32 >= std::numeric_limits<int8_t>::min()) {
+        return int8();
+      }
+      if (i32 <= std::numeric_limits<int16_t>::max() &&
+          i32 >= std::numeric_limits<int16_t>::min()) {
+        return int16();
+      }
+      return int32();
+    }
+    case Type::UINT32: {
+      uint32_t ui32 = value.scalar_as<UInt32Scalar>().value;
+      if (ui32 <= std::numeric_limits<uint8_t>::max()) {
+        return uint8();
+      }
+      if (ui32 <= std::numeric_limits<uint16_t>::max()) {
+        return uint16();
+      }
+      return uint32();
+    }
+    case Type::INT64: {
+      int64_t i64 = value.scalar_as<Int64Scalar>().value;
+      if (i64 <= std::numeric_limits<int8_t>::max() &&
+          i64 >= std::numeric_limits<int8_t>::min()) {
+        return int8();
+      }
+      if (i64 <= std::numeric_limits<int16_t>::max() &&
+          i64 >= std::numeric_limits<int16_t>::min()) {
+        return int16();
+      }
+      if (i64 <= std::numeric_limits<int32_t>::max() &&
+          i64 >= std::numeric_limits<int32_t>::min()) {
+        return int32();
+      }
+      return int64();
+    }
+    case Type::UINT64: {
+      uint64_t ui64 = value.scalar_as<UInt64Scalar>().value;
+      if (ui64 <= std::numeric_limits<uint8_t>::max()) {
+        return uint8();
+      }
+      if (ui64 <= std::numeric_limits<uint16_t>::max()) {
+        return uint16();
+      }
+      if (ui64 <= std::numeric_limits<uint32_t>::max()) {
+        return uint32();
+      }
+      return uint64();
+    }
+    case Type::DOUBLE: {
+      double doub = value.scalar_as<DoubleScalar>().value;
+      if (double(float(doub)) == doub) {

Review Comment:
   What do you think of checking `fmod(doub) == 0` and demoting to integer? 
This would simplify expressions like `i32 > 0.0` to an integer comparison. 
Perhaps not worth it?
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] bkietz commented on a diff in pull request #15180: ARROW-11402: [C++] prefer casting literal over casting field ref

Reply via email to