[GitHub] [arrow] pitrou commented on a change in pull request #11322: ARROW-13558: [C++] Validate decimal arrays/scalars

GitBox Wed, 03 Nov 2021 18:37:24 -0700


pitrou commented on a change in pull request #11322:
URL: https://github.com/apache/arrow/pull/11322#discussion_r742190118




##########
File path: cpp/src/arrow/testing/json_integration_test.cc
##########
@@ -54,6 +54,9 @@ DEFINE_string(
     "Mode of integration testing tool (ARROW_TO_JSON, JSON_TO_ARROW, 
VALIDATE)");
 DEFINE_bool(integration, false, "Run in integration test mode");
 DEFINE_bool(verbose, true, "Verbose output");
+DEFINE_bool(no_decimal_validate, false,

Review comment:
       It's generally nicer to avoid double negatives, so I'd prefer 
`DEFINE_bool(validate_decimals, true, "...")`

##########
File path: cpp/src/arrow/testing/random_test.cc
##########
@@ -240,6 +240,7 @@ class RandomDecimalArrayTest : public ::testing::Test {
     ASSERT_LE(over_half, non_nulls * 0.7);
     ASSERT_GE(negative, non_nulls * 0.3);
     ASSERT_LE(negative, non_nulls * 0.7);
+    ASSERT_OK(values.ValidateFull());

Review comment:
       There's already a `ASSERT_OK(array.ValidateFull());` at the beginning, 
this doesn't seem to add anything?

##########
File path: cpp/src/arrow/testing/util.cc
##########
@@ -83,22 +83,17 @@ std::string random_string(int64_t n, uint32_t seed) {
 }
 
 void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* 
out) {
-  pcg32_fast gen(seed);
-  std::uniform_int_distribution<uint32_t> d(0, 
std::numeric_limits<uint8_t>::max());
-  const int32_t required_bytes = DecimalType::DecimalSize(precision);
-  constexpr int32_t byte_width = 16;
-  std::fill(out, out + byte_width * n, '\0');
-
-  for (int64_t i = 0; i < n; ++i, out += byte_width) {
-    std::generate(out, out + required_bytes,
-                  [&d, &gen] { return static_cast<uint8_t>(d(gen)); });
-
-    // sign extend if the sign bit is set for the last byte generated
-    // 0b10000000 == 0x80 == 128
-    if ((out[required_bytes - 1] & '\x80') != 0) {
-      std::fill(out + required_bytes, out + byte_width, '\xFF');
-    }
+  auto gen = random::RandomArrayGenerator(seed);
+  std::shared_ptr<Array> decimals;
+  int32_t byte_width = 0;
+  if (precision <= Decimal128Type::kMaxPrecision) {
+    decimals = gen.Decimal128(decimal128(precision, 0), n);
+    byte_width = Decimal128Type::kByteWidth;
+  } else {
+    decimals = gen.Decimal256(decimal256(precision, 0), n);
+    byte_width = Decimal256Type::kByteWidth;

Review comment:
       It seems `byte_width` is always 16 in the original version. Do we need 
the decimal256 support? Going forward, it seems we should use 
`RandomArrayGenerator` directly.

##########
File path: cpp/src/parquet/arrow/test_util.h
##########
@@ -130,22 +130,17 @@ ::arrow::enable_if_fixed_size_binary<ArrowType, Status> 
NonNullArray(
 }
 
 static void random_decimals(int64_t n, uint32_t seed, int32_t precision, 
uint8_t* out) {
-  std::default_random_engine gen(seed);
-  std::uniform_int_distribution<uint32_t> d(0, 
std::numeric_limits<uint8_t>::max());
-  const int32_t required_bytes = ::arrow::DecimalType::DecimalSize(precision);
-  int32_t byte_width = precision <= 38 ? 16 : 32;
-  std::fill(out, out + byte_width * n, '\0');
-
-  for (int64_t i = 0; i < n; ++i, out += byte_width) {
-    std::generate(out, out + required_bytes,
-                  [&d, &gen] { return static_cast<uint8_t>(d(gen)); });
-
-    // sign extend if the sign bit is set for the last byte generated
-    // 0b10000000 == 0x80 == 128
-    if ((out[required_bytes - 1] & '\x80') != 0) {
-      std::fill(out + required_bytes, out + byte_width, '\xFF');
-    }
+  auto gen = ::arrow::random::RandomArrayGenerator(seed);
+  std::shared_ptr<Array> decimals;
+  int32_t byte_width = 0;
+  if (precision <= ::arrow::Decimal128Type::kMaxPrecision) {
+    decimals = gen.Decimal128(::arrow::decimal128(precision, 0), n);
+    byte_width = ::arrow::Decimal128Type::kByteWidth;
+  } else {
+    decimals = gen.Decimal256(::arrow::decimal256(precision, 0), n);
+    byte_width = ::arrow::Decimal256Type::kByteWidth;

Review comment:
       Same remark here.

##########
File path: dev/archery/archery/integration/datagen.py
##########
@@ -431,23 +431,10 @@ def generate_column(self, size, name=None):
         return PrimitiveColumn(name, size, is_valid, values)
 
 
-DECIMAL_PRECISION_TO_VALUE = {
-    key: (1 << (8 * i - 1)) - 1 for i, key in enumerate(
-        [1, 3, 5, 7, 10, 12, 15, 17, 19, 22, 24, 27, 29, 32, 34, 36,
-         40, 42, 44, 50, 60, 70],
-        start=1,
-    )
-}
-
-
 def decimal_range_from_precision(precision):
     assert 1 <= precision <= 76
-    try:
-        max_value = DECIMAL_PRECISION_TO_VALUE[precision]
-    except KeyError:
-        return decimal_range_from_precision(precision - 1)
-    else:
-        return ~max_value, max_value
+    max_value = (10 ** precision) - 1
+    return ~max_value, max_value

Review comment:
       Shouldn't it actually be `-max_value, max_value`?

##########
File path: dev/archery/archery/integration/runner.py
##########
@@ -147,7 +147,13 @@ def _gold_tests(self, gold_dir):
                 skip.add("C#")
                 skip.add("Go")
 
-            yield datagen.File(name, None, None, skip=skip, path=out_path)
+            quirks = set()
+            if prefix in {'0.14.1', '0.17.1',

Review comment:
       Add reference to JIRA issue here?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #11322: ARROW-13558: [C++] Validate decimal arrays/scalars

Reply via email to