pitrou commented on code in PR #47427:
URL: https://github.com/apache/arrow/pull/47427#discussion_r2306767901


##########
cpp/src/parquet/properties.h:
##########
@@ -1126,6 +1127,18 @@ class PARQUET_EXPORT ArrowReaderProperties {
   /// Return whether loading statistics as much as possible.
   bool should_load_statistics() const { return should_load_statistics_; }
 
+  /// \brief Set whether to infer Decimal32/64 from Parquet decimal logical 
types.
+  ///
+  /// Default is false.

Review Comment:
   ```suggestion
     /// Default is false for compatibility, meaning that only Decimal128 and 
Decimal256
     /// can be inferred.
   ```



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -1076,10 +1078,16 @@ Result<bool> ApplyOriginalStorageMetadata(const Field& 
origin_field,
     modified = true;
   }
 
-  if (origin_type->id() == ::arrow::Type::DECIMAL256 &&
-      inferred_type->id() == ::arrow::Type::DECIMAL128) {
-    inferred->field = inferred->field->WithType(origin_type);
-    modified = true;
+  if (::arrow::is_decimal(origin_type->id()) &&
+      ::arrow::is_decimal(inferred_type->id())) {
+    auto& origin_decimal = checked_cast<const 
::arrow::DecimalType&>(*origin_type);
+    auto& inferred_decimal = checked_cast<const 
::arrow::DecimalType&>(*inferred_type);
+    ARROW_DCHECK_EQ(origin_decimal.scale(), inferred_decimal.scale());
+    ARROW_DCHECK_LE(origin_decimal.precision(), inferred_decimal.precision());

Review Comment:
   I'd rather we error out cleanly (or simply do not apply the original type), 
rather than have a runtime assertion, so that users do not encounter a crash if 
a Parquet file has incorrect decimal data/metadata.



##########
cpp/src/parquet/arrow/reader_internal.cc:
##########
@@ -802,6 +796,33 @@ Status TransferDecimal(RecordReader* reader, MemoryPool* 
pool,
   return Status::OK();
 }
 
+template <typename DecimalArrayType, typename... Args>
+Status TransferDecimalTo(Type::type physical_type, Args... args) {

Review Comment:
   Nit: perhaps use universal forwarding?
   ```suggestion
   Status TransferDecimalTo(Type::type physical_type, Args&&... args) {
   ```
   
   and then:
   ```c++
         RETURN_NOT_OK(fn(std::forward<Args>(args)...));
   ```



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -3044,6 +3082,32 @@ TEST(ArrowReadWrite, NestedRequiredField) {
                        /*row_group_size=*/8);
 }
 
+TEST(ArrowReadWrite, Decimal32) {

Review Comment:
   1. The various decimal types use the exact same test code here, can this be 
factored out? (or just write a single test and loop over decimal types)
   2. Should we also test decimal128? It seems to be missing here.



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -5468,6 +5532,86 @@ TYPED_TEST(TestIntegerAnnotateDecimalTypeParquetIO, 
SingleNullableDecimalColumn)
   ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleDecimalColumnFile(*values));
 }
 
+template <typename TestType>
+class TestIntegerAnnotateSmallestDecimalTypeParquetIO
+    : public TestIntegerAnnotateDecimalTypeParquetIO<TestType> {
+ public:
+  void ReadAndCheckSingleDecimalColumnFile(const Array& values) {
+    ArrowReaderProperties properties = default_arrow_reader_properties();
+    properties.set_smallest_decimal_enabled(true);
+
+    std::shared_ptr<Array> out;
+    std::unique_ptr<FileReader> reader;
+    this->ReaderFromSink(&reader, properties);
+    this->ReadSingleColumnFile(std::move(reader), &out);
+
+    if (values.type()->id() == out->type()->id()) {
+      AssertArraysEqual(values, *out);
+    } else {
+      auto& expected_values = values;
+      auto& read_values = *out;

Review Comment:
   Not sure why we're introducing these, it just complicated reading the code 
for no good reason.



##########
cpp/src/parquet/properties.h:
##########
@@ -1126,6 +1127,18 @@ class PARQUET_EXPORT ArrowReaderProperties {
   /// Return whether loading statistics as much as possible.
   bool should_load_statistics() const { return should_load_statistics_; }
 
+  /// \brief Set whether to infer Decimal32/64 from Parquet decimal logical 
types.
+  ///
+  /// Default is false.
+  void set_smallest_decimal_enabled(bool smallest_decimal_enable) {
+    smallest_decimal_enabled_ = smallest_decimal_enable;
+  }
+  /// \brief Set whether to infer Decimal32/64 from Parquet decimal logical 
types.
+  ///
+  /// When enabled, decimal type will be inferred as the smallest DecimalType 
which is
+  /// able to represent that precision; otherwise always inferred as 
Decimal128.

Review Comment:
   ```suggestion
     /// When enabled, Parquet decimal columns will be inferred as the smallest 
possible
     /// Arrow Decimal type.
     /// When disabled, Parquet decimal columns will be inferred as either 
Decimal128 or
     /// Decimal256, but not Decimal32/64.
     ///
     /// Note: if an Arrow schema is found in the Parquet metadata, it will 
take priority and
     /// this setting will be ignored.
   ```



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -5468,6 +5532,86 @@ TYPED_TEST(TestIntegerAnnotateDecimalTypeParquetIO, 
SingleNullableDecimalColumn)
   ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleDecimalColumnFile(*values));
 }
 
+template <typename TestType>
+class TestIntegerAnnotateSmallestDecimalTypeParquetIO
+    : public TestIntegerAnnotateDecimalTypeParquetIO<TestType> {
+ public:
+  void ReadAndCheckSingleDecimalColumnFile(const Array& values) {
+    ArrowReaderProperties properties = default_arrow_reader_properties();
+    properties.set_smallest_decimal_enabled(true);
+
+    std::shared_ptr<Array> out;
+    std::unique_ptr<FileReader> reader;
+    this->ReaderFromSink(&reader, properties);
+    this->ReadSingleColumnFile(std::move(reader), &out);
+
+    if (values.type()->id() == out->type()->id()) {
+      AssertArraysEqual(values, *out);
+    } else {
+      auto& expected_values = values;
+      auto& read_values = *out;
+      ASSERT_EQ(expected_values.length(), read_values.length());
+      ASSERT_EQ(expected_values.null_count(), read_values.null_count());
+
+      auto format_decimal = [](const Array& values, int64_t index) {

Review Comment:
   This is a bit cumbersome, we could probably just use `arrow::Compute::Cast` 
to convert both arrays to Decimal256 before comparing?



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -1076,10 +1078,16 @@ Result<bool> ApplyOriginalStorageMetadata(const Field& 
origin_field,
     modified = true;
   }
 
-  if (origin_type->id() == ::arrow::Type::DECIMAL256 &&
-      inferred_type->id() == ::arrow::Type::DECIMAL128) {
-    inferred->field = inferred->field->WithType(origin_type);
-    modified = true;
+  if (::arrow::is_decimal(origin_type->id()) &&
+      ::arrow::is_decimal(inferred_type->id())) {
+    auto& origin_decimal = checked_cast<const 
::arrow::DecimalType&>(*origin_type);
+    auto& inferred_decimal = checked_cast<const 
::arrow::DecimalType&>(*inferred_type);
+    ARROW_DCHECK_EQ(origin_decimal.scale(), inferred_decimal.scale());
+    ARROW_DCHECK_LE(origin_decimal.precision(), inferred_decimal.precision());

Review Comment:
   I'm also not sure why `ARROW_DCHECK_LE` on the precision. Why not check it's 
equal instead?



##########
cpp/src/parquet/properties.h:
##########
@@ -1126,6 +1127,18 @@ class PARQUET_EXPORT ArrowReaderProperties {
   /// Return whether loading statistics as much as possible.
   bool should_load_statistics() const { return should_load_statistics_; }
 
+  /// \brief Set whether to infer Decimal32/64 from Parquet decimal logical 
types.
+  ///
+  /// Default is false.
+  void set_smallest_decimal_enabled(bool smallest_decimal_enable) {
+    smallest_decimal_enabled_ = smallest_decimal_enable;
+  }
+  /// \brief Set whether to infer Decimal32/64 from Parquet decimal logical 
types.

Review Comment:
   ```suggestion
     /// \brief Whether to infer Decimal32/64 from Parquet decimal logical 
types.
   ```



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -5468,6 +5532,86 @@ TYPED_TEST(TestIntegerAnnotateDecimalTypeParquetIO, 
SingleNullableDecimalColumn)
   ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleDecimalColumnFile(*values));
 }
 
+template <typename TestType>
+class TestIntegerAnnotateSmallestDecimalTypeParquetIO
+    : public TestIntegerAnnotateDecimalTypeParquetIO<TestType> {
+ public:
+  void ReadAndCheckSingleDecimalColumnFile(const Array& values) {
+    ArrowReaderProperties properties = default_arrow_reader_properties();
+    properties.set_smallest_decimal_enabled(true);
+
+    std::shared_ptr<Array> out;
+    std::unique_ptr<FileReader> reader;
+    this->ReaderFromSink(&reader, properties);
+    this->ReadSingleColumnFile(std::move(reader), &out);
+
+    if (values.type()->id() == out->type()->id()) {
+      AssertArraysEqual(values, *out);
+    } else {
+      auto& expected_values = values;
+      auto& read_values = *out;
+      ASSERT_EQ(expected_values.length(), read_values.length());
+      ASSERT_EQ(expected_values.null_count(), read_values.null_count());
+
+      auto format_decimal = [](const Array& values, int64_t index) {
+        switch (values.type()->id()) {
+          case ::arrow::Type::DECIMAL32:
+            return static_cast<const 
::arrow::Decimal32Array&>(values).FormatValue(index);
+          case ::arrow::Type::DECIMAL64:
+            return static_cast<const 
::arrow::Decimal64Array&>(values).FormatValue(index);
+          case ::arrow::Type::DECIMAL128:
+            return static_cast<const 
::arrow::Decimal128Array&>(values).FormatValue(
+                index);
+          case ::arrow::Type::DECIMAL256:
+            return static_cast<const 
::arrow::Decimal256Array&>(values).FormatValue(
+                index);
+          default:
+            std::string err("Unexpected decimal type: " + 
values.type()->ToString());
+            ADD_FAILURE() << err;
+            return err;
+        }
+      };
+
+      for (int64_t i = 0; i < expected_values.length(); ++i) {
+        ASSERT_EQ(expected_values.IsNull(i), read_values.IsNull(i));
+        if (!expected_values.IsNull(i)) {
+          std::string expected_str = format_decimal(expected_values, i);
+          std::string read_str = format_decimal(read_values, i);
+          ASSERT_EQ(expected_str, read_str);
+        }
+      }
+    }
+  }
+};
+
+using SmallestDecimalTestTypes = ::testing::Types<

Review Comment:
   I think we can cut down on the number of test combinations, this is 
compiling and running many test instances for no good reason IMHO.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to