pitrou commented on code in PR #48546:
URL: https://github.com/apache/arrow/pull/48546#discussion_r2630389052


##########
cpp/src/parquet/arrow/generate_fuzz_corpus.cc:
##########
@@ -369,6 +378,100 @@ Result<std::vector<Column>> ExampleColumns(int32_t 
length, double null_probabili
   return columns;
 }
 
+template <typename T>
+constexpr auto kMin = std::numeric_limits<T>::lowest();
+template <typename T>
+constexpr auto kMax = std::numeric_limits<T>::max();
+
+// Generate columns for physical types along with their supported encodings
+Result<std::vector<ColumnWithEncodings>> AllColumnsWithEncodings(
+    int32_t length, double null_probability = 0.2) {
+  const EncodingVector kIntEncodings = {Encoding::PLAIN, 
Encoding::RLE_DICTIONARY,
+                                        Encoding::DELTA_BINARY_PACKED,
+                                        Encoding::BYTE_STREAM_SPLIT};
+  const EncodingVector kFloatEncodings = {Encoding::PLAIN, 
Encoding::RLE_DICTIONARY,
+                                          Encoding::BYTE_STREAM_SPLIT};
+  const EncodingVector kBooleanEncodings = {Encoding::PLAIN, Encoding::RLE};
+  const EncodingVector kByteArrayEncodings = {Encoding::PLAIN, 
Encoding::RLE_DICTIONARY,
+                                              
Encoding::DELTA_LENGTH_BYTE_ARRAY,
+                                              Encoding::DELTA_BYTE_ARRAY};
+  const EncodingVector kFixedLenByteArrayEncodings = {
+      Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::DELTA_BYTE_ARRAY,
+      Encoding::BYTE_STREAM_SPLIT};
+
+  std::vector<ColumnWithEncodings> columns;
+
+  random::RandomArrayGenerator gen(42);
+  auto name_gen = Column::NameGenerator();
+
+  for (const double true_probability : {0.0, 0.001, 0.01, 0.5, 0.999}) {
+    columns.push_back(
+        {{name_gen(), gen.Boolean(length, true_probability, null_probability)},
+         kBooleanEncodings});
+  }
+
+  columns.push_back(
+      {{name_gen(), gen.Int32(length, -100, 100, null_probability)}, 
kIntEncodings});
+  columns.push_back(
+      {{name_gen(), gen.Int32(length, kMin<int32_t>, kMax<int32_t>, 
null_probability)},
+       kIntEncodings});
+  columns.push_back({{name_gen(), gen.Int64(length, -100'000, 100'000, 
null_probability)},
+                     kIntEncodings});
+  columns.push_back(
+      {{name_gen(), gen.Int64(length, kMin<int64_t>, kMax<int64_t>, 
null_probability)},
+       kIntEncodings});
+
+  // XXX should we add INT96? It's deprecated, only supports PLAIN and is 
featured in
+  // the parquet-testing files.

Review Comment:
   That's a good point, I'll try to add INT96 into the mix.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to