pitrou commented on code in PR #47621:
URL: https://github.com/apache/arrow/pull/47621#discussion_r2371377492


##########
cpp/src/parquet/arrow/generate_fuzz_corpus.cc:
##########
@@ -41,47 +44,162 @@ namespace arrow {
 
 using ::arrow::internal::CreateDir;
 using ::arrow::internal::PlatformFilename;
+using ::arrow::util::Float16;
+using ::parquet::ArrowWriterProperties;
 using ::parquet::WriterProperties;
 
 static constexpr int32_t kBatchSize = 1000;
+// This will emit several row groups
 static constexpr int32_t kChunkSize = kBatchSize * 3 / 8;
 
-std::shared_ptr<WriterProperties> GetWriterProperties() {
-  WriterProperties::Builder builder{};
-  builder.disable_dictionary("no_dict");
-  builder.compression("compressed", Compression::BROTLI);
-  return builder.build();
+struct WriteConfig {
+  std::shared_ptr<WriterProperties> writer_properties;
+  std::shared_ptr<ArrowWriterProperties> arrow_writer_properties;
+};
+
+struct Column {
+  std::string name;
+  std::shared_ptr<Array> array;
+
+  static std::function<std::string()> NameGenerator() {
+    struct Gen {
+      int num_col = 1;
+
+      std::string operator()() {
+        std::stringstream ss;
+        ss << "col_" << num_col++;
+        return std::move(ss).str();
+      }
+    };
+    return Gen{};
+  }
+};
+
+std::vector<WriteConfig> GetWriteConfigurations() {
+  // clang-format off
+  auto w_brotli = WriterProperties::Builder()
+      .disable_dictionary("no_dict")
+      ->compression("compressed", Compression::BROTLI)
+      // Override current default of 1MB
+      ->data_pagesize(20'000)
+      // Reduce max dictionary page size so that less columns are
+      // dict-encoded (XXX: this does not seem to have an effect?)

Review Comment:
   Thanks for the explanation. I guess it makes sense, though it's a bit 
surprising at first. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to