adamreeve commented on code in PR #47621:
URL: https://github.com/apache/arrow/pull/47621#discussion_r2370538012
##########
cpp/src/parquet/arrow/generate_fuzz_corpus.cc:
##########
@@ -41,47 +44,162 @@ namespace arrow {
using ::arrow::internal::CreateDir;
using ::arrow::internal::PlatformFilename;
+using ::arrow::util::Float16;
+using ::parquet::ArrowWriterProperties;
using ::parquet::WriterProperties;
static constexpr int32_t kBatchSize = 1000;
+// This will emit several row groups
static constexpr int32_t kChunkSize = kBatchSize * 3 / 8;
-std::shared_ptr<WriterProperties> GetWriterProperties() {
- WriterProperties::Builder builder{};
- builder.disable_dictionary("no_dict");
- builder.compression("compressed", Compression::BROTLI);
- return builder.build();
+struct WriteConfig {
+ std::shared_ptr<WriterProperties> writer_properties;
+ std::shared_ptr<ArrowWriterProperties> arrow_writer_properties;
+};
+
+struct Column {
+ std::string name;
+ std::shared_ptr<Array> array;
+
+ static std::function<std::string()> NameGenerator() {
+ struct Gen {
+ int num_col = 1;
+
+ std::string operator()() {
+ std::stringstream ss;
+ ss << "col_" << num_col++;
+ return std::move(ss).str();
+ }
+ };
+ return Gen{};
+ }
+};
+
+std::vector<WriteConfig> GetWriteConfigurations() {
+ // clang-format off
+ auto w_brotli = WriterProperties::Builder()
+ .disable_dictionary("no_dict")
+ ->compression("compressed", Compression::BROTLI)
+ // Override current default of 1MB
+ ->data_pagesize(20'000)
+ // Reduce max dictionary page size so that less columns are
+ // dict-encoded (XXX: this does not seem to have an effect?)
Review Comment:
The number of rows per row group is pretty low (375), maybe the dictionary
page size limit is just too big?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]