mapleFU commented on code in PR #47090: URL: https://github.com/apache/arrow/pull/47090#discussion_r2206693730
########## cpp/src/parquet/properties.h: ########## @@ -293,6 +295,7 @@ class PARQUET_EXPORT WriterProperties { write_batch_size_(DEFAULT_WRITE_BATCH_SIZE), max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH), pagesize_(kDefaultDataPageSize), + max_rows_per_page_(kDefaultMaxRowsPerPage), Review Comment: So if we don't want to limit this, setting this to int64_t max is ok? ########## cpp/src/parquet/column_writer.cc: ########## @@ -1150,61 +1150,62 @@ void ColumnWriterImpl::FlushBufferedDataPages() { // ---------------------------------------------------------------------- // TypedColumnWriter -template <typename Action> -inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) { - int64_t num_batches = static_cast<int>(total / batch_size); - for (int round = 0; round < num_batches; round++) { - action(round * batch_size, batch_size, /*check_page_size=*/true); - } - // Write the remaining values - if (total % batch_size > 0) { - action(num_batches * batch_size, total % batch_size, /*check_page_size=*/true); - } -} - template <typename Action> inline void DoInBatches(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, int64_t batch_size, Action&& action, - bool pages_change_on_record_boundaries) { - if (!pages_change_on_record_boundaries || !rep_levels) { - // If rep_levels is null, then we are writing a non-repeated column. - // In this case, every record contains only one level. - return DoInBatches(num_levels, batch_size, std::forward<Action>(action)); - } - + bool pages_change_on_record_boundaries, int64_t max_rows_per_page, + const std::function<int64_t()>& curr_page_buffered_rows) { Review Comment: Like `Action`, can we use template function rather than `std::function<>`, which `std::function` is hard to optimize by compiler, and might be `nullptr`? ########## cpp/src/parquet/properties.h: ########## @@ -155,6 +155,8 @@ class PARQUET_EXPORT ReaderProperties { ReaderProperties PARQUET_EXPORT default_reader_properties(); static constexpr int64_t kDefaultDataPageSize = 1024 * 1024; +/// FIXME: Switch the default value to 20000 will break UTs. +static constexpr int64_t kDefaultMaxRowsPerPage = 1000000; Review Comment: (So it don't follow the same style with other `DEFAULT_...`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org