pitrou commented on code in PR #36377:
URL: https://github.com/apache/arrow/pull/36377#discussion_r1268131813


##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -5057,23 +5092,123 @@ class TestBufferedParquetIO : public 
TestParquetIO<TestType> {
 TYPED_TEST_SUITE(TestBufferedParquetIO, TestTypes);
 
 TYPED_TEST(TestBufferedParquetIO, SingleColumnOptionalBufferedWriteSmall) {
-  constexpr int64_t batch_size = SMALL_SIZE / 4;
+  constexpr size_t NUM_BATCHES = 4;
+  constexpr int64_t batch_size = SMALL_SIZE / NUM_BATCHES;
   std::shared_ptr<Array> values;
-  ASSERT_OK(NullableArray<TypeParam>(SMALL_SIZE, 10, kDefaultSeed, &values));
+  ASSERT_OK(
+      NullableArray<TypeParam>(SMALL_SIZE, /*num_nulls=*/10, kDefaultSeed, 
&values));
   int num_row_groups = 0;
   this->WriteBufferedFile(values, batch_size, &num_row_groups);
   ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*values, 
num_row_groups));
 }
 
 TYPED_TEST(TestBufferedParquetIO, SingleColumnOptionalBufferedWriteLarge) {
-  constexpr int64_t batch_size = LARGE_SIZE / 4;
+  constexpr size_t NUM_BATCHES = 4;
+  constexpr int64_t batch_size = LARGE_SIZE / NUM_BATCHES;
   std::shared_ptr<Array> values;
-  ASSERT_OK(NullableArray<TypeParam>(LARGE_SIZE, 100, kDefaultSeed, &values));
+  ASSERT_OK(
+      NullableArray<TypeParam>(LARGE_SIZE, /*num_nulls=*/100, kDefaultSeed, 
&values));
   int num_row_groups = 0;
   this->WriteBufferedFile(values, batch_size, &num_row_groups);
   ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnTable(values, 
num_row_groups));
 }
 
+TYPED_TEST(TestBufferedParquetIO, WriteTableSmall) {
+  std::shared_ptr<Array> values;
+  ASSERT_OK(
+      NullableArray<TypeParam>(SMALL_SIZE, /*num_nulls=*/10, kDefaultSeed, 
&values));
+  int num_row_groups = 0;
+  // Write all table with one batch.
+  int64_t write_table_batch_size = SMALL_SIZE;
+  int64_t write_table_max_row_group_size = DEFAULT_MAX_ROW_GROUP_LENGTH;
+  int64_t write_max_row_group_size = DEFAULT_MAX_ROW_GROUP_LENGTH;
+  this->WriteBufferedTable(values, write_table_batch_size, 
write_table_max_row_group_size,
+                           write_max_row_group_size, &num_row_groups);
+  EXPECT_EQ(1, num_row_groups);
+  ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*values, 
num_row_groups));
+}
+
+TYPED_TEST(TestBufferedParquetIO, WriteTableLarge) {
+  std::shared_ptr<Array> values;

Review Comment:
   Can you also void repeating yourself in the tests below?



##########
cpp/src/parquet/arrow/writer.cc:
##########
@@ -488,6 +458,59 @@ class FileWriterImpl : public FileWriter {
   std::vector<ArrowWriteContext> parallel_column_write_contexts_;
 };
 
+template <typename T>
+Status FileWriterImpl::WriteBuffered(const T& batch, int64_t 
max_row_group_length) {
+  if (row_group_writer_ == nullptr || !row_group_writer_->buffered() ||
+      row_group_writer_->num_rows() >= max_row_group_length) {
+    RETURN_NOT_OK(NewBufferedRowGroup());
+  }
+
+  auto WriteBatch = [&](int64_t offset, int64_t size) {
+    std::vector<std::unique_ptr<ArrowColumnWriterV2>> writers;
+    int column_index_start = 0;
+
+    for (int i = 0; i < batch.num_columns(); i++) {
+      std::shared_ptr<ChunkedArray> chunked_array = 
GetColumnChunkedArray(batch, i);

Review Comment:
   It is a bit wasteful to call this for each chunk. Why not use a 
`RecordBatchReader` instead?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -5057,23 +5092,123 @@ class TestBufferedParquetIO : public 
TestParquetIO<TestType> {
 TYPED_TEST_SUITE(TestBufferedParquetIO, TestTypes);
 
 TYPED_TEST(TestBufferedParquetIO, SingleColumnOptionalBufferedWriteSmall) {
-  constexpr int64_t batch_size = SMALL_SIZE / 4;
+  constexpr size_t NUM_BATCHES = 4;
+  constexpr int64_t batch_size = SMALL_SIZE / NUM_BATCHES;
   std::shared_ptr<Array> values;
-  ASSERT_OK(NullableArray<TypeParam>(SMALL_SIZE, 10, kDefaultSeed, &values));
+  ASSERT_OK(
+      NullableArray<TypeParam>(SMALL_SIZE, /*num_nulls=*/10, kDefaultSeed, 
&values));
   int num_row_groups = 0;
   this->WriteBufferedFile(values, batch_size, &num_row_groups);
   ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*values, 
num_row_groups));
 }
 
 TYPED_TEST(TestBufferedParquetIO, SingleColumnOptionalBufferedWriteLarge) {
-  constexpr int64_t batch_size = LARGE_SIZE / 4;
+  constexpr size_t NUM_BATCHES = 4;
+  constexpr int64_t batch_size = LARGE_SIZE / NUM_BATCHES;
   std::shared_ptr<Array> values;
-  ASSERT_OK(NullableArray<TypeParam>(LARGE_SIZE, 100, kDefaultSeed, &values));
+  ASSERT_OK(
+      NullableArray<TypeParam>(LARGE_SIZE, /*num_nulls=*/100, kDefaultSeed, 
&values));
   int num_row_groups = 0;
   this->WriteBufferedFile(values, batch_size, &num_row_groups);
   ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnTable(values, 
num_row_groups));
 }
 
+TYPED_TEST(TestBufferedParquetIO, WriteTableSmall) {
+  std::shared_ptr<Array> values;
+  ASSERT_OK(
+      NullableArray<TypeParam>(SMALL_SIZE, /*num_nulls=*/10, kDefaultSeed, 
&values));
+  int num_row_groups = 0;
+  // Write all table with one batch.
+  int64_t write_table_batch_size = SMALL_SIZE;
+  int64_t write_table_max_row_group_size = DEFAULT_MAX_ROW_GROUP_LENGTH;
+  int64_t write_max_row_group_size = DEFAULT_MAX_ROW_GROUP_LENGTH;
+  this->WriteBufferedTable(values, write_table_batch_size, 
write_table_max_row_group_size,
+                           write_max_row_group_size, &num_row_groups);
+  EXPECT_EQ(1, num_row_groups);
+  ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*values, 
num_row_groups));

Review Comment:
   Are there tests with more than one column somewhere?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -5024,6 +5024,41 @@ class TestBufferedParquetIO : public 
TestParquetIO<TestType> {
     ASSERT_OK_NO_THROW(writer->Close());
   }
 
+  void WriteBufferedTable(const std::shared_ptr<Array>& values,

Review Comment:
   Can you please refactor this with `WriteBufferedFile` to avoid copy-pasting 
entire chunks of code?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to