[GitHub] [arrow] pitrou commented on a change in pull request #11872: ARROW-8823: [C++] Compute aggregate compression ratio when producing compressed IPC body messages

GitBox Tue, 07 Dec 2021 05:14:38 -0800


pitrou commented on a change in pull request #11872:
URL: https://github.com/apache/arrow/pull/11872#discussion_r763975968




##########
File path: cpp/src/arrow/ipc/read_write_test.cc
##########
@@ -1727,6 +1727,61 @@ TEST(TestIpcFileFormat, FooterMetaData) {
   ASSERT_TRUE(out_metadata->Equals(*metadata));
 }
 
+TEST_F(TestWriteRecordBatch, CompressionRatio) {
+  // ARROW-8823: Calculating the compression ratio
+  FileWriterHelper helper;
+  IpcWriteOptions options_uncompressed = IpcWriteOptions::Defaults();
+
+  std::vector<std::shared_ptr<RecordBatch>> batches(3);
+  // empty record batch
+  ASSERT_OK(MakeIntBatchSized(0, &batches[0]));
+  // record batch with int values
+  ASSERT_OK(MakeIntBatchSized(2000, &batches[1], 100));
+
+  // record batch with DictionaryArray
+  random::RandomArrayGenerator rg(/*seed=*/0);
+  int64_t length = 500;
+  int dict_size = 50;
+  std::shared_ptr<Array> dict =
+      rg.String(dict_size, /*min_length=*/5, /*max_length=*/5, 
/*null_probability=*/0);
+  std::shared_ptr<Array> indices =
+      rg.Int32(length, /*min=*/0, /*max=*/dict_size - 1, 
/*null_probability=*/0.1);
+  auto dict_type = dictionary(int32(), utf8());
+  auto dict_field = field("f1", dict_type);
+  ASSERT_OK_AND_ASSIGN(auto dict_array,
+                       DictionaryArray::FromArrays(dict_type, indices, dict));
+
+  auto schema = ::arrow::schema({field("f0", utf8()), dict_field});
+  batches[2] =
+      RecordBatch::Make(schema, length, {rg.String(500, 0, 10, 0.1), 
dict_array});
+
+  for (size_t i = 0; i < batches.size(); ++i) {
+    // without compression
+    ASSERT_OK(helper.Init(batches[i]->schema(), options_uncompressed));
+    ASSERT_OK(helper.WriteBatch(batches[i]));
+    ASSERT_OK(helper.Finish());
+    // padding can make serialized data slightly larger than the raw data size
+    // when no compression is used
+    ASSERT_LE(helper.writer_->stats().total_raw_body_size,
+              helper.writer_->stats().total_serialized_body_size);
+
+    if (!util::Codec::IsAvailable(Compression::LZ4_FRAME)) {
+      continue;
+    }
+
+    IpcWriteOptions options_compressed = IpcWriteOptions::Defaults();
+    ASSERT_OK_AND_ASSIGN(options_compressed.codec,
+                         util::Codec::Create(Compression::LZ4_FRAME));
+
+    // with compression
+    ASSERT_OK(helper.Init(batches[i]->schema(), options_compressed));
+    ASSERT_OK(helper.WriteBatch(batches[i]));
+    ASSERT_OK(helper.Finish());
+    ASSERT_GE(helper.writer_->stats().total_raw_body_size,
+              helper.writer_->stats().total_serialized_body_size);

Review comment:
       Can this be `ASSERT_GT` instead or will it fail the test?

##########
File path: cpp/src/arrow/ipc/read_write_test.cc
##########
@@ -1727,6 +1727,61 @@ TEST(TestIpcFileFormat, FooterMetaData) {
   ASSERT_TRUE(out_metadata->Equals(*metadata));
 }
 
+TEST_F(TestWriteRecordBatch, CompressionRatio) {
+  // ARROW-8823: Calculating the compression ratio

Review comment:
       Nit: rename this test and update the comment now that we don't compute a 
ratio anymore?

##########
File path: cpp/src/arrow/ipc/writer.h
##########
@@ -57,7 +57,8 @@ struct IpcPayload {
   MessageType type = MessageType::NONE;
   std::shared_ptr<Buffer> metadata;
   std::vector<std::shared_ptr<Buffer>> body_buffers;
-  int64_t body_length = 0;
+  int64_t body_length = 0;      // serialized body length (maybe compressed)
+  int64_t raw_body_length = 0;  // initial uncompressed body_length

Review comment:
       ```suggestion
     int64_t body_length = 0;      // serialized body length (padded, maybe 
compressed)
     int64_t raw_body_length = 0;  // initial uncompressed body length
   ```

##########
File path: cpp/src/arrow/ipc/writer.h
##########
@@ -75,6 +76,11 @@ struct WriteStats {
   /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
   /// an existing dictionary with an unrelated new dictionary).
   int64_t num_replaced_dictionaries = 0;
+
+  /// initial and serialized (may be compressed) body sizes in bytes for 
record batches
+  /// these values show the total sizes of all record batch body lengths

Review comment:
       ```suggestion
     /// Total size in bytes of record batches emitted.
     /// The "raw" size counts the original buffer sizes, while the 
"serialized" size
     /// includes padding and (optionally) compression.
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #11872: ARROW-8823: [C++] Compute aggregate compression ratio when producing compressed IPC body messages

Reply via email to