pitrou commented on a change in pull request #11872:
URL: https://github.com/apache/arrow/pull/11872#discussion_r763142640



##########
File path: cpp/src/arrow/ipc/writer.h
##########
@@ -75,6 +76,19 @@ struct WriteStats {
   /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
   /// an existing dictionary with an unrelated new dictionary).
   int64_t num_replaced_dictionaries = 0;
+
+  /// initial and serialized (may be compressed) body lengths for record 
batches
+  /// these values show the total sizes of all record batch body lengths
+  int64_t raw_body_length = 0;
+  int64_t serialized_body_length = 0;

Review comment:
       In the Arrow C++ APIs, "length" generally points to the logical number 
of elements (see e.g. `Array::length()`), while "size" points to the physical 
size in bytes (as in `Buffer::size()`). So I think this should be:
   ```c++
   int64_t total_raw_body_size = 0;
   int64_t total_compressed_body_size = 0;
   ```

##########
File path: cpp/src/arrow/ipc/read_write_test.cc
##########
@@ -1727,6 +1727,52 @@ TEST(TestIpcFileFormat, FooterMetaData) {
   ASSERT_TRUE(out_metadata->Equals(*metadata));
 }
 
+TEST_F(TestWriteRecordBatch, CompressionRatio) {
+  // ARROW-8823: Calculating the compression ratio
+  FileWriterHelper helper;
+  IpcWriteOptions write_options1 = IpcWriteOptions::Defaults();
+  IpcWriteOptions write_options2 = IpcWriteOptions::Defaults();

Review comment:
       Can you give these clearer names, e.g. `options_uncompressed` and 
`options_compressed`?

##########
File path: cpp/src/arrow/ipc/writer.h
##########
@@ -75,6 +76,19 @@ struct WriteStats {
   /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
   /// an existing dictionary with an unrelated new dictionary).
   int64_t num_replaced_dictionaries = 0;
+
+  /// initial and serialized (may be compressed) body lengths for record 
batches
+  /// these values show the total sizes of all record batch body lengths
+  int64_t raw_body_length = 0;
+  int64_t serialized_body_length = 0;
+
+  /// compression ratio for the body of all record batches serialized
+  /// this is equivalent to:
+  ///    serialized_body_length / raw_body_length

Review comment:
       Since this is equivalent, I don't think there is any value in exposing 
it. People can trivially calculate it themselves.
   

##########
File path: cpp/src/arrow/ipc/read_write_test.cc
##########
@@ -1727,6 +1727,52 @@ TEST(TestIpcFileFormat, FooterMetaData) {
   ASSERT_TRUE(out_metadata->Equals(*metadata));
 }
 
+TEST_F(TestWriteRecordBatch, CompressionRatio) {
+  // ARROW-8823: Calculating the compression ratio
+  FileWriterHelper helper;
+  IpcWriteOptions write_options1 = IpcWriteOptions::Defaults();
+  IpcWriteOptions write_options2 = IpcWriteOptions::Defaults();
+  ASSERT_OK_AND_ASSIGN(write_options2.codec, 
util::Codec::Create(Compression::LZ4_FRAME));
+
+  // pre-computed compression ratios for record batches with 
Compression::LZ4_FRAME
+  std::vector<float> comp_ratios{1.0f, 0.64f, 0.79924363f};
+
+  std::vector<std::shared_ptr<RecordBatch>> batches(3);
+  // empty record batch
+  ASSERT_OK(MakeIntBatchSized(0, &batches[0]));
+  // record batch with int values
+  ASSERT_OK(MakeIntBatchSized(2000, &batches[1], 100));
+
+  // record batch with DictionaryArray
+  random::RandomArrayGenerator rg(/*seed=*/0);
+  int64_t length = 500;
+  int dict_size = 50;
+  std::shared_ptr<Array> dict = rg.String(dict_size, /*min_length=*/5, 
/*max_length=*/5, /*null_probability=*/0);
+  std::shared_ptr<Array> indices = rg.Int32(length, /*min=*/0, 
/*max=*/dict_size - 1, /*null_probability=*/0.1);
+  auto dict_type = dictionary(int32(), utf8());
+  auto dict_field = field("f1", dict_type);
+  ASSERT_OK_AND_ASSIGN(auto dict_array,
+                       DictionaryArray::FromArrays(dict_type, indices, dict));
+
+  auto schema = ::arrow::schema({field("f0", utf8()), dict_field});
+  batches[2] =
+    RecordBatch::Make(schema, length, {rg.String(500, 0, 10, 0.1), 
dict_array});
+
+  for(size_t i = 0; i < batches.size(); ++i) {

Review comment:
       As a suggestion for these tests, it would be nice to check that:
   * the raw body size is accurate (it can be hard-coded, since it should be 
stable and deterministic)
   * the compressed body size is equal to or smaller than the raw body size, 
depending on the compression parameter
   

##########
File path: cpp/src/arrow/ipc/writer.h
##########
@@ -75,6 +76,19 @@ struct WriteStats {
   /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
   /// an existing dictionary with an unrelated new dictionary).
   int64_t num_replaced_dictionaries = 0;
+
+  /// initial and serialized (may be compressed) body lengths for record 
batches
+  /// these values show the total sizes of all record batch body lengths

Review comment:
       Will this also include the dictionary batches?

##########
File path: cpp/src/arrow/ipc/read_write_test.cc
##########
@@ -1727,6 +1727,52 @@ TEST(TestIpcFileFormat, FooterMetaData) {
   ASSERT_TRUE(out_metadata->Equals(*metadata));
 }
 
+TEST_F(TestWriteRecordBatch, CompressionRatio) {
+  // ARROW-8823: Calculating the compression ratio
+  FileWriterHelper helper;
+  IpcWriteOptions write_options1 = IpcWriteOptions::Defaults();
+  IpcWriteOptions write_options2 = IpcWriteOptions::Defaults();
+  ASSERT_OK_AND_ASSIGN(write_options2.codec, 
util::Codec::Create(Compression::LZ4_FRAME));
+
+  // pre-computed compression ratios for record batches with 
Compression::LZ4_FRAME
+  std::vector<float> comp_ratios{1.0f, 0.64f, 0.79924363f};

Review comment:
       I don't think we want to hard-code this. The values can vary depending 
on the version of the lz4 library, or internal details of how we initialize the 
compressor. Just testing that _some_ compression happens should be sufficient.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to