pitrou commented on code in PR #39705:
URL: https://github.com/apache/arrow/pull/39705#discussion_r1461915113


##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -219,5 +219,87 @@ BENCHMARK(RecordReaderReadRecords)
     ->Args({2, 1000, true})
     ->Args({2, 1000, false});
 
+void GenerateLevels(int level_repeats, int max_level, int num_levels,
+                    std::vector<int16_t>& input_levels) {
+  // Generate random levels
+  std::default_random_engine gen(/*seed=*/1943);
+  std::uniform_int_distribution<int16_t> d(0, max_level);
+  for (int i = 0; i < num_levels;) {
+    int16_t current_level = d(gen);  // level repeat `level_repeats` times
+    for (int j = 0; j < level_repeats; ++j) {
+      input_levels.push_back(current_level);
+      ++i;

Review Comment:
   What if `i >= num_levels`?



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -219,5 +219,87 @@ BENCHMARK(RecordReaderReadRecords)
     ->Args({2, 1000, true})
     ->Args({2, 1000, false});
 
+void GenerateLevels(int level_repeats, int max_level, int num_levels,
+                    std::vector<int16_t>& input_levels) {
+  // Generate random levels
+  std::default_random_engine gen(/*seed=*/1943);
+  std::uniform_int_distribution<int16_t> d(0, max_level);
+  for (int i = 0; i < num_levels;) {
+    int16_t current_level = d(gen);  // level repeat `level_repeats` times
+    for (int j = 0; j < level_repeats; ++j) {
+      input_levels.push_back(current_level);
+      ++i;
+    }
+  }
+}
+
+void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels,
+                  const int16_t* input_levels, std::vector<uint8_t>& bytes) {

Review Comment:
   Can we please avoid mutable refs?



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -219,5 +219,87 @@ BENCHMARK(RecordReaderReadRecords)
     ->Args({2, 1000, true})
     ->Args({2, 1000, false});
 
+void GenerateLevels(int level_repeats, int max_level, int num_levels,
+                    std::vector<int16_t>& input_levels) {

Review Comment:
   Let's please avoid mutable references. Also, why is it named `input_levels`?



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -219,5 +219,87 @@ BENCHMARK(RecordReaderReadRecords)
     ->Args({2, 1000, true})
     ->Args({2, 1000, false});
 
+void GenerateLevels(int level_repeats, int max_level, int num_levels,
+                    std::vector<int16_t>& input_levels) {
+  // Generate random levels
+  std::default_random_engine gen(/*seed=*/1943);
+  std::uniform_int_distribution<int16_t> d(0, max_level);
+  for (int i = 0; i < num_levels;) {
+    int16_t current_level = d(gen);  // level repeat `level_repeats` times
+    for (int j = 0; j < level_repeats; ++j) {
+      input_levels.push_back(current_level);
+      ++i;
+    }
+  }
+}
+
+void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels,
+                  const int16_t* input_levels, std::vector<uint8_t>& bytes) {
+  LevelEncoder encoder;
+  int levels_count = 0;
+  bytes.resize(2 * num_levels);
+  ASSERT_EQ(2 * num_levels, static_cast<int>(bytes.size()));
+  // encode levels
+  if (encoding == Encoding::RLE) {
+    // leave space to write the rle length value
+    encoder.Init(encoding, max_level, num_levels, bytes.data() + 
sizeof(int32_t),
+                 static_cast<int>(bytes.size()));
+
+    levels_count = encoder.Encode(num_levels, input_levels);
+    (reinterpret_cast<int32_t*>(bytes.data()))[0] = encoder.len();
+  } else {
+    encoder.Init(encoding, max_level, num_levels, bytes.data(),
+                 static_cast<int>(bytes.size()));
+    levels_count = encoder.Encode(num_levels, input_levels);
+  }
+  ASSERT_EQ(num_levels, levels_count);
+}
+
+static void DecodeLevels(Encoding::type level_encoding, int16_t max_level, int 
num_levels,
+                         int batch_size, int level_repeat_count,
+                         ::benchmark::State& state) {
+  std::vector<uint8_t> bytes;
+  {
+    std::vector<int16_t> input_levels;
+    GenerateLevels(/*level_repeats=*/level_repeat_count, 
/*max_repeat_factor=*/max_level,
+                   num_levels, input_levels);
+    // Print generated levels

Review Comment:
   This comment doesn't seem accurate?



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -219,5 +219,87 @@ BENCHMARK(RecordReaderReadRecords)
     ->Args({2, 1000, true})
     ->Args({2, 1000, false});
 
+void GenerateLevels(int level_repeats, int max_level, int num_levels,
+                    std::vector<int16_t>& input_levels) {
+  // Generate random levels
+  std::default_random_engine gen(/*seed=*/1943);
+  std::uniform_int_distribution<int16_t> d(0, max_level);
+  for (int i = 0; i < num_levels;) {
+    int16_t current_level = d(gen);  // level repeat `level_repeats` times
+    for (int j = 0; j < level_repeats; ++j) {
+      input_levels.push_back(current_level);
+      ++i;
+    }
+  }
+}
+
+void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels,
+                  const int16_t* input_levels, std::vector<uint8_t>& bytes) {
+  LevelEncoder encoder;
+  int levels_count = 0;
+  bytes.resize(2 * num_levels);
+  ASSERT_EQ(2 * num_levels, static_cast<int>(bytes.size()));
+  // encode levels
+  if (encoding == Encoding::RLE) {
+    // leave space to write the rle length value
+    encoder.Init(encoding, max_level, num_levels, bytes.data() + 
sizeof(int32_t),
+                 static_cast<int>(bytes.size()));
+
+    levels_count = encoder.Encode(num_levels, input_levels);
+    (reinterpret_cast<int32_t*>(bytes.data()))[0] = encoder.len();
+  } else {
+    encoder.Init(encoding, max_level, num_levels, bytes.data(),
+                 static_cast<int>(bytes.size()));
+    levels_count = encoder.Encode(num_levels, input_levels);
+  }
+  ASSERT_EQ(num_levels, levels_count);
+}
+
+static void DecodeLevels(Encoding::type level_encoding, int16_t max_level, int 
num_levels,
+                         int batch_size, int level_repeat_count,
+                         ::benchmark::State& state) {
+  std::vector<uint8_t> bytes;
+  {
+    std::vector<int16_t> input_levels;
+    GenerateLevels(/*level_repeats=*/level_repeat_count, 
/*max_repeat_factor=*/max_level,
+                   num_levels, input_levels);
+    // Print generated levels
+    EncodeLevels(level_encoding, max_level, num_levels, input_levels.data(), 
bytes);
+  }
+
+  LevelDecoder decoder;
+  std::vector<int16_t> output_levels(num_levels);
+  for (auto _ : state) {
+    decoder.SetData(level_encoding, max_level, num_levels, bytes.data(),
+                    static_cast<int>(bytes.size()));
+    // Decode multiple times with batch_size
+    while (true) {
+      int levels_decoded = decoder.Decode(batch_size, output_levels.data());
+      if (levels_decoded == 0) {
+        break;
+      }
+    }
+  }

Review Comment:
   Can you call `SetItemsProcessed` at the end? We should make sure benchmark 
numbers are intelligible.



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -219,5 +219,87 @@ BENCHMARK(RecordReaderReadRecords)
     ->Args({2, 1000, true})
     ->Args({2, 1000, false});
 
+void GenerateLevels(int level_repeats, int max_level, int num_levels,
+                    std::vector<int16_t>& input_levels) {
+  // Generate random levels
+  std::default_random_engine gen(/*seed=*/1943);
+  std::uniform_int_distribution<int16_t> d(0, max_level);
+  for (int i = 0; i < num_levels;) {
+    int16_t current_level = d(gen);  // level repeat `level_repeats` times
+    for (int j = 0; j < level_repeats; ++j) {
+      input_levels.push_back(current_level);
+      ++i;
+    }
+  }
+}
+
+void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels,
+                  const int16_t* input_levels, std::vector<uint8_t>& bytes) {
+  LevelEncoder encoder;
+  int levels_count = 0;
+  bytes.resize(2 * num_levels);
+  ASSERT_EQ(2 * num_levels, static_cast<int>(bytes.size()));

Review Comment:
   Is this useful?



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -219,5 +219,87 @@ BENCHMARK(RecordReaderReadRecords)
     ->Args({2, 1000, true})
     ->Args({2, 1000, false});
 
+void GenerateLevels(int level_repeats, int max_level, int num_levels,
+                    std::vector<int16_t>& input_levels) {
+  // Generate random levels
+  std::default_random_engine gen(/*seed=*/1943);
+  std::uniform_int_distribution<int16_t> d(0, max_level);
+  for (int i = 0; i < num_levels;) {
+    int16_t current_level = d(gen);  // level repeat `level_repeats` times
+    for (int j = 0; j < level_repeats; ++j) {
+      input_levels.push_back(current_level);
+      ++i;
+    }
+  }
+}
+
+void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels,
+                  const int16_t* input_levels, std::vector<uint8_t>& bytes) {
+  LevelEncoder encoder;
+  int levels_count = 0;
+  bytes.resize(2 * num_levels);
+  ASSERT_EQ(2 * num_levels, static_cast<int>(bytes.size()));
+  // encode levels
+  if (encoding == Encoding::RLE) {

Review Comment:
   Is there a case where we don't use RLE for levels?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to