This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-1.1-lts
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-1.1-lts by this push:
new 48f70cf54c [improvement](scanner) using avg rowset to calculate batch
size instead of using total_bytes since it costs a lot of cpu (#14273)
48f70cf54c is described below
commit 48f70cf54c1e43e593249d23413ccaf874ae3cbe
Author: yiguolei <[email protected]>
AuthorDate: Thu Nov 17 09:23:54 2022 +0800
[improvement](scanner) using avg rowset to calculate batch size instead of
using total_bytes since it costs a lot of cpu (#14273)
---
be/src/exec/olap_scanner.cpp | 19 +++++++++++++++----
be/src/exec/olap_scanner.h | 5 ++++-
2 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp
index 413ebee301..5fe1d551d3 100644
--- a/be/src/exec/olap_scanner.cpp
+++ b/be/src/exec/olap_scanner.cpp
@@ -210,14 +210,14 @@ Status OlapScanner::_init_tablet_reader_params(
bool has_replace_col = false;
for (auto col : _return_columns) {
if (_tablet->tablet_schema().column(col).aggregation() ==
-
FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE) {
+ FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE) {
has_replace_col = true;
break;
}
}
if (auto sequence_col_idx =
_tablet->tablet_schema().sequence_col_idx();
- has_replace_col && std::find(_return_columns.begin(),
_return_columns.end(),
- sequence_col_idx) == _return_columns.end()) {
+ has_replace_col && std::find(_return_columns.begin(),
_return_columns.end(),
+ sequence_col_idx) ==
_return_columns.end()) {
_tablet_reader_params.return_columns.push_back(sequence_col_idx);
}
}
@@ -292,7 +292,18 @@ Status OlapScanner::get_batch(RuntimeState* state,
RowBatch* batch, bool* eof) {
// Use total_byte_size here, not tuple_pool's allocated bytes,
because we preallocated tuple pool at beginning
// its size maybe larger than threshold, so that scanner will
break here and may dead loop.
// Not need check num_rows > 0, because total_byte_size() == 0 if
num_rows == 0.
- if (batch->is_full() || batch->total_byte_size() >=
raw_bytes_threshold ||
+ if (_avg_row_size == 0 && batch->num_rows() > 0) {
+ // total_byte_size() cost a lot of CPU time, so that compute
avg row size here.
+ _first_batch_row_num += batch->num_rows();
+ _first_batch_size += batch->total_byte_size();
+ // Accumulate many batches and then calculate avg row size to
avoid there are only small number of rows
+ if (_first_batch_size > raw_bytes_threshold) {
+ _avg_row_size = _first_batch_size / _first_batch_row_num;
+ }
+ }
+ int64_t batch_total_bytes = _avg_row_size > 0 ? _avg_row_size *
batch->num_rows()
+ :
batch->total_byte_size();
+ if (batch->is_full() || batch_total_bytes >= raw_bytes_threshold ||
raw_rows_read() >= raw_rows_threshold) {
_update_realtime_counter();
break;
diff --git a/be/src/exec/olap_scanner.h b/be/src/exec/olap_scanner.h
index ce758365da..fbfccc8bcd 100644
--- a/be/src/exec/olap_scanner.h
+++ b/be/src/exec/olap_scanner.h
@@ -33,9 +33,9 @@
#include "gen_cpp/PlanNodes_types.h"
#include "olap/delete_handler.h"
#include "olap/olap_cond.h"
-#include "olap/tuple_reader.h"
#include "olap/rowset/column_data.h"
#include "olap/storage_engine.h"
+#include "olap/tuple_reader.h"
#include "runtime/descriptors.h"
#include "runtime/tuple.h"
#include "runtime/vectorized_row_batch.h"
@@ -142,6 +142,9 @@ protected:
int64_t _num_rows_read = 0;
int64_t _raw_rows_read = 0;
int64_t _compressed_bytes_read = 0;
+ int64_t _avg_row_size = 0;
+ int64_t _first_batch_row_num = 0;
+ int64_t _first_batch_size = 0;
size_t _batch_size = 0;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]