This is an automated email from the ASF dual-hosted git repository.
liaoxin01 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 4dda859061c [opt](csv reader) optimize nullable string deserialization
in CSV/text load hot path (#64476)
4dda859061c is described below
commit 4dda859061c373da96d94c914181f2a6e25212b0
Author: Xin Liao <[email protected]>
AuthorDate: Sat Jun 13 11:01:54 2026 +0800
[opt](csv reader) optimize nullable string deserialization in CSV/text load
hot path (#64476)
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #60920 (previous attempt, superseded by this stateless
implementation)
Problem Summary:
When loading CSV data, every column is read as a nullable string, so
`_deserialize_nullable_string` is the per-row per-column hot path
(ClickBench: 105 columns x 100M rows = ~10.5 billion cells). Flame graph
shows two major per-cell overheads:
1. `assert_cast<ColumnNullable&>` performs a typeid comparison per cell
in release builds.
2. `DataTypeStringSerDe::deserialize_one_cell_from_csv` adds a call
layer with another per-cell `assert_cast<ColumnString&>` inside, plus
Status plumbing. Its fill-null-on-failure branch is dead code since the
method never fails.
### Changes
1. Use `assert_cast<..., TypeCheckOnRelease::DISABLE>` in
`CsvReader::_deserialize_nullable_string` and
`TextReader::_deserialize_nullable_string`, which compiles to a plain
`static_cast` in release builds. Debug builds still verify the cast.
2. Write the string column and null map directly instead of going
through the SerDe layer (semantically identical, verified against
`ColumnNullable::insert_data` / `DataTypeStringSerDe` implementations).
The virtual `_deserialize_nullable_string` dispatch is kept, so
TextReader's hive-text semantics (different escape handling and null
detection) remain intact.
3. Add `_reserve_nullable_string_columns`, called once per batch: it
performs checked `assert_cast`s (backing the unchecked per-row casts
with a real type validation per batch, throwing instead of UB on
mismatch) and reserves offsets/null_map capacity to avoid incremental
PODArray growth in the row loop.
The implementation is stateless: no cached column pointers, no per-batch
member state to initialize/clear.
### Performance
A/B test on full ClickBench dataset (73GB / 100M rows / 105 columns),
identical deployment and config, only the BE binary differs:
| Metric | Before | After | Improvement |
|---|---|---|---|
| Total load time (BE LoadTime) | 636.6s | 530.9s | -16.6% (1.20x) |
| CSV parse (ReadDataTime) | 590.6s | 484.5s | -18.0% |
| Avg throughput | 115 MB/s | 138 MB/s | +20% |
All 10 splits (10M rows each) improved consistently by 14-18% with small
variance. Loaded row counts are identical between the two runs
(99,997,497 rows).
---
be/src/format/csv/csv_reader.cpp | 56 ++++++++++++++++++++++++++++----------
be/src/format/csv/csv_reader.h | 5 ++++
be/src/format/text/text_reader.cpp | 26 +++++++++++-------
3 files changed, 62 insertions(+), 25 deletions(-)
diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp
index 731f9e61049..c4837d65fb3 100644
--- a/be/src/format/csv/csv_reader.cpp
+++ b/be/src/format/csv/csv_reader.cpp
@@ -37,7 +37,10 @@
#include "common/status.h"
#include "core/block/block.h"
#include "core/block/column_with_type_and_name.h"
+#include "core/column/column_nullable.h"
+#include "core/column/column_string.h"
#include "core/data_type/data_type_factory.hpp"
+#include "core/data_type_serde/data_type_string_serde.h"
#include "exec/scan/scanner.h"
#include "format/file_reader/new_plain_binary_line_reader.h"
#include "format/file_reader/new_plain_text_line_reader.h"
@@ -451,6 +454,7 @@ Status CsvReader::_do_get_next_block(Block* block, size_t*
read_rows, bool* eof)
} else {
auto columns_guard = block->mutate_columns_scoped();
auto& columns = columns_guard.mutable_columns();
+ _reserve_nullable_string_columns(columns, batch_size);
while (rows < batch_size && !_line_reader_eof &&
(columns_byte_size(columns) < max_block_bytes)) {
const uint8_t* ptr = nullptr;
@@ -559,28 +563,31 @@ Status
CsvReader::get_parsed_schema(std::vector<std::string>* col_names,
}
Status CsvReader::_deserialize_nullable_string(IColumn& column, Slice& slice) {
- auto& null_column = assert_cast<ColumnNullable&>(column);
- if (_empty_field_as_null) {
- if (slice.size == 0) {
- null_column.insert_data(nullptr, 0);
- return Status::OK();
- }
+ // This is the per-row per-column hot path of CSV load (load reads every
column as
+ // nullable string). The column type was already verified by the checked
assert_cast
+ // in _reserve_nullable_string_columns at the beginning of the batch, so
the casts
+ // here can skip the release-build typeid check.
+ auto& null_column = assert_cast<ColumnNullable&,
TypeCheckOnRelease::DISABLE>(column);
+ auto& string_column = assert_cast<ColumnString&,
TypeCheckOnRelease::DISABLE>(
+ null_column.get_nested_column());
+ if (_empty_field_as_null && slice.size == 0) {
+ string_column.insert_default();
+ null_column.get_null_map_data().push_back(1);
+ return Status::OK();
}
if (_options.null_len > 0 && !(_options.converted_from_string &&
slice.trim_double_quotes())) {
if (slice.compare(Slice(_options.null_format, _options.null_len)) ==
0) {
- null_column.insert_data(nullptr, 0);
+ string_column.insert_default();
+ null_column.get_null_map_data().push_back(1);
return Status::OK();
}
}
- static DataTypeStringSerDe stringSerDe(TYPE_STRING);
- auto st =
stringSerDe.deserialize_one_cell_from_csv(null_column.get_nested_column(),
slice,
- _options);
- if (!st.ok()) {
- // fill null if fail
- null_column.insert_data(nullptr, 0); // 0 is meaningless here
- return Status::OK();
+ // Same as DataTypeStringSerDe::deserialize_one_cell_from_csv (which never
fails),
+ // written out here to skip the SerDe layer and its per-cell assert_cast.
+ if (_options.escape_char != 0) {
+ escape_string_for_csv(slice.data, &slice.size, _options.escape_char,
_options.quote_char);
}
- // fill not null if success
+ string_column.insert_data(slice.data, slice.size);
null_column.get_null_map_data().push_back(0);
return Status::OK();
}
@@ -770,6 +777,25 @@ Status CsvReader::_fill_dest_columns(const Slice& line,
std::vector<MutableColum
return Status::OK();
}
+void
CsvReader::_reserve_nullable_string_columns(std::vector<MutableColumnPtr>&
columns,
+ size_t batch_size) {
+ for (int i = 0; i < _file_slot_descs.size(); ++i) {
+ if (!_use_nullable_string_opt[i]) {
+ continue;
+ }
+ IColumn* col_ptr = _is_load ? columns[i].get() :
columns[_file_slot_idx_map[i]].get();
+ // The checked casts here (once per batch) guarantee the column types
for the
+ // unchecked per-row casts in _deserialize_nullable_string.
+ auto& null_column = assert_cast<ColumnNullable&>(*col_ptr);
+ auto& string_column =
assert_cast<ColumnString&>(null_column.get_nested_column());
+ // Reserve up front so the per-row loop does not pay for incremental
growth.
+ // The string chars are not reserved because their total size is
unpredictable.
+ string_column.get_offsets().reserve(string_column.size() + batch_size);
+
null_column.get_null_map_data().reserve(null_column.get_null_map_data().size() +
+ batch_size);
+ }
+}
+
Status CsvReader::_fill_empty_line(std::vector<MutableColumnPtr>& columns,
size_t* rows) {
for (int i = 0; i < _file_slot_descs.size(); ++i) {
IColumn* col_ptr = columns[i].get();
diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h
index f619ce4d4a8..80938abd271 100644
--- a/be/src/format/csv/csv_reader.h
+++ b/be/src/format/csv/csv_reader.h
@@ -235,6 +235,11 @@ private:
Status _fill_dest_columns(const Slice& line,
std::vector<MutableColumnPtr>& columns,
size_t* rows);
Status _fill_empty_line(std::vector<MutableColumnPtr>& columns, size_t*
rows);
+ // Called once per batch: verifies the nullable string columns' concrete
types (so the
+ // per-row casts in _deserialize_nullable_string can skip the
release-build type check)
+ // and reserves their offsets/null_map capacity to avoid incremental
growth per row.
+ void _reserve_nullable_string_columns(std::vector<MutableColumnPtr>&
columns,
+ size_t batch_size);
Status _line_split_to_values(const Slice& line, bool* success);
void _split_line(const Slice& line);
void _init_system_properties();
diff --git a/be/src/format/text/text_reader.cpp
b/be/src/format/text/text_reader.cpp
index 0e6a4f89d27..c118c21adda 100644
--- a/be/src/format/text/text_reader.cpp
+++ b/be/src/format/text/text_reader.cpp
@@ -28,6 +28,9 @@
#include "common/compiler_util.h" // IWYU pragma: keep
#include "common/status.h"
#include "core/block/block.h"
+#include "core/column/column_nullable.h"
+#include "core/column/column_string.h"
+#include "core/data_type_serde/data_type_string_serde.h"
#include "exec/scan/scanner.h"
#include "format/csv/csv_reader.h"
#include "format/file_reader/new_plain_text_line_reader.h"
@@ -166,20 +169,23 @@ Status TextReader::_validate_line(const Slice& line,
bool* success) {
}
Status TextReader::_deserialize_nullable_string(IColumn& column, Slice& slice)
{
- auto& null_column = assert_cast<ColumnNullable&>(column);
+ // Hot path of hive text load, see
CsvReader::_deserialize_nullable_string. The
+ // column type was verified by the checked assert_cast in
+ // _reserve_nullable_string_columns at the beginning of the batch.
+ auto& null_column = assert_cast<ColumnNullable&,
TypeCheckOnRelease::DISABLE>(column);
+ auto& string_column = assert_cast<ColumnString&,
TypeCheckOnRelease::DISABLE>(
+ null_column.get_nested_column());
if (slice.compare(Slice(_options.null_format, _options.null_len)) == 0) {
- null_column.insert_data(nullptr, 0);
+ string_column.insert_default();
+ null_column.get_null_map_data().push_back(1);
return Status::OK();
}
- static DataTypeStringSerDe stringSerDe(TYPE_STRING);
- auto st =
stringSerDe.deserialize_one_cell_from_hive_text(null_column.get_nested_column(),
- slice, _options);
- if (!st.ok()) {
- // fill null if fail
- null_column.insert_data(nullptr, 0); // 0 is meaningless here
- return Status::OK();
+ // Same as DataTypeStringSerDe::deserialize_one_cell_from_hive_text (which
never
+ // fails), written out here to skip the SerDe layer and its per-cell
assert_cast.
+ if (_options.escape_char != 0) {
+ escape_string(slice.data, &slice.size, _options.escape_char);
}
- // fill not null if success
+ string_column.insert_data(slice.data, slice.size);
null_column.get_null_map_data().push_back(0);
return Status::OK();
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]