AshinGau commented on code in PR #13867:
URL: https://github.com/apache/doris/pull/13867#discussion_r1052221400
##########
be/src/vec/exec/format/parquet/vparquet_reader.cpp:
##########
@@ -372,20 +447,24 @@ bool ParquetReader::_next_row_group_reader() {
return true;
}
-Status ParquetReader::_init_row_group_readers() {
- RETURN_IF_ERROR(_filter_row_groups());
+Status ParquetReader::_init_row_group_readers(const bool& filter_groups) {
+ std::vector<RowGroupIndex> group_indexes;
+ RETURN_IF_ERROR(_filter_row_groups(filter_groups, group_indexes));
+ DCHECK_EQ(group_indexes.size(), _read_row_groups.size());
+ auto group_index = group_indexes.begin();
for (auto row_group_id : _read_row_groups) {
auto& row_group = _t_metadata->row_groups[row_group_id];
std::shared_ptr<RowGroupReader> row_group_reader;
- row_group_reader.reset(new RowGroupReader(_file_reader.get(),
_read_columns, row_group_id,
+ row_group_reader.reset(new RowGroupReader(_file_reader.get(),
_read_columns, *group_index,
row_group, _ctz,
_lazy_read_ctx));
- std::vector<RowRange> candidate_row_ranges;
- RETURN_IF_ERROR(_process_page_index(row_group, candidate_row_ranges));
- if (candidate_row_ranges.empty()) {
+ group_index++;
+ RETURN_IF_ERROR(_process_page_index(row_group));
+ if (_row_ranges.empty()) {
+ _row_ranges.emplace_back(0, row_group.num_rows);
_statistics.read_rows += row_group.num_rows;
}
- RETURN_IF_ERROR(row_group_reader->init(_file_metadata->schema(),
candidate_row_ranges,
- _col_offsets));
+ RETURN_IF_ERROR(row_group_reader->init(_file_metadata->schema(),
_col_offsets));
+ row_group_reader->set_row_ranges(_row_ranges);
Review Comment:
是不是有正确性问题呢,所有的RowGroupReader都是相同的 _row_ranges
##########
be/src/vec/exec/format/parquet/vparquet_reader.cpp:
##########
@@ -372,20 +447,24 @@ bool ParquetReader::_next_row_group_reader() {
return true;
}
-Status ParquetReader::_init_row_group_readers() {
- RETURN_IF_ERROR(_filter_row_groups());
+Status ParquetReader::_init_row_group_readers(const bool& filter_groups) {
+ std::vector<RowGroupIndex> group_indexes;
+ RETURN_IF_ERROR(_filter_row_groups(filter_groups, group_indexes));
+ DCHECK_EQ(group_indexes.size(), _read_row_groups.size());
+ auto group_index = group_indexes.begin();
for (auto row_group_id : _read_row_groups) {
auto& row_group = _t_metadata->row_groups[row_group_id];
std::shared_ptr<RowGroupReader> row_group_reader;
- row_group_reader.reset(new RowGroupReader(_file_reader.get(),
_read_columns, row_group_id,
+ row_group_reader.reset(new RowGroupReader(_file_reader.get(),
_read_columns, *group_index,
row_group, _ctz,
_lazy_read_ctx));
- std::vector<RowRange> candidate_row_ranges;
- RETURN_IF_ERROR(_process_page_index(row_group, candidate_row_ranges));
- if (candidate_row_ranges.empty()) {
+ group_index++;
+ RETURN_IF_ERROR(_process_page_index(row_group));
+ if (_row_ranges.empty()) {
+ _row_ranges.emplace_back(0, row_group.num_rows);
_statistics.read_rows += row_group.num_rows;
}
- RETURN_IF_ERROR(row_group_reader->init(_file_metadata->schema(),
candidate_row_ranges,
- _col_offsets));
+ RETURN_IF_ERROR(row_group_reader->init(_file_metadata->schema(),
_col_offsets));
+ row_group_reader->set_row_ranges(_row_ranges);
Review Comment:
底层的ColumnReader需要的row range是针对自己的行号,每个ColumnReader的行号都是从0开始的。
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]