Gabriel39 commented on code in PR #8438:
URL: https://github.com/apache/incubator-doris/pull/8438#discussion_r849433803
##########
be/src/olap/rowset/beta_rowset_writer.cpp:
##########
@@ -88,6 +88,75 @@ OLAPStatus BetaRowsetWriter::init(const RowsetWriterContext&
rowset_writer_conte
return OLAP_SUCCESS;
}
+OLAPStatus BetaRowsetWriter::add_block(const vectorized::Block* block) {
+ if (UNLIKELY(_segment_writer == nullptr)) {
+ RETURN_NOT_OK(_create_segment_writer(&_segment_writer));
+ }
+ size_t block_size_in_bytes = block->bytes();
+ size_t block_row_num = block->rows();
+ if (UNLIKELY(block_row_num == 0)) {
+ return OLAP_SUCCESS;
+ }
+ size_t row_avg_size_in_bytes = std::max((size_t)1, block_size_in_bytes /
block_row_num);
+ size_t row_offset = 0;
+ int64_t segment_capacity_in_bytes = 0;
+ int64_t segment_capacity_in_rows = 0;
+ auto refresh_segment_capacity = [&]() {
+ segment_capacity_in_bytes =
+ (int64_t)MAX_SEGMENT_SIZE -
(int64_t)_segment_writer->estimate_segment_size();
+ segment_capacity_in_rows = (int64_t)_context.max_rows_per_segment -
+
(int64_t)_segment_writer->num_rows_written();
+ };
+
+ refresh_segment_capacity();
+ if (UNLIKELY(segment_capacity_in_bytes < row_avg_size_in_bytes ||
+ segment_capacity_in_rows <= 0)) {
+ // no space for another signle row, need flush now
+ RETURN_NOT_OK(_flush_segment_writer(&_segment_writer));
+ RETURN_NOT_OK(_create_segment_writer(&_segment_writer));
+ refresh_segment_capacity();
+ }
+
+ assert(segment_capacity_in_bytes > row_avg_size_in_bytes &&
segment_capacity_in_rows > 0);
Review Comment:
Btw, size of a segment file will be at least MAX_SEGMENT_SIZE in old way
however at most MAX_SEGMENT_SIZE in this new way. So I just wonder should we
keep origin behavior?
##########
be/src/olap/rowset/beta_rowset_writer.cpp:
##########
@@ -88,6 +88,75 @@ OLAPStatus BetaRowsetWriter::init(const RowsetWriterContext&
rowset_writer_conte
return OLAP_SUCCESS;
}
+OLAPStatus BetaRowsetWriter::add_block(const vectorized::Block* block) {
+ if (UNLIKELY(_segment_writer == nullptr)) {
+ RETURN_NOT_OK(_create_segment_writer(&_segment_writer));
+ }
+ size_t block_size_in_bytes = block->bytes();
+ size_t block_row_num = block->rows();
+ if (UNLIKELY(block_row_num == 0)) {
+ return OLAP_SUCCESS;
+ }
+ size_t row_avg_size_in_bytes = std::max((size_t)1, block_size_in_bytes /
block_row_num);
+ size_t row_offset = 0;
+ int64_t segment_capacity_in_bytes = 0;
+ int64_t segment_capacity_in_rows = 0;
+ auto refresh_segment_capacity = [&]() {
+ segment_capacity_in_bytes =
+ (int64_t)MAX_SEGMENT_SIZE -
(int64_t)_segment_writer->estimate_segment_size();
+ segment_capacity_in_rows = (int64_t)_context.max_rows_per_segment -
+
(int64_t)_segment_writer->num_rows_written();
+ };
+
+ refresh_segment_capacity();
+ if (UNLIKELY(segment_capacity_in_bytes < row_avg_size_in_bytes ||
+ segment_capacity_in_rows <= 0)) {
+ // no space for another signle row, need flush now
+ RETURN_NOT_OK(_flush_segment_writer(&_segment_writer));
+ RETURN_NOT_OK(_create_segment_writer(&_segment_writer));
+ refresh_segment_capacity();
+ }
+
+ assert(segment_capacity_in_bytes > row_avg_size_in_bytes &&
segment_capacity_in_rows > 0);
Review Comment:
In extreme cases, `row_avg_size_in_bytes` could be larger than
`segment_capacity_in_bytes`. In old version `add_row` function, it will be
written in a single segment which contains only one row. But in this new
function, it seems like this row will never be processed.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]