[GitHub] [doris] freemandealer commented on a diff in pull request #16731: segcompaction featuring verticalcompaction

via GitHub Wed, 22 Feb 2023 22:04:55 -0800


freemandealer commented on code in PR #16731:
URL: https://github.com/apache/doris/pull/16731#discussion_r1112710335



##########
run-be-ut.sh:
##########
@@ -269,7 +269,7 @@ export UBSAN_OPTIONS=print_stacktrace=1
 test="${DORIS_TEST_BINARY_DIR}/doris_be_test"
 file_name="${test##*/}"
 if [[ -f "${test}" ]]; then
-    "${test}" --gtest_output="xml:${GTEST_OUTPUT_DIR}/${file_name}.xml" 
--gtest_print_time=true "${FILTER}"
+    gdb --args "${test}" 
--gtest_output="xml:${GTEST_OUTPUT_DIR}/${file_name}.xml" 
--gtest_print_time=true "${FILTER}"

Review Comment:
   unecessary



##########
be/src/olap/rowset/beta_rowset_writer.cpp:
##########
@@ -267,49 +274,69 @@ Status 
BetaRowsetWriter::_do_compact_segments(SegCompactionCandidatesSharedPtr s
         LOG(WARNING) << "skip segcompaction due to memory shortage";
         return Status::Error<FETCH_MEMORY_EXCEEDED>();
     }
+
     uint64_t begin = (*(segments->begin()))->id();
     uint64_t end = (*(segments->end() - 1))->id();
     uint64_t begin_time = GetCurrentTimeMicros();
 
-    auto schema = std::make_shared<Schema>(_context.tablet_schema->columns(),
-                                           
_context.tablet_schema->columns().size());
     std::unique_ptr<OlapReaderStatistics> stat(new OlapReaderStatistics());
     uint64_t merged_row_stat = 0;
-    auto reader_ptr = _get_segcompaction_reader(segments, schema, stat.get(), 
&merged_row_stat);
-    if (UNLIKELY(reader_ptr == nullptr)) {
-        LOG(WARNING) << "failed to get segcompaction reader";
-        return Status::Error<SEGCOMPACTION_INIT_READER>();
-    }
+    uint64_t index_size = 0;
+    uint64_t total_index_size = 0;
+
+    // ================ begin vcompaction ==================
     auto writer = _create_segcompaction_writer(begin, end);
     if (UNLIKELY(writer == nullptr)) {
         LOG(WARNING) << "failed to get segcompaction writer";
         return Status::Error<SEGCOMPACTION_INIT_WRITER>();
     }
-    uint64_t row_count = 0;
-    vectorized::Block block = _context.tablet_schema->create_block();
-    while (true) {
-        auto status = reader_ptr->next_batch(&block);
-        row_count += block.rows();
-        if (status != Status::OK()) {
-            if (LIKELY(status.is<END_OF_FILE>())) {
-                RETURN_NOT_OK_LOG(_add_block_for_segcompaction(&block, 
&writer),
-                                  "write block failed");
-                break;
-            } else {
-                LOG(WARNING) << "read block failed: " << status.to_string();
-                return status;
-            }
+
+    DCHECK(_context.tablet);
+    auto tablet = _context.tablet;
+
+    std::vector<std::vector<uint32_t>> column_groups;
+    Merger::vertical_split_columns(_context.tablet_schema, &column_groups);
+    vectorized::RowSourcesBuffer row_sources_buf(tablet->tablet_id(), 
tablet->tablet_path(),
+                                                 READER_SEGMENT_COMPACTION);
+
+    KeyBoundsPB key_bounds;
+    // compact group one by one
+    for (auto i = 0; i < column_groups.size(); ++i) {
+        VLOG_NOTICE << "row source size: " << row_sources_buf.total_size();
+        bool is_key = (i == 0);
+        std::vector<uint32_t> column_ids = column_groups[i];
+
+        writer->clear();
+        writer->init(column_ids, is_key);
+        auto schema = 
std::make_shared<Schema>(_context.tablet_schema->columns(), column_ids);
+        auto reader =
+                _get_segcompaction_reader(segments, tablet, schema, 
stat.get(), &merged_row_stat,
+                                          row_sources_buf, is_key, column_ids);
+        if (UNLIKELY(reader == nullptr)) {
+            LOG(WARNING) << "failed to get segcompaction reader";
+            return Status::Error<SEGCOMPACTION_INIT_READER>();
+        }
+
+        // ========= Merger Compaction
+        Merger::Statistics stats;
+
+        RETURN_IF_ERROR(Merger::vertical_compact_one_group(
+                tablet, READER_SEGMENT_COMPACTION, _context.tablet_schema, 
is_key, column_ids,
+                &row_sources_buf, *reader, *writer, INT_MAX, &stats, 
&index_size, key_bounds));

Review Comment:
   Segcompaction is meant to reduce the number of segment files, so I think It 
is controversial to split files during segcompaction. Besides, the maximun 
number of segment paticipating in one segcompaction are limited by 
segcompaction_threshold_segment_num, which ensures the output segment size will 
not grow uncontrolled. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [doris] freemandealer commented on a diff in pull request #16731: segcompaction featuring verticalcompaction

Reply via email to