This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git

commit b61e44b308d32ad258282748c0d85c5711d6c703
Author: zhannngchen <[email protected]>
AuthorDate: Mon May 15 17:16:31 2023 +0800

    [fix](storage) consider file size on page cache key (#19619)
    
    The core is due to a DCHECK:
    
    F0513 22:48:56.059758 3996895 tablet.cpp:2690] Check failed: num_to_read == 
num_read
    Finally, we found that the DCHECK failure is due to page cache:
    
    1. At first we have 20 segments, which id is 0-19.
    2. For MoW table, memtable flush process will calculate the delete bitmap. 
In this procedure, the index pages and data pages of PrimaryKeyIndex is loaded 
to cache
    3. Segment compaction compact all these 10 segments to 2 segment, and 
rename it to id 0,1
    4. Finally, before the load commit, we'll calculate delete bitmap between 
segments in current rowset. This procedure need to iterator primary key index 
of each segments, but when we access data of new compacted segments, we read 
data of old segments in page cache
    To fix this issue, the best policy is:
    
    1. Add a crc32 or last modified time to CacheKey.
    2. Or invalid related cache keys after segment compaction.
    For policy 1, we don't have crc32 in segment footer, and getting the 
last-modified-time needs to perform 1 additional disk IO.
    For policy 2, we need to add additional page cache invalidation methods, 
which may cause the page cache not stable
    
    So I think we can simply add a file size to identify that the file is 
changed.
    In LSM-Tree, all modification will generate new files, such file-name reuse 
is not normal case(as far as I know, only segment compaction), file size is 
enough to identify the file change.
---
 be/src/olap/page_cache.h                         |  5 ++-
 be/src/olap/rowset/segment_v2/page_io.cpp        |  2 +-
 be/src/olap/rowset/segment_v2/segment_writer.cpp |  5 +++
 be/src/olap/tablet.cpp                           |  7 +++-
 be/test/olap/page_cache_test.cpp                 | 52 ++++++++++++++++--------
 5 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/be/src/olap/page_cache.h b/be/src/olap/page_cache.h
index 6313e931e0..860110f14e 100644
--- a/be/src/olap/page_cache.h
+++ b/be/src/olap/page_cache.h
@@ -42,13 +42,16 @@ public:
     // TODO(zc): Now we use file name(std::string) as a part of
     // key, which is not efficient. We should make it better later
     struct CacheKey {
-        CacheKey(std::string fname_, int64_t offset_) : 
fname(std::move(fname_)), offset(offset_) {}
+        CacheKey(std::string fname_, size_t fsize_, int64_t offset_)
+                : fname(std::move(fname_)), fsize(fsize_), offset(offset_) {}
         std::string fname;
+        size_t fsize;
         int64_t offset;
 
         // Encode to a flat binary which can be used as LRUCache's key
         std::string encode() const {
             std::string key_buf(fname);
+            key_buf.append((char*)&fsize, sizeof(fsize));
             key_buf.append((char*)&offset, sizeof(offset));
             return key_buf;
         }
diff --git a/be/src/olap/rowset/segment_v2/page_io.cpp 
b/be/src/olap/rowset/segment_v2/page_io.cpp
index e44bdcd0df..cdcb7a6fa8 100644
--- a/be/src/olap/rowset/segment_v2/page_io.cpp
+++ b/be/src/olap/rowset/segment_v2/page_io.cpp
@@ -108,7 +108,7 @@ Status PageIO::read_and_decompress_page(const 
PageReadOptions& opts, PageHandle*
     auto cache = StoragePageCache::instance();
     PageCacheHandle cache_handle;
     StoragePageCache::CacheKey cache_key(opts.file_reader->path().native(),
-                                         opts.page_pointer.offset);
+                                         opts.file_reader->size(), 
opts.page_pointer.offset);
     if (opts.use_page_cache && cache->is_cache_available(opts.type) &&
         cache->lookup(cache_key, &cache_handle, opts.type)) {
         // we find page in cache, use it
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/segment_writer.cpp
index df0f044396..2cc6f0a361 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp
@@ -206,6 +206,7 @@ Status SegmentWriter::append_block(const vectorized::Block* 
block, size_t row_po
     if (_has_key) {
         if (_tablet_schema->keys_type() == UNIQUE_KEYS && 
_opts.enable_unique_key_merge_on_write) {
             // create primary indexes
+            std::string last_key;
             for (size_t pos = 0; pos < num_rows; pos++) {
                 std::string key = _full_encode_keys(key_columns, pos);
 #ifndef NDEBUG
@@ -215,7 +216,11 @@ Status SegmentWriter::append_block(const 
vectorized::Block* block, size_t row_po
                 if (_tablet_schema->has_sequence_col()) {
                     _encode_seq_column(seq_column, pos, &key);
                 }
+                DCHECK(key.compare(last_key) > 0)
+                        << "found duplicate key or key is not sorted! current 
key: " << key
+                        << ", last key" << last_key;
                 RETURN_IF_ERROR(_primary_key_index_builder->add_item(key));
+                last_key = std::move(key);
             }
         } else {
             // create short key indexes'
diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp
index b8dffef4be..b39d284ced 100644
--- a/be/src/olap/tablet.cpp
+++ b/be/src/olap/tablet.cpp
@@ -2074,10 +2074,15 @@ Status Tablet::calc_delete_bitmap(RowsetId rowset_id,
             auto index_column = index_type->create_column();
             Slice last_key_slice(last_key);
             RETURN_IF_ERROR(iter->seek_at_or_after(&last_key_slice, 
&exact_match));
+            auto current_ordinal = iter->get_current_ordinal();
+            DCHECK(total == remaining + current_ordinal)
+                    << "total: " << total << ", remaining: " << remaining
+                    << ", current_ordinal: " << current_ordinal;
 
             size_t num_read = num_to_read;
             RETURN_IF_ERROR(iter->next_batch(&num_read, index_column));
-            DCHECK(num_to_read == num_read);
+            DCHECK(num_to_read == num_read)
+                    << "num_to_read: " << num_to_read << ", num_read: " << 
num_read;
             last_key = index_column->get_data_at(num_read - 1).to_string();
 
             // exclude last_key, last_key will be read in next batch.
diff --git a/be/test/olap/page_cache_test.cpp b/be/test/olap/page_cache_test.cpp
index 73c11c39b8..38297d93c2 100644
--- a/be/test/olap/page_cache_test.cpp
+++ b/be/test/olap/page_cache_test.cpp
@@ -33,8 +33,8 @@ public:
 TEST(StoragePageCacheTest, data_page_only) {
     StoragePageCache cache(kNumShards * 2048, 0, kNumShards);
 
-    StoragePageCache::CacheKey key("abc", 0);
-    StoragePageCache::CacheKey memory_key("mem", 0);
+    StoragePageCache::CacheKey key("abc", 0, 0);
+    StoragePageCache::CacheKey memory_key("mem", 0, 0);
 
     segment_v2::PageTypePB page_type = segment_v2::DATA_PAGE;
 
@@ -67,16 +67,24 @@ TEST(StoragePageCacheTest, data_page_only) {
 
     // put too many page to eliminate first page
     for (int i = 0; i < 10 * kNumShards; ++i) {
-        StoragePageCache::CacheKey key("bcd", i);
+        StoragePageCache::CacheKey key("bcd", 0, i);
         PageCacheHandle handle;
         Slice data(new char[1024], 1024);
         cache.insert(key, data, &handle, page_type, false);
     }
 
-    // cache miss
+    // cache miss, different offset
     {
         PageCacheHandle handle;
-        StoragePageCache::CacheKey miss_key("abc", 1);
+        StoragePageCache::CacheKey miss_key("abc", 0, 1);
+        auto found = cache.lookup(miss_key, &handle, page_type);
+        EXPECT_FALSE(found);
+    }
+
+    // cache miss, different file size
+    {
+        PageCacheHandle handle;
+        StoragePageCache::CacheKey miss_key("abc", 1, 0);
         auto found = cache.lookup(miss_key, &handle, page_type);
         EXPECT_FALSE(found);
     }
@@ -93,8 +101,8 @@ TEST(StoragePageCacheTest, data_page_only) {
 TEST(StoragePageCacheTest, index_page_only) {
     StoragePageCache cache(kNumShards * 2048, 100, kNumShards);
 
-    StoragePageCache::CacheKey key("abc", 0);
-    StoragePageCache::CacheKey memory_key("mem", 0);
+    StoragePageCache::CacheKey key("abc", 0, 0);
+    StoragePageCache::CacheKey memory_key("mem", 0, 0);
 
     segment_v2::PageTypePB page_type = segment_v2::INDEX_PAGE;
 
@@ -127,16 +135,24 @@ TEST(StoragePageCacheTest, index_page_only) {
 
     // put too many page to eliminate first page
     for (int i = 0; i < 10 * kNumShards; ++i) {
-        StoragePageCache::CacheKey key("bcd", i);
+        StoragePageCache::CacheKey key("bcd", 0, i);
         PageCacheHandle handle;
         Slice data(new char[1024], 1024);
         cache.insert(key, data, &handle, page_type, false);
     }
 
-    // cache miss
+    // cache miss, different offset
+    {
+        PageCacheHandle handle;
+        StoragePageCache::CacheKey miss_key("abc", 1, 1);
+        auto found = cache.lookup(miss_key, &handle, page_type);
+        EXPECT_FALSE(found);
+    }
+
+    // cache miss, different file size
     {
         PageCacheHandle handle;
-        StoragePageCache::CacheKey miss_key("abc", 1);
+        StoragePageCache::CacheKey miss_key("abc", 1, 0);
         auto found = cache.lookup(miss_key, &handle, page_type);
         EXPECT_FALSE(found);
     }
@@ -153,10 +169,10 @@ TEST(StoragePageCacheTest, index_page_only) {
 TEST(StoragePageCacheTest, mixed_pages) {
     StoragePageCache cache(kNumShards * 2048, 10, kNumShards);
 
-    StoragePageCache::CacheKey data_key("data", 0);
-    StoragePageCache::CacheKey index_key("index", 0);
-    StoragePageCache::CacheKey data_key_mem("data_mem", 0);
-    StoragePageCache::CacheKey index_key_mem("index_mem", 0);
+    StoragePageCache::CacheKey data_key("data", 0, 0);
+    StoragePageCache::CacheKey index_key("index", 0, 0);
+    StoragePageCache::CacheKey data_key_mem("data_mem", 0, 0);
+    StoragePageCache::CacheKey index_key_mem("index_mem", 0, 0);
 
     segment_v2::PageTypePB page_type_data = segment_v2::DATA_PAGE;
     segment_v2::PageTypePB page_type_index = segment_v2::INDEX_PAGE;
@@ -201,7 +217,7 @@ TEST(StoragePageCacheTest, mixed_pages) {
 
     // put too many page to eliminate first page of both cache
     for (int i = 0; i < 10 * kNumShards; ++i) {
-        StoragePageCache::CacheKey key("bcd", i);
+        StoragePageCache::CacheKey key("bcd", 0, i);
         PageCacheHandle handle;
         Slice data(new char[1024], 1024), index(new char[1024], 1024);
         cache.insert(key, data, &handle, page_type_data, false);
@@ -211,7 +227,7 @@ TEST(StoragePageCacheTest, mixed_pages) {
     // cache miss by key
     {
         PageCacheHandle data_handle, index_handle;
-        StoragePageCache::CacheKey miss_key("abc", 1);
+        StoragePageCache::CacheKey miss_key("abc", 0, 1);
         auto found_data = cache.lookup(miss_key, &data_handle, page_type_data);
         auto found_index = cache.lookup(miss_key, &index_handle, 
page_type_index);
         EXPECT_FALSE(found_data);
@@ -221,8 +237,8 @@ TEST(StoragePageCacheTest, mixed_pages) {
     // cache miss by page type
     {
         PageCacheHandle data_handle, index_handle;
-        StoragePageCache::CacheKey miss_key_data("data_miss", 1);
-        StoragePageCache::CacheKey miss_key_index("index_miss", 1);
+        StoragePageCache::CacheKey miss_key_data("data_miss", 0, 1);
+        StoragePageCache::CacheKey miss_key_index("index_miss", 0, 1);
         char* buf_data = new char[1024];
         char* buf_index = new char[1024];
         Slice data(buf_data, 1024), index(buf_index, 1024);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to