Copilot commented on code in PR #61527:
URL: https://github.com/apache/doris/pull/61527#discussion_r2959240752


##########
be/src/pipeline/query_cache/query_cache.h:
##########
@@ -109,27 +109,76 @@ class QueryCache : public LRUCachePolicy {
     static Status build_cache_key(const std::vector<TScanRangeParams>& 
scan_ranges,
                                   const TQueryCacheParam& cache_param, 
std::string* cache_key,
                                   int64_t* version) {
-        if (scan_ranges.size() > 1) {
-            return Status::InternalError(
-                    "CacheSourceOperator only support one scan range, plan 
error");
+        if (scan_ranges.empty()) {
+            return Status::InternalError("scan_ranges is empty, plan error");
         }
-        auto& scan_range = scan_ranges[0];
-        DCHECK(scan_range.scan_range.__isset.palo_scan_range);
-        auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id;
-
-        std::from_chars(scan_range.scan_range.palo_scan_range.version.data(),
-                        scan_range.scan_range.palo_scan_range.version.data() +
-                                
scan_range.scan_range.palo_scan_range.version.size(),
-                        *version);
-
-        auto find_tablet = cache_param.tablet_to_range.find(tablet_id);
-        if (find_tablet == cache_param.tablet_to_range.end()) {
-            return Status::InternalError("Not find tablet in 
partition_to_tablets, plan error");
+
+        std::string digest;
+        try {
+            digest = cache_param.digest;
+        } catch (const std::exception&) {
+            return Status::InternalError("digest is invalid, plan error");
+        }
+        if (digest.empty()) {
+            return Status::InternalError("digest is empty, plan error");
+        }
+
+        if (cache_param.tablet_to_range.empty()) {
+            return Status::InternalError("tablet_to_range is empty, plan 
error");
+        }
+
+        std::vector<int64_t> tablet_ids;
+        tablet_ids.reserve(scan_ranges.size());
+        for (const auto& scan_range : scan_ranges) {
+            auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id;
+            tablet_ids.push_back(tablet_id);
+        }
+        std::sort(tablet_ids.begin(), tablet_ids.end());
+
+        int64_t first_version = -1;
+        std::string first_tablet_range;
+        for (size_t i = 0; i < tablet_ids.size(); ++i) {
+            auto tablet_id = tablet_ids[i];
+
+            auto find_tablet = cache_param.tablet_to_range.find(tablet_id);
+            if (find_tablet == cache_param.tablet_to_range.end()) {
+                return Status::InternalError("Not find tablet in 
partition_to_tablets, plan error");
+            }
+
+            auto scan_range_iter =
+                    std::find_if(scan_ranges.begin(), scan_ranges.end(),
+                                 [&tablet_id](const TScanRangeParams& range) {
+                                     return 
range.scan_range.palo_scan_range.tablet_id == tablet_id;
+                                 });
+            int64_t current_version = -1;
+            
std::from_chars(scan_range_iter->scan_range.palo_scan_range.version.data(),
+                            
scan_range_iter->scan_range.palo_scan_range.version.data() +
+                                    
scan_range_iter->scan_range.palo_scan_range.version.size(),
+                            current_version);
+
+            if (i == 0) {
+                first_version = current_version;
+                first_tablet_range = find_tablet->second;
+            } else {
+                if (current_version != first_version) {
+                    return Status::InternalError(
+                            "All tablets in one instance must have the same 
version, plan error");
+                }
+                if (find_tablet->second != first_tablet_range) {
+                    return Status::InternalError(
+                            "All tablets in one instance must have the same 
tablet_to_range, plan "
+                            "error");
+                }

Review Comment:
   build_cache_key currently returns an InternalError if tablets in the same 
instance have different `tablet_to_range` values, but FE populates 
`tablet_to_range` per partition (ranges can legitimately differ across tablets 
in the same scan). This makes multi-tablet cache key building prone to failing 
valid queries at runtime. Consider making the cache key deterministic by 
incorporating each tablet's associated range into the key (e.g., append sorted 
(tablet_id, range) pairs), instead of requiring all ranges to be identical.



##########
be/src/pipeline/query_cache/query_cache.h:
##########
@@ -109,27 +109,76 @@ class QueryCache : public LRUCachePolicy {
     static Status build_cache_key(const std::vector<TScanRangeParams>& 
scan_ranges,
                                   const TQueryCacheParam& cache_param, 
std::string* cache_key,
                                   int64_t* version) {
-        if (scan_ranges.size() > 1) {
-            return Status::InternalError(
-                    "CacheSourceOperator only support one scan range, plan 
error");
+        if (scan_ranges.empty()) {
+            return Status::InternalError("scan_ranges is empty, plan error");
         }
-        auto& scan_range = scan_ranges[0];
-        DCHECK(scan_range.scan_range.__isset.palo_scan_range);
-        auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id;
-
-        std::from_chars(scan_range.scan_range.palo_scan_range.version.data(),
-                        scan_range.scan_range.palo_scan_range.version.data() +
-                                
scan_range.scan_range.palo_scan_range.version.size(),
-                        *version);
-
-        auto find_tablet = cache_param.tablet_to_range.find(tablet_id);
-        if (find_tablet == cache_param.tablet_to_range.end()) {
-            return Status::InternalError("Not find tablet in 
partition_to_tablets, plan error");
+
+        std::string digest;
+        try {
+            digest = cache_param.digest;
+        } catch (const std::exception&) {
+            return Status::InternalError("digest is invalid, plan error");
+        }
+        if (digest.empty()) {
+            return Status::InternalError("digest is empty, plan error");
+        }
+
+        if (cache_param.tablet_to_range.empty()) {
+            return Status::InternalError("tablet_to_range is empty, plan 
error");
+        }
+
+        std::vector<int64_t> tablet_ids;
+        tablet_ids.reserve(scan_ranges.size());
+        for (const auto& scan_range : scan_ranges) {
+            auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id;
+            tablet_ids.push_back(tablet_id);
+        }
+        std::sort(tablet_ids.begin(), tablet_ids.end());
+
+        int64_t first_version = -1;
+        std::string first_tablet_range;
+        for (size_t i = 0; i < tablet_ids.size(); ++i) {
+            auto tablet_id = tablet_ids[i];
+
+            auto find_tablet = cache_param.tablet_to_range.find(tablet_id);
+            if (find_tablet == cache_param.tablet_to_range.end()) {
+                return Status::InternalError("Not find tablet in 
partition_to_tablets, plan error");

Review Comment:
   Error message says "Not find tablet in partition_to_tablets" but the actual 
field being checked is `tablet_to_range`. Updating the message would make 
failures easier to diagnose (and avoid referring to a non-existent concept in 
this context).
   



##########
be/src/pipeline/query_cache/query_cache.h:
##########
@@ -109,27 +109,76 @@ class QueryCache : public LRUCachePolicy {
     static Status build_cache_key(const std::vector<TScanRangeParams>& 
scan_ranges,
                                   const TQueryCacheParam& cache_param, 
std::string* cache_key,
                                   int64_t* version) {
-        if (scan_ranges.size() > 1) {
-            return Status::InternalError(
-                    "CacheSourceOperator only support one scan range, plan 
error");
+        if (scan_ranges.empty()) {
+            return Status::InternalError("scan_ranges is empty, plan error");
         }
-        auto& scan_range = scan_ranges[0];
-        DCHECK(scan_range.scan_range.__isset.palo_scan_range);
-        auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id;
-
-        std::from_chars(scan_range.scan_range.palo_scan_range.version.data(),
-                        scan_range.scan_range.palo_scan_range.version.data() +
-                                
scan_range.scan_range.palo_scan_range.version.size(),
-                        *version);
-
-        auto find_tablet = cache_param.tablet_to_range.find(tablet_id);
-        if (find_tablet == cache_param.tablet_to_range.end()) {
-            return Status::InternalError("Not find tablet in 
partition_to_tablets, plan error");
+
+        std::string digest;
+        try {
+            digest = cache_param.digest;
+        } catch (const std::exception&) {
+            return Status::InternalError("digest is invalid, plan error");
+        }

Review Comment:
   The try/catch around `cache_param.digest` is effectively dead code: reading 
a Thrift-generated `std::string` field does not throw when the optional field 
is unset (it just defaults to empty). This is confusing and adds overhead; it 
would be clearer to check the Thrift `__isset.digest` (or equivalent) and/or 
treat empty digest as invalid without exceptions.
   



##########
be/src/pipeline/query_cache/query_cache.h:
##########
@@ -109,27 +109,76 @@ class QueryCache : public LRUCachePolicy {
     static Status build_cache_key(const std::vector<TScanRangeParams>& 
scan_ranges,
                                   const TQueryCacheParam& cache_param, 
std::string* cache_key,
                                   int64_t* version) {
-        if (scan_ranges.size() > 1) {
-            return Status::InternalError(
-                    "CacheSourceOperator only support one scan range, plan 
error");
+        if (scan_ranges.empty()) {
+            return Status::InternalError("scan_ranges is empty, plan error");
         }
-        auto& scan_range = scan_ranges[0];
-        DCHECK(scan_range.scan_range.__isset.palo_scan_range);
-        auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id;
-
-        std::from_chars(scan_range.scan_range.palo_scan_range.version.data(),
-                        scan_range.scan_range.palo_scan_range.version.data() +
-                                
scan_range.scan_range.palo_scan_range.version.size(),
-                        *version);
-
-        auto find_tablet = cache_param.tablet_to_range.find(tablet_id);
-        if (find_tablet == cache_param.tablet_to_range.end()) {
-            return Status::InternalError("Not find tablet in 
partition_to_tablets, plan error");
+
+        std::string digest;
+        try {
+            digest = cache_param.digest;
+        } catch (const std::exception&) {
+            return Status::InternalError("digest is invalid, plan error");
+        }
+        if (digest.empty()) {
+            return Status::InternalError("digest is empty, plan error");
+        }
+
+        if (cache_param.tablet_to_range.empty()) {
+            return Status::InternalError("tablet_to_range is empty, plan 
error");
+        }
+
+        std::vector<int64_t> tablet_ids;
+        tablet_ids.reserve(scan_ranges.size());
+        for (const auto& scan_range : scan_ranges) {
+            auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id;
+            tablet_ids.push_back(tablet_id);
+        }

Review Comment:
   `scan_range.scan_range.palo_scan_range` is accessed without verifying 
`__isset.palo_scan_range` for each element. The previous single-range 
implementation at least had a DCHECK; with multi-range support this should be 
validated (return an error or DCHECK) to avoid reading unset fields and 
building an incorrect key/crashing on unexpected scan range types.



##########
be/test/pipeline/operator/query_cache_operator_test.cpp:
##########
@@ -136,6 +140,7 @@ TEST_F(QueryCacheOperatorTest, test_no_hit_cache1) {
             new MockRowDescriptor 
{{std::make_shared<vectorized::DataTypeInt64>()}, &pool});
     TQueryCacheParam cache_param;
     cache_param.node_id = 0;
+    cache_param.digest = "test_digest";
     cache_param.output_slot_mapping[0] = 0;
     cache_param.tablet_to_range.insert({42, "test"});
     cache_param.force_refresh_query_cache = false;

Review Comment:
   In these tests, `digest` is set via direct assignment on an optional Thrift 
field (`cache_param.digest = ...`). Prefer using the generated 
`__set_digest(...)` setter for consistency and to ensure the field's `__isset` 
flag is set (some code paths may rely on `__isset`).
   



##########
be/src/pipeline/query_cache/query_cache.h:
##########
@@ -109,27 +109,76 @@ class QueryCache : public LRUCachePolicy {
     static Status build_cache_key(const std::vector<TScanRangeParams>& 
scan_ranges,
                                   const TQueryCacheParam& cache_param, 
std::string* cache_key,
                                   int64_t* version) {
-        if (scan_ranges.size() > 1) {
-            return Status::InternalError(
-                    "CacheSourceOperator only support one scan range, plan 
error");
+        if (scan_ranges.empty()) {
+            return Status::InternalError("scan_ranges is empty, plan error");
         }
-        auto& scan_range = scan_ranges[0];
-        DCHECK(scan_range.scan_range.__isset.palo_scan_range);
-        auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id;
-
-        std::from_chars(scan_range.scan_range.palo_scan_range.version.data(),
-                        scan_range.scan_range.palo_scan_range.version.data() +
-                                
scan_range.scan_range.palo_scan_range.version.size(),
-                        *version);
-
-        auto find_tablet = cache_param.tablet_to_range.find(tablet_id);
-        if (find_tablet == cache_param.tablet_to_range.end()) {
-            return Status::InternalError("Not find tablet in 
partition_to_tablets, plan error");
+
+        std::string digest;
+        try {
+            digest = cache_param.digest;
+        } catch (const std::exception&) {
+            return Status::InternalError("digest is invalid, plan error");
+        }
+        if (digest.empty()) {
+            return Status::InternalError("digest is empty, plan error");
+        }
+
+        if (cache_param.tablet_to_range.empty()) {
+            return Status::InternalError("tablet_to_range is empty, plan 
error");
+        }
+
+        std::vector<int64_t> tablet_ids;
+        tablet_ids.reserve(scan_ranges.size());
+        for (const auto& scan_range : scan_ranges) {
+            auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id;
+            tablet_ids.push_back(tablet_id);
+        }
+        std::sort(tablet_ids.begin(), tablet_ids.end());
+
+        int64_t first_version = -1;
+        std::string first_tablet_range;
+        for (size_t i = 0; i < tablet_ids.size(); ++i) {
+            auto tablet_id = tablet_ids[i];
+
+            auto find_tablet = cache_param.tablet_to_range.find(tablet_id);
+            if (find_tablet == cache_param.tablet_to_range.end()) {
+                return Status::InternalError("Not find tablet in 
partition_to_tablets, plan error");
+            }
+
+            auto scan_range_iter =
+                    std::find_if(scan_ranges.begin(), scan_ranges.end(),
+                                 [&tablet_id](const TScanRangeParams& range) {
+                                     return 
range.scan_range.palo_scan_range.tablet_id == tablet_id;
+                                 });
+            int64_t current_version = -1;
+            
std::from_chars(scan_range_iter->scan_range.palo_scan_range.version.data(),
+                            
scan_range_iter->scan_range.palo_scan_range.version.data() +
+                                    
scan_range_iter->scan_range.palo_scan_range.version.size(),
+                            current_version);
+

Review Comment:
   `std::from_chars` results are ignored, so a non-numeric/overflowing 
`palo_scan_range.version` will silently keep `current_version` at -1 and still 
produce an OK status (and potentially a bad cache version). It would be safer 
to check the returned `ec`/parsed pointer and return an error if parsing fails.



##########
be/src/pipeline/query_cache/query_cache.h:
##########
@@ -109,27 +109,76 @@ class QueryCache : public LRUCachePolicy {
     static Status build_cache_key(const std::vector<TScanRangeParams>& 
scan_ranges,
                                   const TQueryCacheParam& cache_param, 
std::string* cache_key,
                                   int64_t* version) {
-        if (scan_ranges.size() > 1) {
-            return Status::InternalError(
-                    "CacheSourceOperator only support one scan range, plan 
error");
+        if (scan_ranges.empty()) {
+            return Status::InternalError("scan_ranges is empty, plan error");
         }
-        auto& scan_range = scan_ranges[0];
-        DCHECK(scan_range.scan_range.__isset.palo_scan_range);
-        auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id;
-
-        std::from_chars(scan_range.scan_range.palo_scan_range.version.data(),
-                        scan_range.scan_range.palo_scan_range.version.data() +
-                                
scan_range.scan_range.palo_scan_range.version.size(),
-                        *version);
-
-        auto find_tablet = cache_param.tablet_to_range.find(tablet_id);
-        if (find_tablet == cache_param.tablet_to_range.end()) {
-            return Status::InternalError("Not find tablet in 
partition_to_tablets, plan error");
+
+        std::string digest;
+        try {
+            digest = cache_param.digest;
+        } catch (const std::exception&) {
+            return Status::InternalError("digest is invalid, plan error");
+        }
+        if (digest.empty()) {
+            return Status::InternalError("digest is empty, plan error");
+        }
+
+        if (cache_param.tablet_to_range.empty()) {
+            return Status::InternalError("tablet_to_range is empty, plan 
error");
+        }
+
+        std::vector<int64_t> tablet_ids;
+        tablet_ids.reserve(scan_ranges.size());
+        for (const auto& scan_range : scan_ranges) {
+            auto tablet_id = scan_range.scan_range.palo_scan_range.tablet_id;
+            tablet_ids.push_back(tablet_id);
+        }
+        std::sort(tablet_ids.begin(), tablet_ids.end());
+
+        int64_t first_version = -1;
+        std::string first_tablet_range;
+        for (size_t i = 0; i < tablet_ids.size(); ++i) {
+            auto tablet_id = tablet_ids[i];
+
+            auto find_tablet = cache_param.tablet_to_range.find(tablet_id);
+            if (find_tablet == cache_param.tablet_to_range.end()) {
+                return Status::InternalError("Not find tablet in 
partition_to_tablets, plan error");
+            }
+
+            auto scan_range_iter =
+                    std::find_if(scan_ranges.begin(), scan_ranges.end(),
+                                 [&tablet_id](const TScanRangeParams& range) {
+                                     return 
range.scan_range.palo_scan_range.tablet_id == tablet_id;
+                                 });

Review Comment:
   The current implementation sorts `tablet_ids` and then does a `find_if` over 
`scan_ranges` for each tablet, making cache key construction O(n^2). Since this 
runs during operator init, it would be more efficient and simpler to compute 
(tablet_id, version, range) in a single pass and then sort that vector once.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to