github-actions[bot] commented on code in PR #64229:
URL: https://github.com/apache/doris/pull/64229#discussion_r3377500976


##########
be/src/storage/index/inverted/inverted_index_reader.cpp:
##########
@@ -487,6 +513,150 @@ InvertedIndexReaderType FullTextIndexReader::type() {
     return InvertedIndexReaderType::FULLTEXT;
 }
 
+namespace {
+
+// Reconstruct the build-time analyzer configuration from the index 
properties. This must
+// mirror InvertedIndexColumnWriter::init_fulltext_index() exactly (including 
the
+// lower_case ReturnTrue=true default) so that the signature computed here 
equals the one
+// stored in the "tbf" sub-file when the same analyzer is in play.
+InvertedIndexAnalyzerConfig build_index_analyzer_config(
+        const std::map<std::string, std::string>& properties) {
+    InvertedIndexAnalyzerConfig cfg;
+    cfg.analyzer_name = get_analyzer_name_from_properties(properties);
+    cfg.parser_type = get_inverted_index_parser_type_from_string(
+            get_parser_string_from_properties(properties));
+    cfg.parser_mode = get_parser_mode_string_from_properties(properties);
+    cfg.char_filter_map = 
get_parser_char_filter_map_from_properties(properties);
+    cfg.lower_case = get_parser_lowercase_from_properties<true>(properties);
+    cfg.stop_words = get_parser_stopwords_from_properties(properties);
+    return cfg;
+}
+
+} // namespace
+
+// Load + validate the token BF from the "tbf" sub-file. Returns nullptr 
(treated as "unknown" ->
+// normal query) on any failure: missing sub-file, corruption, reader error, 
or an analyzer-sig
+// mismatch (A3, stale BF). Never throws. The caller caches the returned BF so 
this runs at most
+// once per (segment, index).
+std::shared_ptr<InvertedIndexTermBloomFilter> 
FullTextIndexReader::load_term_bf_from_subfile(
+        const IndexQueryContextPtr& context) {
+    lucene::store::IndexInput* bf_in = nullptr;
+    lucene::store::Directory* dir = nullptr;
+    bool owned_dir = false;
+    std::shared_ptr<InvertedIndexTermBloomFilter> bf;
+    try {
+        auto st =
+                
_index_file_reader->init(config::inverted_index_read_buffer_size, 
context->io_ctx);
+        if (!st.ok()) {
+            return nullptr;
+        }
+        auto open_res = _index_file_reader->open(&_index_meta, 
context->io_ctx);
+        if (!open_res.has_value()) {
+            return nullptr; // real error surfaces on the normal path
+        }
+        dir = open_res.value().release();
+        owned_dir = true;
+
+        const char* bf_file_name = 
InvertedIndexDescriptor::get_temporary_term_bf_file_name();
+        if (!dir->fileExists(bf_file_name)) {
+            if (owned_dir) {
+                FINALIZE_INPUT(dir);
+            }
+            return nullptr; // no BF sub-file
+        }
+        bf_in = dir->openInput(bf_file_name);
+        auto bf_res = InvertedIndexTermBloomFilter::load(bf_in);
+        FINALIZE_INPUT(bf_in);
+        if (owned_dir) {
+            FINALIZE_INPUT(dir);
+        }
+        if (!bf_res.has_value()) {
+            return nullptr; // corrupt / incompatible
+        }
+        bf = std::move(bf_res.value());
+    } catch (CLuceneError& e) {
+        FINALLY_FINALIZE_INPUT(bf_in);
+        if (owned_dir) {
+            FINALLY_FINALIZE_INPUT(dir);
+        }
+        // Treat any reader error here as "unknown": never turn it into an 
empty result.
+        LOG(WARNING) << "term bloom filter load error, falling back to normal 
query: " << e.what();
+        return nullptr;
+    }
+
+    // A3 (staleness guard): reject a BF whose build-time analyzer fingerprint 
differs from the
+    // current index's analyzer fingerprint (e.g. the index was 
re-built/altered with different
+    // analyzer properties after this BF blob was written). Not a correctness 
load-bearer (the
+    // no-false-negative property holds regardless), only a guard against a 
*stale* BF blob.
+    uint64_t index_sig =
+            
compute_analyzer_sig(build_index_analyzer_config(_index_meta.properties()));
+    if (bf->analyzer_sig() != index_sig) {
+        return nullptr; // stale / incompatible BF
+    }
+    return bf;
+}
+
+Status FullTextIndexReader::try_term_bf_fast_path(
+        const IndexQueryContextPtr& context, InvertedIndexQueryType query_type,
+        const InvertedIndexQueryInfo& query_info,
+        const InvertedIndexQueryCache::CacheKey& cache_key,
+        std::shared_ptr<roaring::Roaring>& bit_map, bool* absent) {
+    *absent = false;
+
+    // Reuse the parsed BF for this (segment, index) so a warm absent query 
does zero IO. The BF
+    // lives in the searcher cache (same LRU + memory budget, no separate 
cache); the cache owns the
+    // key namespacing, so we pass the logical index-file key and never see 
how it coexists with the
+    // searcher entry. On a miss, load the "tbf" sub-file once and cache the 
validated BF.
+    auto* searcher_cache = InvertedIndexSearcherCache::instance();
+    if (searcher_cache == nullptr) {
+        return Status::OK(); // no cache wired (e.g. some tools) -> normal 
query
+    }
+    const auto index_file_key = 
_index_file_reader->get_index_file_cache_key(&_index_meta);
+    InvertedIndexCacheHandle bf_cache_handle;

Review Comment:
   The analyzer-signature guard is only applied when the BF is loaded from 
disk. On a cache hit, this reuses the cached BF without checking 
`bf->analyzer_sig()` against the current `_index_meta.properties()`. That 
leaves a stale/mismatched BF authoritative for later readers of the same 
`index_file_key`: for example, query once with matching metadata to cache the 
BF, then read the same segment/index key with changed analyzer properties (the 
A3 scenario already simulated in the tests). The second query hits the cache 
here, skips `load_term_bf_from_subfile()`, and can return an empty bitmap from 
tokens analyzed under the new config even though the cached BF was built under 
the old one. Please either include the analyzer signature in the BF cache key 
or revalidate the cached BF before probing and treat mismatches as 
unavailable/fall back.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to