This is an automated email from the ASF dual-hosted git repository. suxiaogang223 pushed a commit to branch codex/fix-hdfs-file-handle-cache-key in repository https://gitbox.apache.org/repos/asf/doris.git
commit d6e069d74dcbb43bdc7d7e876f480fd3e0aad528 Author: Socrates <[email protected]> AuthorDate: Fri May 22 11:59:56 2026 +0800 [fix](be) Include HDFS connection in file handle cache key ### What problem does this PR solve? Issue Number: None Related PR: None Problem Summary: HDFS file handles were cached only by path and mtime, so a later query using a different hdfsFS authentication context could reuse a handle opened from another context when the same file path and mtime matched. ### Release note None ### Check List (For Author) - Test: Manual test - `build-support/clang-format.sh be/src/io/fs/file_handle_cache.cpp be/src/io/fs/file_handle_cache.h be/test/io/fs/file_handle_cache_test.cpp` - `git diff --cached --check` - BE UT not run locally per request - Behavior changed: No - Does this need documentation: No --- be/src/io/fs/file_handle_cache.cpp | 20 ++++++++++++--- be/src/io/fs/file_handle_cache.h | 21 +++++++++++++--- be/test/io/fs/file_handle_cache_test.cpp | 43 ++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 7 deletions(-) diff --git a/be/src/io/fs/file_handle_cache.cpp b/be/src/io/fs/file_handle_cache.cpp index fbb904e3473..41617ba1015 100644 --- a/be/src/io/fs/file_handle_cache.cpp +++ b/be/src/io/fs/file_handle_cache.cpp @@ -21,6 +21,7 @@ #include "io/fs/file_handle_cache.h" +#include <cstdint> #include <thread> #include <tuple> @@ -100,7 +101,7 @@ FileHandleCache::Accessor::~Accessor() { #ifdef USE_HADOOP_HDFS if (hdfsUnbufferFile(get()->file()) != 0) { VLOG_FILE << "FS does not support file handle unbuffering, closing file=" - << _cache_accessor.get_key()->first; + << _cache_accessor.get_key()->second.first; destroy(); } else { // Calling explicit release to handle metrics @@ -148,11 +149,13 @@ Status FileHandleCache::get_file_handle(const hdfsFS& fs, const std::string& fna FileHandleCache::Accessor* accessor, bool* cache_hit) { DCHECK_GE(mtime, 0); // Hash the key and get appropriate partition - int index = - HashUtil::hash(fname.data(), cast_set<int>(fname.size()), 0) % _cache_partitions.size(); + uintptr_t fs_identity = reinterpret_cast<uintptr_t>(fs); + uint32_t seed = HashUtil::hash(&fs_identity, sizeof(fs_identity), 0); + int index = HashUtil::hash(fname.data(), cast_set<int>(fname.size()), seed) % + _cache_partitions.size(); FileHandleCachePartition& p = _cache_partitions[index]; - auto cache_key = std::make_pair(fname, mtime); + auto cache_key = make_cache_key(fs, fname, mtime); // If this requires a new handle, skip to the creation codepath. Otherwise, // find an unused entry with the same mtime @@ -187,6 +190,15 @@ Status FileHandleCache::get_file_handle(const hdfsFS& fs, const std::string& fna return Status::OK(); } +#ifdef BE_TEST +bool FileHandleCache::same_cache_key_for_test(const hdfsFS& lhs_fs, const std::string& lhs_fname, + int64_t lhs_mtime, const hdfsFS& rhs_fs, + const std::string& rhs_fname, int64_t rhs_mtime) { + return make_cache_key(lhs_fs, lhs_fname, lhs_mtime) == + make_cache_key(rhs_fs, rhs_fname, rhs_mtime); +} +#endif + void FileHandleCache::_evict_handles_loop() { while (!_is_shut_down.load()) { if (_unused_handle_timeout_secs) { diff --git a/be/src/io/fs/file_handle_cache.h b/be/src/io/fs/file_handle_cache.h index 057bdefc61d..ce3c708ba99 100644 --- a/be/src/io/fs/file_handle_cache.h +++ b/be/src/io/fs/file_handle_cache.h @@ -22,9 +22,12 @@ #pragma once #include <array> +#include <cstdint> #include <list> #include <map> #include <memory> +#include <string> +#include <utility> #include "common/status.h" #include "io/fs/file_system.h" @@ -111,13 +114,15 @@ public: /// mtime is older than the file's current mtime. class FileHandleCache { private: + using CacheKey = std::pair<hdfsFS, std::pair<std::string, int64_t>>; + /// Each partition operates independently, and thus has its own thread-safe cache. /// To avoid contention on the lock_ due to false sharing the partitions are /// aligned to cache line boundaries. struct FileHandleCachePartition : public CacheLineAligned { - // Cache key is a pair of filename and mtime - // Using std::pair to spare boilerplate of hash function - typedef LruMultiCache<std::pair<std::string, int64_t>, CachedHdfsFileHandle> CacheType; + // The same HDFS path can be opened through different hdfsFS instances with + // different authentication contexts, so the filesystem handle is part of the key. + typedef LruMultiCache<CacheKey, CachedHdfsFileHandle> CacheType; CacheType cache; }; @@ -176,7 +181,17 @@ public: int64_t file_size, bool require_new_handle, Accessor* accessor, bool* cache_hit) WARN_UNUSED_RESULT; +#ifdef BE_TEST + static bool same_cache_key_for_test(const hdfsFS& lhs_fs, const std::string& lhs_fname, + int64_t lhs_mtime, const hdfsFS& rhs_fs, + const std::string& rhs_fname, int64_t rhs_mtime); +#endif + private: + static CacheKey make_cache_key(const hdfsFS& fs, const std::string& fname, int64_t mtime) { + return {fs, {fname, mtime}}; + } + /// Periodic check to evict unused file handles. Only executed by _eviction_thread. void _evict_handles_loop(); diff --git a/be/test/io/fs/file_handle_cache_test.cpp b/be/test/io/fs/file_handle_cache_test.cpp new file mode 100644 index 00000000000..5c1f7d1d9e0 --- /dev/null +++ b/be/test/io/fs/file_handle_cache_test.cpp @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "io/fs/file_handle_cache.h" + +#include <gtest/gtest.h> + +#include <cstdint> +#include <string> + +namespace doris::io { + +TEST(FileHandleCacheTest, CacheKeyIncludesHdfsFs) { + auto first_fs = reinterpret_cast<hdfsFS>(static_cast<uintptr_t>(0x1)); + auto second_fs = reinterpret_cast<hdfsFS>(static_cast<uintptr_t>(0x2)); + const std::string fname = "/user/hive/warehouse/table/data.parquet"; + constexpr int64_t mtime = 12345; + + EXPECT_TRUE(FileHandleCache::same_cache_key_for_test(first_fs, fname, mtime, first_fs, fname, + mtime)); + EXPECT_FALSE(FileHandleCache::same_cache_key_for_test(first_fs, fname, mtime, second_fs, fname, + mtime)); + EXPECT_FALSE(FileHandleCache::same_cache_key_for_test(first_fs, fname, mtime, first_fs, + fname + ".other", mtime)); + EXPECT_FALSE(FileHandleCache::same_cache_key_for_test(first_fs, fname, mtime, first_fs, fname, + mtime + 1)); +} + +} // namespace doris::io --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
