This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch faiss
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/faiss by this push:
new 032afe95f67 feat: support ivf-on-disk (#386)
032afe95f67 is described below
commit 032afe95f671cd50b82d52d901345600776d7855
Author: zhiqiang <[email protected]>
AuthorDate: Mon Mar 23 09:24:01 2026 +0800
feat: support ivf-on-disk (#386)
---
faiss/CMakeLists.txt | 4 +
faiss/IndexIVF.cpp | 12 +++
faiss/impl/random_access_reader.cpp | 96 +++++++++++++++++
faiss/impl/random_access_reader.h | 85 ++++++++++++++++
faiss/invlists/PreadInvertedLists.cpp | 187 ++++++++++++++++++++++++++++++++++
faiss/invlists/PreadInvertedLists.h | 88 ++++++++++++++++
6 files changed, 472 insertions(+)
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index fbf0419e2fd..f76f7365171 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -243,6 +243,10 @@ set(FAISS_HEADERS
if(NOT WIN32)
list(APPEND FAISS_SRC invlists/OnDiskInvertedLists.cpp)
list(APPEND FAISS_HEADERS invlists/OnDiskInvertedLists.h)
+ list(APPEND FAISS_SRC impl/random_access_reader.cpp)
+ list(APPEND FAISS_HEADERS impl/random_access_reader.h)
+ list(APPEND FAISS_SRC invlists/PreadInvertedLists.cpp)
+ list(APPEND FAISS_HEADERS invlists/PreadInvertedLists.h)
endif()
# Export FAISS_HEADERS variable to parent scope.
diff --git a/faiss/IndexIVF.cpp b/faiss/IndexIVF.cpp
index 21819d8692a..f96211d2d68 100644
--- a/faiss/IndexIVF.cpp
+++ b/faiss/IndexIVF.cpp
@@ -1341,6 +1341,10 @@ size_t InvertedListScanner::iterate_codes(
if (!keep_max) {
for (; it->is_available(); it->next()) {
auto id_and_codes = it->get_id_and_codes();
+ if (sel != nullptr && !sel->is_member(id_and_codes.first)) {
+ list_size++;
+ continue;
+ }
float dis = distance_to_code(id_and_codes.second);
if (dis < simi[0]) {
maxheap_replace_top(k, simi, idxi, dis, id_and_codes.first);
@@ -1351,6 +1355,10 @@ size_t InvertedListScanner::iterate_codes(
} else {
for (; it->is_available(); it->next()) {
auto id_and_codes = it->get_id_and_codes();
+ if (sel != nullptr && !sel->is_member(id_and_codes.first)) {
+ list_size++;
+ continue;
+ }
float dis = distance_to_code(id_and_codes.second);
if (dis > simi[0]) {
minheap_replace_top(k, simi, idxi, dis, id_and_codes.first);
@@ -1389,6 +1397,10 @@ void InvertedListScanner::iterate_codes_range(
list_size = 0;
for (; it->is_available(); it->next()) {
auto id_and_codes = it->get_id_and_codes();
+ if (sel != nullptr && !sel->is_member(id_and_codes.first)) {
+ list_size++;
+ continue;
+ }
float dis = distance_to_code(id_and_codes.second);
bool keep = !keep_max
? dis < radius
diff --git a/faiss/impl/random_access_reader.cpp
b/faiss/impl/random_access_reader.cpp
new file mode 100644
index 00000000000..d48cf989fbf
--- /dev/null
+++ b/faiss/impl/random_access_reader.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/random_access_reader.h>
+
+#include <cerrno>
+#include <cstring>
+
+#include <faiss/impl/FaissAssert.h>
+
+#ifndef _WIN32
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+namespace faiss {
+
+/*******************************************************
+ * ReadRef — default borrow() implementation
+ *******************************************************/
+
+namespace {
+
+/// ReadRef that owns a heap-allocated buffer.
+struct OwnedReadRef : ReadRef {
+ std::unique_ptr<uint8_t[]> buf_;
+ OwnedReadRef(std::unique_ptr<uint8_t[]> buf, size_t len)
+ : buf_(std::move(buf)) {
+ data_ = buf_.get();
+ size_ = len;
+ }
+};
+
+} // namespace
+
+std::unique_ptr<ReadRef> RandomAccessReader::borrow(
+ size_t offset,
+ size_t nbytes) const {
+ auto buf = std::make_unique<uint8_t[]>(nbytes);
+ read_at(offset, buf.get(), nbytes);
+ return std::make_unique<OwnedReadRef>(std::move(buf), nbytes);
+}
+
+/*******************************************************
+ * FileRandomAccessReader — default POSIX pread backend
+ *******************************************************/
+
+FileRandomAccessReader::FileRandomAccessReader(const std::string& filename) {
+#ifndef _WIN32
+ fd_ = ::open(filename.c_str(), O_RDONLY);
+ FAISS_THROW_IF_NOT_FMT(
+ fd_ >= 0,
+ "FileRandomAccessReader: cannot open %s: %s",
+ filename.c_str(),
+ strerror(errno));
+#else
+ FAISS_THROW_MSG("FileRandomAccessReader is not supported on Windows");
+#endif
+}
+
+FileRandomAccessReader::~FileRandomAccessReader() {
+#ifndef _WIN32
+ if (fd_ >= 0) {
+ ::close(fd_);
+ }
+#endif
+}
+
+void FileRandomAccessReader::read_at(
+ size_t offset,
+ void* ptr,
+ size_t nbytes) const {
+#ifndef _WIN32
+ size_t done = 0;
+ auto* out = static_cast<uint8_t*>(ptr);
+ while (done < nbytes) {
+ ssize_t nr = ::pread(fd_, out + done, nbytes - done, offset + done);
+ FAISS_THROW_IF_NOT_MSG(
+ nr >= 0, "pread failed in FileRandomAccessReader");
+ FAISS_THROW_IF_NOT_MSG(
+ nr > 0, "unexpected EOF in FileRandomAccessReader");
+ done += static_cast<size_t>(nr);
+ }
+#else
+ (void)offset;
+ (void)ptr;
+ (void)nbytes;
+ FAISS_THROW_MSG("FileRandomAccessReader is not supported on Windows");
+#endif
+}
+
+} // namespace faiss
diff --git a/faiss/impl/random_access_reader.h
b/faiss/impl/random_access_reader.h
new file mode 100644
index 00000000000..6bcebe13ca4
--- /dev/null
+++ b/faiss/impl/random_access_reader.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+namespace faiss {
+
+/**
+ * Opaque handle returned by RandomAccessReader::borrow().
+ *
+ * Keeps the underlying data alive (e.g. a pinned cache entry, an mmap
+ * region, or a plain heap buffer). The data() pointer is valid for the
+ * lifetime of this object.
+ */
+struct ReadRef {
+ virtual ~ReadRef() = default;
+
+ const uint8_t* data() const {
+ return data_;
+ }
+ size_t size() const {
+ return size_;
+ }
+
+protected:
+ const uint8_t* data_ = nullptr;
+ size_t size_ = 0;
+};
+
+/**
+ * Abstract interface for random-access (pread-like) reads.
+ *
+ * Unlike IOReader (which is a sequential stream), RandomAccessReader
+ * supports positional reads without internal state, making it safe
+ * for concurrent use from multiple threads.
+ *
+ * This is the runtime data-access interface used during search to
+ * fetch inverted-list data on demand, as opposed to IOReader which
+ * is used for index serialization / deserialization.
+ */
+struct RandomAccessReader {
+ virtual ~RandomAccessReader() = default;
+
+ /**
+ * Read exactly @p nbytes starting at byte @p offset into @p ptr.
+ * Must throw on short read or I/O error.
+ */
+ virtual void read_at(size_t offset, void* ptr, size_t nbytes) const = 0;
+
+ /**
+ * Borrow a region of data and return a ReadRef that keeps it alive.
+ *
+ * The default implementation allocates a buffer and calls read_at().
+ * Subclasses (e.g. a cache-backed reader) can override this to return
+ * a direct pointer into cached / mapped memory without any copy.
+ */
+ virtual std::unique_ptr<ReadRef> borrow(
+ size_t offset,
+ size_t nbytes) const;
+};
+
+/**
+ * Default RandomAccessReader backed by pread(fd) on a local file.
+ * Only available on POSIX systems.
+ */
+struct FileRandomAccessReader : RandomAccessReader {
+ explicit FileRandomAccessReader(const std::string& filename);
+ ~FileRandomAccessReader() override;
+
+ void read_at(size_t offset, void* ptr, size_t nbytes) const override;
+
+private:
+ int fd_ = -1;
+};
+
+} // namespace faiss
diff --git a/faiss/invlists/PreadInvertedLists.cpp
b/faiss/invlists/PreadInvertedLists.cpp
new file mode 100644
index 00000000000..8f67918761e
--- /dev/null
+++ b/faiss/invlists/PreadInvertedLists.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/invlists/PreadInvertedLists.h>
+
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+
+/*******************************************************
+ * PreadInvertedListsIterator
+ *******************************************************/
+
+namespace {
+
+struct PreadInvertedListsIterator : InvertedListsIterator {
+ // --- list-level invariants (set once in ctor) ---
+ size_t list_size = 0;
+ size_t code_size = 0;
+
+ // --- borrowed data for the entire list ---
+ std::unique_ptr<ReadRef> codes_ref_;
+ std::unique_ptr<ReadRef> ids_ref_;
+
+ // --- iteration state ---
+ size_t i = 0;
+
+ PreadInvertedListsIterator(
+ const PreadInvertedLists* parent,
+ size_t list_no) {
+ const OnDiskOneList& l = parent->lists.at(list_no);
+ if (l.size == 0) {
+ return;
+ }
+ list_size = l.size;
+ code_size = parent->code_size;
+ const auto* reader = &parent->reader();
+
+ const size_t codes_offset = l.offset;
+ const size_t ids_offset = l.offset + l.capacity * code_size;
+
+ // Borrow the entire list's codes and ids in one shot.
+ // A cache-backed reader can return pinned references here,
+ // giving zero-copy access on repeated queries.
+ codes_ref_ = reader->borrow(codes_offset, list_size * code_size);
+ ids_ref_ = reader->borrow(ids_offset, list_size * sizeof(idx_t));
+ }
+
+ // non-copyable
+ PreadInvertedListsIterator(const PreadInvertedListsIterator&) = delete;
+ PreadInvertedListsIterator& operator=(const PreadInvertedListsIterator&) =
+ delete;
+
+ bool is_available() const override {
+ return i < list_size;
+ }
+
+ void next() override {
+ ++i;
+ }
+
+ std::pair<idx_t, const uint8_t*> get_id_and_codes() override {
+ auto* ids = reinterpret_cast<const idx_t*>(ids_ref_->data());
+ return {ids[i], codes_ref_->data() + i * code_size};
+ }
+};
+
+} // namespace
+
+/*******************************************************
+ * PreadInvertedLists
+ *******************************************************/
+
+PreadInvertedLists::PreadInvertedLists(const OnDiskInvertedLists& src)
+ : ReadOnlyInvertedLists(src.nlist, src.code_size), lists(src.lists) {
+ use_iterator = true;
+}
+
+PreadInvertedLists::PreadInvertedLists(
+ size_t nlist,
+ size_t code_size,
+ std::vector<OnDiskOneList> lists)
+ : ReadOnlyInvertedLists(nlist, code_size),
+ lists(std::move(lists)) {
+ use_iterator = true;
+}
+
+void PreadInvertedLists::set_reader(
+ std::unique_ptr<RandomAccessReader> reader) {
+ reader_ = std::move(reader);
+}
+
+const RandomAccessReader& PreadInvertedLists::reader() const {
+ FAISS_THROW_IF_NOT_MSG(
+ reader_,
+ "PreadInvertedLists: reader not set, call set_reader() first");
+ return *reader_;
+}
+
+size_t PreadInvertedLists::list_size(size_t list_no) const {
+ return lists.at(list_no).size;
+}
+
+const uint8_t* PreadInvertedLists::get_codes(size_t list_no) const {
+ const OnDiskOneList& l = lists.at(list_no);
+ if (l.size == 0) {
+ return nullptr;
+ }
+ auto* out = new uint8_t[l.size * code_size];
+ FAISS_THROW_IF_NOT_MSG(
+ reader_,
+ "PreadInvertedLists: reader not set, call set_reader() first");
+ reader_->read_at(l.offset, out, l.size * code_size);
+ return out;
+}
+
+const idx_t* PreadInvertedLists::get_ids(size_t list_no) const {
+ const OnDiskOneList& l = lists.at(list_no);
+ if (l.size == 0) {
+ return nullptr;
+ }
+ auto* out = new idx_t[l.size];
+ FAISS_THROW_IF_NOT_MSG(
+ reader_,
+ "PreadInvertedLists: reader not set, call set_reader() first");
+ reader_->read_at(
+ l.offset + l.capacity * code_size, out, l.size * sizeof(idx_t));
+ return out;
+}
+
+void PreadInvertedLists::release_codes(
+ size_t /*list_no*/,
+ const uint8_t* codes) const {
+ delete[] codes;
+}
+
+void PreadInvertedLists::release_ids(
+ size_t /*list_no*/,
+ const idx_t* ids) const {
+ delete[] ids;
+}
+
+bool PreadInvertedLists::is_empty(size_t list_no, void*) const {
+ return lists.at(list_no).size == 0;
+}
+
+InvertedListsIterator* PreadInvertedLists::get_iterator(
+ size_t list_no,
+ void*) const {
+ return new PreadInvertedListsIterator(this, list_no);
+}
+
+void PreadInvertedLists::prefetch_lists(
+ const idx_t* /*list_nos*/,
+ int /*nlist*/) const {
+ // Intentionally empty. The iterator constructor already borrows
+ // the entire list in one shot, so a synchronous prefetch here
+ // would just duplicate the same I/O. If an asynchronous prefetch
+ // mechanism becomes available in the future, this is the place to
+ // add it.
+}
+
+/*******************************************************
+ * Helper: replace OnDiskInvertedLists with PreadInvertedLists
+ *******************************************************/
+
+PreadInvertedLists* replace_with_pread_invlists(Index* index) {
+ auto* ivf = dynamic_cast<IndexIVF*>(index);
+ FAISS_THROW_IF_NOT_MSG(
+ ivf, "replace_with_pread_invlists expects an IndexIVF");
+
+ auto* od = dynamic_cast<OnDiskInvertedLists*>(ivf->invlists);
+ FAISS_THROW_IF_NOT_MSG(
+ od,
+ "replace_with_pread_invlists expects IndexIVF with "
+ "OnDiskInvertedLists");
+
+ auto* pread = new PreadInvertedLists(*od);
+ ivf->replace_invlists(pread, true);
+ return pread;
+}
+
+} // namespace faiss
diff --git a/faiss/invlists/PreadInvertedLists.h
b/faiss/invlists/PreadInvertedLists.h
new file mode 100644
index 00000000000..97668e4a125
--- /dev/null
+++ b/faiss/invlists/PreadInvertedLists.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/random_access_reader.h>
+#include <faiss/invlists/InvertedLists.h>
+#include <faiss/invlists/OnDiskInvertedLists.h>
+
+namespace faiss {
+
+/**
+ * Read-only inverted lists backed by a pluggable RandomAccessReader.
+ *
+ * Unlike OnDiskInvertedLists (which requires mmap), PreadInvertedLists
+ * uses positional reads (pread-style) via a RandomAccessReader, making
+ * it suitable for environments where:
+ * - the data lives inside a compound file (no standalone fd to mmap),
+ * - a user-space cache is interposed between FAISS and storage, or
+ * - the backing store is remote (object storage, HDFS, etc.).
+ *
+ * The list metadata layout is identical to OnDiskInvertedLists:
+ * [codes: capacity * code_size bytes] [ids: capacity * sizeof(idx_t) bytes]
+ * Only the first `size` entries in each region are valid.
+ *
+ * The iterator path (enabled by default) borrows entire lists via
+ * RandomAccessReader::borrow(), which allows zero-copy access when the
+ * reader is backed by a cache or memory map.
+ */
+struct PreadInvertedLists : ReadOnlyInvertedLists {
+ /// Per-list metadata (same layout as OnDiskInvertedLists).
+ std::vector<OnDiskOneList> lists;
+
+ /// Construct from an existing OnDiskInvertedLists, copying its metadata.
+ explicit PreadInvertedLists(const OnDiskInvertedLists& src);
+
+ /// Construct directly from list metadata.
+ PreadInvertedLists(
+ size_t nlist,
+ size_t code_size,
+ std::vector<OnDiskOneList> lists);
+
+ /// Bind the reader that provides random access to the ivfdata content.
+ /// Must be called before any read operation.
+ void set_reader(std::unique_ptr<RandomAccessReader> reader);
+
+ /// Return the bound reader (must have been set via set_reader()).
+ const RandomAccessReader& reader() const;
+
+ // ---------- InvertedLists interface ----------
+
+ size_t list_size(size_t list_no) const override;
+
+ const uint8_t* get_codes(size_t list_no) const override;
+ const idx_t* get_ids(size_t list_no) const override;
+ void release_codes(size_t list_no, const uint8_t* codes) const override;
+ void release_ids(size_t list_no, const idx_t* ids) const override;
+
+ bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
+ const override;
+ InvertedListsIterator* get_iterator(
+ size_t list_no,
+ void* inverted_list_context = nullptr) const override;
+
+ void prefetch_lists(const idx_t* list_nos, int nlist) const override;
+
+private:
+ std::unique_ptr<RandomAccessReader> reader_;
+};
+
+/**
+ * Replace an IndexIVF's OnDiskInvertedLists with a PreadInvertedLists.
+ *
+ * The returned object has no reader yet — the caller must call
+ * set_reader() before any search.
+ */
+FAISS_API PreadInvertedLists* replace_with_pread_invlists(Index* index);
+
+} // namespace faiss
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]