This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch faiss
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/faiss by this push:
     new 032afe95f67 feat: support ivf-on-disk (#386)
032afe95f67 is described below

commit 032afe95f671cd50b82d52d901345600776d7855
Author: zhiqiang <[email protected]>
AuthorDate: Mon Mar 23 09:24:01 2026 +0800

    feat: support ivf-on-disk (#386)
---
 faiss/CMakeLists.txt                  |   4 +
 faiss/IndexIVF.cpp                    |  12 +++
 faiss/impl/random_access_reader.cpp   |  96 +++++++++++++++++
 faiss/impl/random_access_reader.h     |  85 ++++++++++++++++
 faiss/invlists/PreadInvertedLists.cpp | 187 ++++++++++++++++++++++++++++++++++
 faiss/invlists/PreadInvertedLists.h   |  88 ++++++++++++++++
 6 files changed, 472 insertions(+)

diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index fbf0419e2fd..f76f7365171 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -243,6 +243,10 @@ set(FAISS_HEADERS
 if(NOT WIN32)
   list(APPEND FAISS_SRC invlists/OnDiskInvertedLists.cpp)
   list(APPEND FAISS_HEADERS invlists/OnDiskInvertedLists.h)
+  list(APPEND FAISS_SRC impl/random_access_reader.cpp)
+  list(APPEND FAISS_HEADERS impl/random_access_reader.h)
+  list(APPEND FAISS_SRC invlists/PreadInvertedLists.cpp)
+  list(APPEND FAISS_HEADERS invlists/PreadInvertedLists.h)
 endif()
 
 # Export FAISS_HEADERS variable to parent scope.
diff --git a/faiss/IndexIVF.cpp b/faiss/IndexIVF.cpp
index 21819d8692a..f96211d2d68 100644
--- a/faiss/IndexIVF.cpp
+++ b/faiss/IndexIVF.cpp
@@ -1341,6 +1341,10 @@ size_t InvertedListScanner::iterate_codes(
     if (!keep_max) {
         for (; it->is_available(); it->next()) {
             auto id_and_codes = it->get_id_and_codes();
+            if (sel != nullptr && !sel->is_member(id_and_codes.first)) {
+                list_size++;
+                continue;
+            }
             float dis = distance_to_code(id_and_codes.second);
             if (dis < simi[0]) {
                 maxheap_replace_top(k, simi, idxi, dis, id_and_codes.first);
@@ -1351,6 +1355,10 @@ size_t InvertedListScanner::iterate_codes(
     } else {
         for (; it->is_available(); it->next()) {
             auto id_and_codes = it->get_id_and_codes();
+            if (sel != nullptr && !sel->is_member(id_and_codes.first)) {
+                list_size++;
+                continue;
+            }
             float dis = distance_to_code(id_and_codes.second);
             if (dis > simi[0]) {
                 minheap_replace_top(k, simi, idxi, dis, id_and_codes.first);
@@ -1389,6 +1397,10 @@ void InvertedListScanner::iterate_codes_range(
     list_size = 0;
     for (; it->is_available(); it->next()) {
         auto id_and_codes = it->get_id_and_codes();
+        if (sel != nullptr && !sel->is_member(id_and_codes.first)) {
+            list_size++;
+            continue;
+        }
         float dis = distance_to_code(id_and_codes.second);
         bool keep = !keep_max
                 ? dis < radius
diff --git a/faiss/impl/random_access_reader.cpp 
b/faiss/impl/random_access_reader.cpp
new file mode 100644
index 00000000000..d48cf989fbf
--- /dev/null
+++ b/faiss/impl/random_access_reader.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/random_access_reader.h>
+
+#include <cerrno>
+#include <cstring>
+
+#include <faiss/impl/FaissAssert.h>
+
+#ifndef _WIN32
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+namespace faiss {
+
+/*******************************************************
+ * ReadRef — default borrow() implementation
+ *******************************************************/
+
+namespace {
+
+/// ReadRef that owns a heap-allocated buffer.
+struct OwnedReadRef : ReadRef {
+    std::unique_ptr<uint8_t[]> buf_;
+    OwnedReadRef(std::unique_ptr<uint8_t[]> buf, size_t len)
+            : buf_(std::move(buf)) {
+        data_ = buf_.get();
+        size_ = len;
+    }
+};
+
+} // namespace
+
+std::unique_ptr<ReadRef> RandomAccessReader::borrow(
+        size_t offset,
+        size_t nbytes) const {
+    auto buf = std::make_unique<uint8_t[]>(nbytes);
+    read_at(offset, buf.get(), nbytes);
+    return std::make_unique<OwnedReadRef>(std::move(buf), nbytes);
+}
+
+/*******************************************************
+ * FileRandomAccessReader — default POSIX pread backend
+ *******************************************************/
+
+FileRandomAccessReader::FileRandomAccessReader(const std::string& filename) {
+#ifndef _WIN32
+    fd_ = ::open(filename.c_str(), O_RDONLY);
+    FAISS_THROW_IF_NOT_FMT(
+            fd_ >= 0,
+            "FileRandomAccessReader: cannot open %s: %s",
+            filename.c_str(),
+            strerror(errno));
+#else
+    FAISS_THROW_MSG("FileRandomAccessReader is not supported on Windows");
+#endif
+}
+
+FileRandomAccessReader::~FileRandomAccessReader() {
+#ifndef _WIN32
+    if (fd_ >= 0) {
+        ::close(fd_);
+    }
+#endif
+}
+
+void FileRandomAccessReader::read_at(
+        size_t offset,
+        void* ptr,
+        size_t nbytes) const {
+#ifndef _WIN32
+    size_t done = 0;
+    auto* out = static_cast<uint8_t*>(ptr);
+    while (done < nbytes) {
+        ssize_t nr = ::pread(fd_, out + done, nbytes - done, offset + done);
+        FAISS_THROW_IF_NOT_MSG(
+                nr >= 0, "pread failed in FileRandomAccessReader");
+        FAISS_THROW_IF_NOT_MSG(
+                nr > 0, "unexpected EOF in FileRandomAccessReader");
+        done += static_cast<size_t>(nr);
+    }
+#else
+    (void)offset;
+    (void)ptr;
+    (void)nbytes;
+    FAISS_THROW_MSG("FileRandomAccessReader is not supported on Windows");
+#endif
+}
+
+} // namespace faiss
diff --git a/faiss/impl/random_access_reader.h 
b/faiss/impl/random_access_reader.h
new file mode 100644
index 00000000000..6bcebe13ca4
--- /dev/null
+++ b/faiss/impl/random_access_reader.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+namespace faiss {
+
+/**
+ * Opaque handle returned by RandomAccessReader::borrow().
+ *
+ * Keeps the underlying data alive (e.g. a pinned cache entry, an mmap
+ * region, or a plain heap buffer).  The data() pointer is valid for the
+ * lifetime of this object.
+ */
+struct ReadRef {
+    virtual ~ReadRef() = default;
+
+    const uint8_t* data() const {
+        return data_;
+    }
+    size_t size() const {
+        return size_;
+    }
+
+protected:
+    const uint8_t* data_ = nullptr;
+    size_t size_ = 0;
+};
+
+/**
+ * Abstract interface for random-access (pread-like) reads.
+ *
+ * Unlike IOReader (which is a sequential stream), RandomAccessReader
+ * supports positional reads without internal state, making it safe
+ * for concurrent use from multiple threads.
+ *
+ * This is the runtime data-access interface used during search to
+ * fetch inverted-list data on demand, as opposed to IOReader which
+ * is used for index serialization / deserialization.
+ */
+struct RandomAccessReader {
+    virtual ~RandomAccessReader() = default;
+
+    /**
+     * Read exactly @p nbytes starting at byte @p offset into @p ptr.
+     * Must throw on short read or I/O error.
+     */
+    virtual void read_at(size_t offset, void* ptr, size_t nbytes) const = 0;
+
+    /**
+     * Borrow a region of data and return a ReadRef that keeps it alive.
+     *
+     * The default implementation allocates a buffer and calls read_at().
+     * Subclasses (e.g. a cache-backed reader) can override this to return
+     * a direct pointer into cached / mapped memory without any copy.
+     */
+    virtual std::unique_ptr<ReadRef> borrow(
+            size_t offset,
+            size_t nbytes) const;
+};
+
+/**
+ * Default RandomAccessReader backed by pread(fd) on a local file.
+ * Only available on POSIX systems.
+ */
+struct FileRandomAccessReader : RandomAccessReader {
+    explicit FileRandomAccessReader(const std::string& filename);
+    ~FileRandomAccessReader() override;
+
+    void read_at(size_t offset, void* ptr, size_t nbytes) const override;
+
+private:
+    int fd_ = -1;
+};
+
+} // namespace faiss
diff --git a/faiss/invlists/PreadInvertedLists.cpp 
b/faiss/invlists/PreadInvertedLists.cpp
new file mode 100644
index 00000000000..8f67918761e
--- /dev/null
+++ b/faiss/invlists/PreadInvertedLists.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/invlists/PreadInvertedLists.h>
+
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+
+/*******************************************************
+ * PreadInvertedListsIterator
+ *******************************************************/
+
+namespace {
+
+struct PreadInvertedListsIterator : InvertedListsIterator {
+    // --- list-level invariants (set once in ctor) ---
+    size_t list_size = 0;
+    size_t code_size = 0;
+
+    // --- borrowed data for the entire list ---
+    std::unique_ptr<ReadRef> codes_ref_;
+    std::unique_ptr<ReadRef> ids_ref_;
+
+    // --- iteration state ---
+    size_t i = 0;
+
+    PreadInvertedListsIterator(
+            const PreadInvertedLists* parent,
+            size_t list_no) {
+        const OnDiskOneList& l = parent->lists.at(list_no);
+        if (l.size == 0) {
+            return;
+        }
+        list_size = l.size;
+        code_size = parent->code_size;
+        const auto* reader = &parent->reader();
+
+        const size_t codes_offset = l.offset;
+        const size_t ids_offset = l.offset + l.capacity * code_size;
+
+        // Borrow the entire list's codes and ids in one shot.
+        // A cache-backed reader can return pinned references here,
+        // giving zero-copy access on repeated queries.
+        codes_ref_ = reader->borrow(codes_offset, list_size * code_size);
+        ids_ref_ = reader->borrow(ids_offset, list_size * sizeof(idx_t));
+    }
+
+    // non-copyable
+    PreadInvertedListsIterator(const PreadInvertedListsIterator&) = delete;
+    PreadInvertedListsIterator& operator=(const PreadInvertedListsIterator&) =
+            delete;
+
+    bool is_available() const override {
+        return i < list_size;
+    }
+
+    void next() override {
+        ++i;
+    }
+
+    std::pair<idx_t, const uint8_t*> get_id_and_codes() override {
+        auto* ids = reinterpret_cast<const idx_t*>(ids_ref_->data());
+        return {ids[i], codes_ref_->data() + i * code_size};
+    }
+};
+
+} // namespace
+
+/*******************************************************
+ * PreadInvertedLists
+ *******************************************************/
+
+PreadInvertedLists::PreadInvertedLists(const OnDiskInvertedLists& src)
+        : ReadOnlyInvertedLists(src.nlist, src.code_size), lists(src.lists) {
+    use_iterator = true;
+}
+
+PreadInvertedLists::PreadInvertedLists(
+        size_t nlist,
+        size_t code_size,
+        std::vector<OnDiskOneList> lists)
+        : ReadOnlyInvertedLists(nlist, code_size),
+          lists(std::move(lists)) {
+    use_iterator = true;
+}
+
+void PreadInvertedLists::set_reader(
+        std::unique_ptr<RandomAccessReader> reader) {
+    reader_ = std::move(reader);
+}
+
+const RandomAccessReader& PreadInvertedLists::reader() const {
+    FAISS_THROW_IF_NOT_MSG(
+            reader_,
+            "PreadInvertedLists: reader not set, call set_reader() first");
+    return *reader_;
+}
+
+size_t PreadInvertedLists::list_size(size_t list_no) const {
+    return lists.at(list_no).size;
+}
+
+const uint8_t* PreadInvertedLists::get_codes(size_t list_no) const {
+    const OnDiskOneList& l = lists.at(list_no);
+    if (l.size == 0) {
+        return nullptr;
+    }
+    auto* out = new uint8_t[l.size * code_size];
+    FAISS_THROW_IF_NOT_MSG(
+            reader_,
+            "PreadInvertedLists: reader not set, call set_reader() first");
+    reader_->read_at(l.offset, out, l.size * code_size);
+    return out;
+}
+
+const idx_t* PreadInvertedLists::get_ids(size_t list_no) const {
+    const OnDiskOneList& l = lists.at(list_no);
+    if (l.size == 0) {
+        return nullptr;
+    }
+    auto* out = new idx_t[l.size];
+    FAISS_THROW_IF_NOT_MSG(
+            reader_,
+            "PreadInvertedLists: reader not set, call set_reader() first");
+    reader_->read_at(
+            l.offset + l.capacity * code_size, out, l.size * sizeof(idx_t));
+    return out;
+}
+
+void PreadInvertedLists::release_codes(
+        size_t /*list_no*/,
+        const uint8_t* codes) const {
+    delete[] codes;
+}
+
+void PreadInvertedLists::release_ids(
+        size_t /*list_no*/,
+        const idx_t* ids) const {
+    delete[] ids;
+}
+
+bool PreadInvertedLists::is_empty(size_t list_no, void*) const {
+    return lists.at(list_no).size == 0;
+}
+
+InvertedListsIterator* PreadInvertedLists::get_iterator(
+        size_t list_no,
+        void*) const {
+    return new PreadInvertedListsIterator(this, list_no);
+}
+
+void PreadInvertedLists::prefetch_lists(
+        const idx_t* /*list_nos*/,
+        int /*nlist*/) const {
+    // Intentionally empty.  The iterator constructor already borrows
+    // the entire list in one shot, so a synchronous prefetch here
+    // would just duplicate the same I/O.  If an asynchronous prefetch
+    // mechanism becomes available in the future, this is the place to
+    // add it.
+}
+
+/*******************************************************
+ * Helper: replace OnDiskInvertedLists with PreadInvertedLists
+ *******************************************************/
+
+PreadInvertedLists* replace_with_pread_invlists(Index* index) {
+    auto* ivf = dynamic_cast<IndexIVF*>(index);
+    FAISS_THROW_IF_NOT_MSG(
+            ivf, "replace_with_pread_invlists expects an IndexIVF");
+
+    auto* od = dynamic_cast<OnDiskInvertedLists*>(ivf->invlists);
+    FAISS_THROW_IF_NOT_MSG(
+            od,
+            "replace_with_pread_invlists expects IndexIVF with "
+            "OnDiskInvertedLists");
+
+    auto* pread = new PreadInvertedLists(*od);
+    ivf->replace_invlists(pread, true);
+    return pread;
+}
+
+} // namespace faiss
diff --git a/faiss/invlists/PreadInvertedLists.h 
b/faiss/invlists/PreadInvertedLists.h
new file mode 100644
index 00000000000..97668e4a125
--- /dev/null
+++ b/faiss/invlists/PreadInvertedLists.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/random_access_reader.h>
+#include <faiss/invlists/InvertedLists.h>
+#include <faiss/invlists/OnDiskInvertedLists.h>
+
+namespace faiss {
+
+/**
+ * Read-only inverted lists backed by a pluggable RandomAccessReader.
+ *
+ * Unlike OnDiskInvertedLists (which requires mmap), PreadInvertedLists
+ * uses positional reads (pread-style) via a RandomAccessReader, making
+ * it suitable for environments where:
+ * - the data lives inside a compound file (no standalone fd to mmap),
+ * - a user-space cache is interposed between FAISS and storage, or
+ * - the backing store is remote (object storage, HDFS, etc.).
+ *
+ * The list metadata layout is identical to OnDiskInvertedLists:
+ *   [codes: capacity * code_size bytes] [ids: capacity * sizeof(idx_t) bytes]
+ * Only the first `size` entries in each region are valid.
+ *
+ * The iterator path (enabled by default) borrows entire lists via
+ * RandomAccessReader::borrow(), which allows zero-copy access when the
+ * reader is backed by a cache or memory map.
+ */
+struct PreadInvertedLists : ReadOnlyInvertedLists {
+    /// Per-list metadata (same layout as OnDiskInvertedLists).
+    std::vector<OnDiskOneList> lists;
+
+    /// Construct from an existing OnDiskInvertedLists, copying its metadata.
+    explicit PreadInvertedLists(const OnDiskInvertedLists& src);
+
+    /// Construct directly from list metadata.
+    PreadInvertedLists(
+            size_t nlist,
+            size_t code_size,
+            std::vector<OnDiskOneList> lists);
+
+    /// Bind the reader that provides random access to the ivfdata content.
+    /// Must be called before any read operation.
+    void set_reader(std::unique_ptr<RandomAccessReader> reader);
+
+    /// Return the bound reader (must have been set via set_reader()).
+    const RandomAccessReader& reader() const;
+
+    // ---------- InvertedLists interface ----------
+
+    size_t list_size(size_t list_no) const override;
+
+    const uint8_t* get_codes(size_t list_no) const override;
+    const idx_t* get_ids(size_t list_no) const override;
+    void release_codes(size_t list_no, const uint8_t* codes) const override;
+    void release_ids(size_t list_no, const idx_t* ids) const override;
+
+    bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
+            const override;
+    InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context = nullptr) const override;
+
+    void prefetch_lists(const idx_t* list_nos, int nlist) const override;
+
+private:
+    std::unique_ptr<RandomAccessReader> reader_;
+};
+
+/**
+ * Replace an IndexIVF's OnDiskInvertedLists with a PreadInvertedLists.
+ *
+ * The returned object has no reader yet — the caller must call
+ * set_reader() before any search.
+ */
+FAISS_API PreadInvertedLists* replace_with_pread_invlists(Index* index);
+
+} // namespace faiss


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to