This is an automated email from the ASF dual-hosted git repository.

lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new 47e780187e [python] Use pyroaring.BitMap64 for better performance 
(#7185)
47e780187e is described below

commit 47e780187e50cf0132816e56144eaab127cd7c30
Author: Jingsong Lee <[email protected]>
AuthorDate: Tue Feb 3 13:06:54 2026 +0800

    [python] Use pyroaring.BitMap64 for better performance (#7185)
---
 .../apply_deletion_vector_reader.py                |  13 +-
 .../deletionvectors/bitmap_deletion_vector.py      |   8 +-
 .../pypaimon/deletionvectors/deletion_vector.py    |   3 +-
 paimon-python/pypaimon/globalindex/__init__.py     |   2 -
 .../globalindex/btree/btree_index_reader.py        |   2 +-
 .../globalindex/faiss/faiss_vector_reader.py       |   2 +-
 .../pypaimon/globalindex/global_index_result.py    |   2 +-
 .../pypaimon/globalindex/vector_search.py          |   2 +-
 .../pypaimon/globalindex/vector_search_result.py   |   2 +-
 .../{globalindex => utils}/roaring_bitmap.py       | 143 +++++++++++++++++----
 10 files changed, 134 insertions(+), 45 deletions(-)

diff --git 
a/paimon-python/pypaimon/deletionvectors/apply_deletion_vector_reader.py 
b/paimon-python/pypaimon/deletionvectors/apply_deletion_vector_reader.py
index c9a143bdc9..8adc3d8ea0 100644
--- a/paimon-python/pypaimon/deletionvectors/apply_deletion_vector_reader.py
+++ b/paimon-python/pypaimon/deletionvectors/apply_deletion_vector_reader.py
@@ -23,9 +23,7 @@ from pyarrow import RecordBatch
 from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
 from pypaimon.read.reader.iface.record_iterator import RecordIterator
 from pypaimon.deletionvectors.deletion_vector import DeletionVector
-
-from pyroaring import BitMap
-
+from pypaimon.utils.roaring_bitmap import RoaringBitmap
 from pypaimon.read.reader.iface.record_reader import RecordReader
 
 
@@ -57,10 +55,11 @@ class ApplyDeletionVectorReader(RecordBatchReader):
         if arrow_batch is None:
             return None
         # Remove the deleted rows from the batch
-        range_bitmap = BitMap(
-            range(self._reader.return_batch_pos() - arrow_batch.num_rows, 
self._reader.return_batch_pos()))
-        intersection_bitmap = range_bitmap - self._deletion_vector.bit_map()
-        added_row_list = [x - (self._reader.return_batch_pos() - 
arrow_batch.num_rows) for x in
+        range_bitmap = RoaringBitmap()
+        return_batch_pos = self._reader.return_batch_pos()
+        range_bitmap.add_range(return_batch_pos - arrow_batch.num_rows, 
return_batch_pos - 1)
+        intersection_bitmap = RoaringBitmap.remove_all(range_bitmap, 
self._deletion_vector.bit_map())
+        added_row_list = [x - (return_batch_pos - arrow_batch.num_rows) for x 
in
                           list(intersection_bitmap)]
         return arrow_batch.take(pyarrow.array(added_row_list, 
type=pyarrow.int32()))
 
diff --git a/paimon-python/pypaimon/deletionvectors/bitmap_deletion_vector.py 
b/paimon-python/pypaimon/deletionvectors/bitmap_deletion_vector.py
index 0afbe642df..5d3a5170f5 100644
--- a/paimon-python/pypaimon/deletionvectors/bitmap_deletion_vector.py
+++ b/paimon-python/pypaimon/deletionvectors/bitmap_deletion_vector.py
@@ -18,7 +18,7 @@
 from pypaimon.deletionvectors.deletion_vector import DeletionVector
 import struct
 import zlib
-from pyroaring import BitMap
+from pypaimon.utils.roaring_bitmap import RoaringBitmap
 
 
 class BitmapDeletionVector(DeletionVector):
@@ -31,14 +31,14 @@ class BitmapDeletionVector(DeletionVector):
     MAGIC_NUMBER_SIZE_BYTES = 4
     MAX_VALUE = 2147483647
 
-    def __init__(self, bitmap: BitMap = None):
+    def __init__(self, bitmap: RoaringBitmap = None):
         """
         Initialize a BitmapDeletionVector.
 
         Args:
             bitmap: Optional RoaringBitmap instance. If None, creates an empty 
bitmap.
         """
-        self._bitmap = bitmap if bitmap is not None else BitMap()
+        self._bitmap = bitmap if bitmap is not None else RoaringBitmap()
 
     def delete(self, position: int) -> None:
         """
@@ -121,7 +121,7 @@ class BitmapDeletionVector(DeletionVector):
         Returns:
             A BitmapDeletionVector instance.
         """
-        bitmap = BitMap.deserialize(data)
+        bitmap = RoaringBitmap.deserialize(data)
         return BitmapDeletionVector(bitmap)
 
     def bit_map(self):
diff --git a/paimon-python/pypaimon/deletionvectors/deletion_vector.py 
b/paimon-python/pypaimon/deletionvectors/deletion_vector.py
index 44c179653f..866aed177c 100644
--- a/paimon-python/pypaimon/deletionvectors/deletion_vector.py
+++ b/paimon-python/pypaimon/deletionvectors/deletion_vector.py
@@ -19,6 +19,7 @@ from abc import ABC, abstractmethod
 
 from pypaimon.common.file_io import FileIO
 from pypaimon.table.source.deletion_file import DeletionFile
+from pypaimon.utils.roaring_bitmap import RoaringBitmap
 
 
 class DeletionVector(ABC):
@@ -28,7 +29,7 @@ class DeletionVector(ABC):
     """
 
     @abstractmethod
-    def bit_map(self):
+    def bit_map(self) -> RoaringBitmap:
         """
         Returns the bitmap of the DeletionVector.
         """
diff --git a/paimon-python/pypaimon/globalindex/__init__.py 
b/paimon-python/pypaimon/globalindex/__init__.py
index ae8e6328d1..f9d3aa9357 100644
--- a/paimon-python/pypaimon/globalindex/__init__.py
+++ b/paimon-python/pypaimon/globalindex/__init__.py
@@ -30,7 +30,6 @@ from pypaimon.globalindex.global_index_scan_builder import (
     GlobalIndexScanBuilder,
     RowRangeGlobalIndexScanner,
 )
-from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
 from pypaimon.globalindex.range import Range
 
 __all__ = [
@@ -46,6 +45,5 @@ __all__ = [
     'GlobalIndexEvaluator',
     'GlobalIndexScanBuilder',
     'RowRangeGlobalIndexScanner',
-    'RoaringBitmap64',
     'Range',
 ]
diff --git a/paimon-python/pypaimon/globalindex/btree/btree_index_reader.py 
b/paimon-python/pypaimon/globalindex/btree/btree_index_reader.py
index f5172b167a..cff97c8093 100644
--- a/paimon-python/pypaimon/globalindex/btree/btree_index_reader.py
+++ b/paimon-python/pypaimon/globalindex/btree/btree_index_reader.py
@@ -33,7 +33,7 @@ from pypaimon.globalindex.btree.key_serializer import 
KeySerializer
 from pypaimon.globalindex.global_index_meta import GlobalIndexIOMeta
 from pypaimon.globalindex.global_index_reader import FieldRef, 
GlobalIndexReader
 from pypaimon.globalindex.global_index_result import GlobalIndexResult
-from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
+from pypaimon.utils.roaring_bitmap import RoaringBitmap64
 from pypaimon.globalindex.btree.btree_file_footer import BTreeFileFooter
 from pypaimon.globalindex.btree.sst_file_reader import SstFileReader
 from pypaimon.globalindex.btree.memory_slice_input import MemorySliceInput
diff --git a/paimon-python/pypaimon/globalindex/faiss/faiss_vector_reader.py 
b/paimon-python/pypaimon/globalindex/faiss/faiss_vector_reader.py
index 3201894d27..501364075f 100644
--- a/paimon-python/pypaimon/globalindex/faiss/faiss_vector_reader.py
+++ b/paimon-python/pypaimon/globalindex/faiss/faiss_vector_reader.py
@@ -31,7 +31,7 @@ from pypaimon.globalindex.global_index_reader import 
GlobalIndexReader
 from pypaimon.globalindex.global_index_result import GlobalIndexResult
 from pypaimon.globalindex.global_index_meta import GlobalIndexIOMeta
 from pypaimon.globalindex.vector_search_result import 
DictBasedScoredIndexResult
-from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
+from pypaimon.utils.roaring_bitmap import RoaringBitmap64
 from pypaimon.globalindex.faiss.faiss_options import (
     FaissVectorIndexOptions,
     FaissVectorMetric,
diff --git a/paimon-python/pypaimon/globalindex/global_index_result.py 
b/paimon-python/pypaimon/globalindex/global_index_result.py
index 9ba191ce78..664467e42e 100644
--- a/paimon-python/pypaimon/globalindex/global_index_result.py
+++ b/paimon-python/pypaimon/globalindex/global_index_result.py
@@ -19,7 +19,7 @@
 from abc import ABC, abstractmethod
 from typing import Callable, List, Optional
 
-from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
+from pypaimon.utils.roaring_bitmap import RoaringBitmap64
 from pypaimon.globalindex.range import Range
 
 
diff --git a/paimon-python/pypaimon/globalindex/vector_search.py 
b/paimon-python/pypaimon/globalindex/vector_search.py
index f7d7209ff3..525b8deb51 100644
--- a/paimon-python/pypaimon/globalindex/vector_search.py
+++ b/paimon-python/pypaimon/globalindex/vector_search.py
@@ -65,7 +65,7 @@ class VectorSearch:
         """
         Create a new VectorSearch with include_row_ids offset to the given 
range.
         """
-        from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
+        from pypaimon.utils.roaring_bitmap import RoaringBitmap64
 
         if self.include_row_ids is not None:
             range_bitmap = RoaringBitmap64()
diff --git a/paimon-python/pypaimon/globalindex/vector_search_result.py 
b/paimon-python/pypaimon/globalindex/vector_search_result.py
index e4fdcd6cc8..60afe331a9 100644
--- a/paimon-python/pypaimon/globalindex/vector_search_result.py
+++ b/paimon-python/pypaimon/globalindex/vector_search_result.py
@@ -22,7 +22,7 @@ from abc import abstractmethod
 from typing import Callable, Dict, Optional
 
 from pypaimon.globalindex.global_index_result import GlobalIndexResult
-from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
+from pypaimon.utils.roaring_bitmap import RoaringBitmap64
 
 
 # Type alias for score getter function
diff --git a/paimon-python/pypaimon/globalindex/roaring_bitmap.py 
b/paimon-python/pypaimon/utils/roaring_bitmap.py
similarity index 52%
rename from paimon-python/pypaimon/globalindex/roaring_bitmap.py
rename to paimon-python/pypaimon/utils/roaring_bitmap.py
index c91f0c149a..c3f5a647ad 100644
--- a/paimon-python/pypaimon/globalindex/roaring_bitmap.py
+++ b/paimon-python/pypaimon/utils/roaring_bitmap.py
@@ -20,20 +20,20 @@
 Roaring Bitmap.
 """
 
-from typing import Iterator, Set
-import struct
+from typing import Iterator
 
 
 class RoaringBitmap64:
     """
     A 64-bit roaring bitmap implementation.
+
     This class provides efficient storage and operations for sets of 64-bit 
integers.
-    It uses a set-based implementation for simplicity, which can be replaced 
with
-    a more efficient roaring bitmap library if needed.
+    It uses pyroaring.BitMap64 for better performance and memory efficiency.
     """
 
     def __init__(self):
-        self._data: Set[int] = set()
+        from pyroaring import BitMap64
+        self._data = BitMap64()
 
     def add(self, value: int) -> None:
         """Add a single value to the bitmap."""
@@ -41,8 +41,7 @@ class RoaringBitmap64:
 
     def add_range(self, from_: int, to: int) -> None:
         """Add a range of values [from_, to] to the bitmap."""
-        for i in range(from_, to + 1):
-            self._data.add(i)
+        self._data.add_range(from_, to + 1)
 
     def contains(self, value: int) -> bool:
         """Check if the bitmap contains the given value."""
@@ -58,7 +57,7 @@ class RoaringBitmap64:
 
     def __iter__(self) -> Iterator[int]:
         """Iterate over all values in the bitmap in sorted order."""
-        return iter(sorted(self._data))
+        return iter(self._data)
 
     def __len__(self) -> int:
         """Return the number of elements in the bitmap."""
@@ -74,7 +73,7 @@ class RoaringBitmap64:
 
     def to_list(self) -> list:
         """Return a sorted list of all values in the bitmap."""
-        return sorted(self._data)
+        return list(self._data)
 
     def to_range_list(self) -> list:
         """
@@ -85,8 +84,9 @@ class RoaringBitmap64:
         if self.is_empty():
             return []
 
-        sorted_values = sorted(self._data)
+        # Use pyroaring's efficient iteration
         ranges = []
+        sorted_values = list(self._data)
         start = sorted_values[0]
         end = start
 
@@ -127,23 +127,14 @@ class RoaringBitmap64:
 
     def serialize(self) -> bytes:
         """Serialize the bitmap to bytes."""
-        # Simple serialization format: count followed by sorted values
-        values = sorted(self._data)
-        data = struct.pack('>Q', len(values))  # 8-byte count
-        for v in values:
-            data += struct.pack('>q', v)  # 8-byte signed value
-        return data
+        return self._data.serialize()
 
     @staticmethod
     def deserialize(data: bytes) -> 'RoaringBitmap64':
         """Deserialize a bitmap from bytes."""
         result = RoaringBitmap64()
-        count = struct.unpack('>Q', data[:8])[0]
-        offset = 8
-        for _ in range(count):
-            value = struct.unpack('>q', data[offset:offset + 8])[0]
-            result.add(value)
-            offset += 8
+        from pyroaring import BitMap64
+        result._data = BitMap64.deserialize(data)
         return result
 
     def __eq__(self, other: object) -> bool:
@@ -152,9 +143,109 @@ class RoaringBitmap64:
         return self._data == other._data
 
     def __hash__(self) -> int:
-        return hash(frozenset(self._data))
+        return hash(tuple(sorted(self._data)))
+
+    def __repr__(self) -> str:
+        values = list(self._data)
+        if len(values) <= 10:
+            return f"RoaringBitmap64({values})"
+        return f"RoaringBitmap64({len(values)} elements)"
+
+
+class RoaringBitmap:
+    """
+    A 32-bit roaring bitmap implementation.
+
+    This class provides efficient storage and operations for sets of 32-bit 
integers.
+    It uses pyroaring.BitMap for better performance and memory efficiency.
+    """
+
+    def __init__(self):
+        from pyroaring import BitMap
+        self._data = BitMap()
+
+    def add(self, value: int) -> None:
+        """Add a single value to the bitmap."""
+        self._data.add(value)
+
+    def add_range(self, from_: int, to: int) -> None:
+        """Add a range of values [from_, to] to the bitmap."""
+        self._data.add_range(from_, to + 1)
+
+    def contains(self, value: int) -> bool:
+        """Check if the bitmap contains the given value."""
+        return value in self._data
+
+    def is_empty(self) -> bool:
+        """Check if the bitmap is empty."""
+        return len(self._data) == 0
+
+    def cardinality(self) -> int:
+        """Return the number of elements in the bitmap."""
+        return len(self._data)
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate over all values in the bitmap in sorted order."""
+        return iter(self._data)
+
+    def __len__(self) -> int:
+        """Return the number of elements in the bitmap."""
+        return len(self._data)
+
+    def __contains__(self, value: int) -> bool:
+        """Check if the bitmap contains the given value."""
+        return self.contains(value)
+
+    def clear(self) -> None:
+        """Clear all values from the bitmap."""
+        self._data.clear()
+
+    def to_list(self) -> list:
+        """Return a sorted list of all values in the bitmap."""
+        return list(self._data)
+
+    @staticmethod
+    def and_(a: 'RoaringBitmap', b: 'RoaringBitmap') -> 'RoaringBitmap':
+        """Return the intersection of two bitmaps."""
+        result = RoaringBitmap()
+        result._data = a._data & b._data
+        return result
+
+    @staticmethod
+    def or_(a: 'RoaringBitmap', b: 'RoaringBitmap') -> 'RoaringBitmap':
+        """Return the union of two bitmaps."""
+        result = RoaringBitmap()
+        result._data = a._data | b._data
+        return result
+
+    @staticmethod
+    def remove_all(a: 'RoaringBitmap', b: 'RoaringBitmap') -> 'RoaringBitmap':
+        result = RoaringBitmap()
+        result._data = a._data - b._data
+        return result
+
+    def serialize(self) -> bytes:
+        """Serialize the bitmap to bytes."""
+        return self._data.serialize()
+
+    @staticmethod
+    def deserialize(data: bytes) -> 'RoaringBitmap':
+        """Deserialize a bitmap from bytes."""
+        result = RoaringBitmap()
+        from pyroaring import BitMap
+        result._data = BitMap.deserialize(data)
+        return result
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, RoaringBitmap):
+            return False
+        return self._data == other._data
+
+    def __hash__(self) -> int:
+        return hash(tuple(sorted(self._data)))
 
     def __repr__(self) -> str:
-        if len(self._data) <= 10:
-            return f"RoaringBitmap64({sorted(self._data)})"
-        return f"RoaringBitmap64({len(self._data)} elements)"
+        values = list(self._data)
+        if len(values) <= 10:
+            return f"RoaringBitmap({values})"
+        return f"RoaringBitmap({len(values)} elements)"

Reply via email to