This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 47e780187e [python] Use pyroaring.BitMap64 for better performance
(#7185)
47e780187e is described below
commit 47e780187e50cf0132816e56144eaab127cd7c30
Author: Jingsong Lee <[email protected]>
AuthorDate: Tue Feb 3 13:06:54 2026 +0800
[python] Use pyroaring.BitMap64 for better performance (#7185)
---
.../apply_deletion_vector_reader.py | 13 +-
.../deletionvectors/bitmap_deletion_vector.py | 8 +-
.../pypaimon/deletionvectors/deletion_vector.py | 3 +-
paimon-python/pypaimon/globalindex/__init__.py | 2 -
.../globalindex/btree/btree_index_reader.py | 2 +-
.../globalindex/faiss/faiss_vector_reader.py | 2 +-
.../pypaimon/globalindex/global_index_result.py | 2 +-
.../pypaimon/globalindex/vector_search.py | 2 +-
.../pypaimon/globalindex/vector_search_result.py | 2 +-
.../{globalindex => utils}/roaring_bitmap.py | 143 +++++++++++++++++----
10 files changed, 134 insertions(+), 45 deletions(-)
diff --git
a/paimon-python/pypaimon/deletionvectors/apply_deletion_vector_reader.py
b/paimon-python/pypaimon/deletionvectors/apply_deletion_vector_reader.py
index c9a143bdc9..8adc3d8ea0 100644
--- a/paimon-python/pypaimon/deletionvectors/apply_deletion_vector_reader.py
+++ b/paimon-python/pypaimon/deletionvectors/apply_deletion_vector_reader.py
@@ -23,9 +23,7 @@ from pyarrow import RecordBatch
from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
from pypaimon.read.reader.iface.record_iterator import RecordIterator
from pypaimon.deletionvectors.deletion_vector import DeletionVector
-
-from pyroaring import BitMap
-
+from pypaimon.utils.roaring_bitmap import RoaringBitmap
from pypaimon.read.reader.iface.record_reader import RecordReader
@@ -57,10 +55,11 @@ class ApplyDeletionVectorReader(RecordBatchReader):
if arrow_batch is None:
return None
# Remove the deleted rows from the batch
- range_bitmap = BitMap(
- range(self._reader.return_batch_pos() - arrow_batch.num_rows,
self._reader.return_batch_pos()))
- intersection_bitmap = range_bitmap - self._deletion_vector.bit_map()
- added_row_list = [x - (self._reader.return_batch_pos() -
arrow_batch.num_rows) for x in
+ range_bitmap = RoaringBitmap()
+ return_batch_pos = self._reader.return_batch_pos()
+ range_bitmap.add_range(return_batch_pos - arrow_batch.num_rows,
return_batch_pos - 1)
+ intersection_bitmap = RoaringBitmap.remove_all(range_bitmap,
self._deletion_vector.bit_map())
+ added_row_list = [x - (return_batch_pos - arrow_batch.num_rows) for x
in
list(intersection_bitmap)]
return arrow_batch.take(pyarrow.array(added_row_list,
type=pyarrow.int32()))
diff --git a/paimon-python/pypaimon/deletionvectors/bitmap_deletion_vector.py
b/paimon-python/pypaimon/deletionvectors/bitmap_deletion_vector.py
index 0afbe642df..5d3a5170f5 100644
--- a/paimon-python/pypaimon/deletionvectors/bitmap_deletion_vector.py
+++ b/paimon-python/pypaimon/deletionvectors/bitmap_deletion_vector.py
@@ -18,7 +18,7 @@
from pypaimon.deletionvectors.deletion_vector import DeletionVector
import struct
import zlib
-from pyroaring import BitMap
+from pypaimon.utils.roaring_bitmap import RoaringBitmap
class BitmapDeletionVector(DeletionVector):
@@ -31,14 +31,14 @@ class BitmapDeletionVector(DeletionVector):
MAGIC_NUMBER_SIZE_BYTES = 4
MAX_VALUE = 2147483647
- def __init__(self, bitmap: BitMap = None):
+ def __init__(self, bitmap: RoaringBitmap = None):
"""
Initialize a BitmapDeletionVector.
Args:
bitmap: Optional RoaringBitmap instance. If None, creates an empty
bitmap.
"""
- self._bitmap = bitmap if bitmap is not None else BitMap()
+ self._bitmap = bitmap if bitmap is not None else RoaringBitmap()
def delete(self, position: int) -> None:
"""
@@ -121,7 +121,7 @@ class BitmapDeletionVector(DeletionVector):
Returns:
A BitmapDeletionVector instance.
"""
- bitmap = BitMap.deserialize(data)
+ bitmap = RoaringBitmap.deserialize(data)
return BitmapDeletionVector(bitmap)
def bit_map(self):
diff --git a/paimon-python/pypaimon/deletionvectors/deletion_vector.py
b/paimon-python/pypaimon/deletionvectors/deletion_vector.py
index 44c179653f..866aed177c 100644
--- a/paimon-python/pypaimon/deletionvectors/deletion_vector.py
+++ b/paimon-python/pypaimon/deletionvectors/deletion_vector.py
@@ -19,6 +19,7 @@ from abc import ABC, abstractmethod
from pypaimon.common.file_io import FileIO
from pypaimon.table.source.deletion_file import DeletionFile
+from pypaimon.utils.roaring_bitmap import RoaringBitmap
class DeletionVector(ABC):
@@ -28,7 +29,7 @@ class DeletionVector(ABC):
"""
@abstractmethod
- def bit_map(self):
+ def bit_map(self) -> RoaringBitmap:
"""
Returns the bitmap of the DeletionVector.
"""
diff --git a/paimon-python/pypaimon/globalindex/__init__.py
b/paimon-python/pypaimon/globalindex/__init__.py
index ae8e6328d1..f9d3aa9357 100644
--- a/paimon-python/pypaimon/globalindex/__init__.py
+++ b/paimon-python/pypaimon/globalindex/__init__.py
@@ -30,7 +30,6 @@ from pypaimon.globalindex.global_index_scan_builder import (
GlobalIndexScanBuilder,
RowRangeGlobalIndexScanner,
)
-from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
from pypaimon.globalindex.range import Range
__all__ = [
@@ -46,6 +45,5 @@ __all__ = [
'GlobalIndexEvaluator',
'GlobalIndexScanBuilder',
'RowRangeGlobalIndexScanner',
- 'RoaringBitmap64',
'Range',
]
diff --git a/paimon-python/pypaimon/globalindex/btree/btree_index_reader.py
b/paimon-python/pypaimon/globalindex/btree/btree_index_reader.py
index f5172b167a..cff97c8093 100644
--- a/paimon-python/pypaimon/globalindex/btree/btree_index_reader.py
+++ b/paimon-python/pypaimon/globalindex/btree/btree_index_reader.py
@@ -33,7 +33,7 @@ from pypaimon.globalindex.btree.key_serializer import
KeySerializer
from pypaimon.globalindex.global_index_meta import GlobalIndexIOMeta
from pypaimon.globalindex.global_index_reader import FieldRef,
GlobalIndexReader
from pypaimon.globalindex.global_index_result import GlobalIndexResult
-from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
+from pypaimon.utils.roaring_bitmap import RoaringBitmap64
from pypaimon.globalindex.btree.btree_file_footer import BTreeFileFooter
from pypaimon.globalindex.btree.sst_file_reader import SstFileReader
from pypaimon.globalindex.btree.memory_slice_input import MemorySliceInput
diff --git a/paimon-python/pypaimon/globalindex/faiss/faiss_vector_reader.py
b/paimon-python/pypaimon/globalindex/faiss/faiss_vector_reader.py
index 3201894d27..501364075f 100644
--- a/paimon-python/pypaimon/globalindex/faiss/faiss_vector_reader.py
+++ b/paimon-python/pypaimon/globalindex/faiss/faiss_vector_reader.py
@@ -31,7 +31,7 @@ from pypaimon.globalindex.global_index_reader import
GlobalIndexReader
from pypaimon.globalindex.global_index_result import GlobalIndexResult
from pypaimon.globalindex.global_index_meta import GlobalIndexIOMeta
from pypaimon.globalindex.vector_search_result import
DictBasedScoredIndexResult
-from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
+from pypaimon.utils.roaring_bitmap import RoaringBitmap64
from pypaimon.globalindex.faiss.faiss_options import (
FaissVectorIndexOptions,
FaissVectorMetric,
diff --git a/paimon-python/pypaimon/globalindex/global_index_result.py
b/paimon-python/pypaimon/globalindex/global_index_result.py
index 9ba191ce78..664467e42e 100644
--- a/paimon-python/pypaimon/globalindex/global_index_result.py
+++ b/paimon-python/pypaimon/globalindex/global_index_result.py
@@ -19,7 +19,7 @@
from abc import ABC, abstractmethod
from typing import Callable, List, Optional
-from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
+from pypaimon.utils.roaring_bitmap import RoaringBitmap64
from pypaimon.globalindex.range import Range
diff --git a/paimon-python/pypaimon/globalindex/vector_search.py
b/paimon-python/pypaimon/globalindex/vector_search.py
index f7d7209ff3..525b8deb51 100644
--- a/paimon-python/pypaimon/globalindex/vector_search.py
+++ b/paimon-python/pypaimon/globalindex/vector_search.py
@@ -65,7 +65,7 @@ class VectorSearch:
"""
Create a new VectorSearch with include_row_ids offset to the given
range.
"""
- from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
+ from pypaimon.utils.roaring_bitmap import RoaringBitmap64
if self.include_row_ids is not None:
range_bitmap = RoaringBitmap64()
diff --git a/paimon-python/pypaimon/globalindex/vector_search_result.py
b/paimon-python/pypaimon/globalindex/vector_search_result.py
index e4fdcd6cc8..60afe331a9 100644
--- a/paimon-python/pypaimon/globalindex/vector_search_result.py
+++ b/paimon-python/pypaimon/globalindex/vector_search_result.py
@@ -22,7 +22,7 @@ from abc import abstractmethod
from typing import Callable, Dict, Optional
from pypaimon.globalindex.global_index_result import GlobalIndexResult
-from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
+from pypaimon.utils.roaring_bitmap import RoaringBitmap64
# Type alias for score getter function
diff --git a/paimon-python/pypaimon/globalindex/roaring_bitmap.py
b/paimon-python/pypaimon/utils/roaring_bitmap.py
similarity index 52%
rename from paimon-python/pypaimon/globalindex/roaring_bitmap.py
rename to paimon-python/pypaimon/utils/roaring_bitmap.py
index c91f0c149a..c3f5a647ad 100644
--- a/paimon-python/pypaimon/globalindex/roaring_bitmap.py
+++ b/paimon-python/pypaimon/utils/roaring_bitmap.py
@@ -20,20 +20,20 @@
Roaring Bitmap.
"""
-from typing import Iterator, Set
-import struct
+from typing import Iterator
class RoaringBitmap64:
"""
A 64-bit roaring bitmap implementation.
+
This class provides efficient storage and operations for sets of 64-bit
integers.
- It uses a set-based implementation for simplicity, which can be replaced
with
- a more efficient roaring bitmap library if needed.
+ It uses pyroaring.BitMap64 for better performance and memory efficiency.
"""
def __init__(self):
- self._data: Set[int] = set()
+ from pyroaring import BitMap64
+ self._data = BitMap64()
def add(self, value: int) -> None:
"""Add a single value to the bitmap."""
@@ -41,8 +41,7 @@ class RoaringBitmap64:
def add_range(self, from_: int, to: int) -> None:
"""Add a range of values [from_, to] to the bitmap."""
- for i in range(from_, to + 1):
- self._data.add(i)
+ self._data.add_range(from_, to + 1)
def contains(self, value: int) -> bool:
"""Check if the bitmap contains the given value."""
@@ -58,7 +57,7 @@ class RoaringBitmap64:
def __iter__(self) -> Iterator[int]:
"""Iterate over all values in the bitmap in sorted order."""
- return iter(sorted(self._data))
+ return iter(self._data)
def __len__(self) -> int:
"""Return the number of elements in the bitmap."""
@@ -74,7 +73,7 @@ class RoaringBitmap64:
def to_list(self) -> list:
"""Return a sorted list of all values in the bitmap."""
- return sorted(self._data)
+ return list(self._data)
def to_range_list(self) -> list:
"""
@@ -85,8 +84,9 @@ class RoaringBitmap64:
if self.is_empty():
return []
- sorted_values = sorted(self._data)
+ # Use pyroaring's efficient iteration
ranges = []
+ sorted_values = list(self._data)
start = sorted_values[0]
end = start
@@ -127,23 +127,14 @@ class RoaringBitmap64:
def serialize(self) -> bytes:
"""Serialize the bitmap to bytes."""
- # Simple serialization format: count followed by sorted values
- values = sorted(self._data)
- data = struct.pack('>Q', len(values)) # 8-byte count
- for v in values:
- data += struct.pack('>q', v) # 8-byte signed value
- return data
+ return self._data.serialize()
@staticmethod
def deserialize(data: bytes) -> 'RoaringBitmap64':
"""Deserialize a bitmap from bytes."""
result = RoaringBitmap64()
- count = struct.unpack('>Q', data[:8])[0]
- offset = 8
- for _ in range(count):
- value = struct.unpack('>q', data[offset:offset + 8])[0]
- result.add(value)
- offset += 8
+ from pyroaring import BitMap64
+ result._data = BitMap64.deserialize(data)
return result
def __eq__(self, other: object) -> bool:
@@ -152,9 +143,109 @@ class RoaringBitmap64:
return self._data == other._data
def __hash__(self) -> int:
- return hash(frozenset(self._data))
+ return hash(tuple(sorted(self._data)))
+
+ def __repr__(self) -> str:
+ values = list(self._data)
+ if len(values) <= 10:
+ return f"RoaringBitmap64({values})"
+ return f"RoaringBitmap64({len(values)} elements)"
+
+
+class RoaringBitmap:
+ """
+ A 32-bit roaring bitmap implementation.
+
+ This class provides efficient storage and operations for sets of 32-bit
integers.
+ It uses pyroaring.BitMap for better performance and memory efficiency.
+ """
+
+ def __init__(self):
+ from pyroaring import BitMap
+ self._data = BitMap()
+
+ def add(self, value: int) -> None:
+ """Add a single value to the bitmap."""
+ self._data.add(value)
+
+ def add_range(self, from_: int, to: int) -> None:
+ """Add a range of values [from_, to] to the bitmap."""
+ self._data.add_range(from_, to + 1)
+
+ def contains(self, value: int) -> bool:
+ """Check if the bitmap contains the given value."""
+ return value in self._data
+
+ def is_empty(self) -> bool:
+ """Check if the bitmap is empty."""
+ return len(self._data) == 0
+
+ def cardinality(self) -> int:
+ """Return the number of elements in the bitmap."""
+ return len(self._data)
+
+ def __iter__(self) -> Iterator[int]:
+ """Iterate over all values in the bitmap in sorted order."""
+ return iter(self._data)
+
+ def __len__(self) -> int:
+ """Return the number of elements in the bitmap."""
+ return len(self._data)
+
+ def __contains__(self, value: int) -> bool:
+ """Check if the bitmap contains the given value."""
+ return self.contains(value)
+
+ def clear(self) -> None:
+ """Clear all values from the bitmap."""
+ self._data.clear()
+
+ def to_list(self) -> list:
+ """Return a sorted list of all values in the bitmap."""
+ return list(self._data)
+
+ @staticmethod
+ def and_(a: 'RoaringBitmap', b: 'RoaringBitmap') -> 'RoaringBitmap':
+ """Return the intersection of two bitmaps."""
+ result = RoaringBitmap()
+ result._data = a._data & b._data
+ return result
+
+ @staticmethod
+ def or_(a: 'RoaringBitmap', b: 'RoaringBitmap') -> 'RoaringBitmap':
+ """Return the union of two bitmaps."""
+ result = RoaringBitmap()
+ result._data = a._data | b._data
+ return result
+
+ @staticmethod
+ def remove_all(a: 'RoaringBitmap', b: 'RoaringBitmap') -> 'RoaringBitmap':
+ result = RoaringBitmap()
+ result._data = a._data - b._data
+ return result
+
+ def serialize(self) -> bytes:
+ """Serialize the bitmap to bytes."""
+ return self._data.serialize()
+
+ @staticmethod
+ def deserialize(data: bytes) -> 'RoaringBitmap':
+ """Deserialize a bitmap from bytes."""
+ result = RoaringBitmap()
+ from pyroaring import BitMap
+ result._data = BitMap.deserialize(data)
+ return result
+
+ def __eq__(self, other: object) -> bool:
+ if not isinstance(other, RoaringBitmap):
+ return False
+ return self._data == other._data
+
+ def __hash__(self) -> int:
+ return hash(tuple(sorted(self._data)))
def __repr__(self) -> str:
- if len(self._data) <= 10:
- return f"RoaringBitmap64({sorted(self._data)})"
- return f"RoaringBitmap64({len(self._data)} elements)"
+ values = list(self._data)
+ if len(values) <= 10:
+ return f"RoaringBitmap({values})"
+ return f"RoaringBitmap({len(values)} elements)"