This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new ee095bccaf [core] Refactor VectorSearch to keep required fields
ee095bccaf is described below
commit ee095bccaf8bde3aaea3240f3cb8f208108550ae
Author: JingsongLi <[email protected]>
AuthorDate: Tue Dec 23 18:10:42 2025 +0800
[core] Refactor VectorSearch to keep required fields
---
.gitignore | 1 +
.../paimon/globalindex/GlobalIndexEvaluator.java | 6 +--
.../org/apache/paimon/predicate/VectorSearch.java | 55 +++++++---------------
.../index/LuceneVectorGlobalIndexReader.java | 54 ++++++++-------------
.../index/LuceneVectorGlobalIndexWriter.java | 8 ++--
.../lucene/index/LuceneVectorGlobalIndexer.java | 4 +-
.../lucene/index/LuceneVectorIndexFactory.java | 6 +--
.../index/LuceneVectorSearchGlobalIndexResult.java | 2 +-
.../index/LuceneVectorGlobalIndexScanTest.java | 7 ++-
.../lucene/index/LuceneVectorGlobalIndexTest.java | 24 ++++------
10 files changed, 63 insertions(+), 104 deletions(-)
diff --git a/.gitignore b/.gitignore
index 50d63fc69f..d961d026a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ paimon-python/.idea/
paimon-python/dist/
paimon-python/*.egg-info/
paimon-python/dev/log
+paimon-lucene/.idea/
### VS Code ###
.vscode/
diff --git
a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexEvaluator.java
b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexEvaluator.java
index 3eee4807d2..694480882d 100644
---
a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexEvaluator.java
+++
b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexEvaluator.java
@@ -63,9 +63,9 @@ public class GlobalIndexEvaluator
int fieldId = rowType.getField(vectorSearch.fieldName()).id();
Collection<GlobalIndexReader> readers =
indexReadersCache.computeIfAbsent(fieldId,
readersFunction::apply);
- compoundResult.ifPresent(
- globalIndexResult ->
-
vectorSearch.withIncludeRowIds(globalIndexResult.results().iterator()));
+ if (compoundResult.isPresent()) {
+ vectorSearch =
vectorSearch.withIncludeRowIds(compoundResult.get().results());
+ }
for (GlobalIndexReader fileIndexReader : readers) {
GlobalIndexResult childResult =
vectorSearch.visit(fileIndexReader);
// AND Operation
diff --git
a/paimon-common/src/main/java/org/apache/paimon/predicate/VectorSearch.java
b/paimon-common/src/main/java/org/apache/paimon/predicate/VectorSearch.java
index d4ede50b6f..9097317821 100644
--- a/paimon-common/src/main/java/org/apache/paimon/predicate/VectorSearch.java
+++ b/paimon-common/src/main/java/org/apache/paimon/predicate/VectorSearch.java
@@ -20,30 +20,26 @@ package org.apache.paimon.predicate;
import org.apache.paimon.globalindex.GlobalIndexReader;
import org.apache.paimon.globalindex.GlobalIndexResult;
+import org.apache.paimon.utils.RoaringNavigableMap64;
import javax.annotation.Nullable;
import java.io.Serializable;
-import java.util.Iterator;
-import java.util.Optional;
/** VectorSearch to perform vector similarity search. * */
public class VectorSearch implements Serializable {
+
private static final long serialVersionUID = 1L;
- private Object search;
- private String fieldName;
- private Optional<String> similarityFunction;
- private int limit;
- private Iterator<Long> includeRowIds;
-
- public VectorSearch(
- Object search,
- int limit,
- String fieldName,
- @Nullable Iterator<Long> includeRowIds,
- @Nullable String similarityFunction) {
- if (search == null) {
+ // float[] or byte[]
+ private final Object vector;
+ private final String fieldName;
+ private final int limit;
+
+ @Nullable private RoaringNavigableMap64 includeRowIds;
+
+ public VectorSearch(Object vector, int limit, String fieldName) {
+ if (vector == null) {
throw new IllegalArgumentException("Search cannot be null");
}
if (limit <= 0) {
@@ -52,23 +48,14 @@ public class VectorSearch implements Serializable {
if (fieldName == null || fieldName.isEmpty()) {
throw new IllegalArgumentException("Field name cannot be null or
empty");
}
- this.search = search;
+ this.vector = vector;
this.limit = limit;
this.fieldName = fieldName;
- this.similarityFunction = Optional.ofNullable(similarityFunction);
- this.includeRowIds = includeRowIds;
- }
-
- public VectorSearch(Object search, int limit, String fieldName) {
- this(search, limit, fieldName, null, null);
- }
-
- public VectorSearch(Object search, int limit, String fieldName,
Iterator<Long> includeRowIds) {
- this(search, limit, fieldName, includeRowIds, null);
}
- public Object search() {
- return search;
+ // float[] or byte[]
+ public Object vector() {
+ return vector;
}
public int limit() {
@@ -79,15 +66,11 @@ public class VectorSearch implements Serializable {
return fieldName;
}
- public Optional<String> similarityFunction() {
- return similarityFunction;
- }
-
- public Iterator<Long> includeRowIds() {
+ public RoaringNavigableMap64 includeRowIds() {
return includeRowIds;
}
- public VectorSearch withIncludeRowIds(Iterator<Long> includeRowIds) {
+ public VectorSearch withIncludeRowIds(RoaringNavigableMap64 includeRowIds)
{
this.includeRowIds = includeRowIds;
return this;
}
@@ -98,8 +81,6 @@ public class VectorSearch implements Serializable {
@Override
public String toString() {
- return String.format(
- "FieldName(%s), SimilarityFunction(%s), Limit(%s)",
- fieldName, similarityFunction, limit);
+ return String.format("FieldName(%s), Limit(%s)", fieldName, limit);
}
}
diff --git
a/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexReader.java
b/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexReader.java
index 7d8af60332..818644027b 100644
---
a/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexReader.java
+++
b/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexReader.java
@@ -29,6 +29,7 @@ import org.apache.paimon.types.ArrayType;
import org.apache.paimon.types.DataType;
import org.apache.paimon.types.FloatType;
import org.apache.paimon.types.TinyIntType;
+import org.apache.paimon.utils.IOUtils;
import org.apache.paimon.utils.Range;
import org.apache.paimon.utils.RoaringNavigableMap64;
@@ -68,17 +69,14 @@ public class LuceneVectorGlobalIndexReader implements
GlobalIndexReader {
private final GlobalIndexFileReader fileReader;
private final GlobalIndexResult defaultResult;
private volatile boolean indicesLoaded = false;
- private final LuceneVectorIndexOptions vectorIndexOptions;
private final DataType fieldType;
public LuceneVectorGlobalIndexReader(
GlobalIndexFileReader fileReader,
List<GlobalIndexIOMeta> ioMetas,
- LuceneVectorIndexOptions options,
DataType fieldType) {
this.fileReader = fileReader;
this.ioMetas = ioMetas;
- this.vectorIndexOptions = options;
this.fieldType = fieldType;
this.searchers = new ArrayList<>();
this.directories = new ArrayList<>();
@@ -88,23 +86,17 @@ public class LuceneVectorGlobalIndexReader implements
GlobalIndexReader {
@Override
public GlobalIndexResult visitVectorSearch(VectorSearch vectorSearch) {
try {
- if (vectorSearch.similarityFunction().isEmpty()
- ||
LuceneVectorMetric.fromString(vectorSearch.similarityFunction().get())
- == vectorIndexOptions.metric()) {
- ensureLoadIndices(fileReader, ioMetas);
- Query query = query(vectorSearch, fieldType);
- return search(query, vectorSearch.limit());
- }
+ ensureLoadIndices(fileReader, ioMetas);
+ Query query = query(vectorSearch, fieldType);
+ return search(query, vectorSearch.limit());
} catch (IOException e) {
throw new RuntimeException(
String.format(
- "Failed to search vector index with fieldName=%s,
similarity=%s, limit=%d",
+ "Failed to search vector index with fieldName=%s,
limit=%d",
vectorSearch.fieldName(),
- vectorIndexOptions.metric(),
vectorSearch.limit()),
e);
}
- return defaultResult;
}
@Override
@@ -153,32 +145,35 @@ public class LuceneVectorGlobalIndexReader implements
GlobalIndexReader {
private Query query(VectorSearch vectorSearch, DataType dataType) {
Query idFilterQuery = null;
- Iterator<Long> includeRowIds = vectorSearch.includeRowIds();
+ RoaringNavigableMap64 includeRowIds = vectorSearch.includeRowIds();
if (includeRowIds != null) {
- ArrayList<Long> targetIds = new ArrayList<>();
- includeRowIds.forEachRemaining(id -> targetIds.add(id));
+ long[] targetIds = new long[includeRowIds.getIntCardinality()];
+ Iterator<Long> iterator = includeRowIds.iterator();
+ for (int i = 0; i < targetIds.length; i++) {
+ targetIds[i] = iterator.next();
+ }
idFilterQuery = LongPoint.newSetQuery(ROW_ID_FIELD, targetIds);
}
if (dataType instanceof ArrayType
&& ((ArrayType) dataType).getElementType() instanceof
FloatType) {
- if (!(vectorSearch.search() instanceof float[])) {
+ if (!(vectorSearch.vector() instanceof float[])) {
throw new IllegalArgumentException(
- "Expected float[] vector but got: " +
vectorSearch.search().getClass());
+ "Expected float[] vector but got: " +
vectorSearch.vector().getClass());
}
return new KnnFloatVectorQuery(
LuceneVectorIndex.VECTOR_FIELD,
- (float[]) vectorSearch.search(),
+ (float[]) vectorSearch.vector(),
vectorSearch.limit(),
idFilterQuery);
} else if (dataType instanceof ArrayType
&& ((ArrayType) dataType).getElementType() instanceof
TinyIntType) {
- if (!(vectorSearch.search() instanceof byte[])) {
+ if (!(vectorSearch.vector() instanceof byte[])) {
throw new IllegalArgumentException(
- "Expected byte[] vector but got: " +
vectorSearch.search().getClass());
+ "Expected byte[] vector but got: " +
vectorSearch.vector().getClass());
}
return new KnnByteVectorQuery(
LuceneVectorIndex.VECTOR_FIELD,
- (byte[]) vectorSearch.search(),
+ (byte[]) vectorSearch.vector(),
vectorSearch.limit(),
idFilterQuery);
} else {
@@ -249,19 +244,8 @@ public class LuceneVectorGlobalIndexReader implements
GlobalIndexReader {
indicesLoaded = true;
} finally {
if (!indicesLoaded) {
- if (reader != null) {
- try {
- reader.close();
- } catch (IOException e) {
- }
- }
- if (directory != null) {
- try {
- directory.close();
- } catch (Exception e) {
- throw new IOException("Failed to
close directory", e);
- }
- }
+ IOUtils.closeQuietly(reader);
+ IOUtils.closeQuietly(directory);
}
}
}
diff --git
a/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexWriter.java
b/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexWriter.java
index 1ebde19803..9c363fa779 100644
---
a/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexWriter.java
+++
b/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexWriter.java
@@ -52,7 +52,7 @@ public class LuceneVectorGlobalIndexWriter implements
GlobalIndexWriter {
private final LuceneVectorIndexFactory vectorIndexFactory;
private long count = 0;
- private final List<LuceneVectorIndex> vectorIndices;
+ private final List<LuceneVectorIndex<?>> vectorIndices;
private final List<ResultEntry> results;
public LuceneVectorGlobalIndexWriter(
@@ -70,7 +70,7 @@ public class LuceneVectorGlobalIndexWriter implements
GlobalIndexWriter {
@Override
public void write(Object key) {
- LuceneVectorIndex index = vectorIndexFactory.create(count, key);
+ LuceneVectorIndex<?> index = vectorIndexFactory.create(count, key);
index.checkDimension(vectorIndexOptions.dimension());
vectorIndices.add(index);
if (vectorIndices.size() >= sizePerIndex) {
@@ -113,7 +113,7 @@ public class LuceneVectorGlobalIndexWriter implements
GlobalIndexWriter {
}
private void buildIndex(
- List<LuceneVectorIndex> batchVectors,
+ List<LuceneVectorIndex<?>> batchVectors,
int m,
int efConstruction,
int writeBufferSize,
@@ -123,7 +123,7 @@ public class LuceneVectorGlobalIndexWriter implements
GlobalIndexWriter {
try (LuceneIndexMMapDirectory luceneIndexMMapDirectory = new
LuceneIndexMMapDirectory()) {
try (IndexWriter writer =
new IndexWriter(luceneIndexMMapDirectory.directory(),
config)) {
- for (LuceneVectorIndex luceneVectorIndex : batchVectors) {
+ for (LuceneVectorIndex<?> luceneVectorIndex : batchVectors) {
Document doc = new Document();
doc.add(luceneVectorIndex.indexableField(similarityFunction));
doc.add(luceneVectorIndex.rowIdLongPoint());
diff --git
a/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexer.java
b/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexer.java
index d08a24a49c..a78eb607e9 100644
---
a/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexer.java
+++
b/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexer.java
@@ -48,7 +48,7 @@ public class LuceneVectorGlobalIndexer implements
GlobalIndexer {
@Override
public GlobalIndexReader createReader(
- GlobalIndexFileReader fileReader, List<GlobalIndexIOMeta> files)
throws IOException {
- return new LuceneVectorGlobalIndexReader(fileReader, files, options,
fieldType);
+ GlobalIndexFileReader fileReader, List<GlobalIndexIOMeta> files) {
+ return new LuceneVectorGlobalIndexReader(fileReader, files, fieldType);
}
}
diff --git
a/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorIndexFactory.java
b/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorIndexFactory.java
index df63c532ac..7215410aff 100644
---
a/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorIndexFactory.java
+++
b/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorIndexFactory.java
@@ -38,12 +38,12 @@ public abstract class LuceneVectorIndexFactory {
}
}
- public abstract LuceneVectorIndex create(long rowId, Object vector);
+ public abstract LuceneVectorIndex<?> create(long rowId, Object vector);
/** Factory for creating LuceneFloatVectorIndex instances. */
public static class LuceneFloatVectorIndexFactory extends
LuceneVectorIndexFactory {
@Override
- public LuceneVectorIndex create(long rowId, Object vector) {
+ public LuceneVectorIndex<?> create(long rowId, Object vector) {
return new LuceneFloatVectorIndex(rowId, (float[]) vector);
}
}
@@ -51,7 +51,7 @@ public abstract class LuceneVectorIndexFactory {
/** Factory for creating LuceneByteVectorIndex instances. */
public static class LuceneByteVectorIndexFactory extends
LuceneVectorIndexFactory {
@Override
- public LuceneVectorIndex create(long rowId, Object vector) {
+ public LuceneVectorIndex<?> create(long rowId, Object vector) {
return new LuceneByteVectorIndex(rowId, (byte[]) vector);
}
}
diff --git
a/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorSearchGlobalIndexResult.java
b/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorSearchGlobalIndexResult.java
index b87ddc048b..da7cc89dc6 100644
---
a/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorSearchGlobalIndexResult.java
+++
b/paimon-lucene/src/main/java/org/apache/paimon/lucene/index/LuceneVectorSearchGlobalIndexResult.java
@@ -38,7 +38,7 @@ public class LuceneVectorSearchGlobalIndexResult implements
VectorSearchGlobalIn
@Override
public ScoreGetter scoreGetter() {
- return rowId -> id2scores.get(rowId);
+ return id2scores::get;
}
@Override
diff --git
a/paimon-lucene/src/test/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexScanTest.java
b/paimon-lucene/src/test/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexScanTest.java
index 6731c90304..93d9f4ffd8 100644
---
a/paimon-lucene/src/test/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexScanTest.java
+++
b/paimon-lucene/src/test/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexScanTest.java
@@ -68,18 +68,17 @@ public class LuceneVectorGlobalIndexScanTest {
private FileStoreTable table;
private String commitUser;
- private Path tablePath;
private FileIO fileIO;
private RowType rowType;
- private String similarityMetric = "EUCLIDEAN";
- private String vectorFieldName = "vec";
+ private final String vectorFieldName = "vec";
@BeforeEach
public void before() throws Exception {
- tablePath = new Path(tempDir.toString());
+ Path tablePath = new Path(tempDir.toString());
fileIO = new LocalFileIO();
SchemaManager schemaManager = new SchemaManager(fileIO, tablePath);
+ String similarityMetric = "EUCLIDEAN";
Schema schema =
Schema.newBuilder()
.column("id", DataTypes.INT())
diff --git
a/paimon-lucene/src/test/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexTest.java
b/paimon-lucene/src/test/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexTest.java
index 6385b38304..99f0adbdd8 100644
---
a/paimon-lucene/src/test/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexTest.java
+++
b/paimon-lucene/src/test/java/org/apache/paimon/lucene/index/LuceneVectorGlobalIndexTest.java
@@ -58,8 +58,7 @@ public class LuceneVectorGlobalIndexTest {
private FileIO fileIO;
private Path indexPath;
private DataType vectorType;
- private String defaultMetric = "EUCLIDEAN";
- private String fieldName = "vec";
+ private final String fieldName = "vec";
@BeforeEach
public void setup() {
@@ -127,7 +126,7 @@ public class LuceneVectorGlobalIndexTest {
try (LuceneVectorGlobalIndexReader reader =
new LuceneVectorGlobalIndexReader(
- fileReader, metas, indexOptions, vectorType)) {
+ fileReader, metas, vectorType)) {
VectorSearch vectorSearch = new
VectorSearch(testVectors.get(0), 3, fieldName);
GlobalIndexResult searchResult =
reader.visitVectorSearch(vectorSearch);
assertThat(searchResult).isNotNull();
@@ -166,7 +165,7 @@ public class LuceneVectorGlobalIndexTest {
try (LuceneVectorGlobalIndexReader reader =
new LuceneVectorGlobalIndexReader(
- fileReader, metas, indexOptions, vectorType)) {
+ fileReader, metas, vectorType)) {
// Verify search works with this dimension
VectorSearch vectorSearch = new
VectorSearch(testVectors.get(0), 5, fieldName);
GlobalIndexResult searchResult =
reader.visitVectorSearch(vectorSearch);
@@ -215,8 +214,7 @@ public class LuceneVectorGlobalIndexTest {
GlobalIndexFileReader fileReader = createFileReader(indexPath);
List<GlobalIndexIOMeta> metas = new ArrayList<>();
- for (int i = 0; i < results.size(); i++) {
- GlobalIndexWriter.ResultEntry result = results.get(i);
+ for (GlobalIndexWriter.ResultEntry result : results) {
metas.add(
new GlobalIndexIOMeta(
result.fileName(),
@@ -226,7 +224,7 @@ public class LuceneVectorGlobalIndexTest {
}
try (LuceneVectorGlobalIndexReader reader =
- new LuceneVectorGlobalIndexReader(fileReader, metas,
indexOptions, vectorType)) {
+ new LuceneVectorGlobalIndexReader(fileReader, metas,
vectorType)) {
VectorSearch vectorSearch = new VectorSearch(vectors[0], 1,
fieldName);
LuceneVectorSearchGlobalIndexResult result =
(LuceneVectorSearchGlobalIndexResult)
reader.visitVectorSearch(vectorSearch);
@@ -239,7 +237,7 @@ public class LuceneVectorGlobalIndexTest {
filterResults.add(expectedRowId);
vectorSearch =
new VectorSearch(
- vectors[0], 1, fieldName,
filterResults.iterator(), defaultMetric);
+ vectors[0], 1,
fieldName).withIncludeRowIds(filterResults);
result = (LuceneVectorSearchGlobalIndexResult)
reader.visitVectorSearch(vectorSearch);
assertThat(containsRowId(result, expectedRowId)).isTrue();
@@ -281,8 +279,7 @@ public class LuceneVectorGlobalIndexTest {
GlobalIndexFileReader fileReader = createFileReader(indexPath);
List<GlobalIndexIOMeta> metas = new ArrayList<>();
- for (int i = 0; i < results.size(); i++) {
- GlobalIndexWriter.ResultEntry result = results.get(i);
+ for (GlobalIndexWriter.ResultEntry result : results) {
metas.add(
new GlobalIndexIOMeta(
result.fileName(),
@@ -293,7 +290,7 @@ public class LuceneVectorGlobalIndexTest {
try (LuceneVectorGlobalIndexReader reader =
new LuceneVectorGlobalIndexReader(
- fileReader, metas, indexOptions, byteVectorType)) {
+ fileReader, metas, byteVectorType)) {
VectorSearch vectorSearch = new VectorSearch(vectors[0], 1,
fieldName);
GlobalIndexResult result = reader.visitVectorSearch(vectorSearch);
assertThat(result.results().getLongCardinality()).isEqualTo(1);
@@ -310,10 +307,6 @@ public class LuceneVectorGlobalIndexTest {
@Test
public void testInvalidTopK() {
- assertThatThrownBy(() -> new VectorSearch(null, 10, fieldName))
- .isInstanceOf(IllegalArgumentException.class)
- .hasMessageContaining("Search cannot be null");
-
assertThatThrownBy(() -> new VectorSearch(new float[] {0.1f}, 0,
fieldName))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageContaining("Limit must be positive");
@@ -322,6 +315,7 @@ public class LuceneVectorGlobalIndexTest {
private Options createDefaultOptions(int dimension) {
Options options = new Options();
options.setInteger("vector.dim", dimension);
+ String defaultMetric = "EUCLIDEAN";
options.setString("vector.metric", defaultMetric);
options.setInteger("vector.m", 16);
options.setInteger("vector.ef-construction", 100);