This is an automated email from the ASF dual-hosted git repository.
abenedetti pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new 7f454a658ce [SOLR-17812] Add support for
BinaryQuantizedDenseVectorField (#3468)
7f454a658ce is described below
commit 7f454a658cea143275c7c2ed0fb1153b816af851
Author: Kevin Liang <[email protected]>
AuthorDate: Tue Sep 23 06:35:38 2025 -0400
[SOLR-17812] Add support for BinaryQuantizedDenseVectorField (#3468)
* added binary quantisation + documentation and tests
---------
Co-authored-by: kliang78 <[email protected]>
Co-authored-by: Alessandro Benedetti <[email protected]>
---
solr/CHANGES.txt | 2 +
.../schema/BinaryQuantizedDenseVectorField.java | 28 ++++++++++++
.../collection1/conf/schema-densevector-bq.xml | 50 ++++++++++++++++++++++
.../BinaryQuantizedDenseVectorFieldTest.java | 47 ++++++++++++++++++++
.../query-guide/pages/dense-vector-search.adoc | 25 +++++++++++
5 files changed, 152 insertions(+)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index abe7b8f5563..60a2db6c5f1 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -21,6 +21,8 @@ New Features
* SOLR-17780: Add support for scalar quantized dense vectors (Kevin Liang via
Alessandro Benedetti)
+* SOLR-17812: Add support for binary quantized dense vectors (Kevin Liang via
Alessandro Benedetti)
+
* SOLR-17023: Use Modern NLP Models from Apache OpenNLP with Solr (Jeff
Zemerick, Eric Pugh)
* SOLR-17814: Add support for PatienceKnnVectorQuery. (Ilaria Petreti via
Alessandro Benedetti)
diff --git
a/solr/core/src/java/org/apache/solr/schema/BinaryQuantizedDenseVectorField.java
b/solr/core/src/java/org/apache/solr/schema/BinaryQuantizedDenseVectorField.java
new file mode 100644
index 00000000000..4293e9a6d8b
--- /dev/null
+++
b/solr/core/src/java/org/apache/solr/schema/BinaryQuantizedDenseVectorField.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import org.apache.lucene.codecs.KnnVectorsFormat;
+import
org.apache.lucene.codecs.lucene102.Lucene102HnswBinaryQuantizedVectorsFormat;
+
+public class BinaryQuantizedDenseVectorField extends DenseVectorField {
+
+ @Override
+ public KnnVectorsFormat buildKnnVectorsFormat() {
+ return new Lucene102HnswBinaryQuantizedVectorsFormat(getHnswMaxConn(),
getHnswBeamWidth());
+ }
+}
diff --git
a/solr/core/src/test-files/solr/collection1/conf/schema-densevector-bq.xml
b/solr/core/src/test-files/solr/collection1/conf/schema-densevector-bq.xml
new file mode 100644
index 00000000000..056e0928060
--- /dev/null
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-densevector-bq.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for DenseVectorField types -->
+
+<schema name="schema-densevector-bbq" version="1.0">
+ <fieldType name="string" class="solr.StrField" multiValued="true"/>
+ <fieldType name="plong" class="solr.LongPointField"
useDocValuesAsStored="false"/>
+
+ <!-- Binary Bit Quantized vectors -->
+ <fieldType name="knn_vector_binary_quantized"
class="solr.BinaryQuantizedDenseVectorField" vectorDimension="4"/>
+
+ <field name="v_bq" type="knn_vector_binary_quantized" indexed="true"
stored="true" />
+
+ <field name="string_field" type="string" indexed="true" stored="true"
multiValued="false" required="false"/>
+ <field name="id" type="string" indexed="true" stored="true"
multiValued="false" required="false"/>
+ <field name="_version_" type="plong" indexed="true" stored="true"
multiValued="false" />
+ <field name="_text_" type="text_general" indexed="true" stored="false"
multiValued="true"/>
+ <copyField source="*" dest="_text_"/>
+ <fieldType name="text_general" class="solr.TextField"
positionIncrementGap="100" multiValued="true">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" words="stopwords.txt"
ignoreCase="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" words="stopwords.txt"
ignoreCase="true"/>
+ <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <uniqueKey>id</uniqueKey>
+</schema>
diff --git
a/solr/core/src/test/org/apache/solr/schema/BinaryQuantizedDenseVectorFieldTest.java
b/solr/core/src/test/org/apache/solr/schema/BinaryQuantizedDenseVectorFieldTest.java
new file mode 100644
index 00000000000..de08d3f7ed2
--- /dev/null
+++
b/solr/core/src/test/org/apache/solr/schema/BinaryQuantizedDenseVectorFieldTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import org.apache.solr.core.AbstractBadConfigTestBase;
+import org.junit.Test;
+
+public class BinaryQuantizedDenseVectorFieldTest extends
AbstractBadConfigTestBase {
+ @Test
+ public void fieldDefinition_correctConfiguration_shouldLoadSchemaField()
throws Exception {
+ try {
+ initCore("solrconfig-basic.xml", "schema-densevector-bq.xml");
+ IndexSchema schema = h.getCore().getLatestSchema();
+
+ SchemaField vector = schema.getField("v_bq");
+ assertNotNull(vector);
+
+ BinaryQuantizedDenseVectorField type = (BinaryQuantizedDenseVectorField)
vector.getType();
+ assertEquals(4, type.getDimension());
+ assertTrue(vector.indexed());
+ assertTrue(vector.stored());
+ } finally {
+ deleteCore();
+ }
+ }
+
+ // there are no major interface differences between
BinaryBitQuantizedDenseVectorField and
+ // DenseVectorField
+ // so we can rely on those tests for validation cases
+ //
+ // as for behavior, there are no externally visible state differences.
Internal implementation
+ // is tested at Lucene level
+}
diff --git
a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
index d596fbaf985..c52fa7ffb08 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
@@ -316,6 +316,31 @@ preserved when `stored` is true.
+
Accepted values: `BOOLEAN`
+=== BinaryQuantizedDenseVectorField
+
+Binary quantization is a quantization technique that extends scalar
quantization, and is even more aggressive in its compression;
+able to reduce in-memory representation of each vector dimension from a 32 bit
float down to a single bit.
+This is done by normalizing each dimension of a vector relative to a centroid
(mid-point pre-calculated against all vectors in the index)
+with the stored bit representing whether the actual value is "above" or
"below" the centroid's value. A further "corrective factor" is also computed
+and stored to help compensate accuracy in the estimated distance. At query
time asymmetric quantization is applied to the query
+vector (reducing its dimension values down to 4 bits each), but allowing
comparison with the stored binary quantized vector via bit arithmetic.
+
+This implementation comprises of LVQ, proposed in
https://arxiv.org/abs/2304.04759[Similarity Search in the Blink of an Eye With
Compressed Indices]
+by Cecilia Aguerrebere et al., previous work on globally optimized scalar
quantization in Apache Lucene, and ideas from
+https://arxiv.org/abs/1908.10396[Accelerating Large-Scale Inference with
Anisotropic Vector Quantization] by Ruiqi Guo et al.
+
+This vector type is best utilized for data sets consisting of large amounts of
high dimensionality vectors.
+
+Here is how a BinaryQuantizedDenseVectorField can be defined in the schema:
+
+[source,xml]
+<fieldType name="binary_quantized_vector"
class="solr.BinaryQuantizedDenseVectorField" vectorDimension="4"/>
+<field name="vector" type="binary_quantized_vector" indexed="true"
stored="true"/>
+
+BinaryQuantizedDenseVectorField accepts the same parameters as
`DenseVectorField` with the only notable exception being
+`similarityFunction`. Bit quantization uses its own distance calculation and
so does not require nor use the `similarityFunction`
+param.
+
== Query Time
Apache Solr provides three query parsers that work with dense vector fields,
that each support different ways of matching documents based on vector
similarity: The `knn` query parser, the `vectorSimilarity` query parser and the
`knn_text_to_vector` query parser.