This is an automated email from the ASF dual-hosted git repository.
thomasm pushed a commit to branch OAK-12101
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/OAK-12101 by this push:
new 88cbec0ffb OAK-12101 - Skip indexing of very long tags (#2768)
88cbec0ffb is described below
commit 88cbec0ffba19e26ad404b3c740ce2cb7e027823
Author: Anton Hosgood <[email protected]>
AuthorDate: Fri Feb 27 14:07:38 2026 +0100
OAK-12101 - Skip indexing of very long tags (#2768)
* feat: skip long similarity tags
* feat: add tests
* feat: log warning once per minute
* feat: skip long similarity tags for dynamic boosting
* fix: similarity tag constant naming
* feat: add option to disable filtering
* refactor: standardise naming with existing conventions
* feat: silence logs per property
* fix: comment
* feat: add dynamic boost test
* doc: add details to lucene.md
* fix: test name
---------
Co-authored-by: Anton Hosgood <[email protected]>
---
oak-doc/src/site/markdown/query/lucene.md | 19 ++++++++++
.../plugins/index/lucene/LuceneDocumentMaker.java | 4 +-
.../index/lucene/LuceneDocumentMakerTest.java | 43 +++++++++++++++++++++-
.../dynamicBoost/LuceneDynamicBoostTest.java | 31 +++++++++++++++-
.../index/elastic/index/ElasticDocumentMaker.java | 7 ++--
.../index/search/FulltextIndexConstants.java | 6 +++
.../oak/plugins/index/search/IndexDefinition.java | 12 ++++++
.../search/spi/editor/FulltextDocumentMaker.java | 26 ++++++++++++-
.../oak/plugins/index/DynamicBoostCommonTest.java | 9 +++++
.../plugins/index/search/IndexDefinitionTest.java | 13 +++++++
10 files changed, 158 insertions(+), 12 deletions(-)
diff --git a/oak-doc/src/site/markdown/query/lucene.md
b/oak-doc/src/site/markdown/query/lucene.md
index 0a1c2f5a39..a41649ec3e 100644
--- a/oak-doc/src/site/markdown/query/lucene.md
+++ b/oak-doc/src/site/markdown/query/lucene.md
@@ -154,6 +154,7 @@ Below is the canonical index definition structure
- queryPaths (string) multiple = ['/']
- excludedPaths (string) multiple
- maxFieldLength (long) = 10000
+ - maxTagLength (long) = 100
- refresh (boolean)
- useIfExists (string)
- blobSize (long) = 32768
@@ -233,6 +234,13 @@ selectionPolicy
[maxFieldLength][OAK-2469]
: Numbers of terms indexed per field. Defaults to 10000
+[maxTagLength][OAK-12101]
+: Optional integer property. Defaults to 100.
+: Maximum length of similarity tag and dynamic boost tag values to be indexed.
+ Tags with values longer than this limit are skipped during indexing.
+ Set to -1 to disable the length check entirely.
+ See [Dynamic Boost](#dynamic-boost) and [Search by similar feature
vectors](#similar-fv) for details.
+
refresh
: Optional boolean property.
: Used to refresh the stored index definition. See [Effective Index
Definition](#stored-index-definition)
@@ -1231,6 +1239,11 @@ with boost set to the confidence.
This is a replacement for the `IndexFieldProvider`.
See also [OAK-8971][OAK-8971].
+Tag values that exceed the configured `maxTagLength` (default 100) are skipped
during indexing.
+This prevents unexpectedly long values from being indexed as dynamic boost
tags.
+The limit can be changed by setting the `maxTagLength` property on the index
definition,
+or disabled entirely by setting it to -1. See [OAK-12101][OAK-12101].
+
### <a name="native-query"></a>Native Query and Index Selection
`@deprecated Oak 1.46`
@@ -1702,6 +1715,11 @@ As a further improvement for the accuracy of similarity
search results if nodes
holding text values that can be used as keywords or tags that well describe
the feature vector contents, the
`similarityTags` configuration can be set to _true_ for such properties (see
[OAK-8118](https://issues.apache.org/jira/browse/OAK-8118)).
+Similarity tag values that exceed the configured `maxTagLength` (default 100)
are skipped during indexing.
+This prevents unexpectedly long values from being indexed as similarity tags.
+The limit can be changed by setting the `maxTagLength` property on the index
definition,
+or disabled entirely by setting it to -1. See [OAK-12101][OAK-12101].
+
See also [OAK-7575](https://issues.apache.org/jira/browse/OAK-7575).
@@ -2231,6 +2249,7 @@ SELECT rep:facet(title) FROM [app:Asset] WHERE [title] IS
NOT NULL
[OAK-7739]: https://issues.apache.org/jira/browse/OAK-7739
[OAK-8971]: https://issues.apache.org/jira/browse/OAK-8971
[OAK-9625]: https://issues.apache.org/jira/browse/OAK-9625
+[OAK-12101]: https://issues.apache.org/jira/browse/OAK-12101
[luke]: https://code.google.com/p/luke/
[tika]: http://tika.apache.org/
[oak-console]:
https://github.com/apache/jackrabbit-oak/tree/trunk/oak-run#console
diff --git
a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
index f33f86312a..6192b54ae1 100644
---
a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
+++
b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
@@ -415,8 +415,8 @@ public class LuceneDocumentMaker extends
FulltextDocumentMaker<Document> {
}
@Override
- protected boolean indexSimilarityTag(Document doc, PropertyState property)
{
- doc.add(new TextField(FieldNames.SIMILARITY_TAGS,
property.getValue(Type.STRING), Field.Store.YES));
+ protected boolean indexSimilarityTag(Document doc, String value) {
+ doc.add(new TextField(FieldNames.SIMILARITY_TAGS, value,
Field.Store.YES));
return true;
}
diff --git
a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMakerTest.java
b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMakerTest.java
index 8f5cf9b0e9..2c2e804fbe 100644
---
a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMakerTest.java
+++
b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMakerTest.java
@@ -21,23 +21,27 @@ package org.apache.jackrabbit.oak.plugins.index.lucene;
import org.apache.jackrabbit.oak.api.Type;
import
org.apache.jackrabbit.oak.plugins.index.lucene.util.LuceneIndexDefinitionBuilder;
+import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
+import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
import org.apache.jackrabbit.oak.spi.state.NodeState;
+import org.apache.lucene.document.Document;
import org.junit.Test;
import java.util.List;
import static org.apache.jackrabbit.oak.InitialContentHelper.INITIAL_CONTENT;
import static
org.apache.jackrabbit.oak.plugins.memory.EmptyNodeState.EMPTY_NODE;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
public class LuceneDocumentMakerTest {
private final NodeState root = INITIAL_CONTENT;
- private final LuceneIndexDefinitionBuilder builder = new
LuceneIndexDefinitionBuilder();
@Test
public void excludeSingleProperty() throws Exception{
+ LuceneIndexDefinitionBuilder builder = new
LuceneIndexDefinitionBuilder();
builder.indexRule("nt:base")
.property("foo")
.propertyIndex()
@@ -63,4 +67,39 @@ public class LuceneDocumentMakerTest {
assertNull(docMaker.makeDocument(test.getNodeState()));
}
-}
\ No newline at end of file
+ @Test
+ public void similarityTagMaxLengthFiltering() throws Exception{
+ LuceneIndexDefinitionBuilder builder = new
LuceneIndexDefinitionBuilder();
+ builder.indexRule("nt:base")
+ .property("jcr:primaryType")
+ .propertyIndex();
+ builder.indexRule("nt:base")
+ .property("tag")
+ .similarityTags(true);
+
+
builder.getBuilderTree().setProperty(FulltextIndexConstants.MAX_TAG_LENGTH, 10);
+
+ LuceneIndexDefinition defn =
LuceneIndexDefinition.newLuceneBuilder(root, builder.build(), "/foo").build();
+ LuceneDocumentMaker docMaker = new LuceneDocumentMaker(defn,
+ defn.getApplicableIndexingRule("nt:base"), "/x");
+
+ NodeBuilder test = EMPTY_NODE.builder();
+ test.setProperty("tag", "short");
+ Document doc = docMaker.makeDocument(test.getNodeState());
+ assertNotNull(doc);
+ assertEquals("short", doc.get(FieldNames.SIMILARITY_TAGS));
+
+ test = EMPTY_NODE.builder();
+ test.setProperty("tag", "exactly10!");
+ doc = docMaker.makeDocument(test.getNodeState());
+ assertNotNull(doc);
+ assertEquals("exactly10!", doc.get(FieldNames.SIMILARITY_TAGS));
+
+ test = EMPTY_NODE.builder();
+ test.setProperty("tag", "this is too long");
+ doc = docMaker.makeDocument(test.getNodeState());
+ assertNotNull(doc);
+ assertNull(doc.get(FieldNames.SIMILARITY_TAGS));
+ }
+
+}
diff --git
a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/dynamicBoost/LuceneDynamicBoostTest.java
b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/dynamicBoost/LuceneDynamicBoostTest.java
index 1a745bced6..f8c17f6e6b 100644
---
a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/dynamicBoost/LuceneDynamicBoostTest.java
+++
b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/dynamicBoost/LuceneDynamicBoostTest.java
@@ -184,10 +184,37 @@ public class LuceneDynamicBoostTest extends
DynamicBoostCommonTest {
List.of("/test/asset3", "/test/asset2"));
}
+ @Test
+ public void dynamicBoostMaxLengthFiltering() throws Exception {
+ createAssetsIndexAndProperties(false, false, true, 10);
+
+ Tree testParent = createNodeWithType(root.getTree("/"), "test",
JcrConstants.NT_UNSTRUCTURED, "");
+
+ Tree predicted1 = createAssetNodeWithPredicted(testParent, "asset1",
"test");
+ createPredictedTag(predicted1, "short", 0.9);
+ createPredictedTag(predicted1, "exactly10!", 0.8);
+ createPredictedTag(predicted1, "this is too long", 0.7);
+
+ Tree predicted2 = createAssetNodeWithPredicted(testParent, "asset2",
"test");
+ createPredictedTag(predicted2, "short", 0.9);
+ createPredictedTag(predicted2, "exactly10!", 0.8);
+
+ root.commit();
+
+ assertEventually(() -> {
+ assertQuery("select [jcr:path] from [dam:Asset] where contains(*,
'short')", SQL2,
+ List.of("/test/asset1", "/test/asset2"));
+ assertQuery("select [jcr:path] from [dam:Asset] where contains(*,
'exactly10!')", SQL2,
+ List.of("/test/asset1", "/test/asset2"));
+
+ assertQuery("select [jcr:path] from [dam:Asset] where contains(*,
'this is too long')", SQL2, List.of());
+ });
+ }
+
@Override
- protected void createAssetsIndexAndProperties(boolean lite, boolean
similarityTags) throws Exception {
+ protected void createAssetsIndexAndProperties(boolean lite, boolean
similarityTags, boolean useInFullTextQuery, Integer maxTagLength) throws
Exception {
factory.queryTermsProvider = new FulltextQueryTermsProviderImpl();
- super.createAssetsIndexAndProperties(lite, similarityTags);
+ super.createAssetsIndexAndProperties(lite, similarityTags,
useInFullTextQuery, maxTagLength);
}
private String runIndexingTest(Class<?> loggerClass, boolean nameProperty)
throws CommitFailedException {
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
index bc22980771..dd856dfd07 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
@@ -242,10 +242,9 @@ public class ElasticDocumentMaker extends
FulltextDocumentMaker<ElasticDocument>
}
@Override
- protected boolean indexSimilarityTag(ElasticDocument doc, PropertyState
property) {
- String val = property.getValue(Type.STRING);
- if (!val.isEmpty()) {
- doc.addSimilarityTag(val);
+ protected boolean indexSimilarityTag(ElasticDocument doc, String value) {
+ if (!value.isEmpty()) {
+ doc.addSimilarityTag(value);
return true;
}
return false;
diff --git
a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java
b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java
index c0fc932116..aa29ddad5d 100644
---
a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java
+++
b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java
@@ -251,6 +251,12 @@ public interface FulltextIndexConstants {
*/
String MAX_FIELD_LENGTH = "maxFieldLength";
+ /**
+ * Maximum length of similarity and dynamic boost tag values to be
indexed. Tags longer than this value will be skipped.
+ * Set to -1 to disable the length check entirely
+ */
+ String MAX_TAG_LENGTH = "maxTagLength";
+
/**
* whether use this property values for suggestions
*/
diff --git
a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java
b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java
index 318fd30330..0798a2485c 100644
---
a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java
+++
b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java
@@ -146,6 +146,11 @@ public class IndexDefinition implements
Aggregate.AggregateMapper {
*/
public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
+ /**
+ * Default value for property {@link #maxTagLength}.
+ */
+ public static final int DEFAULT_MAX_TAG_LENGTH = 100;
+
public static final int DEFAULT_MAX_EXTRACT_LENGTH = -10;
/**
@@ -274,6 +279,8 @@ public class IndexDefinition implements
Aggregate.AggregateMapper {
private final int maxFieldLength;
+ private final int maxTagLength;
+
private final int maxExtractLength;
private final int suggesterUpdateFrequencyMinutes;
@@ -470,6 +477,7 @@ public class IndexDefinition implements
Aggregate.AggregateMapper {
}
this.maxFieldLength = getOptionalValue(defn,
FulltextIndexConstants.MAX_FIELD_LENGTH, DEFAULT_MAX_FIELD_LENGTH);
+ this.maxTagLength = getOptionalValue(defn,
FulltextIndexConstants.MAX_TAG_LENGTH, DEFAULT_MAX_TAG_LENGTH);
this.costPerEntry = getOptionalValue(defn,
FulltextIndexConstants.COST_PER_ENTRY, getDefaultCostPerEntry(version));
this.costPerExecution = getOptionalValue(defn,
FulltextIndexConstants.COST_PER_EXECUTION, 1.0);
this.hasCustomTikaConfig = getTikaConfigNode().exists();
@@ -690,6 +698,10 @@ public class IndexDefinition implements
Aggregate.AggregateMapper {
return indexSelectionPolicy;
}
+ public int getMaxTagLength() {
+ return maxTagLength;
+ }
+
public int getMaxExtractLength() {
return maxExtractLength;
}
diff --git
a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextDocumentMaker.java
b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextDocumentMaker.java
index 6965403ea0..ee3f676826 100644
---
a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextDocumentMaker.java
+++
b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextDocumentMaker.java
@@ -23,6 +23,7 @@ import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.commons.collections.IterableUtils;
+import org.apache.jackrabbit.oak.commons.log.LogSilencer;
import org.apache.jackrabbit.oak.plugins.index.search.Aggregate;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
@@ -58,6 +59,9 @@ import static java.util.Objects.requireNonNull;
public abstract class FulltextDocumentMaker<D> implements DocumentMaker<D> {
private final Logger log = LoggerFactory.getLogger(getClass());
+
+ private static final LogSilencer LOG_SILENCER = new LogSilencer();
+
public static final String WARN_LOG_STRING_SIZE_THRESHOLD_KEY =
"oak.repository.property.index.logWarnStringSizeThreshold";
private static final int DEFAULT_WARN_LOG_STRING_SIZE_THRESHOLD_VALUE =
102400;
@@ -343,7 +347,13 @@ public abstract class FulltextDocumentMaker<D> implements
DocumentMaker<D> {
dirty |= indexFacets(doc, property, pname, pd);
}
if (pd.similarityTags) {
- dirty |= indexSimilarityTag(doc, property);
+ String value = property.getValue(Type.STRING);
+ if (isTagWithinLengthLimit(value)) {
+ dirty |= indexSimilarityTag(doc, value);
+ } else if (!LOG_SILENCER.silence(pname)) {
+ log.warn("[{}] Skipping similarity tag for property {}.
Value length {} exceeds maximum allowed length",
+ getIndexName(), pname, value.length());
+ }
}
}
@@ -377,7 +387,7 @@ public abstract class FulltextDocumentMaker<D> implements
DocumentMaker<D> {
return true;
}
- protected abstract boolean indexSimilarityTag(D doc, PropertyState
property);
+ protected abstract boolean indexSimilarityTag(D doc, String value);
protected abstract void indexSimilarityBinaries(D doc, PropertyDefinition
pd, Blob blob) throws IOException;
@@ -704,6 +714,13 @@ public abstract class FulltextDocumentMaker<D> implements
DocumentMaker<D> {
continue;
}
String dynaTagValue = p.getValue(Type.STRING);
+ if (!isTagWithinLengthLimit(dynaTagValue)) {
+ if (!LOG_SILENCER.silence(p.getName())) {
+ log.warn("[{}] Skipping dynamic boost tag for property {}.
Value length {} exceeds maximum allowed length",
+ getIndexName(), p.getName(),
dynaTagValue.length());
+ }
+ continue;
+ }
p = dynaTag.getProperty(DYNAMIC_BOOST_TAG_CONFIDENCE);
if (p == null) {
// here we don't log a warning, because possibly it will be
added later
@@ -736,6 +753,11 @@ public abstract class FulltextDocumentMaker<D> implements
DocumentMaker<D> {
return definition.getIndexName();
}
+ private boolean isTagWithinLengthLimit(String value) {
+ int maxLength = definition.getMaxTagLength();
+ return maxLength < 0 || value.length() <= maxLength;
+ }
+
/*
* Extracts the local name of the current node ignoring any namespace
prefix
*/
diff --git
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
index d71426d9af..6c5e6998a1 100644
---
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
+++
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java
@@ -234,6 +234,10 @@ public abstract class DynamicBoostCommonTest extends
AbstractQueryTest {
}
protected void createAssetsIndexAndProperties(boolean lite, boolean
similarityTags, boolean useInFullTextQuery) throws Exception {
+ createAssetsIndexAndProperties(lite, similarityTags,
useInFullTextQuery, null);
+ }
+
+ protected void createAssetsIndexAndProperties(boolean lite, boolean
similarityTags, boolean useInFullTextQuery, Integer maxTagLength) throws
Exception {
NodeTypeRegistry.register(root, new
ByteArrayInputStream(ASSET_NODE_TYPE.getBytes()), "test nodeType");
Tree indexRuleProps = createIndex("dam:Asset", lite);
@@ -250,6 +254,11 @@ public abstract class DynamicBoostCommonTest extends
AbstractQueryTest {
predictedTags.setProperty("similarityTags", true);
}
+ if (maxTagLength != null) {
+ Tree indexDef = root.getTree("/oak:index/" + TEST_INDEX_NAME);
+ indexDef.setProperty(FulltextIndexConstants.MAX_TAG_LENGTH,
maxTagLength);
+ }
+
root.commit();
}
diff --git
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinitionTest.java
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinitionTest.java
index 43dcafbe97..d74e82202b 100644
---
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinitionTest.java
+++
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinitionTest.java
@@ -513,6 +513,19 @@ public class IndexDefinitionTest {
assertEquals("application/test-unmapped",
defn.getTikaMappedMimeType("application/test-unmapped"));
}
+ @Test
+ public void maxTagLength() {
+ NodeBuilder defnb =
newFTIndexDefinition(builder.child(INDEX_DEFINITIONS_NAME), "foo",
+ "lucene", Set.of(TYPENAME_STRING));
+ IndexDefinition defn = new IndexDefinition(root, defnb.getNodeState(),
"/foo");
+ assertEquals(IndexDefinition.DEFAULT_MAX_TAG_LENGTH,
defn.getMaxTagLength());
+
+ defnb.setProperty(FulltextIndexConstants.MAX_TAG_LENGTH, 50);
+
+ defn = new IndexDefinition(root, defnb.getNodeState(), "/foo");
+ assertEquals(50, defn.getMaxTagLength());
+ }
+
@Test
public void maxExtractLength() {
NodeBuilder defnb =
newFTIndexDefinition(builder.child(INDEX_DEFINITIONS_NAME), "foo",