This is an automated email from the ASF dual-hosted git repository.
thomasm pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new 25df4149f2 OAK-11568 Elastic: improved compatibility for aggregation
definitions (#2193)
25df4149f2 is described below
commit 25df4149f22d8b05da3f2f7a5a57b3feebf908a3
Author: Thomas Mueller <[email protected]>
AuthorDate: Mon Mar 31 17:11:36 2025 +0200
OAK-11568 Elastic: improved compatibility for aggregation definitions
(#2193)
* OAK-11568 Elastic: improved compatibility for aggregation definitions
* OAK-11568 Elastic: improved compatibility for aggregation definitions
* OAK-11568 Elastic: improved compatibility for aggregation definitions
* OAK-11568 Elastic: improved compatibility for aggregation definitions
* OAK-11568 Elastic: improved compatibility for aggregation definitions
* OAK-11568 Elastic: improved compatibility for aggregation definitions
---
.../index/elastic/ElasticIndexProviderService.java | 69 +--------
.../elastic/index/ElasticBulkProcessorHandler.java | 2 +-
.../index/elastic/index/ElasticCustomAnalyzer.java | 136 +++++++++++++++---
.../index/elastic/index/ElasticDocument.java | 2 +
.../elastic/index/ElasticIndexEditorContext.java | 2 +-
.../index/elastic/index/ElasticIndexWriter.java | 11 +-
.../index/elastic/util/ElasticIndexUtils.java | 3 +-
.../index/elastic/ElasticInferenceTest.java | 5 +-
.../oak/plugins/index/elastic/ElasticPerfTest.java | 5 +-
.../plugins/index/elastic/ElasticTestServer.java | 1 +
.../elastic/index/ElasticIndexHelperTest.java | 61 ++++++++
.../oak/plugins/index/search/util/ConfigUtil.java | 2 +-
.../plugins/index/FullTextAnalyzerCommonTest.java | 156 +++++++++++++++++++++
13 files changed, 362 insertions(+), 93 deletions(-)
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexProviderService.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexProviderService.java
index 85ceaba30f..550042d97a 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexProviderService.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexProviderService.java
@@ -16,9 +16,6 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic;
-import org.apache.commons.io.FilenameUtils;
-import org.apache.jackrabbit.oak.api.jmx.CacheStatsMBean;
-import org.apache.jackrabbit.oak.cache.CacheStats;
import org.apache.jackrabbit.oak.commons.IOUtils;
import org.apache.jackrabbit.oak.osgi.OsgiWhiteboard;
import org.apache.jackrabbit.oak.plugins.index.AsyncIndexInfoService;
@@ -50,13 +47,11 @@ import
org.osgi.service.metatype.annotations.ObjectClassDefinition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.File;
import java.util.ArrayList;
import java.util.Dictionary;
import java.util.Hashtable;
import java.util.List;
-import static org.apache.commons.io.FileUtils.ONE_MB;
import static
org.apache.jackrabbit.oak.spi.whiteboard.WhiteboardUtils.registerMBean;
import static
org.apache.jackrabbit.oak.spi.whiteboard.WhiteboardUtils.scheduleWithFixedDelay;
@@ -130,8 +125,6 @@ public class ElasticIndexProviderService {
private static final Logger LOG =
LoggerFactory.getLogger(ElasticIndexProviderService.class);
- private static final String REPOSITORY_HOME = "repository.home";
-
@Reference
private StatisticsProvider statisticsProvider;
@@ -149,11 +142,10 @@ public class ElasticIndexProviderService {
private ExtractedTextCache extractedTextCache;
- private final List<ServiceRegistration> regs = new ArrayList<>();
+ private final List<ServiceRegistration<?>> regs = new ArrayList<>();
private final List<Registration> oakRegs = new ArrayList<>();
private Whiteboard whiteboard;
- private File textExtractionDir;
private ElasticConnection elasticConnection;
private ElasticMetricHandler metricHandler;
@@ -207,7 +199,7 @@ public class ElasticIndexProviderService {
@Deactivate
private void deactivate() {
- for (ServiceRegistration reg : regs) {
+ for (ServiceRegistration<?> reg : regs) {
reg.unregister();
}
@@ -245,63 +237,6 @@ public class ElasticIndexProviderService {
Dictionary<String, Object> props = new Hashtable<>();
props.put("type", ElasticIndexDefinition.TYPE_ELASTICSEARCH);
regs.add(bundleContext.registerService(IndexEditorProvider.class.getName(),
editorProvider, props));
-// oakRegs.add(registerMBean(whiteboard,
-// TextExtractionStatsMBean.class,
-// editorProvider.getExtractedTextCache().getStatsMBean(),
-// TextExtractionStatsMBean.TYPE,
-// "TextExtraction statistics"));
- }
-
- private void initializeExtractedTextCache(final Config config,
StatisticsProvider statisticsProvider) {
-
- extractedTextCache = new ExtractedTextCache(
- config.extractedTextCacheSizeInMB() * ONE_MB,
- config.extractedTextCacheExpiryInSecs(),
- config.alwaysUsePreExtractedCache(),
- textExtractionDir,
- statisticsProvider);
- if (extractedTextProvider != null) {
- registerExtractedTextProvider(extractedTextProvider);
- }
- CacheStats stats = extractedTextCache.getCacheStats();
- if (stats != null) {
- oakRegs.add(registerMBean(whiteboard,
- CacheStatsMBean.class, stats,
- CacheStatsMBean.TYPE, stats.getName()));
- LOG.info("Extracted text caching enabled with maxSize {} MB,
expiry time {} secs",
- config.extractedTextCacheSizeInMB(),
config.extractedTextCacheExpiryInSecs());
- }
- }
-
- private void initializeTextExtractionDir(BundleContext bundleContext,
Config config) {
- String textExtractionDir = config.localTextExtractionDir();
- if (textExtractionDir.trim().isEmpty()) {
- String repoHome = bundleContext.getProperty(REPOSITORY_HOME);
- if (repoHome != null) {
- textExtractionDir = FilenameUtils.concat(repoHome, "index");
- }
- }
-
- if (textExtractionDir == null) {
- throw new IllegalStateException(String.format("Text extraction
directory cannot be determined as neither " +
- "directory path [%s] nor repository home [%s] defined",
PROP_LOCAL_TEXT_EXTRACTION_DIR, REPOSITORY_HOME));
- }
-
- this.textExtractionDir = new File(textExtractionDir);
- }
-
- private void registerExtractedTextProvider(PreExtractedTextProvider
provider) {
- if (extractedTextCache != null) {
- if (provider != null) {
- String usage =
extractedTextCache.isAlwaysUsePreExtractedCache() ?
- "always" : "only during reindexing phase";
- LOG.info("Registering PreExtractedTextProvider {} with
extracted text cache. " +
- "It would be used {}", provider, usage);
- } else {
- LOG.info("Unregistering PreExtractedTextProvider with
extracted text cache");
- }
- extractedTextCache.setExtractedTextProvider(provider);
- }
}
private ElasticConnection getElasticConnection(Config contextConfig) {
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticBulkProcessorHandler.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticBulkProcessorHandler.java
index 64485b03f8..673488a5ea 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticBulkProcessorHandler.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticBulkProcessorHandler.java
@@ -158,7 +158,7 @@ class ElasticBulkProcessorHandler {
private void checkFailures() throws IOException {
if (!suppressedErrorCauses.isEmpty()) {
- IOException ioe = new IOException("Exception while indexing. See
suppressed for details");
+ IOException ioe = new IOException("Exception while indexing " +
indexName + ". See suppressed for details");
suppressedErrorCauses.stream().map(ec -> new
IllegalStateException(ec.reason())).forEach(ioe::addSuppressed);
throw ioe;
}
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzer.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzer.java
index 05026f9e20..02690691b7 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzer.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzer.java
@@ -19,6 +19,7 @@ package org.apache.jackrabbit.oak.plugins.index.elastic.index;
import co.elastic.clients.elasticsearch._types.analysis.Analyzer;
import co.elastic.clients.elasticsearch._types.analysis.CharFilterDefinition;
import co.elastic.clients.elasticsearch._types.analysis.CustomAnalyzer;
+import co.elastic.clients.elasticsearch._types.analysis.NGramTokenizer;
import co.elastic.clients.elasticsearch._types.analysis.TokenFilterDefinition;
import co.elastic.clients.elasticsearch._types.analysis.TokenizerDefinition;
import co.elastic.clients.elasticsearch.indices.IndexSettingsAnalysis;
@@ -40,6 +41,7 @@ import org.apache.jackrabbit.oak.spi.state.NodeStateUtils;
import org.apache.lucene.analysis.AbstractAnalysisFactory;
import org.apache.lucene.analysis.CharFilterFactory;
import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
import org.apache.lucene.util.ResourceLoader;
import org.jetbrains.annotations.NotNull;
@@ -55,6 +57,7 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
@@ -97,7 +100,13 @@ public class ElasticCustomAnalyzer {
NodeState defaultAnalyzer =
state.getChildNode(FulltextIndexConstants.ANL_DEFAULT);
if (defaultAnalyzer.exists()) {
IndexSettingsAnalysis.Builder builder = new
IndexSettingsAnalysis.Builder();
- Map<String, Object> analyzer =
convertNodeState(defaultAnalyzer);
+ Map<String, Object> analyzer;
+ try {
+ analyzer = convertNodeState(defaultAnalyzer);
+ } catch (IOException e) {
+ LOG.warn("Can not load analyzer; using an empty
configuration", e);
+ analyzer = Map.of();
+ }
String builtIn =
defaultAnalyzer.getString(FulltextIndexConstants.ANL_CLASS);
if (builtIn == null) {
builtIn =
defaultAnalyzer.getString(FulltextIndexConstants.ANL_NAME);
@@ -107,11 +116,14 @@ public class ElasticCustomAnalyzer {
// content params, usually stop words
for (ChildNodeEntry nodeEntry :
defaultAnalyzer.getChildNodeEntries()) {
+ List<String> list;
try {
- analyzer.put(normalize(nodeEntry.getName()),
loadContent(nodeEntry.getNodeState(), nodeEntry.getName(),
NOOP_TRANSFORMATION));
+ list = loadContent(nodeEntry.getNodeState(),
nodeEntry.getName(), NOOP_TRANSFORMATION);
} catch (IOException e) {
- throw new IllegalStateException("Unable to load
content for node entry " + nodeEntry.getName(), e);
+ LOG.warn("Unable to load analyzer content for
entry '" + nodeEntry.getName() + "'; using empty list", e);
+ list = List.of();
}
+ analyzer.put(normalize(nodeEntry.getName()), list);
}
builder.analyzer(analyzerName, new Analyzer(null,
JsonData.of(analyzer)));
@@ -145,31 +157,68 @@ public class ElasticCustomAnalyzer {
@NotNull
private static TokenizerDefinition loadTokenizer(NodeState state) {
- String name =
normalize(Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME)));
- Map<String, Object> args = convertNodeState(state);
+ String name;
+ Map<String, Object> args;
+ if (!state.exists()) {
+ LOG.warn("No tokenizer specified; the standard with an empty
configuration");
+ name = "Standard";
+ args = new HashMap<String, Object>();
+ } else {
+ name =
Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME));
+ try {
+ args = convertNodeState(state);
+ } catch (IOException e) {
+ LOG.warn("Can not load tokenizer; using an empty
configuration", e);
+ args = new HashMap<String, Object>();
+ }
+ }
+ name = normalize(name);
+ if ("n_gram".equals(name)) {
+ // OAK-11568
+ //
https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html
+ Integer minGramSize = getIntegerSetting(args, "minGramSize", 2);
+ Integer maxGramSize = getIntegerSetting(args, "maxGramSize", 3);
+ TokenizerDefinition ngram = TokenizerDefinition.of(t -> t.ngram(
+ NGramTokenizer.of(n ->
n.minGram(minGramSize).maxGram(maxGramSize))));
+ return ngram;
+ }
args.put(ANALYZER_TYPE, name);
return new TokenizerDefinition(name, JsonData.of(args));
}
+ private static Integer getIntegerSetting(Map<String, Object> args, String
name, Integer defaultValue) {
+ Object value = args.getOrDefault(name, defaultValue);
+ if (!(value instanceof Integer)) {
+ LOG.warn("Setting {} value {} is not an integer; using default:
{}", name, value, defaultValue);
+ return defaultValue;
+ }
+ return (Integer) value;
+ }
+
private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
Function<String,
Class<? extends AbstractAnalysisFactory>> lookup,
BiFunction<String, JsonData, FD> factory) {
LinkedHashMap<String, FD> filters = new LinkedHashMap<>();
int i = 0;
- //Need to read children in order
+ // Need to read children in order
Tree tree = TreeFactory.createReadOnlyTree(state);
+
+ // We need to remember that a "WordDelimiter" was configured,
+ // because we have to remove it if a synonyms filter is configured as
well
+ String wordDelimiterFilterKey = null;
for (Tree t : tree.getChildren()) {
NodeState child = state.getChildNode(t.getName());
String name;
List<String> content = null;
List<ParameterTransformer> transformers;
+ boolean skipEntry = false;
try {
- Class<? extends AbstractAnalysisFactory> tff =
lookup.apply(t.getName());
+ Class<? extends AbstractAnalysisFactory> analysisFactory =
lookup.apply(t.getName());
List<String> unsupportedParameters =
UNSUPPORTED_LUCENE_PARAMETERS.entrySet().stream()
- .filter(k -> k.getKey().isAssignableFrom(tff))
+ .filter(k ->
k.getKey().isAssignableFrom(analysisFactory))
.map(Map.Entry::getValue)
.findFirst().orElseGet(Collections::emptyList);
Map<String, String> luceneArgs =
StreamSupport.stream(child.getProperties().spliterator(), false)
@@ -177,17 +226,24 @@ public class ElasticCustomAnalyzer {
.filter(ps ->
!unsupportedParameters.contains(ps.getName()))
.collect(Collectors.toMap(PropertyState::getName, ps
-> ps.getValue(Type.STRING)));
- AbstractAnalysisFactory luceneFactory =
tff.getConstructor(Map.class).newInstance(luceneArgs);
+ AbstractAnalysisFactory luceneFactory =
analysisFactory.getConstructor(Map.class).newInstance(luceneArgs);
if (luceneFactory instanceof AbstractWordsFileFilterFactory) {
AbstractWordsFileFilterFactory wordsFF =
((AbstractWordsFileFilterFactory) luceneFactory);
// this will parse/load the content handling different
formats, comments, etc
wordsFF.inform(new NodeStateResourceLoader(child));
content = wordsFF.getWords().stream().map(w -> new
String(((char[]) w))).collect(Collectors.toList());
}
+ if (luceneFactory instanceof MappingCharFilterFactory) {
+ MappingCharFilterFactory map = (MappingCharFilterFactory)
luceneFactory;
+ if (map.getOriginalArgs().isEmpty()) {
+ skipEntry = true;
+ LOG.warn("Empty CharFilter mapping: ignoring");
+ }
+ }
- name = normalize((String) tff.getField("NAME").get(null));
+ name = normalize((String)
analysisFactory.getField("NAME").get(null));
transformers = LUCENE_ELASTIC_TRANSFORMERS.entrySet().stream()
- .filter(k -> k.getKey().isAssignableFrom(tff))
+ .filter(k ->
k.getKey().isAssignableFrom(analysisFactory))
.map(Map.Entry::getValue)
.collect(Collectors.toList());
} catch (Exception e) {
@@ -201,6 +257,21 @@ public class ElasticCustomAnalyzer {
Map<String, Object> args = convertNodeState(child, transformers,
content);
+ if (name.equals("word_delimiter")) {
+ //
https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-tokenfilter.html
+ // We recommend using the word_delimiter_graph instead of the
word_delimiter filter.
+ // The word_delimiter filter can produce invalid token graphs.
+ LOG.info("Replacing the word delimiter filter with the word
delimiter graph");
+ name = "word_delimiter_graph";
+ }
+ if (name.equals("hyphenation_compound_word")) {
+ name = "hyphenation_decompounder";
+ String hypenator = args.getOrDefault("hyphenator",
"").toString();
+ LOG.info("Using the hyphenation_decompounder: " + hypenator);
+ args.put("hyphenation_patterns_path",
"analysis/hyphenation_patterns.xml");
+ args.put("word_list", List.of());
+ }
+
// stemmer in elastic don't have language based configurations.
They all stay under the stemmer config with
// a language parameter
if (name.endsWith("_stem")) {
@@ -221,14 +292,31 @@ public class ElasticCustomAnalyzer {
}
args.put(ANALYZER_TYPE, name);
- filters.put(name + "_" + i, factory.apply(name,
JsonData.of(args)));
+ if (skipEntry) {
+ continue;
+ }
+ String key = name + "_" + i;
+ filters.put(key, factory.apply(name, JsonData.of(args)));
+ if (name.equals("word_delimiter_graph")) {
+ wordDelimiterFilterKey = key;
+ } else if (name.equals("synonym")) {
+ if (wordDelimiterFilterKey != null) {
+ LOG.info("Removing word delimiter because there is a
synonyms filter as well: " + wordDelimiterFilterKey);
+ filters.remove(wordDelimiterFilterKey);
+ }
+ }
i++;
}
return filters;
}
private static List<String> loadContent(NodeState file, String name,
ContentTransformer transformer) throws IOException {
- Blob blob = ConfigUtil.getBlob(file, name);
+ Blob blob;
+ try {
+ blob = ConfigUtil.getBlob(file, name);
+ } catch (IllegalArgumentException | IllegalStateException e) {
+ throw new IOException("Could not load " + name, e);
+ }
try (Reader content = new
InputStreamReader(Objects.requireNonNull(blob).getNewStream(),
StandardCharsets.UTF_8)) {
try (BufferedReader br = new BufferedReader(content)) {
return br.lines()
@@ -264,11 +352,25 @@ public class ElasticCustomAnalyzer {
return name;
}
- private static Map<String, Object> convertNodeState(NodeState state) {
- return convertNodeState(state, List.of(), List.of());
+ private static Map<String, Object> convertNodeState(NodeState state)
throws IOException {
+ try {
+ return convertNodeState(state, List.of(), List.of());
+ } catch (IllegalStateException e) {
+ // convert runtime exception back to checked exception
+ throw new IOException("Can not convert", e);
+ }
}
- private static Map<String, Object> convertNodeState(NodeState state,
List<ParameterTransformer> transformers, List<String> preloadedContent) {
+ /**
+ * Read analyzer configuration.
+ *
+ * @param state the node state
+ * @param transformers
+ * @param preloadedContent
+ * @return
+ * @throws IllegalStateException
+ */
+ private static Map<String, Object> convertNodeState(NodeState state,
List<ParameterTransformer> transformers, List<String> preloadedContent) throws
IllegalStateException {
Map<String, Object> luceneParams =
StreamSupport.stream(Spliterators.spliteratorUnknownSize(state.getProperties().iterator(),
Spliterator.ORDERED), false)
.filter(ElasticCustomAnalyzer::isPropertySupported)
.collect(Collectors.toMap(PropertyState::getName, ps -> {
@@ -280,6 +382,8 @@ public class ElasticCustomAnalyzer {
return
loadContent(state.getChildNode(v.trim()), v.trim(),
CONTENT_TRANSFORMERS.getOrDefault(ps.getName(), NOOP_TRANSFORMATION)).stream();
} catch (IOException e) {
+ // convert checked exception to runtime
exception to runtime exception,
+ // because the stream API doesn't support
checked exceptions
throw new IllegalStateException(e);
}
}).collect(Collectors.toList()));
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
index 2f1ee7e26e..a7918ec83f 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
@@ -97,6 +97,7 @@ public class ElasticDocument {
map -> {
Object existingValue =
map.get(ElasticIndexHelper.DYNAMIC_PROPERTY_VALUE);
if (existingValue instanceof Set) {
+ @SuppressWarnings("unchecked")
Set<Object> existingSet = (Set<Object>)
existingValue;
existingSet.add(value);
} else {
@@ -134,6 +135,7 @@ public class ElasticDocument {
if (existingValue == null) {
finalValue = value;
} else if (existingValue instanceof Set) {
+ @SuppressWarnings("unchecked")
Set<Object> existingSet = (Set<Object>) existingValue;
existingSet.add(value);
finalValue = existingSet;
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexEditorContext.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexEditorContext.java
index 330ebe416c..4e7f68cf95 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexEditorContext.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexEditorContext.java
@@ -40,7 +40,7 @@ class ElasticIndexEditorContext extends
FulltextIndexEditorContext<ElasticDocume
}
@Override
- public IndexDefinition.Builder newDefinitionBuilder() {
+ public ElasticIndexDefinition.Builder newDefinitionBuilder() {
return new ElasticIndexDefinition.Builder(((ElasticIndexDefinition)
definition).getIndexPrefix());
}
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
index f77de098e6..abd1e94fe8 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexWriter.java
@@ -192,7 +192,16 @@ class ElasticIndexWriter implements
FulltextIndexWriter<ElasticDocument> {
LOG.error("Failed to create index {}: {}", indexName,
e.toString());
throw e;
}
- LOG.debug("Creating Index with request {}", request);
+ if (LOG.isDebugEnabled()) {
+ int old = JsonpUtils.maxToStringLength();
+ try {
+ // temporarily increase the length, to avoid truncation
+ JsonpUtils.maxToStringLength(1_000_000);
+ LOG.debug("Creating Index with request {}", request);
+ } finally {
+ JsonpUtils.maxToStringLength(old);
+ }
+ }
// create the new index
try {
final CreateIndexResponse response = esClient.create(request);
diff --git
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
index b34a908f5e..c57118e9ad 100644
---
a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
+++
b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
@@ -107,7 +107,8 @@ public class ElasticIndexUtils {
byte[] pathBytes = path.getBytes(StandardCharsets.UTF_8);
if (pathBytes.length > 512) {
try {
- return new
String(MessageDigest.getInstance("SHA-256").digest(pathBytes));
+ return new
String(MessageDigest.getInstance("SHA-256").digest(pathBytes),
+ StandardCharsets.UTF_8);
} catch (NoSuchAlgorithmException e) {
throw new IllegalStateException(e);
}
diff --git
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticInferenceTest.java
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticInferenceTest.java
index 22a334f344..d0c459f0c5 100644
---
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticInferenceTest.java
+++
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticInferenceTest.java
@@ -145,12 +145,13 @@ public class ElasticInferenceTest extends
ElasticAbstractQueryTest {
for (String path : paths) {
URL json = this.getClass().getResource("/inference" + path +
".json");
if (json != null) {
- Map<String, Object> map = mapper.readValue(json, Map.class);
+ @SuppressWarnings("unchecked")
+ Map<String, Collection<Double>> map = mapper.readValue(json,
Map.class);
ObjectNode updateDoc = mapper.createObjectNode();
ObjectNode inferenceNode =
updateDoc.putObject(ElasticIndexDefinition.INFERENCE);
ArrayNode embeddingsNode =
inferenceNode.putObject("embeddings").putArray("value");
inferenceNode.putObject("metadata").put("updatedAt",
Instant.now().toEpochMilli());
- for (Double d : (Collection<Double>) map.get("embedding")) {
+ for (Double d : map.get("embedding")) {
embeddingsNode.add(d);
}
updateDocument(index, path, updateDoc);
diff --git
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPerfTest.java
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPerfTest.java
index c81412f755..77ef7ce225 100644
---
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPerfTest.java
+++
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPerfTest.java
@@ -167,12 +167,11 @@ public class ElasticPerfTest extends
ElasticAbstractQueryTest {
private void testQuery(String query, String language) throws Exception {
Result result = executeQuery(query, language, NO_BINDINGS);
- Iterable<ResultRow> it = (Iterable<ResultRow>) result.getRows();
- Iterator<ResultRow> iterator = it.iterator();
+ Iterator<? extends ResultRow> iterator = result.getRows().iterator();
long start = LOG_PERF.startForInfoLog("Getting result rows");
int i = 0;
while (iterator.hasNext()) {
- ResultRow row = iterator.next();
+ iterator.next();
i++;
}
LOG_PERF.end(start, -1,-1, "{} Results fetched", i);
diff --git
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java
index 4efcc665e5..cf694cb673 100644
---
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java
+++
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java
@@ -62,6 +62,7 @@ public class ElasticTestServer implements AutoCloseable {
return CONTAINER;
}
+ @SuppressWarnings("resource")
private synchronized void setup() {
String esDockerImageVersion = ELASTIC_DOCKER_IMAGE_VERSION != null ?
ELASTIC_DOCKER_IMAGE_VERSION : Version.VERSION.toString();
LOG.info("Elasticsearch test Docker image version: {}.",
esDockerImageVersion);
diff --git
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java
index 9b7372967e..3a7bdb4d15 100644
---
a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java
+++
b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java
@@ -95,6 +95,67 @@ public class ElasticIndexHelperTest {
ElasticIndexHelper.createIndexRequest("prefix.path", definition);
}
+ @Test
+ public void analyzerWithEmptyTokenizer() {
+ IndexDefinitionBuilder builder = new ElasticIndexDefinitionBuilder();
+ IndexDefinitionBuilder.IndexRule indexRule =
builder.indexRule("idxRule");
+ indexRule.property("foo").type("String").useInSimilarity();
+
+ Tree analyzer = builder.getBuilderTree().addChild("analyzers");
+ Tree defaultAnalyzer = analyzer.addChild("default");
+ defaultAnalyzer.setProperty(FulltextIndexConstants.ANL_CLASS,
"org.apache.lucene.analysis.en.EnglishAnalyzer");
+ defaultAnalyzer.addChild("tokenizer");
+ defaultAnalyzer.addChild("filters");
+
+ NodeState nodeState = builder.build();
+ ElasticIndexDefinition definition =
+ new ElasticIndexDefinition(nodeState, nodeState, "path",
"prefix");
+ ElasticIndexHelper.createIndexRequest("prefix.path", definition);
+ }
+
+ @Test
+ public void analyzerWithEmptyDefault() {
+ IndexDefinitionBuilder builder = new ElasticIndexDefinitionBuilder();
+ IndexDefinitionBuilder.IndexRule indexRule =
builder.indexRule("idxRule");
+ indexRule.property("foo").type("String").useInSimilarity();
+
+ Tree analyzer = builder.getBuilderTree().addChild("analyzers");
+ analyzer.addChild("default");
+
+ NodeState nodeState = builder.build();
+ ElasticIndexDefinition definition =
+ new ElasticIndexDefinition(nodeState, nodeState, "path",
"prefix");
+ ElasticIndexHelper.createIndexRequest("prefix.path", definition);
+ }
+
+ @Test
+ public void analyzerWithWordDelimiter() {
+ IndexDefinitionBuilder builder = new ElasticIndexDefinitionBuilder();
+ IndexDefinitionBuilder.IndexRule indexRule =
builder.indexRule("idxRule");
+ indexRule.property("foo").type("String").useInSimilarity();
+
+ Tree analyzer = builder.getBuilderTree().addChild("analyzers");
+ Tree defaultAnalyzer = analyzer.addChild("default");
+ Tree tokenizer = defaultAnalyzer.addChild("tokenizer");
+ tokenizer.setProperty("name", "Standard");
+ Tree filters = defaultAnalyzer.addChild("filters");
+ filters.addChild("LowerCase");
+ filters.addChild("WordDelimiter");
+ Tree synonym = filters.addChild("Synonym");
+ synonym.setProperty("format", "solr");
+ synonym.setProperty("ignoreCase", true);
+ synonym.setProperty("synonyms", "synonyms.txt");
+ Tree synonymsText = filters.addChild("synonyms.txt");
+ Tree synonymsContent = synonymsText.addChild("jcr:content");
+ synonymsContent.setProperty("jcr:data", "test");
+ filters.addChild("PorterStem");
+
+ NodeState nodeState = builder.build();
+ ElasticIndexDefinition definition =
+ new ElasticIndexDefinition(nodeState, nodeState, "path",
"prefix");
+ ElasticIndexHelper.createIndexRequest("prefix.path", definition);
+ }
+
@Test()
public void indexSettingsAreCorrectlySet() {
IndexDefinitionBuilder builder = new ElasticIndexDefinitionBuilder();
diff --git
a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/ConfigUtil.java
b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/ConfigUtil.java
index 9b5c3d0491..517e055d7b 100644
---
a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/ConfigUtil.java
+++
b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/ConfigUtil.java
@@ -110,7 +110,7 @@ public class ConfigUtil {
* the jcr:content/@jcr:data property to get the binary content
*/
@Nullable
- public static Blob getBlob(NodeState state, String resourceName){
+ public static Blob getBlob(NodeState state, String resourceName) {
NodeState contentNode = state.getChildNode(JcrConstants.JCR_CONTENT);
checkArgument(contentNode.exists(), "Was expecting to find jcr:content
node to read resource %s", resourceName);
PropertyState property =
contentNode.getProperty(JcrConstants.JCR_DATA);
diff --git
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
index d6db2b511d..77f0893be9 100644
---
a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
+++
b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/FullTextAnalyzerCommonTest.java
@@ -24,6 +24,7 @@ import
org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
import
org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
import org.apache.jackrabbit.oak.query.AbstractQueryTest;
import org.junit.Assert;
+import org.junit.Ignore;
import org.junit.Test;
import java.io.ByteArrayInputStream;
@@ -1075,6 +1076,161 @@ public abstract class FullTextAnalyzerCommonTest
extends AbstractQueryTest {
});
}
+ // OAK-11568
+ @Test
+ public void analyzerWithEmptyCharFilterMapping() throws Exception {
+ setup(List.of("foo"), idx -> {
+ Tree analyzers = idx.addChild(FulltextIndexConstants.ANALYZERS);
+ Tree defaultAnalyzers =
analyzers.addChild(FulltextIndexConstants.ANL_DEFAULT);
+ Tree charFilters =
defaultAnalyzers.addChild(FulltextIndexConstants.ANL_CHAR_FILTERS);
+ charFilters.addChild("HTMLStrip");
+
+ // having the mappings, but not having any content, resulted in:
+ // co.elastic.clients.elasticsearch._types.ElasticsearchException:
+ // [es/indices.create] failed: [illegal_argument_exception]
+ // mapping requires either `mappings` or `mappings_path` to be
configured
+ charFilters.addChild("Mapping");
+
+ defaultAnalyzers.addChild(FulltextIndexConstants.ANL_TOKENIZER)
+ .setProperty(FulltextIndexConstants.ANL_NAME, "Standard");
+ Tree filters =
defaultAnalyzers.addChild(FulltextIndexConstants.ANL_FILTERS);
+ filters.setOrderableChildren(true);
+ filters.addChild("LowerCase");
+ });
+
+ Tree content = root.getTree("/").addChild("content");
+ content.addChild("bar").setProperty("foo", "foo bar");
+ root.commit();
+
+ assertEventually(() -> {
+ assertQuery("select * from [nt:base] where CONTAINS(*, 'foo')",
List.of("/content/bar"));
+ });
+ }
+
+ // OAK-11568
+ @Test
+ public void analyzerWithNGramTokenizer() throws Exception {
+ setup(List.of("foo"), idx -> {
+ Tree analyzers = idx.addChild(FulltextIndexConstants.ANALYZERS);
+ Tree defaultAnalyzers =
analyzers.addChild(FulltextIndexConstants.ANL_DEFAULT);
+ Tree tokenizer =
defaultAnalyzers.addChild(FulltextIndexConstants.ANL_TOKENIZER);
+ tokenizer.setProperty(FulltextIndexConstants.ANL_NAME, "NGram");
+ tokenizer.setProperty("minGramSize", 2);
+ tokenizer.setProperty("maxGramSize", 3);
+ });
+
+ Tree content = root.getTree("/").addChild("content");
+ content.addChild("bar").setProperty("foo", "foob bart");
+ root.commit();
+
+ assertEventually(() -> {
+ assertQuery("select * from [nt:base] where contains(*, 'fo')",
List.of("/content/bar"));
+ assertQuery("select * from [nt:base] where contains(*, 'foo')",
List.of("/content/bar"));
+ assertQuery("select * from [nt:base] where contains(*, 'oob')",
List.of("/content/bar"));
+ assertQuery("select * from [nt:base] where contains(*, 'ba')",
List.of("/content/bar"));
+ assertQuery("select * from [nt:base] where contains(*, 'bar')",
List.of("/content/bar"));
+ assertQuery("select * from [nt:base] where contains(*, 'art')",
List.of("/content/bar"));
+ // not found with Elasticsearch, but found with Lucene
+ // assertQuery("select * from [nt:base] where contains(*, 'foo
bar')", List.of("/content/bar"));
+ });
+ }
+
+ // OAK-11568
+ @Test
+ public void analyzerWithPatternTokenizer() throws Exception {
+ setup(List.of("foo"), idx -> {
+ Tree analyzers = idx.addChild(FulltextIndexConstants.ANALYZERS);
+ Tree defaultAnalyzers =
analyzers.addChild(FulltextIndexConstants.ANL_DEFAULT);
+ Tree tokenizer =
defaultAnalyzers.addChild(FulltextIndexConstants.ANL_TOKENIZER);
+ tokenizer.setProperty(FulltextIndexConstants.ANL_NAME, "pattern");
+ tokenizer.setProperty("pattern", "[^\\p{L}\\d-_]");
+ });
+
+ Tree content = root.getTree("/").addChild("content");
+ content.addChild("bar").setProperty("foo", "foo bar");
+ root.commit();
+
+ assertEventually(() -> {
+ assertQuery("select * from [nt:base] where contains(*, 'foo')",
List.of("/content/bar"));
+ });
+ }
+
+ // OAK-11568
+ @Test
+ public void analyzerWithWordDelimiterAndSynonyms() throws Exception {
+ setup(List.of("foo"), idx -> {
+ Tree analyzers = idx.addChild(FulltextIndexConstants.ANALYZERS);
+ Tree defaultAnalyzers =
analyzers.addChild(FulltextIndexConstants.ANL_DEFAULT);
+ Tree tokenizer =
defaultAnalyzers.addChild(FulltextIndexConstants.ANL_TOKENIZER);
+ tokenizer.setProperty(FulltextIndexConstants.ANL_NAME, "Standard");
+ Tree filters =
defaultAnalyzers.addChild(FulltextIndexConstants.ANL_FILTERS);
+ filters.setOrderableChildren(true);
+ filters.addChild("LowerCase");
+ // internally, this is re-ordered _after_ the synonyms filter
+ filters.addChild("WordDelimiter");
+ Tree synonym = filters.addChild("Synonym");
+ synonym.setProperty("format", "solr");
+ synonym.setProperty("ignoreCase", true);
+ synonym.setProperty("synonyms", "synonyms.txt");
+ Tree synonymTxt = synonym.addChild("synonyms.txt");
+ Tree content = synonymTxt.addChild("jcr:content");
+ content.setProperty("jcr:data", "find => replace\n" +
+ "madison => mad");
+ content.setProperty("jcr:mimeType", "text/plain");
+ filters.addChild("PorterStem");
+ });
+
+ Tree content = root.getTree("/").addChild("content");
+ content.addChild("bar").setProperty("foo", "replacing wi-fi Madison
Square Garden email [email protected]");
+ root.commit();
+
+ assertEventually(() -> {
+ assertQuery("select * from [nt:base] where contains(*, 'find')",
List.of("/content/bar"));
+ });
+ }
+
+ // OAK-11568
+ @Test
+ @Ignore
+ public void analyzerWithHyphenationCompoundWord() throws Exception {
+ setup(List.of("foo"), idx -> {
+ Tree analyzers = idx.addChild(FulltextIndexConstants.ANALYZERS);
+ Tree defaultAnalyzers =
analyzers.addChild(FulltextIndexConstants.ANL_DEFAULT);
+ Tree tokenizer =
defaultAnalyzers.addChild(FulltextIndexConstants.ANL_TOKENIZER);
+ tokenizer.setProperty(FulltextIndexConstants.ANL_NAME, "Standard");
+ Tree filters =
defaultAnalyzers.addChild(FulltextIndexConstants.ANL_FILTERS);
+ filters.setOrderableChildren(true);
+ filters.addChild("Standard");
+ filters.addChild("LowerCase");
+ Tree synFilter = addFilter(filters, "Synonym");
+ synFilter.setProperty("synonyms", "syn.txt");
+ synFilter.addChild("syn.txt").addChild(JCR_CONTENT)
+ .setProperty(JCR_DATA, "plane, airplane, aircraft\n" +
+ "flies=>scars\n" +
+ // this rule is incorrect: "term: + was completely
eliminated by analyzer"
+ // by default, the configuration has to be lenient
and not fail on such cases
+ "plus,+,addition");
+ Tree hyphenation = filters.addChild("HyphenationCompoundWord");
+ hyphenation.setProperty("hyphenator", "de.xml");
+ Tree deXml = hyphenation.addChild("de.xml");
+ Tree deXmlContent = deXml.addChild("jcr:content");
+ deXmlContent.setProperty("jcr:data", "<...>");
+ deXmlContent.setProperty("jcr:mimeType", "application/xml");
+ filters.addChild("GermanLightStem");
+ filters.addChild("FrenchLightStem");
+ filters.addChild("ItalianLightStem");
+ filters.addChild("PorterStem");
+ });
+
+ Tree content = root.getTree("/").addChild("content");
+ content.addChild("bar").setProperty("foo", "replace");
+ root.commit();
+
+ assertEventually(() -> {
+ assertQuery("select * from [nt:base] where contains(*, 'find')",
List.of("/content/bar"));
+ });
+ }
+
protected Tree addFilter(Tree analyzer, String filterName) {
Tree filter = analyzer.addChild(filterName);
// mimics nodes api