This is an automated email from the ASF dual-hosted git repository. reschke pushed a commit to branch OAK-10694 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 307531dfb819a8ce67ca821fa66cb3eeeaf30395 Author: Julian Reschke <julian.resc...@gmx.de> AuthorDate: Tue Apr 9 11:57:22 2024 +0100 OAK-10694: Remove oak-search-mt --- oak-search-mt/pom.xml | 160 --------------------- .../index/mt/MTFulltextQueryTermsProvider.java | 144 ------------------- .../mt/MTFulltextQueryTermsProviderFactory.java | 144 ------------------- .../index/mt/MTFulltextQueryTermsProviderTest.java | 64 --------- pom.xml | 1 - 5 files changed, 513 deletions(-) diff --git a/oak-search-mt/pom.xml b/oak-search-mt/pom.xml deleted file mode 100644 index e32873ad6d..0000000000 --- a/oak-search-mt/pom.xml +++ /dev/null @@ -1,160 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - --> - -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd "> - <modelVersion>4.0.0</modelVersion> - - <parent> - <groupId>org.apache.jackrabbit</groupId> - <artifactId>oak-parent</artifactId> - <version>1.63-SNAPSHOT</version> - <relativePath>../oak-parent/pom.xml</relativePath> - </parent> - - <artifactId>oak-search-mt</artifactId> - <name>Oak Search Machine Translation</name> - <packaging>bundle</packaging> - <description>Machine Translation extension for Oak search</description> - - <build> - <plugins> - <plugin> - <groupId>org.apache.felix</groupId> - <artifactId>maven-bundle-plugin</artifactId> - <extensions>true</extensions> - <configuration> - <instructions> - <Export-Package> - !* - </Export-Package> - <Embed-Dependency>*;scope=compile,artifactId=!oak-lucene</Embed-Dependency> - <Import-Package> - com.ibm.uvm.tools.*;resolution:=optional, - com.sun.jdmk.comm.*;resolution:=optional, - com.sun.net.httpserver.*;resolution:=optional, - edu.uci.ics.*;resolution:=optional, - javax.jms.*;resolution:=optional, - javax.jmdns.*;resolution:=optional, - junit.framework.*;resolution:=optional, - org.apache.commons.collections15.*;resolution:=optional, - org.apache.tools.ant.*;resolution:=optional, - org.apache.tools.ant.types.*;resolution:=optional, - org.easymock.*;resolution:=optional, - org.jmock.core.*;resolution:=optional, - sun.misc.*;resolution:=optional, - EDU.oswego.cs.dl.util.concurrent.*;resolution:=optional, - org.kohsuke.args4j.*;resolution:=optional, - * - </Import-Package> - </instructions> - </configuration> - <executions> - <execution> - <id>baseline</id> - <goals> - <goal>baseline</goal> - </goals> - <phase>pre-integration-test</phase> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.rat</groupId> - <artifactId>apache-rat-plugin</artifactId> - </plugin> - </plugins> - </build> - - <dependencies> - <!-- Optional OSGi dependencies, used only when running within OSGi --> - <dependency> - <groupId>org.osgi</groupId> - <artifactId>org.osgi.service.component.annotations</artifactId> - <scope>provided</scope> - </dependency> - - <dependency> - <groupId>org.apache.jackrabbit</groupId> - <artifactId>oak-lucene</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.apache.joshua</groupId> - <artifactId>joshua-incubating</artifactId> - <version>6.1</version> - <exclusions> - <exclusion> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-log4j12</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.apache.lucene</groupId> - <artifactId>lucene-queryparser</artifactId> - <version>${lucene.version}</version> - <scope>provided</scope> - </dependency> - - <!-- Nullability annotations --> - <dependency> - <groupId>org.jetbrains</groupId> - <artifactId>annotations</artifactId> - <scope>provided</scope> - </dependency> - - <!-- Test Dependencies --> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.mockito</groupId> - <artifactId>mockito-core</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>colt</groupId> - <artifactId>colt</artifactId> - <version>1.2.0</version> - </dependency> - <dependency> - <groupId>edu.berkeley.nlp</groupId> - <artifactId>berkeleylm</artifactId> - <version>1.1.2</version> - </dependency> - <dependency> - <groupId>commons-cli</groupId> - <artifactId>commons-cli</artifactId> - <version>1.2</version> - </dependency> - <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-math3</artifactId> - </dependency> - - </dependencies> -</project> - diff --git a/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java b/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java deleted file mode 100644 index 6d99147233..0000000000 --- a/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProvider.java +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.jackrabbit.oak.plugins.index.mt; - -import java.io.StringReader; -import java.util.List; -import java.util.Set; - -import org.apache.jackrabbit.oak.plugins.index.lucene.OakAnalyzer; -import org.apache.jackrabbit.oak.plugins.index.lucene.spi.FulltextQueryTermsProvider; -import org.apache.jackrabbit.oak.plugins.index.search.FieldNames; -import org.apache.jackrabbit.oak.spi.state.NodeState; -import org.apache.joshua.decoder.Decoder; -import org.apache.joshua.decoder.StructuredTranslation; -import org.apache.joshua.decoder.Translation; -import org.apache.joshua.decoder.segment_file.Sentence; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.index.Term; -import org.apache.lucene.queryparser.simple.SimpleQueryParser; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.util.Version; -import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * {@link FulltextQueryTermsProvider} that performs machine translation on full text returning a query containing - * translated tokens. - * @deprecated see OAK-10694 - */ -@Deprecated(forRemoval = true) -public class MTFulltextQueryTermsProvider implements FulltextQueryTermsProvider { - - private final Logger log = LoggerFactory.getLogger(getClass()); - - private final Decoder decoder; - private final Set<String> nodeTypes; - private final float minScore; - private final SimpleQueryParser qp; - - public MTFulltextQueryTermsProvider(Decoder decoder, Set<String> nodeTypes, float minScore) { - this.decoder = decoder; - this.nodeTypes = nodeTypes; - this.minScore = minScore; - this.qp = new SimpleQueryParser(new OakAnalyzer(Version.LUCENE_47), FieldNames.FULLTEXT); - } - - @Override - public Query getQueryTerm(String text, Analyzer analyzer, NodeState indexDefinition) { - - BooleanQuery query = new BooleanQuery(); - try { - Sentence sentence = new Sentence(text, text.hashCode(), decoder.getJoshuaConfiguration()); - Translation translation = decoder.decode(sentence); - log.debug("{} decoded into {}", text, translation); - query.add(new BooleanClause(new TermQuery(new Term(FieldNames.FULLTEXT, translation.toString())), BooleanClause.Occur.SHOULD)); - - - // try phrase translation first - List<StructuredTranslation> structuredTranslations = translation.getStructuredTranslations(); - log.debug("found {} structured translations", structuredTranslations.size()); - if (!structuredTranslations.isEmpty()) { - log.debug("phrase translation"); - addTranslations(query, structuredTranslations); - } else { - // if phrase cannot be translated, perform token by token translation - log.debug("per token translation"); - - TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text)); - tokenStream.addAttribute(CharTermAttribute.class); - tokenStream.reset(); - while (tokenStream.incrementToken()) { - CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); - String source = attribute.toString(); - Translation translatedToken = decoder.decode(new Sentence(source, source.hashCode(), - decoder.getJoshuaConfiguration())); - addTranslations(query, translatedToken.getStructuredTranslations()); - } - tokenStream.end(); - } - - } catch (Exception e) { - log.error("could not translate query", e); - } - return query.clauses().size() > 0 ? query : null; - } - - private void addTranslations(BooleanQuery query, List<StructuredTranslation> structuredTranslations) { - for (StructuredTranslation st : structuredTranslations) { - String translationString = st.getTranslationString(); - float translationScore = st.getTranslationScore(); - log.debug("translation {} has score {}", translationString, translationScore); - if (translationScore > minScore) { - log.debug("translation score for {} is {}", translationString, translationScore); - query.add(new BooleanClause(qp.createPhraseQuery(FieldNames.FULLTEXT, translationString), - BooleanClause.Occur.SHOULD)); - log.debug("added query for translated phrase {}", translationString); - List<String> translationTokens = st.getTranslationTokens(); - int i = 0; - // if output is a phrase, look for tokens having a word alignment to the original sentence terms - for (List<Integer> wa : st.getTranslationWordAlignments()) { - if (!wa.isEmpty()) { - String translatedTerm = translationTokens.get(i); - Query termQuery = qp.parse(translatedTerm); - query.add(new BooleanClause(termQuery, BooleanClause.Occur.SHOULD)); - log.debug("added query for translated token {}", translatedTerm); - } - i++; - } - } - } - } - - public void clearResources() { - decoder.cleanUp(); - } - - @NotNull - @Override - public Set<String> getSupportedTypes() { - return nodeTypes; - } -} diff --git a/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java b/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java deleted file mode 100644 index 821f094640..0000000000 --- a/oak-search-mt/src/main/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderFactory.java +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.jackrabbit.oak.plugins.index.mt; - -import java.io.File; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; -import org.osgi.service.component.annotations.Activate; -import org.osgi.service.component.annotations.Component; -import org.osgi.service.component.annotations.ConfigurationPolicy; -import org.osgi.service.component.annotations.Deactivate; -import org.osgi.service.metatype.annotations.AttributeDefinition; -import org.osgi.service.metatype.annotations.AttributeType; -import org.osgi.service.metatype.annotations.Designate; -import org.osgi.service.metatype.annotations.ObjectClassDefinition; -import org.apache.jackrabbit.oak.commons.PropertiesUtil; -import org.apache.jackrabbit.oak.plugins.index.lucene.spi.FulltextQueryTermsProvider; -import org.apache.jackrabbit.oak.spi.state.NodeState; -import org.apache.joshua.decoder.Decoder; -import org.apache.joshua.decoder.JoshuaConfiguration; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.search.Query; -import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Factory for {@link MTFulltextQueryTermsProvider} - * @deprecated see OAK-10694 - */ -@Component( - service = { FulltextQueryTermsProvider.class }, - configurationPolicy = ConfigurationPolicy.REQUIRE -) -@Designate( - ocd = MTFulltextQueryTermsProviderFactory.Configuration.class, - factory = true ) -@Deprecated(forRemoval = true) -public class MTFulltextQueryTermsProviderFactory implements FulltextQueryTermsProvider { - - @ObjectClassDefinition( - id = "org.apache.jackrabbit.oak.plugins.index.mt.MTFulltextQueryTermsProviderFactory", - name = "Apache Jackrabbit Oak Machine Translation Fulltext Query Terms Provider" - ) - @interface Configuration { - - @AttributeDefinition( - name = "Joshua Config Path", - description = "The absolute filesystem path to Apache Joshua configuration file" - ) - String path_to_config(); - - @AttributeDefinition( - name = "Node types", - description = "List of node types for which expanding the query via MT", - cardinality = 10 - ) - String[] node_types(); - - @AttributeDefinition( - name = "Minimum score", - description = "Minimum allowed score for a translated phrase/term to be used for expansion", - type = AttributeType.FLOAT - ) - float min_score() default DEFAULT_MIN_SCORE; - } - - private static final float DEFAULT_MIN_SCORE = 0.5f; - - private final Logger log = LoggerFactory.getLogger(getClass()); - - private MTFulltextQueryTermsProvider queryTermsProvider; - - @Activate - public void activate(Configuration config) { - String pathToJoshuaConfig = PropertiesUtil.toString(config.path_to_config(), "."); - String[] nts = PropertiesUtil.toStringArray(config.node_types(), new String[]{"Oak:unstructured"}); - float minScore = (float) PropertiesUtil.toDouble(config.min_score(), DEFAULT_MIN_SCORE); - log.info("activating MT FulltextQueryTermProvider from Joshua config at {} on {} nodetypes, minScore {}", pathToJoshuaConfig, nts, minScore); - Decoder decoder = null; - try { - log.debug("reading joshua config"); - JoshuaConfiguration configuration = new JoshuaConfiguration(); - configuration.readConfigFile(pathToJoshuaConfig); - configuration.setConfigFilePath(new File(pathToJoshuaConfig).getCanonicalFile().getParent()); - configuration.use_structured_output = true; - decoder = new Decoder(configuration, pathToJoshuaConfig); - log.debug("decoder initialized"); - Set<String> nodeTypes = new HashSet<>(); - nodeTypes.addAll(Arrays.asList(nts)); - queryTermsProvider = new MTFulltextQueryTermsProvider(decoder, nodeTypes, minScore); - } catch (Exception e) { - log.error("could not initialize MTFulltextQueryTermProvider", e); - if (decoder != null) { - decoder.cleanUp(); - } - } - } - - @Deactivate - public void deactivate() { - if (queryTermsProvider != null) { - log.debug("clearing resources"); - queryTermsProvider.clearResources(); - } - } - - @Override - public Query getQueryTerm(String text, Analyzer analyzer, NodeState indexDefinition) { - if (queryTermsProvider != null) { - return queryTermsProvider.getQueryTerm(text, analyzer, indexDefinition); - } else { - return null; - } - } - - @NotNull - @Override - public Set<String> getSupportedTypes() { - if (queryTermsProvider != null) { - return queryTermsProvider.getSupportedTypes(); - } else { - return Collections.emptySet(); - } - } -} diff --git a/oak-search-mt/src/test/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderTest.java b/oak-search-mt/src/test/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderTest.java deleted file mode 100644 index dd402c6b3f..0000000000 --- a/oak-search-mt/src/test/java/org/apache/jackrabbit/oak/plugins/index/mt/MTFulltextQueryTermsProviderTest.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.jackrabbit.oak.plugins.index.mt; - -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; - -import org.apache.jackrabbit.oak.spi.state.NodeState; -import org.apache.joshua.decoder.Decoder; -import org.apache.joshua.decoder.JoshuaConfiguration; -import org.apache.joshua.decoder.StructuredTranslation; -import org.apache.joshua.decoder.Translation; -import org.apache.joshua.decoder.segment_file.Sentence; -import org.apache.lucene.analysis.Analyzer; -import org.junit.Test; - -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -/** - * Tests for {@link MTFulltextQueryTermsProvider} - * @deprecated see OAK-10694 - */ -@Deprecated(forRemoval = true) -public class MTFulltextQueryTermsProviderTest { - - @Test - public void testGetQueryTermWithPhraseTranslation() throws Exception { - Decoder decoder = mock(Decoder.class); - Translation translation = mock(Translation.class); - List<StructuredTranslation> translations = new LinkedList<>(); - StructuredTranslation structuredTranslation = mock(StructuredTranslation.class); - when(structuredTranslation.getTranslationString()).thenReturn("fou bur"); - translations.add(structuredTranslation); - when(translation.getStructuredTranslations()).thenReturn(translations); - when(decoder.decode(any(Sentence.class))).thenReturn(translation); - JoshuaConfiguration configuration = mock(JoshuaConfiguration.class); - when(decoder.getJoshuaConfiguration()).thenReturn(configuration); - Set<String> nodeTypes = new HashSet<>(); - MTFulltextQueryTermsProvider mtFulltextQueryTermsProvider = new MTFulltextQueryTermsProvider(decoder, nodeTypes, -1); - Analyzer analyzer = mock(Analyzer.class); - NodeState indexDefinition = mock(NodeState.class); - mtFulltextQueryTermsProvider.getQueryTerm("foo bar", analyzer, indexDefinition); - } -} \ No newline at end of file diff --git a/pom.xml b/pom.xml index 1b205655c9..0f065a63e3 100644 --- a/pom.xml +++ b/pom.xml @@ -63,7 +63,6 @@ <module>oak-lucene</module> <module>oak-solr-core</module> <module>oak-solr-osgi</module> - <module>oak-search-mt</module> <module>oak-auth-external</module> <module>oak-auth-ldap</module> <module>oak-run-commons</module>