DCausse has uploaded a new change for review.
https://gerrit.wikimedia.org/r/297960
Change subject: Upgrade to elastic 2.3.4
......................................................................
Upgrade to elastic 2.3.4
cleaned up some tests
Change-Id: I54bbde7f40344306de636428792d283a7e77d753
---
M README.md
M docs/preserve_original.md
M pom.xml
M
src/test/java/org/wikimedia/search/extra/analysis/filters/PreserveOriginalFilterTest.java
4 files changed, 173 insertions(+), 126 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/search/extra
refs/changes/60/297960/1
diff --git a/README.md b/README.md
index 5be24cc..18f7878 100644
--- a/README.md
+++ b/README.md
@@ -20,8 +20,8 @@
Analysis:
* [preserve_original](docs/preserve_original.md) - A token filter that wraps a
-filter chain and keeps and emit the original term at the same position. New in
-2.3.3.1.
+filter chain to keep and emit the original term at the same position. New in
+2.3.4.
Installation
------------
diff --git a/docs/preserve_original.md b/docs/preserve_original.md
index c5deb8c..4ddee8e 100644
--- a/docs/preserve_original.md
+++ b/docs/preserve_original.md
@@ -1,8 +1,8 @@
preserve_original
=================
-The `preserve_original` and `preserve_original_recorder` are token filters that
-allows to keep and index original terms. This is very similar to the
+`preserve_original` and `preserve_original_recorder` are token filters that
+allow to keep and index original terms. This is very similar to the
`keyword_repeat` and `unique` filters but will work also on filters that do not
support the keyword attribute.
diff --git a/pom.xml b/pom.xml
index da3f022..9fe02ba 100644
--- a/pom.xml
+++ b/pom.xml
@@ -44,7 +44,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- <elasticsearch.version>2.3.3</elasticsearch.version>
+ <elasticsearch.version>2.3.4</elasticsearch.version>
<lucene.version>5.5.0</lucene.version>
<maven.compiler.target>1.7</maven.compiler.target>
<maven.compiler.source>1.7</maven.compiler.source>
diff --git
a/src/test/java/org/wikimedia/search/extra/analysis/filters/PreserveOriginalFilterTest.java
b/src/test/java/org/wikimedia/search/extra/analysis/filters/PreserveOriginalFilterTest.java
index 4aedbff..a6bd2e4 100644
---
a/src/test/java/org/wikimedia/search/extra/analysis/filters/PreserveOriginalFilterTest.java
+++
b/src/test/java/org/wikimedia/search/extra/analysis/filters/PreserveOriginalFilterTest.java
@@ -1,7 +1,6 @@
package org.wikimedia.search.extra.analysis.filters;
import java.io.IOException;
-import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
@@ -13,16 +12,17 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.fr.FrenchLightStemFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -39,10 +39,8 @@
@Test
public void simpleTest() throws IOException {
String input = "Hello the World";
- try (Analyzer ws = new WhitespaceAnalyzer()) {
+ try (Analyzer ws = newPreserveOriginalLowerCase()) {
TokenStream ts = ws.tokenStream("", input);
- ts = new StopFilter(ts, new CharArraySet(new
HashSet<>(Arrays.asList("the")), true));
- ts = new PreserveOriginalFilter(ts,
TokenFilterFactory.forName("lowercase", Collections.<String,String>emptyMap()));
assertTokenStreamContents(ts,
new String[]{"hello", "Hello", "world", "World"},
new int[]{0,0,10,10}, // start offsets
@@ -54,6 +52,18 @@
null, //keywordAtts, (unsupported)
true);
}
+ }
+
+ private Analyzer newPreserveOriginalLowerCase() {
+ return new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName)
{
+ Tokenizer tok = new WhitespaceTokenizer();
+ TokenStream ts = new StopFilter(tok, new CharArraySet(new
HashSet<>(Arrays.asList("the")), true));
+ ts = new PreserveOriginalFilter(ts,
TokenFilterFactory.forName("lowercase", Collections.<String,String>emptyMap()));
+ return new TokenStreamComponents(tok, ts);
+ }
+ };
}
@Test
@@ -61,12 +71,8 @@
// Same test but with a stop filter wrapped
// testing that if a term is removed our states are still valid
String input = "Hello the World";
- try (Analyzer ws = new WhitespaceAnalyzer()) {
+ try (Analyzer ws = newPreserveOriginalWithStop()) {
TokenStream ts = ws.tokenStream("", input);
- ts = new PreserveOriginalFilter.Recorder(ts);
- ts = new StopFilter(ts, new CharArraySet(new
HashSet<>(Arrays.asList("the")), true));
- ts = new LowerCaseFilter(ts);
- ts = new PreserveOriginalFilter(ts);
assertTokenStreamContents(ts,
new String[]{"hello", "Hello", "world", "World"},
new int[]{0,0,10,10}, // start offsets
@@ -79,17 +85,29 @@
true);
}
}
+
+ private Analyzer newPreserveOriginalWithStop() {
+ return new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName)
{
+ Tokenizer tok = new WhitespaceTokenizer();
+ TokenStream ts = new PreserveOriginalFilter.Recorder(tok);
+ ts = new StopFilter(ts, new CharArraySet(new
HashSet<>(Arrays.asList("the")), true));
+ ts = new LowerCaseFilter(ts);
+ ts = new PreserveOriginalFilter(ts);
+ return new TokenStreamComponents(tok, ts);
+ }
+ };
+ }
@Test(expected=IllegalArgumentException.class)
- public void testBadSetup() throws IOException {
- try(Analyzer analizer = new StandardAnalyzer()) {
- @SuppressWarnings("resource")
- TokenStream ts = tokenStream(analizer, "/Prise de
possession.txt");
- ts = new StopFilter(ts, FrenchAnalyzer.getDefaultStopSet());
- ts = new ASCIIFoldingFilter(ts, false);
- ts = new PreserveOriginalFilter(ts); // should fail here
- }
+ public TokenStream testBadSetup() throws IOException {
+ Tokenizer tok = new StandardTokenizer();
+ TokenStream ts = new StopFilter(tok,
FrenchAnalyzer.getDefaultStopSet());
+ ts = new ASCIIFoldingFilter(ts, false);
+ ts = new PreserveOriginalFilter(ts); // should fail here
+ return ts;
}
/**
@@ -102,129 +120,158 @@
@Test
public void longTextTest() throws IOException {
String textRes = "/Prise de possession.txt";
- try (StandardAnalyzer one = new StandardAnalyzer();
- StandardAnalyzer two = new StandardAnalyzer()) {
- try (TokenStream expected = stopAndASCIIFoldingPreserve(one,
textRes);
- TokenStream actual = stopGenericPreserveASCIIFolding(two,
textRes)) {
- assertSameOutput(expected, actual);
- }
-
- // Let's retry with a shingle filter which stores/restores states
- try (TokenStream expected = stopAndASCIIFoldingAndShingle(one,
textRes);
- TokenStream actual =
stopGenericPreserveASCIIFoldingShingles(two, textRes)) {
- assertSameOutput(expected, actual);
- }
-
- // now with a KW repeat and a stemmer
- try ( TokenStream expected = stopKWRepeatStemmerAndShingles(one,
textRes);
- TokenStream actual =
stopGenericPreserveStemmerAnsShingles(two, textRes)) {
- assertSameOutput(expected, actual);
- }
+ try (Analyzer expected = stopAndASCIIFoldingPreserve();
+ Analyzer actual = stopGenericPreserveASCIIFolding()) {
+ assertSameOutput(expected, actual, textRes);
+ // test reuse
+ assertSameOutput(expected, actual, textRes);
+ }
+ // Let's retry with a shingle filter which stores/restores states
+ try (Analyzer expected = stopAndASCIIFoldingAndShingle();
+ Analyzer actual = stopGenericPreserveASCIIFoldingShingles()) {
+ assertSameOutput(expected, actual, textRes);
+ // test reuse
+ assertSameOutput(expected, actual, textRes);
+ }
+ // now with a KW repeat and a stemmer
+ try (Analyzer expected = stopKWRepeatStemmerAndShingles();
+ Analyzer actual = stopGenericPreserveStemmerAnsShingles()) {
+ assertSameOutput(expected, actual, textRes);
+ // test reuse
+ assertSameOutput(expected, actual, textRes);
}
}
- private TokenStream stopGenericPreserveStemmerAnsShingles(Analyzer a,
String textRes) {
- TokenStream ts = tokenStream(a, textRes);
- ts = new StopFilter(ts, FrenchAnalyzer.getDefaultStopSet());
- ts = new PreserveOriginalFilter.Recorder(ts);
- ts = new FrenchLightStemFilter(ts);
- ts = new PreserveOriginalFilter(ts);
- ts = new ShingleFilter(ts, this.shingleMinSize, this.shingleMaxSize);
- return ts;
- }
-
- private TokenStream stopKWRepeatStemmerAndShingles(Analyzer a, String
textRes) {
- TokenStream ts = tokenStream(a, textRes);
- ts = new StopFilter(ts,FrenchAnalyzer.getDefaultStopSet());
- ts = new KeywordRepeatFilter(ts);
- // Keyword repeat emits token in the wrong order (returns the
preserved first)
- // this code switches token by pair
- ts = new TokenFilter(ts) {
- private State state = null;
- private final PositionIncrementAttribute pattr =
getAttribute(PositionIncrementAttribute.class);
+ private Analyzer stopGenericPreserveStemmerAnsShingles() {
+ return new Analyzer() {
@Override
- public final boolean incrementToken() throws IOException {
- if(state != null) {
- restoreState(state);
- pattr.setPositionIncrement(0);
- state = null;
- return true;
- } else if(input.incrementToken()) {
- state = captureState();
- int posInc = pattr.getPositionIncrement();
- assert input.incrementToken();
- assert pattr.getPositionIncrement() == 0;
- pattr.setPositionIncrement(posInc);
- return true;
- }
- return false;
+ protected TokenStreamComponents createComponents(String fieldName)
{
+ Tokenizer tok = new StandardTokenizer();
+ TokenStream ts = new StopFilter(tok,
FrenchAnalyzer.getDefaultStopSet());
+ ts = new PreserveOriginalFilter.Recorder(ts);
+ ts = new FrenchLightStemFilter(ts);
+ ts = new PreserveOriginalFilter(ts);
+ ts = new ShingleFilter(ts, shingleMinSize, shingleMaxSize);
+ return new TokenStreamComponents(tok, ts);
}
};
- ts = new FrenchLightStemFilter(ts);
- ts = new RemoveDuplicatesTokenFilter(ts);
- ts = new ShingleFilter(ts, this.shingleMinSize, this.shingleMaxSize);
- return ts;
}
- private TokenStream stopGenericPreserveASCIIFoldingShingles(Analyzer a,
String textRes) {
- TokenStream ts = tokenStream(a, textRes);
- ts = new StopFilter(ts, FrenchAnalyzer.getDefaultStopSet());
- ts = new PreserveOriginalFilter.Recorder(ts);
- ts = new ASCIIFoldingFilter(ts);
- ts = new PreserveOriginalFilter(ts);
- ts = new ShingleFilter(ts, this.shingleMinSize, this.shingleMaxSize);
- return ts;
+ private Analyzer stopKWRepeatStemmerAndShingles() {
+ return new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName)
{
+ Tokenizer tok = new StandardTokenizer();
+ TokenStream ts = new
StopFilter(tok,FrenchAnalyzer.getDefaultStopSet());
+ ts = new KeywordRepeatFilter(ts);
+ // Keyword repeat emits token in the wrong order (returns the
preserved first)
+ // this code switches token by pair
+ ts = new TokenFilter(ts) {
+ private State state = null;
+ private final PositionIncrementAttribute pattr =
getAttribute(PositionIncrementAttribute.class);
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if(state != null) {
+ restoreState(state);
+ pattr.setPositionIncrement(0);
+ state = null;
+ return true;
+ } else if(input.incrementToken()) {
+ state = captureState();
+ int posInc = pattr.getPositionIncrement();
+ assert input.incrementToken();
+ assert pattr.getPositionIncrement() == 0;
+ pattr.setPositionIncrement(posInc);
+ return true;
+ }
+ return false;
+ }
+ };
+ ts = new FrenchLightStemFilter(ts);
+ ts = new RemoveDuplicatesTokenFilter(ts);
+ ts = new ShingleFilter(ts, shingleMinSize, shingleMaxSize);
+ return new TokenStreamComponents(tok, ts);
+ }
+ };
}
- private TokenStream stopAndASCIIFoldingAndShingle(Analyzer a, String
textRes) {
- TokenStream ts = tokenStream(a, textRes);
- ts = new StopFilter(ts,FrenchAnalyzer.getDefaultStopSet());
- ts = new ASCIIFoldingFilter(ts, true);
- ts = new ShingleFilter(ts, this.shingleMinSize, this.shingleMaxSize);
- return ts;
+ private Analyzer stopGenericPreserveASCIIFoldingShingles() {
+ return new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName)
{
+ Tokenizer tok = new StandardTokenizer();
+ TokenStream ts = new StopFilter(tok,
FrenchAnalyzer.getDefaultStopSet());
+ ts = new PreserveOriginalFilter.Recorder(ts);
+ ts = new ASCIIFoldingFilter(ts);
+ ts = new PreserveOriginalFilter(ts);
+ ts = new ShingleFilter(ts, shingleMinSize, shingleMaxSize);
+ return new TokenStreamComponents(tok, ts);
+ }
+ };
}
- private TokenStream stopGenericPreserveASCIIFolding(Analyzer a, String
textRes) {
- TokenStream ts = tokenStream(a, textRes);
- ts = new StopFilter(ts, FrenchAnalyzer.getDefaultStopSet());
- ts = new PreserveOriginalFilter.Recorder(ts);
- ts = new ASCIIFoldingFilter(ts, false);
- ts = new PreserveOriginalFilter(ts);
- return ts;
+ private Analyzer stopAndASCIIFoldingAndShingle() {
+ return new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName)
{
+ Tokenizer tok = new StandardTokenizer();
+ TokenStream ts = new StopFilter(tok,
FrenchAnalyzer.getDefaultStopSet());
+ ts = new ASCIIFoldingFilter(ts, true);
+ ts = new ShingleFilter(ts, shingleMinSize, shingleMaxSize);
+ return new TokenStreamComponents(tok, ts);
+ }
+ };
}
- private TokenStream stopAndASCIIFoldingPreserve(Analyzer a, String
textRes) {
- TokenStream ts = tokenStream(a, textRes);
- ts = new StopFilter(ts ,FrenchAnalyzer.getDefaultStopSet());
- ts = new ASCIIFoldingFilter(ts ,true);
- return ts;
+ private Analyzer stopGenericPreserveASCIIFolding() {
+ return new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName)
{
+ Tokenizer tok = new StandardTokenizer();
+ TokenStream ts = new StopFilter(tok,
FrenchAnalyzer.getDefaultStopSet());
+ ts = new PreserveOriginalFilter.Recorder(ts);
+ ts = new ASCIIFoldingFilter(ts, false);
+ ts = new PreserveOriginalFilter(ts);
+ return new TokenStreamComponents(tok, ts);
+ }
+ };
}
- private void assertSameOutput(TokenStream expected, TokenStream actual)
throws IOException {
+ private Analyzer stopAndASCIIFoldingPreserve() {
+ return new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName)
{
+ Tokenizer tok = new StandardTokenizer();
+ TokenStream ts = new
StopFilter(tok,FrenchAnalyzer.getDefaultStopSet());
+ ts = new ASCIIFoldingFilter(ts ,true);
+ return new TokenStreamComponents(tok, ts);
+ }
+ };
+ }
+
+ private void assertSameOutput(Analyzer expectedAnalyzer, Analyzer
actualAnalyzer, String res) throws IOException {
List<String> output = new ArrayList<>();
List<Integer> posInc = new ArrayList<>();
List<Integer> startOffsets = new ArrayList<>();
List<Integer> endOffsets = new ArrayList<>();
int finalOffset = -1;
-
- expected.reset();
- CharTermAttribute cattr =
expected.getAttribute(CharTermAttribute.class);
- PositionIncrementAttribute pInc =
expected.getAttribute(PositionIncrementAttribute.class);
- OffsetAttribute oattr = expected.getAttribute(OffsetAttribute.class);
- while(expected.incrementToken()) {
- output.add(cattr.toString());
- posInc.add(pInc.getPositionIncrement());
- startOffsets.add(oattr.startOffset());
- endOffsets.add(oattr.endOffset());
+ try (TokenStream expected = expectedAnalyzer.tokenStream("",
+ new
InputStreamReader(this.getClass().getResourceAsStream(res), Charsets.UTF_8));
+ TokenStream actual = actualAnalyzer.tokenStream("",
+ new
InputStreamReader(this.getClass().getResourceAsStream(res), Charsets.UTF_8))) {
+ expected.reset();
+ CharTermAttribute cattr =
expected.getAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute pInc =
expected.getAttribute(PositionIncrementAttribute.class);
+ OffsetAttribute oattr =
expected.getAttribute(OffsetAttribute.class);
+ while(expected.incrementToken()) {
+ output.add(cattr.toString());
+ posInc.add(pInc.getPositionIncrement());
+ startOffsets.add(oattr.startOffset());
+ endOffsets.add(oattr.endOffset());
+ }
+ expected.end();
+ finalOffset = oattr.endOffset();
+ assertTokenStreamContents(actual, output.toArray(new String[0]),
Ints.toArray(startOffsets), Ints.toArray(endOffsets), null,
Ints.toArray(posInc), null, finalOffset, null, true);
}
- expected.end();
- finalOffset = oattr.endOffset();
- assertTokenStreamContents(actual, output.toArray(new String[0]),
Ints.toArray(startOffsets), Ints.toArray(endOffsets), null,
Ints.toArray(posInc), null, finalOffset, null, true);
- }
-
- private TokenStream tokenStream(Analyzer a, String res) {
- InputStream is = this.getClass().getResourceAsStream(res);
- closeAfterTest(is);
- return a.tokenStream("", new InputStreamReader(is, Charsets.UTF_8));
}
}
--
To view, visit https://gerrit.wikimedia.org/r/297960
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I54bbde7f40344306de636428792d283a7e77d753
Gerrit-PatchSet: 1
Gerrit-Project: search/extra
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits