EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/326168 )
Change subject: Lucene Stemmer UDF ...................................................................... Lucene Stemmer UDF Implements a stemmer for text using lucene for a variety of languages. This is needed for some processing of search queries, but may be useful for other use cases as well. Bug: T148811 Change-Id: I458e7ac724fefe813732b48fcfcef4728359fca9 --- A refinery-core/src/test/resources/stemmer_test_data.csv M refinery-hive/pom.xml A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/StemmerUDF.java A refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestStemmerUDF.java 4 files changed, 199 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source refs/changes/68/326168/1 diff --git a/refinery-core/src/test/resources/stemmer_test_data.csv b/refinery-core/src/test/resources/stemmer_test_data.csv new file mode 100644 index 0000000..e8c71e4 --- /dev/null +++ b/refinery-core/src/test/resources/stemmer_test_data.csv @@ -0,0 +1,8 @@ +text,lang,stemmed +testing the english stemmer,en,test english stemmer +اختبار محلل جذوع العربية,ar,اختبار محلل جذوع عرب +тестване на българина произтичащо,bg,тестван българин произтичащ +Testando o braço brasileiro,br,test brac brasileir +provar el català derivats,ca,prov catal deriv +Testování české vyplývající,cz,testován česk vyplývajík +afprøvning af dansk hidrører,da,afprøvning dansk hidrør diff --git a/refinery-hive/pom.xml b/refinery-hive/pom.xml index 1c7ba2a..669852e 100644 --- a/refinery-hive/pom.xml +++ b/refinery-hive/pom.xml @@ -30,6 +30,11 @@ <groupId>org.wikimedia.analytics.refinery.core</groupId> <artifactId>refinery-core</artifactId> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + <version>5.5.3</version> + </dependency> <dependency> <groupId>org.apache.hadoop</groupId> diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/StemmerUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/StemmerUDF.java new file mode 100644 index 0000000..849fbb5 --- /dev/null +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/StemmerUDF.java @@ -0,0 +1,144 @@ +package org.wikimedia.analytics.refinery.hive; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Analyzer;; +import org.apache.lucene.analysis.ar.ArabicAnalyzer; +import org.apache.lucene.analysis.bg.BulgarianAnalyzer; +import org.apache.lucene.analysis.br.BrazilianAnalyzer; +import org.apache.lucene.analysis.ca.CatalanAnalyzer; +import org.apache.lucene.analysis.cz.CzechAnalyzer; +import org.apache.lucene.analysis.da.DanishAnalyzer; +import org.apache.lucene.analysis.de.GermanAnalyzer; +import org.apache.lucene.analysis.el.GreekAnalyzer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.analysis.es.SpanishAnalyzer; +import org.apache.lucene.analysis.eu.BasqueAnalyzer; +import org.apache.lucene.analysis.fa.PersianAnalyzer; +import org.apache.lucene.analysis.fi.FinnishAnalyzer; +import org.apache.lucene.analysis.fr.FrenchAnalyzer; +import org.apache.lucene.analysis.ga.IrishAnalyzer; +import org.apache.lucene.analysis.gl.GalicianAnalyzer; +import org.apache.lucene.analysis.hi.HindiAnalyzer; +import org.apache.lucene.analysis.hu.HungarianAnalyzer; +import org.apache.lucene.analysis.hy.ArmenianAnalyzer; +import org.apache.lucene.analysis.id.IndonesianAnalyzer; +import org.apache.lucene.analysis.it.ItalianAnalyzer; +import org.apache.lucene.analysis.lt.LithuanianAnalyzer; +import org.apache.lucene.analysis.lv.LatvianAnalyzer; +import org.apache.lucene.analysis.nl.DutchAnalyzer; +import org.apache.lucene.analysis.no.NorwegianAnalyzer; +import org.apache.lucene.analysis.pt.PortugueseAnalyzer; +import org.apache.lucene.analysis.ro.RomanianAnalyzer; +import org.apache.lucene.analysis.ru.RussianAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.sv.SwedishAnalyzer; +import org.apache.lucene.analysis.th.ThaiAnalyzer; +import org.apache.lucene.analysis.tr.TurkishAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import com.google.common.base.Joiner; + +public class StemmerUDF extends UDF { + public String evaluate(String text) { + return evaluate(text, "en"); + } + public String evaluate(String text, String lang) { + if (text == null) { + return ""; + } + List<String> words = new ArrayList<>(); + try ( Analyzer analyzer = getAnalyzer(lang); + TokenStream ts = analyzer.tokenStream("", new StringReader(text)) + ) { + ts.reset(); + CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class); + while(ts.incrementToken()) { + words.add(cattr.toString()); + } + } catch (IOException e) { + return text; + } + return Joiner.on(" ").join(words); + } + + private Analyzer getAnalyzer(String lang) { + if (lang == null) { + // Sane? + return new StandardAnalyzer(); + } + switch(lang) { + case "ar": + return new ArabicAnalyzer(); + case "bg": + return new BulgarianAnalyzer(); + case "br": + return new BrazilianAnalyzer(); + case "ca": + return new CatalanAnalyzer(); + case "cz": + return new CzechAnalyzer(); + case "da": + return new DanishAnalyzer(); + case "de": + return new GermanAnalyzer(); + case "el": + return new GreekAnalyzer(); + case "en": + return new EnglishAnalyzer(); + case "es": + return new SpanishAnalyzer(); + case "eu": + return new BasqueAnalyzer(); + case "fa": + return new PersianAnalyzer(); + case "fi": + return new FinnishAnalyzer(); + case "fr": + return new FrenchAnalyzer(); + case "ga": + return new IrishAnalyzer(); + case "gl": + return new GalicianAnalyzer(); + case "hi": + return new HindiAnalyzer(); + case "hu": + return new HungarianAnalyzer(); + case "hy": + return new ArmenianAnalyzer(); + case "id": + return new IndonesianAnalyzer(); + case "it": + return new ItalianAnalyzer(); + case "lt": + return new LithuanianAnalyzer(); + case "lv": + return new LatvianAnalyzer(); + case "nl": + return new DutchAnalyzer(); + case "no": + return new NorwegianAnalyzer(); + case "pt": + return new PortugueseAnalyzer(); + case "ro": + return new RomanianAnalyzer(); + case "ru": + return new RussianAnalyzer(); + case "sv": + return new SwedishAnalyzer(); + case "th": + return new ThaiAnalyzer(); + case "tr": + return new TurkishAnalyzer(); + default: + // Does this make any sense? + return new StandardAnalyzer(); + } + } + +} diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestStemmerUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestStemmerUDF.java new file mode 100644 index 0000000..fe970c1 --- /dev/null +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestStemmerUDF.java @@ -0,0 +1,42 @@ +/** + * Copyright (C) 2015 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.wikimedia.analytics.refinery.hive; + +import org.junit.Test; +import org.junit.runner.RunWith; +import static org.junit.Assert.assertEquals; + +import junitparams.FileParameters; +import junitparams.JUnitParamsRunner; +import junitparams.mappers.CsvWithHeaderMapper; + +@RunWith(JUnitParamsRunner.class) +public class TestStemmerUDF { + + @Test + @FileParameters( + value = "../refinery-core/src/test/resources/stemmer_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + public void testEvaluate( + String text, + String lang, + String stemmed + ) { + StemmerUDF udf = new StemmerUDF(); + assertEquals(stemmed, udf.evaluate(text, lang)); + } +} -- To view, visit https://gerrit.wikimedia.org/r/326168 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I458e7ac724fefe813732b48fcfcef4728359fca9 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits