Repository: metron Updated Branches: refs/heads/master a2bae0bce -> c8e84fa3b
METRON-1061 Add Fuzzy String Scoring to Stellar (ottobackwards) closes apache/metron#667 Project: http://git-wip-us.apache.org/repos/asf/metron/repo Commit: http://git-wip-us.apache.org/repos/asf/metron/commit/d5dbfc20 Tree: http://git-wip-us.apache.org/repos/asf/metron/tree/d5dbfc20 Diff: http://git-wip-us.apache.org/repos/asf/metron/diff/d5dbfc20 Branch: refs/heads/master Commit: d5dbfc20cb42708c27e9a02c5f7eacac98604745 Parents: a2bae0b Author: ottobackwards <ottobackwa...@gmail.com> Authored: Sat Aug 26 10:46:01 2017 -0400 Committer: otto <o...@apache.org> Committed: Sat Aug 26 10:46:01 2017 -0400 ---------------------------------------------------------------------- dependencies_with_url.csv | 1 + metron-stellar/stellar-common/README.md | 14 +++ metron-stellar/stellar-common/pom.xml | 5 + .../stellar/dsl/functions/TextFunctions.java | 112 +++++++++++++++++++ .../dsl/functions/TextFunctionsTest.java | 101 +++++++++++++++++ 5 files changed, 233 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/metron/blob/d5dbfc20/dependencies_with_url.csv ---------------------------------------------------------------------- diff --git a/dependencies_with_url.csv b/dependencies_with_url.csv index 83078ad..fac1164 100644 --- a/dependencies_with_url.csv +++ b/dependencies_with_url.csv @@ -177,6 +177,7 @@ commons-logging:commons-logging:jar:1.1.3:compile,ASLv2,http://commons.apache.or commons-logging:commons-logging:jar:1.2:compile,ASLv2,http://commons.apache.org/proper/commons-logging/ commons-net:commons-net:jar:3.1:compile,ASLv2,http://commons.apache.org/net/ commons-net:commons-net:jar:3.1:provided,ASLv2,http://commons.apache.org/net/ +commons-text:commons-text:jar:1.1:compile,ASLv2,http://commons.apache.org/proper/commons-text/ commons-validator:commons-validator:jar:1.4.0:compile,ASLv2,http://commons.apache.org/validator/ commons-validator:commons-validator:jar:1.5.1:compile,ASLv2,http://commons.apache.org/proper/commons-validator/ commons-validator:commons-validator:jar:1.6:compile,ASLv2,http://commons.apache.org/proper/commons-validator/ http://git-wip-us.apache.org/repos/asf/metron/blob/d5dbfc20/metron-stellar/stellar-common/README.md ---------------------------------------------------------------------- diff --git a/metron-stellar/stellar-common/README.md b/metron-stellar/stellar-common/README.md index a25c831..8746e60 100644 --- a/metron-stellar/stellar-common/README.md +++ b/metron-stellar/stellar-common/README.md @@ -131,6 +131,8 @@ In the core language functions, we support basic functional programming primitiv | [ `FILL_RIGHT`](#fill_right) | | [ `FILTER`](#filter) | | [ `FLOOR`](#floor) | +| [ `FUZZY_LANGS`](#fuzzy_langs) | +| [ `FUZZY_SCORE`](#fuzzy_score) | | [ `FORMAT`](#format) | | [ `GEO_GET`](#geo_get) | | [ `GET`](#get) | @@ -412,6 +414,18 @@ In the core language functions, we support basic functional programming primitiv * format - string * arguments... - object(s) * Returns: A formatted string. + +### `FUZZY_LANGS` + * Description: Returns a list of IETF BCP 47 available to the system, such as en, fr, de. + * Returns: A list of IEF BGP 47 language tag strings + +### `FUZZY_SCORE` + * Description: Returns the Fuzzy Score which indicates the similarity score between two strings. One point is given for every matched character. Subsequent matches yield two bonus points. A higher score indicates a higher similarity. + * Input: + * string - The full term that should be matched against. + * string - The query that will be matched against a term. + * string - The IETF BCP 47 language code to use. + * Returns: An Integer representing the score. ### `GEO_GET` * Description: Look up an IPV4 address and returns geographic information about it http://git-wip-us.apache.org/repos/asf/metron/blob/d5dbfc20/metron-stellar/stellar-common/pom.xml ---------------------------------------------------------------------- diff --git a/metron-stellar/stellar-common/pom.xml b/metron-stellar/stellar-common/pom.xml index 2f4cb6e..5945bbd 100644 --- a/metron-stellar/stellar-common/pom.xml +++ b/metron-stellar/stellar-common/pom.xml @@ -97,6 +97,11 @@ <version>1.10</version> </dependency> <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-text</artifactId> + <version>1.1</version> + </dependency> + <dependency> <groupId>commons-validator</groupId> <artifactId>commons-validator</artifactId> <version>1.6</version> http://git-wip-us.apache.org/repos/asf/metron/blob/d5dbfc20/metron-stellar/stellar-common/src/main/java/org/apache/metron/stellar/dsl/functions/TextFunctions.java ---------------------------------------------------------------------- diff --git a/metron-stellar/stellar-common/src/main/java/org/apache/metron/stellar/dsl/functions/TextFunctions.java b/metron-stellar/stellar-common/src/main/java/org/apache/metron/stellar/dsl/functions/TextFunctions.java new file mode 100644 index 0000000..01e5da4 --- /dev/null +++ b/metron-stellar/stellar-common/src/main/java/org/apache/metron/stellar/dsl/functions/TextFunctions.java @@ -0,0 +1,112 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with the License. You may obtain + * a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.metron.stellar.dsl.functions; + +import com.google.common.collect.ImmutableList; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import org.apache.commons.lang.StringUtils; +import org.apache.commons.text.similarity.FuzzyScore; +import org.apache.metron.stellar.dsl.BaseStellarFunction; +import org.apache.metron.stellar.dsl.ParseException; +import org.apache.metron.stellar.dsl.Stellar; + +public class TextFunctions { + + private static final List<String> tagsList; + + static { + List<String> tags = new ArrayList<>(); + for (Locale locale : Locale.getAvailableLocales()) { + tags.add(locale.toLanguageTag()); + } + tagsList = ImmutableList.copyOf(tags); + } + + @Stellar(name = "LANGS", + namespace = "FUZZY", + description = "Returns a list of IETF BCP 47 available to the system, such as en, fr, de. " + + "These values may be passed to FUZZY_SCORE", + params = {}, + returns = "A list of IEF BCP 47 language tag strings") + /** + * GetAvailableLanaguageTags exposes IEF BCP 47 lanaguage tags available to the system + */ + public static class GetAvailableLanaguageTags extends BaseStellarFunction { + + @Override + public Object apply(List<Object> list) { + return tagsList; + } + } + + @Stellar(name = "SCORE", + namespace = "FUZZY", + description = + "Returns the Fuzzy Score which indicates the similarity score between two Strings " + + + "One point is given for every matched character. Subsequent matches yield two bonus " + + + "points. A higher score indicates a higher similarity", + params = { + "string - The full term that should be matched against", + "string - The query that will be matched against a term", + "string - The IETF BCP 47 language code to use such as en, fr, de " + + + "( SEE FUZZY_LANGS and https://tools.ietf.org/html/bcp47)" + }, + returns = "integer representing the score") + /** + * FuzzyScoreFunction exposes the Apache Commons Text Similarity FuzzyScore through + * Stellar. + */ + public static class FuzzyScoreFunction extends BaseStellarFunction { + + @Override + public Object apply(List<Object> list) { + if (list.size() < 3) { + throw new IllegalStateException("FUZZY_SCORE expects three args: [string, string, string]"); + } + Object oterm = list.get(0); + Object oquery = list.get(1); + Object olang = list.get(2); + + // return 0 here, validate will pass 3 nulls + // if we change validate to pass default of expected type, we can differentiate + if (!(oterm instanceof String) || !(oquery instanceof String) || !(olang instanceof String)) { + return 0; + } + + String term = (String) oterm; + String query = (String) oquery; + String lang = (String) olang; + + if (!tagsList.contains(lang)) { + throw new ParseException( + "FUZZY_SCORE requires a valid IETF BCP47 language code see FUZZY_LANGS and https://tools.ietf.org/html/bcp47"); + } + + if (StringUtils.isEmpty(term) || StringUtils.isEmpty(query)) { + return 0; + } + + Locale locale = Locale.forLanguageTag(lang); + FuzzyScore score = new FuzzyScore(locale); + return score.fuzzyScore(term, query); + } + } +} http://git-wip-us.apache.org/repos/asf/metron/blob/d5dbfc20/metron-stellar/stellar-common/src/test/java/org/apache/metron/stellar/dsl/functions/TextFunctionsTest.java ---------------------------------------------------------------------- diff --git a/metron-stellar/stellar-common/src/test/java/org/apache/metron/stellar/dsl/functions/TextFunctionsTest.java b/metron-stellar/stellar-common/src/test/java/org/apache/metron/stellar/dsl/functions/TextFunctionsTest.java new file mode 100644 index 0000000..07b3619 --- /dev/null +++ b/metron-stellar/stellar-common/src/test/java/org/apache/metron/stellar/dsl/functions/TextFunctionsTest.java @@ -0,0 +1,101 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with the License. You may obtain + * a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.metron.stellar.dsl.functions; + +import static org.apache.metron.stellar.common.utils.StellarProcessorUtils.run; +import static org.apache.metron.stellar.common.utils.StellarProcessorUtils.runPredicate; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.metron.stellar.dsl.DefaultVariableResolver; +import org.apache.metron.stellar.dsl.ParseException; +import org.junit.Assert; +import org.junit.Test; + +public class TextFunctionsTest { + + static final Map<String, String> variableMap = new HashMap<String, String>() {{ + put("metron", "metron"); + put("sentence", "metron is great"); + put("empty", ""); + put("english", "en"); + put("klingon", "Kling"); + put("asf", "Apache Software Foundation"); + }}; + + @Test + public void testGetAvailableLanguageTags() { + Object ret = run("FUZZY_LANGS()", new HashMap<>()); + Assert.assertNotNull(ret); + Assert.assertTrue(ret instanceof List); + List<String> tags = (List<String>) ret; + Assert.assertTrue(tags.size() > 0); + Assert.assertTrue(tags.contains("en")); + Assert.assertTrue(tags.contains("fr")); + } + + @Test() + public void testNoMatchStrings() throws Exception { + Assert.assertTrue(runPredicate("0 == FUZZY_SCORE(metron,'z',english)", + new DefaultVariableResolver(v -> variableMap.get(v), + v -> variableMap.containsKey(v)))); + } + + @Test(expected = ParseException.class) + public void testMissingLanguage() throws Exception { + runPredicate("0 == FUZZY_SCORE(metron,'z',klingon)", + new DefaultVariableResolver(v -> variableMap.get(v), + v -> variableMap.containsKey(v))); + } + + @Test() + public void testEmptyFirstArg() throws Exception { + Assert.assertTrue(runPredicate("0 == FUZZY_SCORE(empty,'z',english)", + new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v)))); + } + + @Test() + public void testEmptyFirstTwoArgs() throws Exception { + Assert.assertTrue(runPredicate("0 == FUZZY_SCORE(empty,empty,english)", + new DefaultVariableResolver(v -> variableMap.get(v), + v -> variableMap.containsKey(v)))); + } + + @Test(expected = ParseException.class) + public void testEmptyArgs() throws Exception { + runPredicate("0 == FUZZY_SCORE(empty,empty,empty)", + new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v))); + } + + @Test(expected = ParseException.class) + public void testNoArgs() throws Exception { + runPredicate("0 == FUZZY_SCORE()", + new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v))); + } + + @Test + public void testHappyStringFunctions() throws Exception { + Assert + .assertTrue(runPredicate("1 == FUZZY_SCORE(metron,'m',english)", + new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v)))); + Assert.assertTrue( + runPredicate("16 == FUZZY_SCORE(metron,'metron',english)", + new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v)))); + Assert.assertTrue(runPredicate("3 == FUZZY_SCORE(asf,'asf',english)", + new DefaultVariableResolver(v -> variableMap.get(v), v -> variableMap.containsKey(v)))); + } +}