This is an automated email from the ASF dual-hosted git repository. kinow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-text.git
The following commit(s) were added to refs/heads/master by this push: new a32b1d2 TEXT-104: deprecate JaroWinkler methods for 2.0, and fix clirr report new 56c060d Merge pull request #102 from kinow/deprecate-jaro-winkler-for-20 a32b1d2 is described below commit a32b1d2948c7b1b4a07eda8a72fb5cd5bdf2dd00 Author: Bruno P. Kinoshita <bruno.kinosh...@niwa.co.nz> AuthorDate: Thu Feb 21 10:44:02 2019 +1300 TEXT-104: deprecate JaroWinkler methods for 2.0, and fix clirr report --- .../text/similarity/JaroWinklerDistance.java | 91 +++++++++++++++++++++- .../text/similarity/JaroWinklerDistanceTest.java | 45 +++++++---- 2 files changed, 117 insertions(+), 19 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java index eaa9fb3..388d0c7 100644 --- a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java @@ -16,6 +16,8 @@ */ package org.apache.commons.text.similarity; +import java.util.Arrays; + /** * Measures the Jaro-Winkler distance of two character sequences. * It is the complementary of Jaro-Winkler similarity. @@ -25,6 +27,12 @@ package org.apache.commons.text.similarity; public class JaroWinklerDistance implements EditDistance<Double> { /** + * @deprecated Deprecated as of 1.7. This constant will be removed in 2.0. + */ + @Deprecated + public static final int INDEX_NOT_FOUND = -1; + + /** * Computes the Jaro Winkler Distance between two character sequences. * * <pre> @@ -63,7 +71,86 @@ public class JaroWinklerDistance implements EditDistance<Double> { throw new IllegalArgumentException("CharSequences must not be null"); } - JaroWinklerSimilarity similarity = new JaroWinklerSimilarity(); - return 1 - similarity.apply(left, right); + // TODO: replace the rest of the code by this in 2.0, see TEXT-104 + // + // JaroWinklerSimilarity similarity = new JaroWinklerSimilarity(); + // return 1 - similarity.apply(left, right); + + final double defaultScalingFactor = 0.1; + final int[] mtp = matches(left, right); + final double m = mtp[0]; + if (m == 0) { + return 0D; + } + final double j = ((m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m)) / 3; + final double jw = j < 0.7D ? j : j + defaultScalingFactor * mtp[2] * (1D - j); + return jw; + } + + // TODO: remove this method in 2.0, see TEXT-104 + /** + * This method returns the Jaro-Winkler string matches, half transpositions, prefix array. + * + * @param first the first string to be matched + * @param second the second string to be matched + * @return mtp array containing: matches, half transpositions, and prefix + * @deprecated Deprecated as of 1.7. This method will be removed in 2.0, and moved to a Jaro Winkler similarity + * class. + */ + @Deprecated + protected static int[] matches(final CharSequence first, final CharSequence second) { + CharSequence max, min; + if (first.length() > second.length()) { + max = first; + min = second; + } else { + max = second; + min = first; + } + final int range = Math.max(max.length() / 2 - 1, 0); + final int[] matchIndexes = new int[min.length()]; + Arrays.fill(matchIndexes, -1); + final boolean[] matchFlags = new boolean[max.length()]; + int matches = 0; + for (int mi = 0; mi < min.length(); mi++) { + final char c1 = min.charAt(mi); + for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) { + if (!matchFlags[xi] && c1 == max.charAt(xi)) { + matchIndexes[mi] = xi; + matchFlags[xi] = true; + matches++; + break; + } + } + } + final char[] ms1 = new char[matches]; + final char[] ms2 = new char[matches]; + for (int i = 0, si = 0; i < min.length(); i++) { + if (matchIndexes[i] != -1) { + ms1[si] = min.charAt(i); + si++; + } + } + for (int i = 0, si = 0; i < max.length(); i++) { + if (matchFlags[i]) { + ms2[si] = max.charAt(i); + si++; + } + } + int halfTranspositions = 0; + for (int mi = 0; mi < ms1.length; mi++) { + if (ms1[mi] != ms2[mi]) { + halfTranspositions++; + } + } + int prefix = 0; + for (int mi = 0; mi < Math.min(4, min.length()); mi++) { + if (first.charAt(mi) == second.charAt(mi)) { + prefix++; + } else { + break; + } + } + return new int[] {matches, halfTranspositions, prefix}; } } diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java index eadf1a2..e56ec07 100644 --- a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java @@ -36,23 +36,34 @@ public class JaroWinklerDistanceTest { @Test public void testGetJaroWinklerDistance_StringString() { - assertEquals(0d, distance.apply("", ""), 0.00001d); - assertEquals(0d, distance.apply("foo", "foo"), 0.00001d); - assertEquals(1 - 0.94166d, distance.apply("foo", "foo "), 0.00001d); - assertEquals(1 - 0.90666d, distance.apply("foo", "foo "), 0.00001d); - assertEquals(1 - 0.86666d, distance.apply("foo", " foo "), 0.00001d); - assertEquals(1 - 0.51111d, distance.apply("foo", " foo"), 0.00001d); - assertEquals(1 - 0.92499d, distance.apply("frog", "fog"), 0.00001d); - assertEquals(1.0d, distance.apply("fly", "ant"), 0.00000000000000000001d); - assertEquals(1 - 0.44166d, distance.apply("elephant", "hippo"), 0.00001d); - assertEquals(1 - 0.90666d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d); - assertEquals(1 - 0.95251d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d); - assertEquals(1 - 0.942d, - distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d); - assertEquals(1 - 0.898018d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d); - assertEquals(1 - 0.971428d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d); - assertEquals(1 - 0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d); - assertEquals(1 - 0.911111d, distance.apply("John Horn", "John Hopkins"), 0.00001d); + assertEquals(0.92499d, distance.apply("frog", "fog"), 0.00001d); + assertEquals(0.0d, distance.apply("fly", "ant"), 0.00000000000000000001d); + assertEquals(0.44166d, distance.apply("elephant", "hippo"), 0.00001d); + assertEquals(0.90666d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d); + assertEquals(0.95251d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d); + assertEquals(0.942d, distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d); + assertEquals(0.898018d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d); + assertEquals(0.971428d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d); + assertEquals(0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d); + assertEquals(0.911111d, distance.apply("John Horn", "John Hopkins"), 0.00001d); + // TODO: replace tests in 2.0. See TEXT-104 for more. + // assertEquals(0d, distance.apply("", ""), 0.00001d); + // assertEquals(0d, distance.apply("foo", "foo"), 0.00001d); + // assertEquals(1 - 0.94166d, distance.apply("foo", "foo "), 0.00001d); + // assertEquals(1 - 0.90666d, distance.apply("foo", "foo "), 0.00001d); + // assertEquals(1 - 0.86666d, distance.apply("foo", " foo "), 0.00001d); + // assertEquals(1 - 0.51111d, distance.apply("foo", " foo"), 0.00001d); + // assertEquals(1 - 0.92499d, distance.apply("frog", "fog"), 0.00001d); + // assertEquals(1.0d, distance.apply("fly", "ant"), 0.00000000000000000001d); + // assertEquals(1 - 0.44166d, distance.apply("elephant", "hippo"), 0.00001d); + // assertEquals(1 - 0.90666d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d); + // assertEquals(1 - 0.95251d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d); + // assertEquals(1 - 0.942d, + // distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d); + // assertEquals(1 - 0.898018d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d); + // assertEquals(1 - 0.971428d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d); + // assertEquals(1 - 0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d); + // assertEquals(1 - 0.911111d, distance.apply("John Horn", "John Hopkins"), 0.00001d); } @Test