This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch OPENNLP-1589 in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 9bac8a77c5718ec4a856c434a95eebc2698c0e08 Author: Richard Zowalla <[email protected]> AuthorDate: Thu Jul 4 13:48:40 2024 +0200 OPENNLP-1589 - Implements a more aggressive caching strategy based on a dual caching strategy; relies on cache evication as implemented by the Cache class of OpenNLP. - Use Arrays.equals(...) - Demonstrate that caching based on identity is broken. --- .../tools/namefind/TokenNameFinderFactory.java | 4 +- .../util/featuregen/CachedFeatureGenerator.java | 57 ++++++++++------------ .../featuregen/CachedFeatureGeneratorTest.java | 48 ++++++++++++++---- 3 files changed, 68 insertions(+), 41 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java index 6f31f0fe..da7277a4 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java @@ -212,13 +212,13 @@ public class TokenNameFinderFactory extends BaseToolFactory { AdaptiveFeatureGenerator featureGenerator = createFeatureGenerators(); if (featureGenerator == null) { - featureGenerator = new CachedFeatureGenerator( + featureGenerator = new CachedFeatureGenerator(new AggregatedFeatureGenerator( new WindowFeatureGenerator(new TokenFeatureGenerator(), 2, 2), new WindowFeatureGenerator(new TokenClassFeatureGenerator(true), 2, 2), new OutcomePriorFeatureGenerator(), new PreviousMapFeatureGenerator(), new BigramNameFeatureGenerator(), - new SentenceFeatureGenerator(true, false)); + new SentenceFeatureGenerator(true, false))); } return new DefaultNameContextGenerator(featureGenerator); diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java index e26ffbdd..ff3fb639 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java @@ -19,6 +19,7 @@ package opennlp.tools.util.featuregen; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import opennlp.tools.util.Cache; @@ -30,52 +31,42 @@ public class CachedFeatureGenerator implements AdaptiveFeatureGenerator { private final AdaptiveFeatureGenerator generator; - private String[] prevTokens; - - private final Cache<Integer, List<String>> contextsCache; + private final Cache<Integer, Cache<Integer, List<String>>> contextCaches; + private final int cacheSize; private long numberOfCacheHits; private long numberOfCacheMisses; - @Deprecated - public CachedFeatureGenerator(AdaptiveFeatureGenerator... generators) { - this.generator = new AggregatedFeatureGenerator(generators); - contextsCache = new Cache<>(100); + public CachedFeatureGenerator(AdaptiveFeatureGenerator generator, int cacheSize) { + this.generator = generator; + this.contextCaches = new Cache<>(cacheSize); + this.cacheSize = cacheSize; } public CachedFeatureGenerator(AdaptiveFeatureGenerator generator) { - this.generator = generator; - contextsCache = new Cache<>(100); + this(generator, 100); } @Override public void createFeatures(List<String> features, String[] tokens, int index, - String[] previousOutcomes) { + String[] previousOutcomes) { - List<String> cacheFeatures; + final int tokenHash = Arrays.hashCode(tokens); - if (tokens == prevTokens) { - cacheFeatures = contextsCache.get(index); - - if (cacheFeatures != null) { - numberOfCacheHits++; - features.addAll(cacheFeatures); - return; - } + final Cache<Integer, List<String>> contextCache = contextCaches.computeIfAbsent(tokenHash, + k -> new Cache<>(cacheSize)); + List<String> cacheFeatures = contextCache.get(index); + if (cacheFeatures != null) { + numberOfCacheHits++; + features.addAll(cacheFeatures); } else { - contextsCache.clear(); - prevTokens = tokens; + numberOfCacheMisses++; + cacheFeatures = new ArrayList<>(); + generator.createFeatures(cacheFeatures, tokens, index, previousOutcomes); + contextCache.put(index, cacheFeatures); + features.addAll(cacheFeatures); } - - cacheFeatures = new ArrayList<>(); - - numberOfCacheMisses++; - - generator.createFeatures(cacheFeatures, tokens, index, previousOutcomes); - - contextsCache.put(index, cacheFeatures); - features.addAll(cacheFeatures); } @Override @@ -102,6 +93,12 @@ public class CachedFeatureGenerator implements AdaptiveFeatureGenerator { return numberOfCacheMisses; } + public void clearCache() { + this.contextCaches.clear(); + this.numberOfCacheMisses = 0; + this.numberOfCacheHits = 0; + } + @Override public String toString() { return super.toString() + ": hits=" + numberOfCacheHits diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java index 05861852..6f89fe4b 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java @@ -18,6 +18,7 @@ package opennlp.tools.util.featuregen; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.junit.jupiter.api.Assertions; @@ -47,6 +48,40 @@ public class CachedFeatureGeneratorTest { features = new ArrayList<>(); } + @Test + void testCachingOfRealWorldSentence() { + CachedFeatureGenerator generator = new CachedFeatureGenerator(identityGenerator); + final String[] sentence = "He belongs to Apache \n Software Foundation .".split(" "); + int testIndex = 0; + + // after this call features are cached for testIndex + generator.createFeatures(features, sentence, testIndex, null); + Assertions.assertEquals(1, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(0, generator.getNumberOfCacheHits()); + + generator.createFeatures(features, sentence, testIndex, null); + Assertions.assertEquals(1, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(1, generator.getNumberOfCacheHits()); + + generator.createFeatures(features, sentence, testIndex + 1, null); + Assertions.assertEquals(2, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(1, generator.getNumberOfCacheHits()); + + generator.createFeatures(features, sentence, testIndex + 1, null); + Assertions.assertEquals(2, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(2, generator.getNumberOfCacheHits()); + + generator.clearCache(); + + Assertions.assertEquals(0, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(0, generator.getNumberOfCacheHits()); + + generator.createFeatures(features, sentence, testIndex + 1, null); + Assertions.assertEquals(1, generator.getNumberOfCacheMisses()); + Assertions.assertEquals(0, generator.getNumberOfCacheHits()); + + } + /** * Tests if cache works for one sentence and two different token indexes. */ @@ -70,9 +105,7 @@ public class CachedFeatureGeneratorTest { final String expectedToken = testSentence1[testIndex]; - testSentence1[testIndex] = null; - - generator.createFeatures(features, testSentence1, testIndex, null); + generator.createFeatures(features, Arrays.copyOf(testSentence1, testSentence1.length), testIndex, null); Assertions.assertEquals(1, generator.getNumberOfCacheMisses()); Assertions.assertEquals(1, generator.getNumberOfCacheHits()); @@ -86,7 +119,7 @@ public class CachedFeatureGeneratorTest { int testIndex2 = testIndex + 1; - generator.createFeatures(features, testSentence1, testIndex2, null); + generator.createFeatures(features, Arrays.copyOf(testSentence1, testSentence1.length), testIndex2, null); Assertions.assertEquals(2, generator.getNumberOfCacheMisses()); Assertions.assertEquals(1, generator.getNumberOfCacheHits()); @@ -116,7 +149,7 @@ public class CachedFeatureGeneratorTest { features.clear(); // use another sentence but same index - generator.createFeatures(features, testSentence2, testIndex, null); + generator.createFeatures(features, Arrays.copyOf(testSentence2, testSentence2.length), testIndex, null); Assertions.assertEquals(2, generator.getNumberOfCacheMisses()); Assertions.assertEquals(0, generator.getNumberOfCacheHits()); @@ -128,10 +161,7 @@ public class CachedFeatureGeneratorTest { // check if features are really cached final String expectedToken = testSentence2[testIndex]; - - testSentence2[testIndex] = null; - - generator.createFeatures(features, testSentence2, testIndex, null); + generator.createFeatures(features, Arrays.copyOf(testSentence2, testSentence2.length), testIndex, null); Assertions.assertTrue(features.contains(expectedToken)); Assertions.assertEquals(1, features.size());
