This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch OPENNLP-1589
in repository https://gitbox.apache.org/repos/asf/opennlp.git

commit 9bac8a77c5718ec4a856c434a95eebc2698c0e08
Author: Richard Zowalla <[email protected]>
AuthorDate: Thu Jul 4 13:48:40 2024 +0200

    OPENNLP-1589 - Implements a more aggressive caching strategy based on a 
dual caching strategy; relies on cache evication as implemented by the Cache 
class of OpenNLP.
     - Use Arrays.equals(...)
     - Demonstrate that caching based on identity is broken.
---
 .../tools/namefind/TokenNameFinderFactory.java     |  4 +-
 .../util/featuregen/CachedFeatureGenerator.java    | 57 ++++++++++------------
 .../featuregen/CachedFeatureGeneratorTest.java     | 48 ++++++++++++++----
 3 files changed, 68 insertions(+), 41 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java
 
b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java
index 6f31f0fe..da7277a4 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java
@@ -212,13 +212,13 @@ public class TokenNameFinderFactory extends 
BaseToolFactory {
     AdaptiveFeatureGenerator featureGenerator = createFeatureGenerators();
 
     if (featureGenerator == null) {
-      featureGenerator = new CachedFeatureGenerator(
+      featureGenerator = new CachedFeatureGenerator(new 
AggregatedFeatureGenerator(
           new WindowFeatureGenerator(new TokenFeatureGenerator(), 2, 2),
           new WindowFeatureGenerator(new TokenClassFeatureGenerator(true), 2, 
2),
           new OutcomePriorFeatureGenerator(),
           new PreviousMapFeatureGenerator(),
           new BigramNameFeatureGenerator(),
-          new SentenceFeatureGenerator(true, false));
+          new SentenceFeatureGenerator(true, false)));
     }
 
     return new DefaultNameContextGenerator(featureGenerator);
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java
index e26ffbdd..ff3fb639 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java
@@ -19,6 +19,7 @@
 package opennlp.tools.util.featuregen;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import opennlp.tools.util.Cache;
@@ -30,52 +31,42 @@ public class CachedFeatureGenerator implements 
AdaptiveFeatureGenerator {
 
   private final AdaptiveFeatureGenerator generator;
 
-  private String[] prevTokens;
-
-  private final Cache<Integer, List<String>> contextsCache;
+  private final Cache<Integer, Cache<Integer, List<String>>> contextCaches;
+  private final int cacheSize;
 
   private long numberOfCacheHits;
   private long numberOfCacheMisses;
 
-  @Deprecated
-  public CachedFeatureGenerator(AdaptiveFeatureGenerator... generators) {
-    this.generator = new AggregatedFeatureGenerator(generators);
-    contextsCache = new Cache<>(100);
+  public CachedFeatureGenerator(AdaptiveFeatureGenerator generator, int 
cacheSize) {
+    this.generator = generator;
+    this.contextCaches = new Cache<>(cacheSize);
+    this.cacheSize = cacheSize;
   }
 
   public CachedFeatureGenerator(AdaptiveFeatureGenerator generator) {
-    this.generator = generator;
-    contextsCache = new Cache<>(100);
+    this(generator, 100);
   }
 
   @Override
   public void createFeatures(List<String> features, String[] tokens, int index,
-      String[] previousOutcomes) {
+                             String[] previousOutcomes) {
 
-    List<String> cacheFeatures;
+    final int tokenHash = Arrays.hashCode(tokens);
 
-    if (tokens == prevTokens) {
-      cacheFeatures = contextsCache.get(index);
-
-      if (cacheFeatures != null) {
-        numberOfCacheHits++;
-        features.addAll(cacheFeatures);
-        return;
-      }
+    final Cache<Integer, List<String>> contextCache = 
contextCaches.computeIfAbsent(tokenHash,
+        k -> new Cache<>(cacheSize));
+    List<String> cacheFeatures = contextCache.get(index);
 
+    if (cacheFeatures != null) {
+      numberOfCacheHits++;
+      features.addAll(cacheFeatures);
     } else {
-      contextsCache.clear();
-      prevTokens = tokens;
+      numberOfCacheMisses++;
+      cacheFeatures = new ArrayList<>();
+      generator.createFeatures(cacheFeatures, tokens, index, previousOutcomes);
+      contextCache.put(index, cacheFeatures);
+      features.addAll(cacheFeatures);
     }
-
-    cacheFeatures = new ArrayList<>();
-
-    numberOfCacheMisses++;
-
-    generator.createFeatures(cacheFeatures, tokens, index, previousOutcomes);
-
-    contextsCache.put(index, cacheFeatures);
-    features.addAll(cacheFeatures);
   }
 
   @Override
@@ -102,6 +93,12 @@ public class CachedFeatureGenerator implements 
AdaptiveFeatureGenerator {
     return numberOfCacheMisses;
   }
 
+  public void clearCache() {
+    this.contextCaches.clear();
+    this.numberOfCacheMisses = 0;
+    this.numberOfCacheHits = 0;
+  }
+
   @Override
   public String toString() {
     return super.toString() + ": hits=" + numberOfCacheHits
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java
index 05861852..6f89fe4b 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/CachedFeatureGeneratorTest.java
@@ -18,6 +18,7 @@
 package opennlp.tools.util.featuregen;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import org.junit.jupiter.api.Assertions;
@@ -47,6 +48,40 @@ public class CachedFeatureGeneratorTest {
     features = new ArrayList<>();
   }
 
+  @Test
+  void testCachingOfRealWorldSentence() {
+    CachedFeatureGenerator generator = new 
CachedFeatureGenerator(identityGenerator);
+    final String[] sentence = "He belongs to Apache \n Software Foundation 
.".split(" ");
+    int testIndex = 0;
+
+    // after this call features are cached for testIndex
+    generator.createFeatures(features, sentence, testIndex, null);
+    Assertions.assertEquals(1, generator.getNumberOfCacheMisses());
+    Assertions.assertEquals(0, generator.getNumberOfCacheHits());
+
+    generator.createFeatures(features, sentence, testIndex, null);
+    Assertions.assertEquals(1, generator.getNumberOfCacheMisses());
+    Assertions.assertEquals(1, generator.getNumberOfCacheHits());
+
+    generator.createFeatures(features, sentence, testIndex + 1, null);
+    Assertions.assertEquals(2, generator.getNumberOfCacheMisses());
+    Assertions.assertEquals(1, generator.getNumberOfCacheHits());
+
+    generator.createFeatures(features, sentence, testIndex + 1, null);
+    Assertions.assertEquals(2, generator.getNumberOfCacheMisses());
+    Assertions.assertEquals(2, generator.getNumberOfCacheHits());
+
+    generator.clearCache();
+
+    Assertions.assertEquals(0, generator.getNumberOfCacheMisses());
+    Assertions.assertEquals(0, generator.getNumberOfCacheHits());
+
+    generator.createFeatures(features, sentence, testIndex + 1, null);
+    Assertions.assertEquals(1, generator.getNumberOfCacheMisses());
+    Assertions.assertEquals(0, generator.getNumberOfCacheHits());
+
+  }
+
   /**
    * Tests if cache works for one sentence and two different token indexes.
    */
@@ -70,9 +105,7 @@ public class CachedFeatureGeneratorTest {
 
     final String expectedToken = testSentence1[testIndex];
 
-    testSentence1[testIndex] = null;
-
-    generator.createFeatures(features, testSentence1, testIndex, null);
+    generator.createFeatures(features, Arrays.copyOf(testSentence1, 
testSentence1.length), testIndex, null);
 
     Assertions.assertEquals(1, generator.getNumberOfCacheMisses());
     Assertions.assertEquals(1, generator.getNumberOfCacheHits());
@@ -86,7 +119,7 @@ public class CachedFeatureGeneratorTest {
 
     int testIndex2 = testIndex + 1;
 
-    generator.createFeatures(features, testSentence1, testIndex2, null);
+    generator.createFeatures(features, Arrays.copyOf(testSentence1, 
testSentence1.length), testIndex2, null);
 
     Assertions.assertEquals(2, generator.getNumberOfCacheMisses());
     Assertions.assertEquals(1, generator.getNumberOfCacheHits());
@@ -116,7 +149,7 @@ public class CachedFeatureGeneratorTest {
     features.clear();
 
     // use another sentence but same index
-    generator.createFeatures(features, testSentence2, testIndex, null);
+    generator.createFeatures(features, Arrays.copyOf(testSentence2, 
testSentence2.length), testIndex, null);
 
     Assertions.assertEquals(2, generator.getNumberOfCacheMisses());
     Assertions.assertEquals(0, generator.getNumberOfCacheHits());
@@ -128,10 +161,7 @@ public class CachedFeatureGeneratorTest {
 
     // check if features are really cached
     final String expectedToken = testSentence2[testIndex];
-
-    testSentence2[testIndex] = null;
-
-    generator.createFeatures(features, testSentence2, testIndex, null);
+    generator.createFeatures(features, Arrays.copyOf(testSentence2, 
testSentence2.length), testIndex, null);
 
     Assertions.assertTrue(features.contains(expectedToken));
     Assertions.assertEquals(1, features.size());

Reply via email to