This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/main by this push:
     new e91ceb17 OPENNLP-1661: Fix custom models being wiped from OpenNLP 
user.home directory (#704)
e91ceb17 is described below

commit e91ceb17e36120b66dafdf15d2e11fc6a59c92c6
Author: Martin Wiesner <[email protected]>
AuthorDate: Tue Dec 3 07:35:19 2024 +0100

    OPENNLP-1661: Fix custom models being wiped from OpenNLP user.home 
directory (#704)
    
    - deletes AbstractDownloadUtilTest.java removing historical code that wiped 
models
    - adds package-private DownloadUtil#existsModel(..) method to check models 
for certain language exist locally
    - adds to and adjusts related test classes
---
 .../main/java/opennlp/tools/util/DownloadUtil.java | 60 +++++++++++++---
 .../tools/util/AbstractDownloadUtilTest.java       | 79 ----------------------
 .../tools/util/DownloadUtilDownloadTwiceTest.java  | 30 ++++----
 .../java/opennlp/tools/util/DownloadUtilTest.java  | 30 ++++++--
 4 files changed, 90 insertions(+), 109 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java 
b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
index 11b328da..cd80792f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
@@ -77,9 +77,45 @@ public class DownloadUtil {
       System.getProperty("OPENNLP_DOWNLOAD_BASE_URL", 
"https://dlcdn.apache.org/opennlp/";);
   private static final String MODEL_URI_PATH =
       System.getProperty("OPENNLP_DOWNLOAD_MODEL_PATH", 
"models/ud-models-1.2/");
+  private static final String OPENNLP_DOWNLOAD_HOME = "OPENNLP_DOWNLOAD_HOME";
 
   private static Map<String, Map<ModelType, String>> availableModels;
 
+  /**
+   * Checks if a model of the specified {@code modelType} has been downloaded 
already
+   * for a particular {@code language}.
+   *
+   * @param language  The ISO language code of the requested model.
+   * @param modelType The {@link DownloadUtil.ModelType type} of model.
+   * @return {@code true} if a model exists locally, {@code false} otherwise.
+   * @throws IOException Thrown if IO errors occurred or the computed hash sum
+   * of an associated, local model file was incorrect.
+   */
+  static boolean existsModel(String language, ModelType modelType) throws 
IOException {
+    Map<ModelType, String> modelsByLanguage = 
getAvailableModels().get(language);
+    if (modelsByLanguage == null) {
+      return false;
+    } else {
+      final String url = modelsByLanguage.get(modelType);
+      if (url != null) {
+        final Path homeDirectory = getDownloadHome();
+        final String filename = url.substring(url.lastIndexOf("/") + 1);
+        final Path localFile = homeDirectory.resolve(filename);
+        boolean exists;
+        if (Files.exists(localFile)) {
+          // if this does not throw the requested model is valid!
+          validateModel(new URL(url + ".sha512"), localFile);
+          exists = true;
+        } else {
+          exists = false;
+        }
+        return exists;
+      } else {
+        return false;
+      }
+    }
+  }
+
   /**
    * Triggers a download for the specified {@link DownloadUtil.ModelType}.
    *
@@ -94,7 +130,7 @@ public class DownloadUtil {
                                                       Class<T> type) throws 
IOException {
 
     if (getAvailableModels().containsKey(language)) {
-      final String url = (getAvailableModels().get(language).get(modelType));
+      final String url = getAvailableModels().get(language).get(modelType);
       if (url != null) {
         return downloadModel(new URL(url), type);
       }
@@ -119,8 +155,7 @@ public class DownloadUtil {
    */
   public static <T extends BaseModel> T downloadModel(URL url, Class<T> type) 
throws IOException {
 
-    final Path homeDirectory = 
Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME",
-        System.getProperty("user.home"))).resolve(".opennlp");
+    final Path homeDirectory = getDownloadHome();
 
     if (!Files.isDirectory(homeDirectory)) {
       try {
@@ -131,20 +166,17 @@ public class DownloadUtil {
     }
 
     final String filename = 
url.toString().substring(url.toString().lastIndexOf("/") + 1);
-    final Path localFile = Paths.get(homeDirectory.toString(), filename);
+    final Path localFile = homeDirectory.resolve(filename);
 
     if (!Files.exists(localFile)) {
-      logger.debug("Downloading model from {} to {}.", url, localFile);
+      logger.debug("Downloading model to {}.", localFile);
 
       try (final InputStream in = url.openStream()) {
         Files.copy(in, localFile, StandardCopyOption.REPLACE_EXISTING);
       }
-
       validateModel(new URL(url + ".sha512"), localFile);
-
       logger.debug("Download complete.");
     } else {
-      System.out.println("Model file already exists. Skipping download.");
       logger.debug("Model file '{}' already exists. Skipping download.", 
filename);
     }
 
@@ -167,7 +199,7 @@ public class DownloadUtil {
   }
 
   /**
-   * Validates the downloaded model.
+   * Validates a downloaded model via the specified {@link Path 
downloadedModel path}.
    *
    * @param sha512          the url to get the sha512 hash
    * @param downloadedModel the model file to check
@@ -187,8 +219,8 @@ public class DownloadUtil {
     // Validate SHA512 checksum
     final String actualChecksum = calculateSHA512(downloadedModel);
     if (!actualChecksum.equalsIgnoreCase(expectedChecksum)) {
-      throw new IOException("SHA512 checksum validation failed. Expected: "
-          + expectedChecksum + ", but got: " + actualChecksum);
+      throw new IOException("SHA512 checksum validation failed for " + 
downloadedModel.getFileName() +
+          ". Expected: " + expectedChecksum + ", but got: " + actualChecksum);
     }
   }
 
@@ -198,6 +230,7 @@ public class DownloadUtil {
       try (InputStream fis = Files.newInputStream(file);
            DigestInputStream dis = new DigestInputStream(fis, digest)) {
         byte[] buffer = new byte[4096];
+        //noinspection StatementWithEmptyBody
         while (dis.read(buffer) != -1) {
           // Reading the file to update the digest
         }
@@ -217,6 +250,11 @@ public class DownloadUtil {
     }
   }
 
+  private static Path getDownloadHome() {
+    return Paths.get(System.getProperty(OPENNLP_DOWNLOAD_HOME,
+            System.getProperty("user.home"))).resolve(".opennlp");
+  }
+
   @Internal
   static class DownloadParser {
 
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/util/AbstractDownloadUtilTest.java 
b/opennlp-tools/src/test/java/opennlp/tools/util/AbstractDownloadUtilTest.java
deleted file mode 100644
index 9fdde8b6..00000000
--- 
a/opennlp-tools/src/test/java/opennlp/tools/util/AbstractDownloadUtilTest.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.util;
-
-import java.io.IOException;
-import java.net.InetSocketAddress;
-import java.net.Socket;
-import java.nio.file.DirectoryStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-
-import org.junit.jupiter.api.BeforeAll;
-
-import opennlp.tools.EnabledWhenCDNAvailable;
-
-import static org.junit.jupiter.api.Assertions.fail;
-
-@EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
-public abstract class AbstractDownloadUtilTest {
-
-  private static final String APACHE_CDN = "dlcdn.apache.org";
-
-  @BeforeAll
-  public static void cleanupWhenOnline() {
-    boolean isOnline;
-    try (Socket socket = new Socket()) {
-      socket.connect(new InetSocketAddress(APACHE_CDN, 80), 
EnabledWhenCDNAvailable.TIMEOUT_MS);
-      isOnline = true;
-    } catch (IOException e) {
-      // Unreachable, unresolvable or timeout
-      isOnline = false;
-    }
-    // If CDN is available -> go cleanup in preparation of the actual tests
-    if (isOnline) {
-      wipeExistingModelFiles("-tokens-");
-      wipeExistingModelFiles("-sentence-");
-      wipeExistingModelFiles("-pos-");
-      wipeExistingModelFiles("-lemmas-");
-    }
-  }
-
-
-  /*
-   * Helper method that wipes out mode files if they exist on the text 
execution env.
-   * Those model files are wiped from a hidden '.opennlp' subdirectory.
-   *
-   * Thereby, a clean download can be guaranteed - ín CDN is available and 
test are executed.
-   */
-  private static void wipeExistingModelFiles(final String fragment) {
-    final Path dir = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME",
-        System.getProperty("user.home"))).resolve(".opennlp");
-    if (Files.exists(dir)) {
-      try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, 
"*opennlp-*" + fragment + "*")) {
-        for (Path modelFileToWipe : stream) {
-          Files.deleteIfExists(modelFileToWipe);
-        }
-      } catch (IOException e) {
-        fail(e.getLocalizedMessage());
-      }
-    }
-  }
-
-}
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java
index 5f328b00..24bad516 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java
@@ -35,7 +35,7 @@ import opennlp.tools.sentdetect.SentenceModel;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
-public class DownloadUtilDownloadTwiceTest extends AbstractDownloadUtilTest {
+public class DownloadUtilDownloadTwiceTest {
 
   /*
    * Programmatic change to debug log to ensure that we can see log messages to
@@ -60,24 +60,24 @@ public class DownloadUtilDownloadTwiceTest extends 
AbstractDownloadUtilTest {
 
   @Test
   public void testDownloadModelTwice() throws IOException {
+    String lang = "de";
+    DownloadUtil.ModelType type = DownloadUtil.ModelType.SENTENCE_DETECTOR;
+    
     try (LogCaptor logCaptor = LogCaptor.forClass(DownloadUtil.class)) {
-
-      DownloadUtil.downloadModel("de",
-          DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
-
-      assertEquals(2, logCaptor.getDebugLogs().size());
-      checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "Download 
complete.");
+      boolean alreadyDownloaded = DownloadUtil.existsModel(lang, type);
+      DownloadUtil.downloadModel(lang, type, SentenceModel.class);
+
+      if (! alreadyDownloaded) {
+        assertEquals(2, logCaptor.getDebugLogs().size());
+        checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), 
"Download complete.");
+      } else {
+        assertEquals(1, logCaptor.getDebugLogs().size());
+        checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), 
"already exists. Skipping download.");
+      }
       logCaptor.clearLogs();
 
       // try to download again
-      DownloadUtil.downloadModel("de",
-          DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
-      assertEquals(1, logCaptor.getDebugLogs().size());
-      checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already 
exists. Skipping download.");
-      logCaptor.clearLogs();
-
-      DownloadUtil.downloadModel("de",
-          DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
+      DownloadUtil.downloadModel(lang, type, SentenceModel.class);
       assertEquals(1, logCaptor.getDebugLogs().size());
       checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already 
exists. Skipping download.");
       logCaptor.clearLogs();
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java 
b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java
index b8a61f91..c5355601 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.net.URL;
 import java.util.stream.Stream;
 
+import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
 import org.junit.jupiter.params.provider.MethodSource;
@@ -32,11 +33,12 @@ import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.tokenize.TokenizerModel;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
-public class DownloadUtilTest extends AbstractDownloadUtilTest {
+public class DownloadUtilTest {
 
   @ParameterizedTest(name = "Verify \"{0}\" sentence model")
   @ValueSource(strings = {"en", "fr", "de", "it", "nl", "bg", "ca", "cs", 
"da", "el",
@@ -62,13 +64,33 @@ public class DownloadUtilTest extends 
AbstractDownloadUtilTest {
     assertTrue(model.isLoadedFromSerialized());
   }
 
+  @Test
+  @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
+  public void testExistsModel() throws IOException {
+    final String lang = "en";
+    final DownloadUtil.ModelType type = 
DownloadUtil.ModelType.SENTENCE_DETECTOR;
+    // Prepare
+    SentenceModel model = DownloadUtil.downloadModel(lang, type, 
SentenceModel.class);
+    assertNotNull(model);
+    assertEquals(lang, model.getLanguage());
+    // Test
+    assertTrue(DownloadUtil.existsModel(lang, type));
+  }
+
+  @ParameterizedTest
+  @NullAndEmptySource
+  @ValueSource(strings = {"xy", "\t", "\n"})
+  @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
+  public void testExistsModelInvalid(String input) throws IOException {
+    assertFalse(DownloadUtil.existsModel(input, 
DownloadUtil.ModelType.SENTENCE_DETECTOR));
+  }
+
   @ParameterizedTest(name = "Detect invalid input: \"{0}\"")
   @NullAndEmptySource
   @ValueSource(strings = {" ", "\t", "\n"})
   public void testDownloadModelInvalid(String input) {
-    assertThrows(IOException.class, () -> DownloadUtil.downloadModel(
-            input, DownloadUtil.ModelType.SENTENCE_DETECTOR, 
SentenceModel.class),
-        "Invalid model");
+    assertThrows(IOException.class, () -> DownloadUtil.downloadModel(input,
+            DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class), 
"Invalid model");
   }
 
   private static final DownloadUtil.ModelType MT_TOKENIZER = 
DownloadUtil.ModelType.TOKENIZER;

Reply via email to