This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/master by this push:
     new 34f3c4e  Resource download component added.
     new 3b28a77  Merge branch 'NLPCRAFT-480'
34f3c4e is described below

commit 34f3c4e06274ab9f488558601a2fee7169b81dd5
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Feb 25 13:36:07 2022 +0300

    Resource download component added.
---
 .../nlpcraft/internal/util/NCResourceReader.java   |  55 +++++++
 .../internal/util/NCResourceReaderImpl.scala       | 182 +++++++++++++++++++++
 .../nlpcraft/internal/util/NCResourceSpec.scala    |  42 +++++
 3 files changed, 279 insertions(+)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCResourceReader.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCResourceReader.java
new file mode 100644
index 0000000..7d3e1c5
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCResourceReader.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.util;
+
+import java.io.File;
+
+/**
+ * TODO: move it into right folder.
+ */
+public class NCResourceReader {
+    /**
+     *
+     */
+    private NCResourceReaderImpl impl;
+
+    /**
+     *
+     */
+    public NCResourceReader() {
+        impl = NCResourceReaderImpl.apply();
+    }
+
+    /**
+     *
+     * @param dir Folder to save downloaded data.
+     */
+    public NCResourceReader(String dir) {
+        impl = NCResourceReaderImpl.apply(dir);
+    }
+
+    /**
+     * Gets if exists or download.
+     *
+     * @param path
+     * @return
+     */
+    public File get(String path) {
+        return impl.get(path);
+    }
+}
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCResourceReaderImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCResourceReaderImpl.scala
new file mode 100644
index 0000000..2363d1a
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCResourceReaderImpl.scala
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.util
+
+import org.apache.nlpcraft.NCException
+
+import java.io.*
+import java.net.URL
+import scala.util.Using
+import scala.io.Source
+import com.typesafe.scalalogging.LazyLogging
+import org.apache.commons.io.IOUtils
+import org.apache.commons.codec.digest.DigestUtils
+import java.nio.file.Files
+
+/**
+  *
+  */
+object NCResourceReaderImpl extends LazyLogging:
+    private final val DFLT_DIR = new File(System.getProperty("user.home"), 
".nlpcraft/extcfg").getAbsolutePath
+    private final val BASE_URL = 
"https://github.com/apache/incubator-nlpcraft/raw/external_config/external";
+    private final val MD5_FILE_URL = s"$BASE_URL/md5.txt"
+
+    /**
+      *
+      * @param dir
+      * @return
+      */
+    private def mkDir(dir: String): File =
+        val normDir = if dir != null then dir else DFLT_DIR
+        val f = new File(normDir)
+
+        if f.exists then
+            if !f.isDirectory then throw new NCException(s"Invalid folder: 
$normDir")
+        else
+            if !f.mkdirs then throw new NCException(s"Cannot create folder: 
$normDir")
+
+        f
+
+    /**
+      *
+      * @param url
+      * @return
+      */
+    private def readMd5(url: String): Map[String, String] =
+        try
+            Using.resource(Source.fromURL(url)) { src =>
+                src.getLines().map(_.trim()).filter(s => s.nonEmpty && 
!s.startsWith("#")).map(p => {
+                    val seq = p.split(" ").map(_.strip)
+
+                    if seq.length != 2 || seq.exists(_.isEmpty) then
+                        throw new NCException(s"Unexpected '$url' file line 
format: '$p'")
+
+                    seq.head -> seq.last
+                }).toList.toMap
+            }
+        catch case e: IOException => throw new NCException(s"Failed to read: 
'$url'", e)
+
+    /**
+      *
+      * @param f
+      */
+    private def delete(f: File): Unit =
+        if !f.delete() then throw new NCException(s"Couldn't delete file: 
${f.getAbsolutePath}")
+        else logger.info(s"File deleted: ${f.getAbsolutePath}")
+
+    /**
+      *
+      * @param dir
+      * @return
+      */
+    def apply(dir: String): NCResourceReaderImpl = new 
NCResourceReaderImpl(mkDir(dir))
+
+    /**
+      *
+      * @return
+      */
+    def apply(): NCResourceReaderImpl = new NCResourceReaderImpl(mkDir(null))
+
+import NCResourceReaderImpl.*
+
+/**
+  *
+  * @param dir
+  */
+class NCResourceReaderImpl(dir: File) extends LazyLogging:
+    private val md5 = readMd5(MD5_FILE_URL)
+
+    /**
+      *
+      * @param f
+      * @return
+      */
+    private def isExists(f: File): Boolean = f.exists() && f.isFile
+
+    /**
+      *
+      * @param f
+      * @return
+      */
+    private def getMd5(f: File): String =
+        val path = f.getAbsolutePath
+        val nameLen = f.getName.length
+
+        md5.
+            flatMap { (resPath, md5) => if path.endsWith(resPath) && 
resPath.length >= nameLen then Some(md5) else None }.
+            to(LazyList).
+            headOption.
+            getOrElse(throw new NCException(s"MD5 data not found for: 
'$path'"))
+
+    /**
+      *
+      * @param f
+      * @return
+      */
+    private def isValid(f: File): Boolean =
+        val v1 = getMd5(f)
+
+        val v2 =
+            try Using.resource(Files.newInputStream(f.toPath)) { in => 
DigestUtils.md5Hex(in) }
+            catch case e: IOException => throw new NCException(s"Failed to get 
MD5 for: '${f.getAbsolutePath}'", e)
+
+        v1 == v2
+
+    /**
+      *
+      * @param path
+      * @param outFile
+      * @return
+      */
+    private def download(path: String, outFile: String): File =
+        mkDir(new File(outFile).getParent)
+
+        val url = s"$BASE_URL/$path"
+
+        try
+            Using.resource(new BufferedInputStream(new URL(url).openStream())) 
{ src =>
+                Using.resource(new FileOutputStream(outFile)) { out => 
IOUtils.copy(src, out) }
+                logger.info(s"One-time download for external config 
[url='$url', file='$outFile']")
+
+                val f = new File(outFile)
+                if !isValid(f) then throw new NCException(s"Invalid downloaded 
file [url='$url'")
+                f
+            }
+        catch case e: IOException => throw new NCException(s"Failed to 
download external config [url='$url', file='$outFile']", e)
+
+    /**
+      *
+      * @param path
+      * @return
+      */
+    def get(path: String): File =
+        var f = new File(path)
+
+        def process(f: File): File =
+            if isValid(f) then
+                logger.info(s"File found: ${f.getAbsolutePath}")
+                f
+            else
+                delete(f)
+                download(path, f.getAbsolutePath)
+
+        if isExists(f) then
+            process(f)
+        else
+            f = new File(DFLT_DIR, path)
+            if isExists(f) then process(f) else download(path, 
f.getAbsolutePath)
\ No newline at end of file
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/internal/util/NCResourceSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/internal/util/NCResourceSpec.scala
new file mode 100644
index 0000000..be15240
--- /dev/null
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/internal/util/NCResourceSpec.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.util
+
+import org.junit.jupiter.api.Test
+
+/**
+  *
+  */
+class NCResourceSpec:
+    @Test
+    def test(): Unit =
+        val res = new NCResourceReader()
+
+        // Get and delete.
+        var f = res.get("opennlp/en-lemmatizer.dict")
+        require(f.delete())
+        println(s"Deleted: ${f.getAbsolutePath}")
+
+        // Download.
+        f = res.get("opennlp/en-lemmatizer.dict")
+        // From cache.
+        f = res.get("opennlp/en-lemmatizer.dict")
+        // By absolute path.
+        f = res.get(f.getAbsolutePath)
+
+        require(f.exists())
\ No newline at end of file

Reply via email to