ctubbsii commented on code in PR #35: URL: https://github.com/apache/accumulo-classloaders/pull/35#discussion_r2677428530
########## modules/local-caching-classloader/src/main/java/org/apache/accumulo/classloader/lcc/util/LocalStore.java: ########## @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.classloader.lcc.util; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.nio.file.StandardCopyOption.ATOMIC_MOVE; +import static java.nio.file.StandardOpenOption.CREATE_NEW; +import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING; +import static java.util.Objects.requireNonNull; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UncheckedIOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLClassLoader; +import java.nio.file.FileAlreadyExistsException; +import java.nio.file.Files; +import java.nio.file.NoSuchFileException; +import java.nio.file.Path; +import java.util.LinkedHashSet; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.FutureTask; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.regex.Pattern; + +import org.apache.accumulo.classloader.lcc.Constants; +import org.apache.accumulo.classloader.lcc.definition.ContextDefinition; +import org.apache.accumulo.classloader.lcc.definition.Resource; +import org.apache.accumulo.classloader.lcc.resolvers.FileResolver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A simple storage service backed by a local file system for storing downloaded + * {@link ContextDefinition} files and the {@link Resource} objects it references. + * <p> + * The layout of the storage area consists of two directories: + * <ul> + * <li><b>contexts</b> stores a copy of the {@link ContextDefinition} JSON files for each context, + * and exist primarily for user convenience (they aren't used again by this factory) + * <li><b>resources</b> stores a copy of all the {@link Resource} files for all contexts + * </ul> + * + * <p> + * When downloading any file, the file is first downloaded to a temporary file with a unique name, + * and then atomically renamed, so it works correctly even if there are multiple processes or + * threads doing the same thing. The use of `CREATE_NEW` and the lack of `REPLACE_EXISTING` when + * creating the temporary files is to ensure we do not silently collide with other processes, when + * these temporary files should be unique. + * + * <p> + * When downloading resource files, an additional "in-progress" signal file derived from the name of + * the file being downloaded with the suffix ".downloading" is also used. This file is updated with + * the current process' PID every 5 seconds so long as the file is still being downloaded. This + * serves as a signal to other processes or threads that they can wait instead of attempting to + * download the same file. This avoids duplicate work. If an attempt to download a resource file is + * detected, and it has been updated within the last 30 seconds, it is skipped and the remaining + * files are downloaded before attempting it again. If this signal file hasn't been updated in the + * last 30 seconds, a download will be attempted. Failures to download are propagated up to the + * higher level. `CREATE_NEW` is used when creating this signal file, in order to atomically detect + * race collisions with other threads or processes, and to try to avoid duplicate work. + * + * <p> + * Once all files for a context are downloaded, an array of {@link URL}s will be returned, which + * point to the local files that have been downloaded for the context, in the same order as + * specified in the {@link ContextDefinition} file, to be used for constructing a + * {@link URLClassLoader}. + */ +public final class LocalStore { + private static final Logger LOG = LoggerFactory.getLogger(LocalStore.class); + private static final String PID = Long.toString(ProcessHandle.current().pid()); + + private final Path contextsDir; + private final Path resourcesDir; + + public LocalStore(final Path baseDir) throws IOException { + this.contextsDir = requireNonNull(baseDir).toAbsolutePath().resolve("contexts"); + this.resourcesDir = baseDir.resolve("resources"); + Files.createDirectories(contextsDir); + Files.createDirectories(resourcesDir); + } + + Path contextsDir() { + return contextsDir; + } + + Path resourcesDir() { + return resourcesDir; + } + + // pattern to match regular files that have at least one non-dot character preceding a dot and a + // non-zero suffix; these files can be easily converted so the local store retains the original + // file name extension, while non-matching files will not attempt to retain the original file name + // extension, and will instead just append the checksum to the original file name + private static Pattern fileNamesWithExtensionPattern = Pattern.compile("^(.*[^.].*)[.]([^.]+)$"); + + static String localName(String remoteFileName, String checksum) { + requireNonNull(remoteFileName); + requireNonNull(checksum); + var matcher = fileNamesWithExtensionPattern.matcher(remoteFileName); + if (matcher.matches()) { + return String.format("%s-%s.%s", matcher.group(1), checksum, matcher.group(2)); + } + return String.format("%s-%s", remoteFileName, checksum); + } + + private static String tempName(String baseName) { + return "." + requireNonNull(baseName) + "_PID" + PID + "_" + UUID.randomUUID() + ".tmp"; + } + + public URL[] storeContextResources(final ContextDefinition contextDefinition) throws IOException { + requireNonNull(contextDefinition, "definition must be supplied"); + // use a LinkedHashSet to preserve the order of the context resources + final Set<Path> localFiles = new LinkedHashSet<>(); + // store it with a .json suffix, if the original file didn't have one + final String origSourceName = contextDefinition.getSourceFileName(); + final String sourceNameWithSuffix = + origSourceName.toLowerCase().endsWith(".json") ? origSourceName : origSourceName + ".json"; + final String destinationName = localName(sourceNameWithSuffix, contextDefinition.getChecksum()); + try { + storeContextDefinition(contextDefinition, destinationName); + boolean successful = false; + while (!successful) { + localFiles.clear(); + for (Resource resource : contextDefinition.getResources()) { + Path path = storeResource(resource); + if (path == null) { + LOG.debug("Skipped resource {} while another process or thread is downloading it", + resource.getLocation()); + continue; + } + localFiles.add(path); + LOG.trace("Added resource {} to classpath", path); + } + successful = localFiles.size() == contextDefinition.getResources().size(); + } + + } catch (IOException | RuntimeException e) { + LOG.error("Error initializing context: " + destinationName, e); Review Comment: The source file name is just the last portion of the original URL, which we don't have here. `ContextDefinition.getSourceFileName()` isn't particularly useful here, since it's just the last portion of the original URL, and isn't unique in any way. There may be many context definitions that use that same name. The destination file includes the specific checksum for the definition and can be used to find it on the local disk. It's also derived from the original source name, so that information isn't lost. The only thing lost is the original source URL, which we don't really need here. The error will get propagated up and the original URL will be included there. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
