github-advanced-security[bot] commented on code in PR #456: URL: https://github.com/apache/uima-uimaj/pull/456#discussion_r3428472723
########## src/main/bin_distr_license_notices/CheckLicenseNotices.java: ########## @@ -0,0 +1,429 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Keeps the binary distribution's third-party license information in sync with + * what is actually shipped, following the ASF policy at + * https://infra.apache.org/licensing-howto.html + * + * The binary distribution bundles a fixed set of third-party JARs in lib/ (see + * the assembly descriptor src/main/assembly/bin.xml). For each of those JARs + * this tool: + * + * - extracts the LICENSE / NOTICE files the JAR carries in its META-INF folder + * VERBATIM. These are NOT checked into the repository; they are emitted at + * build time (--emit) into the assembly's work directory, from where the + * assembly ships them under the distribution's licenses/ directory; + * - regenerates the pointer section of the top-level LICENSE.txt (one entry per + * bundled dependency: name, version, license type, and a pointer to its + * licenses/ folder), delimited by stable BEGIN/END markers; and + * - ASSISTS with the curated NOTICE file by reporting which bundled JARs carry + * a NOTICE, which ones are new/removed since the last run, and printing their + * NOTICE text so a maintainer can bubble up the required portions by hand. + * It never edits NOTICE.md. + * + * The bundled set and its versions are read from the build itself (the assembly + * descriptor and the POMs), so there is no second list to keep in sync and the + * emitted licenses/ tree can never over- or under-disclose relative to lib/. + * + * Zero install, single solution for Linux/Mac/Windows: run with the JDK + * single-file source launcher (Java 11+), which the project already requires to + * build. The build invokes --emit via exec-maven-plugin at prepare-package. + * + * Emit the verbatim licenses/ tree into <dir> (used by the build): + * java src/main/bin_distr_license_notices/CheckLicenseNotices.java --emit <dir> + * + * Verify the LICENSE.txt pointer block and NOTICE manifest (exit 1 on drift): + * java src/main/bin_distr_license_notices/CheckLicenseNotices.java + * + * Regenerate the LICENSE.txt pointer block and the NOTICE manifest: + * java src/main/bin_distr_license_notices/CheckLicenseNotices.java --apply + * + * Print only the NOTICE assist report: + * java src/main/bin_distr_license_notices/CheckLicenseNotices.java --notice-report + * + * Optional overrides: + * -Dbasedir=<repo-root> (default: current directory) + * -Dmaven.repo.local=<repo-path> (default: $M2_REPO or ~/.m2/repository) + */ + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.zip.ZipFile; + +public class CheckLicenseNotices { + + enum Mode { VERIFY, APPLY, NOTICE_REPORT, EMIT } + + static final String BEGIN_MARKER = + "=== BEGIN GENERATED THIRD-PARTY DEPENDENCY POINTERS (do not edit by hand) ==="; + static final String END_MARKER = + "=== END GENERATED THIRD-PARTY DEPENDENCY POINTERS ==="; + + static final String LICENSE_TXT = "src/main/bin_distr_license_notices/LICENSE.txt"; + static final String NOTICE_MANIFEST = "src/main/bin_distr_license_notices/verified-notice-deps.txt"; + static final String BIN_XML = "src/main/assembly/bin.xml"; + static final String PARENT_POM = "uimaj-parent-internal/pom.xml"; + static final String ROOT_POM = "pom.xml"; + + // A bundled third-party dependency. + record Dep(String groupId, String artifactId, String version) { + String key() { return artifactId + "-" + version; } + } + + // One verbatim license/notice file extracted from a JAR. + record LicenseFile(String depKey, String fileName, boolean isNotice, byte[] bytes) { + String relPath() { return depKey + "/" + fileName; } + } + + static Path baseDir; + static Path localRepo; + + public static void main(String[] args) throws IOException { + Mode mode = Mode.VERIFY; + String emitDir = null; + for (int i = 0; i < args.length; i++) { + String a = args[i]; + switch (a) { + case "--apply", "--fix" -> mode = Mode.APPLY; + case "--verify", "--check" -> mode = Mode.VERIFY; + case "--notice-report" -> mode = Mode.NOTICE_REPORT; + case "--emit" -> { + mode = Mode.EMIT; + if (i + 1 >= args.length) { System.err.println("--emit requires a target directory"); System.exit(64); } + emitDir = args[++i]; + } + case "-h", "--help" -> { printUsage(); return; } + default -> { System.err.println("Unknown argument: " + a); printUsage(); System.exit(64); } + } + } + + baseDir = Path.of(System.getProperty("basedir", System.getProperty("user.dir"))); + localRepo = resolveLocalRepo(); + System.out.println("Base dir : " + baseDir.toAbsolutePath()); + System.out.println("Local repo : " + localRepo); + System.out.println("Mode : " + mode); + System.out.println(); + + // 1. Determine the bundled set and resolve versions. + var deps = bundledDependencies(); + System.out.println("Bundled third-party dependencies (" + deps.size() + "):"); + for (var d : deps) System.out.println(" " + d.groupId() + ":" + d.artifactId() + ":" + d.version()); + System.out.println(); + + // 2. Extract verbatim license/notice files from each JAR. + var files = new ArrayList<LicenseFile>(); + for (var d : deps) files.addAll(extractLicenseFiles(d)); + + // 3. Build the expected artifacts: the licenses/ tree, the NOTICE manifest, + // and the LICENSE.txt pointer block. + var tree = new LinkedHashMap<String, byte[]>(); + for (var f : files) tree.put(f.relPath(), f.bytes()); + var noticeDeps = new TreeSet<String>(); + for (var f : files) if (f.isNotice()) noticeDeps.add(f.depKey()); + var manifest = String.join("\n", noticeDeps) + (noticeDeps.isEmpty() ? "" : "\n"); + var pointerBlock = pointerBlock(deps, tree); + + switch (mode) { + case EMIT -> emit(tree, Path.of(emitDir)); + case APPLY -> apply(manifest, pointerBlock); + case VERIFY -> { boolean drift = verify(manifest, pointerBlock); assist(deps, files); + if (drift) { System.out.println(); + System.out.println("LICENSE.txt pointer block / NOTICE manifest are OUT OF DATE. Run with --apply."); + System.exit(1); } + System.out.println("\nLICENSE.txt pointer block and NOTICE manifest are up to date."); } + case NOTICE_REPORT -> assist(deps, files); + } + } + + // ---- 1. bundled set + versions ------------------------------------------ + + // The bundled third-party set is exactly the non-UIMA <include> entries in the + // assembly descriptor - the single source of truth for what ships in lib/. + static List<Dep> bundledDependencies() throws IOException { + var bin = readString(baseDir.resolve(BIN_XML)); + var parentPom = readString(baseDir.resolve(PARENT_POM)); + var rootPom = readString(baseDir.resolve(ROOT_POM)); + var deps = new ArrayList<Dep>(); + var m = Pattern.compile("<include>\\s*([^<:\\s]+):([^<:\\s]+)\\s*</include>").matcher(bin); + while (m.find()) { + String groupId = m.group(1), artifactId = m.group(2); + if (groupId.equals("org.apache.uima")) continue; // first-party + deps.add(new Dep(groupId, artifactId, resolveVersion(artifactId, parentPom, rootPom))); + } + return deps; + } + + // Version comes from a property in the parent POM where one exists, otherwise + // from the hard-coded <version> beside the <artifactId> in the root POM. + static String resolveVersion(String artifactId, String parentPom, String rootPom) { + var prop = switch (artifactId) { + case "jackson-core" -> "jackson-version"; + case "commons-io" -> "commons-io-version"; + case "commons-lang3" -> "commons-lang3-version"; + default -> artifactId.startsWith("spring-") ? "spring-version" + : artifactId.startsWith("slf4j-") ? "slf4j-version" + : null; + }; + if (prop != null) { + var v = group(parentPom, "<" + Pattern.quote(prop) + ">([^<]+)</" + Pattern.quote(prop) + ">"); + if (v != null) return v.strip(); + } + var v = group(rootPom, + "<artifactId>\\s*" + Pattern.quote(artifactId) + "\\s*</artifactId>\\s*<version>([^<]+)</version>"); + if (v != null) return v.strip(); + System.err.println("Could not resolve a version for " + artifactId + + " (no property in parent POM, no hard-coded <version> in root POM)"); + System.exit(2); + return null; + } + + // ---- 2. extraction ------------------------------------------------------- + + static List<LicenseFile> extractLicenseFiles(Dep d) throws IOException { + var jar = jar(d.groupId(), d.artifactId(), d.version()); + var out = new ArrayList<LicenseFile>(); + try (var zf = new ZipFile(jar.toFile())) { + var entries = zf.entries(); + while (entries.hasMoreElements()) { + var e = entries.nextElement(); + if (e.isDirectory()) continue; + var name = e.getName(); Review Comment: ## CodeQL / Arbitrary file access during archive extraction ("Zip Slip") Unsanitized archive entry, which may contain '..', is used in a [file system operation](1). Unsanitized archive entry, which may contain '..', is used in a [file system operation](2). [Show more details](https://github.com/apache/uima-uimaj/security/code-scanning/18) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
