This is an automated email from the ASF dual-hosted git repository.
aicam pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git
The following commit(s) were added to refs/heads/main by this push:
new 6060877a4d ci: enrich check_binary_deps failure report with license +
target file (#5057)
6060877a4d is described below
commit 6060877a4d0a94ba9b3ce186334d1a230055c585
Author: Matthew B. <[email protected]>
AuthorDate: Fri May 15 16:41:43 2026 -0700
ci: enrich check_binary_deps failure report with license + target file
(#5057)
### What changes were proposed in this PR?
Enrich the failure output of `bin/licensing/check_binary_deps.py` with
two pieces of context that previously required manual lookup for every
offending package: the dep's declared license string (read from
`3rdpartylicenses.json` for npm/agent-npm or `pip-licenses.csv` for
python) and the per-module `LICENSE-binary` file to edit. Each bullet
now renders as `+ name@version (license: X) → add to <file>` for new
packages, `→ remove from <file>` for stale, and `→ update in <file>` for
drift, with the `ACTION REQUIRED` block naming the same file. Transitive
deps are covered the same way because the input files already list
everything bundled. No CI workflow changes; The script's exit semantics
and aggregation behavior are unchanged.
### Any related issues, documentation, or discussions?
Closes: #5056
### How was this PR tested?
Added five unit tests in `bin/licensing/test_check_binary_deps.py`
covering license rendering, target-file rendering, the jar no-license
case, stale and drift hints, and the no-target-file fallback. Full suite
of 34 tests passes locally via `python3 -m unittest discover -s
bin/licensing -p "test_*.py"`. Also smoke-tested by running the script
against a synthetic `3rdpartylicenses.json` and confirming the enriched
bullets render correctly for new, stale, and drifted entries.
### Was this PR authored or co-authored using generative AI tooling?
Co-authored with Claude Opus 4.7 in compliance with ASF
---------
Co-authored-by: ali risheh <[email protected]>
---
bin/licensing/check_binary_deps.py | 119 ++++++++++++++++++++++++------
bin/licensing/test_check_binary_deps.py | 127 ++++++++++++++++++++++++++++++++
2 files changed, 222 insertions(+), 24 deletions(-)
diff --git a/bin/licensing/check_binary_deps.py
b/bin/licensing/check_binary_deps.py
index c85f093062..0e9574203b 100755
--- a/bin/licensing/check_binary_deps.py
+++ b/bin/licensing/check_binary_deps.py
@@ -103,6 +103,15 @@ ECO_HEADERS = {
"agent-npm": "Agent service npm packages",
}
+# Used to point the user at a concrete file in the failure report when
--license-binary is
+# not passed (npm / agent-npm / python CI invocations don't pass it).
+DEFAULT_TARGET_FILE = {
+ "npm": "frontend/LICENSE-binary",
+ "agent-npm": "agent-service/LICENSE-binary",
+ "python": "amber/LICENSE-binary-python",
+ "jar": "amber/LICENSE-binary-java",
+}
+
JAR_BULLET = re.compile(r"^\s*-\s+(\S+\.jar)\b")
# ` - <name>@<version>` — npm form, name may start with @scope/.
NPM_BULLET = re.compile(r"^\s*-\s+(@?[\w@/.\-]+)@([^\s@]+)\s*$")
@@ -250,12 +259,26 @@ def collect_jars(lib_dirs) -> set[str]:
return result
-def collect_npm(path: Path) -> set[str]:
+def collect_npm(path: Path) -> tuple[set[str], dict[str, str]]:
"""3rdpartylicenses.json emitted by license-webpack-plugin (configured in
frontend/custom-webpack.config.js): a JSON array of {name, version,
license}
- entries scoped to the actual webpack bundle."""
+ entries scoped to the actual webpack bundle. Returns (items, licenses)
+ where items is the set of '<name>@<version>' keys and licenses maps each
+ key to its declared license string (when present)."""
data = json.loads(path.read_text())
- return {f"{e['name']}@{e['version']}" for e in data if e.get('name') and
e.get('version')}
+ items: set[str] = set()
+ licenses: dict[str, str] = {}
+ for e in data:
+ name = e.get('name')
+ version = e.get('version')
+ if not (name and version):
+ continue
+ key = f"{name}@{version}"
+ items.add(key)
+ lic = e.get('license')
+ if isinstance(lic, str) and lic.strip():
+ licenses[key] = lic.strip()
+ return items, licenses
def canonicalize_python_name(name: str) -> str:
@@ -270,12 +293,15 @@ def canonicalize_python_version(version: str) -> str:
return version.split("+", 1)[0]
-def collect_python(path: Path) -> set[str]:
+def collect_python(path: Path) -> tuple[set[str], dict[str, str]]:
"""pip-licenses CSV: Name,Version,License (header row). Names are
canonicalized per PEP 503 so the compare is indifferent to whether
a distribution uses hyphens, underscores, or dots; versions are
- canonicalized to the public release form (no PEP 440 +local suffix)."""
- result: set[str] = set()
+ canonicalized to the public release form (no PEP 440 +local suffix).
+ Returns (items, licenses) where items is the set of '<name>==<version>'
+ keys and licenses maps each key to its declared license string."""
+ items: set[str] = set()
+ licenses: dict[str, str] = {}
with path.open(newline="") as f:
reader = csv.reader(f)
next(reader, None) # header
@@ -283,8 +309,11 @@ def collect_python(path: Path) -> set[str]:
if row and row[0] and row[1]:
name = canonicalize_python_name(row[0])
ver = canonicalize_python_version(row[1])
- result.add(f"{name}=={ver}")
- return result
+ key = f"{name}=={ver}"
+ items.add(key)
+ if len(row) >= 3 and row[2].strip():
+ licenses[key] = row[2].strip()
+ return items, licenses
# --- diff ------------------------------------------------------------------
@@ -368,17 +397,40 @@ def report(
label: str,
kind: str,
ignore_transitive_version: bool,
+ licenses: dict[str, str] | None = None,
+ target_file: str | None = None,
) -> int:
+ """Print a consolidated, actionable report. `licenses` maps an `added`
+ entry key to its declared license string (npm/python; empty for jar).
+ `target_file` is the per-module LICENSE-binary path the user should edit
+ — quoted in both the per-bullet hint and the ACTION REQUIRED block so a
+ failing CI step tells the user exactly which file to update."""
+ licenses = licenses or {}
rc = 0
+ add_hint = f" → add to {target_file}" if target_file else ""
+ remove_hint = f" → remove from {target_file}" if target_file else ""
+ update_hint = f" → update in {target_file}" if target_file else ""
+
+ def _added_line(entry: str) -> str:
+ lic = licenses.get(entry)
+ lic_part = f" (license: {lic})" if lic else ""
+ return f" + {entry}{lic_part}{add_hint}"
+
if added:
print(f"NEW {label} not claimed by LICENSE-binary:")
for a in sorted(added):
- print(f" + {a}")
+ print(_added_line(a))
print()
print("ACTION REQUIRED")
print(f" 1. Verify each dep's license is ASF Category A or B.")
- print(f" 2. Add a bullet in LICENSE-binary under the matching
license")
- print(f" section, either as '{kind}-compatible token' (see format
below).")
+ if target_file:
+ print(f" 2. Add a bullet for each dep above to {target_file}")
+ print(f" under the matching license section (see existing")
+ print(f" '{kind}-compatible token' bullets for format).")
+ else:
+ print(f" 2. Add a bullet in the matching per-module
LICENSE-binary")
+ print(f" under the matching license section (see existing")
+ print(f" '{kind}-compatible token' bullets for format).")
print(f" 3. If an upstream NOTICE must be bubbled up, add to
NOTICE-binary.")
print()
rc = 1
@@ -386,17 +438,23 @@ def report(
if stale:
print(f"STALE {label} claimed by LICENSE-binary but not actually
bundled:")
for s in sorted(stale):
- print(f" - {s}")
+ print(f" - {s}{remove_hint}")
print()
print("ACTION REQUIRED")
- print(f" 1. Remove the matching bullet / token from LICENSE-binary.")
+ if target_file:
+ print(f" 1. Remove the matching bullet / token from
{target_file}.")
+ else:
+ print(f" 1. Remove the matching bullet / token from the
per-module LICENSE-binary.")
print(f" 2. Remove any matching attribution from NOTICE-binary.")
print()
rc = 1
def _fmt_drift(entry: tuple[str, list[str], list[str]]) -> str:
name, cvers, rvers = entry
- return f" ~ {name}: LICENSE-binary={', '.join(cvers)} bundled={',
'.join(rvers)}"
+ return (
+ f" ~ {name}: LICENSE-binary={', '.join(cvers)} "
+ f"bundled={', '.join(rvers)}{update_hint}"
+ )
if drift_direct:
print(f"DRIFT (direct) {label} — claimed versions differ from
bundled:")
@@ -404,8 +462,12 @@ def report(
print(_fmt_drift(entry))
print()
print("ACTION REQUIRED")
- print(f" Update LICENSE-binary to match the bundled versions. Direct
deps")
- print(f" always block CI — a version bump may carry license changes.")
+ if target_file:
+ print(f" Update {target_file} to match the bundled versions.
Direct")
+ print(f" deps always block CI — a version bump may carry license
changes.")
+ else:
+ print(f" Update LICENSE-binary to match the bundled versions.
Direct deps")
+ print(f" always block CI — a version bump may carry license
changes.")
print()
rc = 1
@@ -423,7 +485,10 @@ def report(
print(_fmt_drift(entry))
print()
print("ACTION REQUIRED")
- print(f" Update LICENSE-binary to match the bundled versions, or
rerun")
+ if target_file:
+ print(f" Update {target_file} to match the bundled versions,
or rerun")
+ else:
+ print(f" Update LICENSE-binary to match the bundled versions,
or rerun")
print(f" with --ignore-transitive-version to treat transitive
drift as")
print(f" informational.")
print()
@@ -523,48 +588,54 @@ def main() -> int:
if args.license_binary is None:
lb = build_default_license_binary()
+ target_file = DEFAULT_TARGET_FILE.get(args.kind)
else:
lb = Path(args.license_binary)
if not lb.exists():
sys.stderr.write(f"error: {lb} not found\n")
return 2
+ target_file = args.license_binary
if args.kind == "jar":
claimed = parse_prose(lb, "jar")
reality = collect_jars(args.inputs)
direct_artifacts = load_direct_jar_artifacts()
added, stale, dd, dt = diff_jars(_index_jar(claimed),
_index_jar(reality), direct_artifacts)
- rc = report(added, stale, dd, dt, "JVM jars", "jar",
args.ignore_transitive_version)
+ rc = report(added, stale, dd, dt, "JVM jars", "jar",
+ args.ignore_transitive_version, target_file=target_file)
if rc == 0:
print(f"OK: {len(reality)} JVM jars match LICENSE-binary.")
return rc
if args.kind == "npm":
claimed = parse_prose(lb, "npm")
- reality = collect_npm(Path(args.inputs[0]))
+ reality, lic_map = collect_npm(Path(args.inputs[0]))
direct = load_direct_npm("frontend/package.json")
added, stale, dd, dt = diff_simple(_index_npm(claimed),
_index_npm(reality), direct, joiner="@")
- rc = report(added, stale, dd, dt, "npm packages", "npm",
args.ignore_transitive_version)
+ rc = report(added, stale, dd, dt, "npm packages", "npm",
+ args.ignore_transitive_version, licenses=lic_map,
target_file=target_file)
if rc == 0:
print(f"OK: {len(reality)} npm packages match LICENSE-binary.")
return rc
if args.kind == "agent-npm":
claimed = parse_prose(lb, "agent-npm")
- reality = collect_npm(Path(args.inputs[0]))
+ reality, lic_map = collect_npm(Path(args.inputs[0]))
direct = load_direct_npm("agent-service/package.json")
added, stale, dd, dt = diff_simple(_index_npm(claimed),
_index_npm(reality), direct, joiner="@")
- rc = report(added, stale, dd, dt, "agent-service npm packages",
"agent-npm", args.ignore_transitive_version)
+ rc = report(added, stale, dd, dt, "agent-service npm packages",
"agent-npm",
+ args.ignore_transitive_version, licenses=lic_map,
target_file=target_file)
if rc == 0:
print(f"OK: {len(reality)} agent-service npm packages match
LICENSE-binary.")
return rc
if args.kind == "python":
claimed = parse_prose(lb, "python")
- reality = collect_python(Path(args.inputs[0]))
+ reality, lic_map = collect_python(Path(args.inputs[0]))
direct = load_direct_python()
added, stale, dd, dt = diff_simple(_index_python(claimed),
_index_python(reality), direct, joiner="==")
- rc = report(added, stale, dd, dt, "Python packages", "python",
args.ignore_transitive_version)
+ rc = report(added, stale, dd, dt, "Python packages", "python",
+ args.ignore_transitive_version, licenses=lic_map,
target_file=target_file)
if rc == 0:
print(f"OK: {len(reality)} Python packages match LICENSE-binary.")
return rc
diff --git a/bin/licensing/test_check_binary_deps.py
b/bin/licensing/test_check_binary_deps.py
index 36fbb0c41f..82fa008a63 100644
--- a/bin/licensing/test_check_binary_deps.py
+++ b/bin/licensing/test_check_binary_deps.py
@@ -354,5 +354,132 @@ class EndToEndPython(unittest.TestCase):
self.assertEqual(rc, 0)
+class CollectNpmReturnsLicenses(unittest.TestCase):
+ """`collect_npm` parses 3rdpartylicenses.json into (items, licenses).
+ The license string is what the failure report quotes back to the user
+ so they can verify ASF compatibility without leaving the CI log."""
+
+ def _write_json(self, payload: list) -> Path:
+ import json as _json
+ p = Path(tempfile.mkstemp(suffix=".json")[1])
+ p.write_text(_json.dumps(payload))
+ return p
+
+ def test_returns_items_and_licenses(self):
+ p = self._write_json([
+ {"name": "react", "version": "18.2.0", "license": "MIT"},
+ {"name": "@angular/core", "version": "18.0.0", "license": "MIT"},
+ ])
+ items, licenses = cbd.collect_npm(p)
+ self.assertEqual(items, {"[email protected]", "@angular/[email protected]"})
+ self.assertEqual(licenses["[email protected]"], "MIT")
+ self.assertEqual(licenses["@angular/[email protected]"], "MIT")
+
+ def test_missing_license_field_omitted(self):
+ # No 'license' key — the entry is still bundled (we must surface
+ # it) but no license string is recorded.
+ p = self._write_json([
+ {"name": "obscure", "version": "1.0.0"},
+ ])
+ items, licenses = cbd.collect_npm(p)
+ self.assertEqual(items, {"[email protected]"})
+ self.assertNotIn("[email protected]", licenses)
+
+
+class CollectPythonReturnsLicenses(unittest.TestCase):
+ """`collect_python` parses pip-licenses CSV into (items, licenses)."""
+
+ def test_returns_items_and_licenses(self):
+ p = Path(tempfile.mkstemp(suffix=".csv")[1])
+ with p.open("w", newline="") as f:
+ w = csv.writer(f)
+ w.writerow(["Name", "Version", "License"])
+ w.writerow(["NumPy", "2.1.0", "BSD"])
+ w.writerow(["requests", "2.31.0", "Apache 2.0"])
+ items, licenses = cbd.collect_python(p)
+ self.assertEqual(items, {"numpy==2.1.0", "requests==2.31.0"})
+ self.assertEqual(licenses["numpy==2.1.0"], "BSD")
+ self.assertEqual(licenses["requests==2.31.0"], "Apache 2.0")
+
+
+class ReportRendersLicenseAndTargetFile(unittest.TestCase):
+ """The failure report must include (a) each added bullet's declared
+ license string when available, and (b) the target per-module file in
+ both the per-bullet hint and the ACTION REQUIRED block — that's the
+ actionable info that was missing before."""
+
+ def _capture(self, **kwargs) -> str:
+ buf = io.StringIO()
+ with redirect_stdout(buf):
+ cbd.report(**kwargs)
+ return buf.getvalue()
+
+ def test_added_line_includes_license_and_target(self):
+ out = self._capture(
+ added=["[email protected]"],
+ stale=[],
+ drift_direct=[],
+ drift_transitive=[],
+ label="npm packages",
+ kind="npm",
+ ignore_transitive_version=False,
+ licenses={"[email protected]": "MIT"},
+ target_file="frontend/LICENSE-binary",
+ )
+ self.assertIn("+ [email protected]", out)
+ self.assertIn("(license: MIT)", out)
+ self.assertIn("→ add to frontend/LICENSE-binary", out)
+ self.assertIn("frontend/LICENSE-binary", out.split("ACTION REQUIRED",
1)[1])
+
+ def test_added_without_license_still_renders_target(self):
+ # Jar case: no license info available; we still tell the user
+ # which file to update.
+ out = self._capture(
+ added=["foo-1.0.0.jar"],
+ stale=[],
+ drift_direct=[],
+ drift_transitive=[],
+ label="JVM jars",
+ kind="jar",
+ ignore_transitive_version=False,
+ licenses={},
+ target_file="amber/LICENSE-binary-java",
+ )
+ self.assertIn("+ foo-1.0.0.jar", out)
+ self.assertNotIn("(license:", out)
+ self.assertIn("→ add to amber/LICENSE-binary-java", out)
+
+ def test_stale_and_drift_include_target_file_hint(self):
+ out = self._capture(
+ added=[],
+ stale=["gone-pkg==1.0"],
+ drift_direct=[("foo", ["1.0"], ["1.1"])],
+ drift_transitive=[("bar", ["2.0"], ["2.1"])],
+ label="Python packages",
+ kind="python",
+ ignore_transitive_version=False,
+ licenses={},
+ target_file="amber/LICENSE-binary-python",
+ )
+ self.assertIn("- gone-pkg==1.0", out)
+ self.assertIn("→ remove from amber/LICENSE-binary-python", out)
+ self.assertIn("→ update in amber/LICENSE-binary-python", out)
+
+ def test_no_target_file_falls_back_to_generic_text(self):
+ out = self._capture(
+ added=["[email protected]"],
+ stale=[],
+ drift_direct=[],
+ drift_transitive=[],
+ label="npm packages",
+ kind="npm",
+ ignore_transitive_version=False,
+ licenses={"[email protected]": "MIT"},
+ target_file=None,
+ )
+ self.assertIn("(license: MIT)", out)
+ self.assertNotIn("→ add to", out)
+
+
if __name__ == "__main__":
unittest.main()