This is an automated email from the ASF dual-hosted git repository. akm pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tooling-agents.git
commit 98608bffa816417257456ab6c7ff6c2bb84ddaf1 Author: Andrew Musselman <[email protected]> AuthorDate: Thu Apr 2 17:05:08 2026 -0700 Pushing github review code up --- .../{code.py => agents/publishing.py} | 13 +- repos/apache/github-review/agents/security.py | 771 +++++++++++++++++++++ repos/apache/github-review/agents/summary.py | 406 +++++++++++ repos/apache/github-review/monitor-agent.sh | 176 ----- repos/apache/github-review/report.md | 203 ------ 5 files changed, 1187 insertions(+), 382 deletions(-) diff --git a/repos/apache/github-review/code.py b/repos/apache/github-review/agents/publishing.py similarity index 98% rename from repos/apache/github-review/code.py rename to repos/apache/github-review/agents/publishing.py index 8d8ff40..662f9ed 100644 --- a/repos/apache/github-review/code.py +++ b/repos/apache/github-review/agents/publishing.py @@ -572,7 +572,10 @@ async def run(input_dict, tools): entry = {"repo": repo, **w} entry["ecosystems"] = ecosystems_raw by_category[cat].append(entry) - publishing_repos.add(repo) + + # Only count as "publishing" for supply-chain purposes if release or snapshot + if cat in ("release_artifact", "snapshot_artifact"): + publishing_repos.add(repo) for eco in ecosystems_raw: if eco and eco != "github_actions_artifacts": @@ -595,7 +598,7 @@ async def run(input_dict, tools): lines.append(f"| Repositories scanned | {stats['repos_scanned']} |") lines.append(f"| Repositories with workflows | {stats['repos_with_workflows']} |") lines.append(f"| Total workflow files | {stats['total_workflows']} |") - lines.append(f"| **Repos with any publishing** | **{len(publishing_repos)}** |") + lines.append(f"| **Repos publishing to registries** | **{len(publishing_repos)}** |") lines.append(f"| Release artifact workflows | {len(release_wfs)} |") lines.append(f"| Snapshot / nightly workflows | {len(snapshot_wfs)} |") lines.append(f"| CI infrastructure image workflows | {len(ci_wfs)} |") @@ -813,7 +816,11 @@ async def run(input_dict, tools): lines.append("*No release or snapshot publishing workflows detected.*\n") # --- Non-publishing repos --- - non_publishing = sorted([r for r in all_results.keys() if r not in publishing_repos]) + # Repos that have doc/CI workflows are covered above; only list truly non-publishing repos + repos_with_any_output = set(publishing_repos) + for w in ci_wfs + doc_wfs: + repos_with_any_output.add(w.get("repo", "")) + non_publishing = sorted([r for r in all_results.keys() if r not in repos_with_any_output]) if non_publishing: lines.append("## Repositories with Workflows (No Publishing Detected)\n") lines.append(f"{len(non_publishing)} repositories had workflow files but no publishing of any kind.\n") diff --git a/repos/apache/github-review/agents/security.py b/repos/apache/github-review/agents/security.py new file mode 100644 index 0000000..71ab0bb --- /dev/null +++ b/repos/apache/github-review/agents/security.py @@ -0,0 +1,771 @@ +from agent_factory.remote_mcp_client import RemoteMCPClient +from services.llm_service import call_llm +import httpx + +async def run(input_dict, tools): + mcpc = { url : RemoteMCPClient(remote_url = url) for url in tools.keys() } + http_client = httpx.AsyncClient() + try: + owner = input_dict.get("owner", "apache") + github_pat = input_dict.get("github_pat", "").strip() + clear_cache_raw = input_dict.get("clear_cache", "false") + clear_cache = str(clear_cache_raw).lower().strip() in ("true", "1", "yes") + + if not github_pat: + return {"outputText": "Error: `github_pat` is required."} + + GITHUB_API = "https://api.github.com" + gh_headers = {"Accept": "application/vnd.github.v3+json", "Authorization": f"token {github_pat}"} + + workflow_ns = data_store.use_namespace(f"ci-workflows:{owner}") + security_ns = data_store.use_namespace(f"ci-security:{owner}") + + if clear_cache: + print("Clearing security cache...", flush=True) + for key in security_ns.list_keys(): + security_ns.delete(key) + print("Cache cleared.", flush=True) + + all_wf_keys = workflow_ns.list_keys() + if not all_wf_keys: + return {"outputText": "Error: no cached workflows found in `ci-workflows:" + owner + "`. " + "Run the Publishing Analyzer agent first."} + + repos = {} + for key in all_wf_keys: + if "/" in key: + repo, wf_name = key.split("/", 1) + repos.setdefault(repo, []).append(wf_name) + + print(f"Found {len(all_wf_keys)} cached workflows across {len(repos)} repos\n", flush=True) + + async def github_get(url, max_retries=3): + for attempt in range(max_retries): + try: + resp = await http_client.get(url, headers=gh_headers, timeout=15.0) + if resp.status_code == 429 or (resp.status_code == 403 and + resp.headers.get("X-RateLimit-Remaining", "1") == "0"): + await asyncio.sleep(30) + continue + return resp + except Exception: + if attempt < max_retries - 1: + await asyncio.sleep(2) + return None + + TRUSTED_ORGS = { + "actions", "github", "docker", "google-github-actions", "aws-actions", + "azure", "hashicorp", "gradle", "ruby", "codecov", "peaceiris", + "pypa", "peter-evans", "softprops", "JamesIves", "crazy-max", + "dorny", "EnricoMi", "pnpm", "apache", + } + + PR_TRIGGERS = {"pull_request", "pull_request_target", "issue_comment"} + + # --- Pattern matching helpers --- + + def is_sha_pinned(ref): + if not ref: + return False + return len(ref) == 40 and all(c in "0123456789abcdef" for c in ref.lower()) + + def extract_action_refs(content): + refs = [] + for line in content.split("\n"): + stripped = line.strip() + if "uses:" in stripped: + idx = stripped.index("uses:") + action_ref = stripped[idx + 5:].strip().strip("'\"") + if "#" in action_ref: + action_ref = action_ref[:action_ref.index("#")].strip() + if action_ref and not action_ref.startswith("$"): + refs.append(action_ref) + return refs + + def parse_action_ref(ref): + if ref.startswith("./"): + return {"type": "local", "path": ref, "raw": ref} + if "@" in ref: + action_path, version = ref.rsplit("@", 1) + parts = action_path.split("/") + org = parts[0] if parts else "" + name = "/".join(parts[:2]) if len(parts) >= 2 else action_path + return {"type": "remote", "org": org, "name": name, "full": action_path, + "version": version, "pinned": is_sha_pinned(version), "raw": ref} + return {"type": "unknown", "raw": ref} + + def extract_triggers(content): + triggers = set() + in_on = False + for line in content.split("\n"): + stripped = line.strip() + if stripped.startswith("on:"): + in_on = True + rest = stripped[3:].strip() + if rest.startswith("["): + for t in rest.strip("[]").split(","): + triggers.add(t.strip()) + in_on = False + elif rest and not rest.startswith("#"): + triggers.add(rest.rstrip(":")) + continue + if in_on: + if stripped and not stripped.startswith("#"): + if not line.startswith(" ") and not line.startswith("\t"): + in_on = False + continue + if ":" in stripped: + trigger_name = stripped.split(":")[0].strip() + if trigger_name and not trigger_name.startswith("-"): + triggers.add(trigger_name) + return triggers + + def extract_permissions(content): + perms = {} + in_perms = False + indent = 0 + for line in content.split("\n"): + stripped = line.strip() + if stripped.startswith("permissions:"): + rest = stripped[12:].strip() + if rest and rest != "{}" and not rest.startswith("#"): + perms["_level"] = rest + return perms + in_perms = True + indent = len(line) - len(line.lstrip()) + continue + if in_perms: + if not stripped or stripped.startswith("#"): + continue + cur_indent = len(line) - len(line.lstrip()) + if cur_indent <= indent and stripped: + break + if ":" in stripped: + key, val = stripped.split(":", 1) + perms[key.strip()] = val.strip() + return perms + + def find_injection_in_run_blocks(content, context_label=""): + """Find ${{ }} interpolation in run: blocks. Returns list of (severity, detail).""" + findings = [] + in_run = False + run_indent = 0 + current_step = "" + + for line in content.split("\n"): + stripped = line.strip() + + if stripped.startswith("- name:"): + current_step = stripped[7:].strip().strip("'\"") + + if stripped.startswith("run:"): + in_run = True + run_indent = len(line) - len(line.lstrip()) + run_content = stripped[4:].strip() + if run_content.startswith("|") or run_content.startswith(">"): + continue + if "${{" in run_content: + findings.extend(_classify_interpolation(run_content, current_step, context_label)) + in_run = False + continue + + if in_run: + cur_indent = len(line) - len(line.lstrip()) + if stripped and cur_indent <= run_indent: + in_run = False + elif "${{" in line: + findings.extend(_classify_interpolation(line, current_step, context_label)) + + return findings + + def _classify_interpolation(line, step_name, context_label=""): + findings = [] + prefix = f" in {context_label}" if context_label else "" + step_info = f" at step '{step_name}'" if step_name else "" + + import re as _re + exprs = _re.findall(r'\$\{\{([^}]+)\}\}', line) + + for expr in exprs: + expr = expr.strip() + expr_lower = expr.lower() + + untrusted_patterns = [ + "event.pull_request.title", "event.pull_request.body", + "event.pull_request.head.ref", "event.pull_request.head.label", + "event.issue.title", "event.issue.body", + "event.comment.body", "event.review.body", + "event.discussion.title", "event.discussion.body", + ] + if any(p in expr_lower for p in untrusted_patterns): + findings.append(("CRITICAL", + f"Direct interpolation of untrusted input `${{{{ {expr} }}}}` in run block" + f"{step_info}{prefix}. Exploitable by external contributors.")) + continue + + if "secrets." in expr_lower: + findings.append(("LOW", + f"Secret `${{{{ {expr} }}}}` directly interpolated in run block" + f"{step_info}{prefix}. Trusted value but risks log leakage.")) + continue + + if "event.inputs." in expr_lower or "inputs." in expr_lower: + findings.append(("LOW", + f"Workflow input `${{{{ {expr} }}}}` directly interpolated in run block" + f"{step_info}{prefix}. Trusted committer input but should use env: block.")) + continue + + github_controlled = [ + "github.actor", "github.sha", "github.ref", "github.repository", + "github.run_id", "github.run_number", "github.workspace", + "github.ref_name", "github.head_ref", "github.base_ref", + "runner.", "matrix.", "steps.", "needs.", "env.", + ] + if any(p in expr_lower for p in github_controlled): + continue + + return findings + + def check_prt_checkout(content): + triggers = extract_triggers(content) + if "pull_request_target" not in triggers: + return None + has_checkout = False + checks_head = False + for line in content.split("\n"): + stripped = line.strip() + if "actions/checkout" in stripped: + has_checkout = True + if has_checkout and ("pull_request.head.sha" in stripped or + "pull_request.head.ref" in stripped or + "github.event.pull_request.head" in stripped): + checks_head = True + break + if has_checkout and checks_head: + return ("CRITICAL", "pull_request_target trigger with checkout of PR head code. " + "Untrusted PR code executes with base repo secrets and write permissions.") + elif has_checkout: + return ("LOW", "pull_request_target trigger with checkout action present. " + "Verify the checkout uses the base ref, not PR head.") + return None + + def check_self_hosted(content, triggers): + has_self_hosted = "self-hosted" in content + has_pr_trigger = bool(triggers & PR_TRIGGERS) + if has_self_hosted and has_pr_trigger: + return ("HIGH", "Self-hosted runner with PR trigger. External contributors can " + "execute arbitrary code on self-hosted infrastructure.") + elif has_self_hosted: + return ("INFO", "Uses self-hosted runners. Ensure runners are ephemeral.") + return None + + def check_permissions(content): + perms = extract_permissions(content) + findings = [] + level = perms.get("_level", "") + if level in ("write-all", "read-all|write-all"): + findings.append(("HIGH", "Workflow uses `permissions: write-all`. " + "Follow least-privilege principle.")) + write_perms = [k for k, v in perms.items() if v == "write" and k != "_level"] + if len(write_perms) > 3: + findings.append(("LOW", f"Requests write access to {len(write_perms)} scopes: " + f"{', '.join(write_perms)}.")) + return findings + + def check_cache_poisoning(content, triggers): + has_pr = bool(triggers & {"pull_request", "pull_request_target"}) + has_cache = "actions/cache" in content + if has_cache and has_pr: + for line in content.split("\n"): + if "key:" in line and ("pull_request" in line or "head_ref" in line): + return ("HIGH", "Cache key derived from PR-controlled value. " + "A malicious PR could poison the cache.") + return ("INFO", "Uses actions/cache with PR trigger. Verify cache keys " + "are not PR-controlled.") + return None + + def deduplicate_findings(findings): + """Collapse repeated same-check same-file findings into summaries.""" + deduped = [] + # Group by (check, file, severity) + groups = {} + for f in findings: + key = (f["check"], f["file"], f["severity"]) + groups.setdefault(key, []).append(f) + + for (check, file, severity), items in groups.items(): + if len(items) == 1: + deduped.append(items[0]) + elif check in ("run_block_injection", "composite_action_injection"): + # Summarize: extract unique expressions + import re as _re + exprs = set() + for item in items: + found = _re.findall(r'`\$\{\{ ([^}]+) \}\}`', item["detail"]) + exprs.update(found) + expr_list = sorted(exprs)[:5] + expr_str = ", ".join(f"`{e}`" for e in expr_list) + more = f" +{len(exprs) - 5} more" if len(exprs) > 5 else "" + deduped.append({ + "check": check, + "file": file, + "severity": severity, + "detail": (f"{len(items)} instances of direct interpolation in run blocks. " + f"Expressions: {expr_str}{more}."), + "count": len(items), + }) + elif check == "composite_action_unpinned": + # Summarize unpinned refs inside one composite action + refs = [item["detail"].split("`")[1] if "`" in item["detail"] else "?" for item in items] + unique_refs = sorted(set(refs)) + deduped.append({ + "check": check, + "file": file, + "severity": severity, + "detail": (f"{len(items)} unpinned action refs in composite action: " + f"{', '.join(f'`{r}`' for r in unique_refs[:5])}" + + (f" +{len(unique_refs)-5} more" if len(unique_refs) > 5 else "")), + "count": len(items), + }) + else: + # For other checks, keep first and note count + entry = dict(items[0]) + if len(items) > 1: + entry["detail"] = f"({len(items)}x) {entry['detail']}" + entry["count"] = len(items) + deduped.append(entry) + + return deduped + + + # ===== Main scan loop ===== + all_findings = {} + repos_scanned = 0 + + for repo_name, wf_names in sorted(repos.items()): + repos_scanned += 1 + + if repos_scanned % 10 == 1: + print(f"[{repos_scanned}/{len(repos)}] Scanning {repo_name}...", flush=True) + + cached = security_ns.get(f"findings:{repo_name}") + if cached is not None and not clear_cache: + if cached: + all_findings[repo_name] = cached + continue + + repo_findings = [] + all_action_refs = [] + repo_triggers = set() + + # --- Analyze each cached workflow --- + for wf_name in wf_names: + # Skip composite action files — analyzed separately in Check 9 + if ".github/actions/" in wf_name: + continue + + content = workflow_ns.get(f"{repo_name}/{wf_name}") + if not content or not isinstance(content, str): + continue + + triggers = extract_triggers(content) + repo_triggers.update(triggers) + action_refs = extract_action_refs(content) + all_action_refs.extend([(wf_name, ref) for ref in action_refs]) + + # Check 1: pull_request_target + checkout + prt = check_prt_checkout(content) + if prt: + repo_findings.append({"check": "prt_checkout", "severity": prt[0], + "file": wf_name, "detail": prt[1]}) + + # Check 2: Self-hosted runners + sh = check_self_hosted(content, triggers) + if sh: + repo_findings.append({"check": "self_hosted_runner", "severity": sh[0], + "file": wf_name, "detail": sh[1]}) + + # Check 3: Permissions + for sev, detail in check_permissions(content): + repo_findings.append({"check": "broad_permissions", "severity": sev, + "file": wf_name, "detail": detail}) + + # Check 4: Cache poisoning + cp = check_cache_poisoning(content, triggers) + if cp: + repo_findings.append({"check": "cache_poisoning", "severity": cp[0], + "file": wf_name, "detail": cp[1]}) + + # Check 5: Injection in workflow run blocks + injections = find_injection_in_run_blocks(content, context_label=f"workflow {wf_name}") + for sev, detail in injections: + repo_findings.append({"check": "run_block_injection", "severity": sev, + "file": wf_name, "detail": detail}) + + # Check 6: Unpinned actions (repo-wide summary) + unpinned = [] + third_party = [] + for wf_name, ref in all_action_refs: + parsed = parse_action_ref(ref) + if parsed["type"] == "local": + continue + if parsed["type"] == "remote": + if not parsed["pinned"]: + unpinned.append({"file": wf_name, "action": parsed["raw"], + "org": parsed["org"], "name": parsed["name"]}) + if parsed["org"] not in TRUSTED_ORGS: + third_party.append({"file": wf_name, "action": parsed["raw"], + "org": parsed["org"], "name": parsed["name"]}) + + if unpinned: + by_action = {} + for u in unpinned: + by_action.setdefault(u["name"], []).append(u["file"]) + top = sorted(by_action.items(), key=lambda x: -len(x[1]))[:5] + detail_parts = [f"`{name}` ({len(files)})" for name, files in top] + repo_findings.append({ + "check": "unpinned_actions", "severity": "MEDIUM", + "file": "(repo-wide)", + "detail": (f"{len(unpinned)} unpinned action refs (mutable tags). " + f"Top: {', '.join(detail_parts)}."), + "count": len(unpinned), "total_refs": len(all_action_refs), + }) + + if third_party: + unique = sorted(set(t["name"] for t in third_party)) + repo_findings.append({ + "check": "third_party_actions", "severity": "INFO", + "file": "(repo-wide)", + "detail": (f"{len(unique)} third-party actions: " + f"{', '.join(unique[:10])}" + + (f" +{len(unique)-10} more" if len(unique) > 10 else "")), + "count": len(unique), + }) + + # --- Fetch extra files from GitHub --- + + # Check 7: CODEOWNERS + resp = await github_get(f"{GITHUB_API}/repos/{owner}/{repo_name}/contents/.github/CODEOWNERS") + if resp and resp.status_code == 200: + try: + co_url = resp.json().get("download_url") + if co_url: + co_resp = await http_client.get(co_url, follow_redirects=True, timeout=10.0) + if co_resp.status_code == 200: + co_content = co_resp.text + has_github_rule = any(".github" in line and not line.strip().startswith("#") + for line in co_content.split("\n")) + if not has_github_rule: + repo_findings.append({ + "check": "codeowners_gap", "severity": "LOW", + "file": "CODEOWNERS", + "detail": "CODEOWNERS exists but has no rule covering `.github/`. " + "Workflow changes can bypass security-focused review.", + }) + except Exception: + pass + elif resp and resp.status_code == 404: + repo_findings.append({ + "check": "missing_codeowners", "severity": "LOW", + "file": "(missing)", + "detail": "No CODEOWNERS file. Workflow changes have no mandatory review.", + }) + + # Check 8: Dependabot / Renovate + has_deps = False + for path in [".github/dependabot.yml", ".github/dependabot.yaml", + "renovate.json", ".github/renovate.json", ".renovaterc.json"]: + resp = await github_get(f"{GITHUB_API}/repos/{owner}/{repo_name}/contents/{path}") + if resp and resp.status_code == 200: + has_deps = True + break + + if not has_deps: + repo_findings.append({ + "check": "missing_dependency_updates", "severity": "INFO", + "file": "(missing)", + "detail": "No dependabot.yml or renovate.json found.", + }) + + # Check 9: Composite actions via recursive Git Trees API + # One API call gets the entire tree, handles any nesting depth + composite_findings = [] + composite_analyzed = 0 + composite_total = 0 + + resp = await github_get( + f"{GITHUB_API}/repos/{owner}/{repo_name}/git/trees/HEAD?recursive=1") + if resp and resp.status_code == 200: + try: + tree = resp.json().get("tree", []) + action_files = [ + item["path"] for item in tree + if item.get("path", "").startswith(".github/actions/") + and item.get("path", "").endswith(("/action.yml", "/action.yaml")) + and item.get("type") == "blob" + ] + composite_total = len(action_files) + + for action_path in action_files: + # Extract action name: .github/actions/build/rust/action.yml -> build/rust + action_name = action_path.replace(".github/actions/", "").rsplit("/", 1)[0] + + # Fetch the action.yml content + aresp = await github_get( + f"{GITHUB_API}/repos/{owner}/{repo_name}/contents/{action_path}") + if not aresp or aresp.status_code != 200: + continue + + try: + dl_url = aresp.json().get("download_url") + if not dl_url: + continue + dl_resp = await http_client.get(dl_url, follow_redirects=True, timeout=10.0) + if dl_resp.status_code != 200: + continue + action_content = dl_resp.text + except Exception: + continue + + composite_analyzed += 1 + short_path = f".github/actions/{action_name}/action.yml" + + # Store for other agents + workflow_ns.set(f"{repo_name}/{short_path}", action_content) + + # Run injection checks + context = f"composite action .github/actions/{action_name}" + injections = find_injection_in_run_blocks(action_content, context_label=context) + for sev, detail in injections: + composite_findings.append({ + "check": "composite_action_injection", + "severity": sev, + "file": short_path, + "detail": detail, + }) + + # Check unpinned actions inside composite + ca_refs = extract_action_refs(action_content) + for ref in ca_refs: + parsed = parse_action_ref(ref) + if parsed["type"] == "remote" and not parsed["pinned"]: + composite_findings.append({ + "check": "composite_action_unpinned", + "severity": "MEDIUM", + "file": short_path, + "detail": (f"Composite action uses unpinned action `{parsed['raw']}`. " + "Supply chain risk."), + }) + + # Check inputs.* directly in run blocks (hidden injection) + has_input_injection = False + in_run = False + run_indent = 0 + for cline in action_content.split("\n"): + cs = cline.strip() + if cs.startswith("run:"): + in_run = True + run_indent = len(cline) - len(cline.lstrip()) + rest = cs[4:].strip() + if rest.startswith("|") or rest.startswith(">"): + continue + if "inputs." in rest and "${{" in rest: + has_input_injection = True + break + elif in_run: + ci = len(cline) - len(cline.lstrip()) + if cs and ci <= run_indent: + in_run = False + elif "inputs." in cline and "${{" in cline: + has_input_injection = True + break + + if has_input_injection: + composite_findings.append({ + "check": "composite_action_input_injection", + "severity": "HIGH", + "file": short_path, + "detail": (f"Composite action `{action_name}` directly interpolates " + "`inputs.*` in run block. Callers may pass untrusted values — " + "the injection is hidden from workflow-level analysis."), + }) + + except Exception as e: + print(f" Error scanning composite actions for {repo_name}: {str(e)[:100]}", flush=True) + + # Deduplicate composite findings per file before adding + composite_findings = deduplicate_findings(composite_findings) + repo_findings.extend(composite_findings) + + if composite_total > 0: + repo_findings.append({ + "check": "composite_actions_scanned", "severity": "INFO", + "file": ".github/actions/", + "detail": (f"{composite_analyzed}/{composite_total} composite actions analyzed. " + f"{len(composite_findings)} finding(s)."), + }) + + # Deduplicate all findings for this repo + repo_findings = deduplicate_findings(repo_findings) + + # Store + security_ns.set(f"findings:{repo_name}", repo_findings) + if repo_findings: + all_findings[repo_name] = repo_findings + + await asyncio.sleep(0.1) + + print(f"\n{'=' * 60}", flush=True) + print(f"Security scan complete! {repos_scanned} repos", flush=True) + total_findings = sum(len(f) for f in all_findings.values()) + print(f"Total findings: {total_findings} across {len(all_findings)} repos", flush=True) + print(f"{'=' * 60}\n", flush=True) + + # ===== Build report ===== + report_title = f"CI Security Scan: {owner}" + + severity_counts = {} + check_counts = {} + for repo, findings in all_findings.items(): + for f in findings: + sev = f.get("severity", "INFO") + severity_counts[sev] = severity_counts.get(sev, 0) + 1 + chk = f.get("check", "unknown") + check_counts[chk] = check_counts.get(chk, 0) + 1 + + lines = [] + lines.append(f"Analyzed **{repos_scanned}** repositories using cached workflow YAML " + f"from the Publishing Analyzer.\n") + + lines.append("## Executive Summary\n") + lines.append("| Severity | Count |") + lines.append("|----------|-------|") + for sev in ["CRITICAL", "HIGH", "MEDIUM", "LOW", "INFO"]: + count = severity_counts.get(sev, 0) + if count > 0: + lines.append(f"| **{sev}** | **{count}** |") + lines.append("") + + check_descriptions = { + "prt_checkout": "pull_request_target + checkout of untrusted PR code", + "self_hosted_runner": "Self-hosted runners exposed to PR triggers", + "broad_permissions": "Overly broad GITHUB_TOKEN permissions", + "cache_poisoning": "Potential cache poisoning via PR-controlled keys", + "run_block_injection": "Direct ${{ }} interpolation in workflow run blocks", + "unpinned_actions": "Mutable tag refs (not SHA-pinned)", + "third_party_actions": "Actions from unverified third-party sources", + "codeowners_gap": "CODEOWNERS missing .github/ coverage", + "missing_codeowners": "No CODEOWNERS file", + "missing_dependency_updates": "No dependabot/renovate configuration", + "composite_actions_scanned": "Composite actions analyzed", + "composite_action_injection": "Injection in composite action run block", + "composite_action_unpinned": "Unpinned action ref inside composite action", + "composite_action_input_injection": "Composite action passes inputs.* directly to run block", + } + + lines.append("## Findings by Check Type\n") + lines.append("| Check | Count | Description |") + lines.append("|-------|-------|-------------|") + for chk, count in sorted(check_counts.items(), key=lambda x: -x[1]): + desc = check_descriptions.get(chk, chk) + lines.append(f"| {chk} | {count} | {desc} |") + lines.append("") + + by_severity = {"CRITICAL": [], "HIGH": [], "MEDIUM": [], "LOW": [], "INFO": []} + for repo, findings in all_findings.items(): + for f in findings: + sev = f.get("severity", "INFO") + if sev in by_severity: + by_severity[sev].append((repo, f)) + + if by_severity["CRITICAL"]: + lines.append("## CRITICAL Findings\n") + lines.append("Untrusted external input directly interpolated in shell execution contexts.\n") + for repo, f in sorted(by_severity["CRITICAL"], key=lambda x: (x[0], x[1].get("file", ""))): + lines.append(f"- **{owner}/{repo}** (`{f['file']}`): [{f['check']}] {f['detail']}") + lines.append("") + + if by_severity["HIGH"]: + lines.append("## HIGH Findings\n") + for repo, f in sorted(by_severity["HIGH"], key=lambda x: (x[0], x[1].get("file", ""))): + lines.append(f"- **{owner}/{repo}** (`{f['file']}`): [{f['check']}] {f['detail']}") + lines.append("") + + if by_severity["MEDIUM"]: + lines.append("## MEDIUM Findings\n") + lines.append(f"<details>\n<summary>Show {len(by_severity['MEDIUM'])} medium findings</summary>\n") + for repo, f in sorted(by_severity["MEDIUM"], key=lambda x: (x[0], x[1].get("file", ""))): + lines.append(f"- **{owner}/{repo}** (`{f['file']}`): [{f['check']}] {f['detail']}") + lines.append(f"\n</details>\n") + + if by_severity["LOW"]: + lines.append("## LOW Findings\n") + lines.append(f"<details>\n<summary>Show {len(by_severity['LOW'])} low findings</summary>\n") + for repo, f in sorted(by_severity["LOW"], key=lambda x: (x[0], x[1].get("file", ""))): + lines.append(f"- **{owner}/{repo}** (`{f['file']}`): [{f['check']}] {f['detail']}") + lines.append(f"\n</details>\n") + + if by_severity["INFO"]: + lines.append("## INFO Findings\n") + lines.append(f"<details>\n<summary>Show {len(by_severity['INFO'])} info findings</summary>\n") + for repo, f in sorted(by_severity["INFO"], key=lambda x: (x[0], x[1].get("file", ""))): + lines.append(f"- **{owner}/{repo}** (`{f['file']}`): [{f['check']}] {f['detail']}") + lines.append(f"\n</details>\n") + + lines.append("## Detailed Results by Repository\n") + for repo in sorted(all_findings.keys()): + findings = all_findings[repo] + if not findings: + continue + sev_summary = {} + for f in findings: + sev_summary[f["severity"]] = sev_summary.get(f["severity"], 0) + 1 + sev_str = ", ".join(f"{s}: {c}" for s, c in sorted(sev_summary.items())) + + lines.append(f"### {owner}/{repo}\n") + lines.append(f"**{len(findings)}** findings | {sev_str}\n") + + sev_order = {"CRITICAL": 0, "HIGH": 1, "MEDIUM": 2, "LOW": 3, "INFO": 4} + for f in sorted(findings, key=lambda x: sev_order.get(x["severity"], 99)): + lines.append(f"- **[{f['severity']}]** `{f['file']}` — [{f['check']}] {f['detail']}") + lines.append("") + + lines.append("---\n") + lines.append(f"*Findings cached in `ci-security:{owner}`. " + f"Set `clear_cache` to `true` to re-scan.*") + + report_body = "\n".join(lines) + + def to_anchor(text): + anchor = text.lower().strip() + anchor = re.sub(r'[^\w\s-]', '', anchor) + anchor = re.sub(r'\s+', '-', anchor) + anchor = re.sub(r'-+', '-', anchor) + return anchor.strip('-') + + toc_lines = [f"# {report_title}\n", "## Contents\n"] + toc_lines.append(f"- [Executive Summary](#{to_anchor('Executive Summary')})") + toc_lines.append(f"- [Findings by Check Type](#{to_anchor('Findings by Check Type')})") + for sev in ["CRITICAL", "HIGH", "MEDIUM", "LOW", "INFO"]: + if by_severity.get(sev): + toc_lines.append(f"- [{sev} Findings](#{to_anchor(f'{sev} Findings')}) ({len(by_severity[sev])})") + toc_lines.append(f"- [Detailed Results](#{to_anchor('Detailed Results by Repository')})") + for repo in sorted(all_findings.keys()): + toc_lines.append(f" - [{owner}/{repo}](#{to_anchor(f'{owner}/{repo}')})") + + toc = "\n".join(toc_lines) + full_report = toc + "\n\n---\n\n" + report_body + + security_ns.set("latest_report", full_report) + security_ns.set("latest_stats", { + "repos_scanned": repos_scanned, + "repos_with_findings": len(all_findings), + "total_findings": total_findings, + "severity_counts": severity_counts, + "check_counts": check_counts, + }) + + return {"outputText": full_report} + + finally: + await http_client.aclose() \ No newline at end of file diff --git a/repos/apache/github-review/agents/summary.py b/repos/apache/github-review/agents/summary.py new file mode 100644 index 0000000..abbee50 --- /dev/null +++ b/repos/apache/github-review/agents/summary.py @@ -0,0 +1,406 @@ +from agent_factory.remote_mcp_client import RemoteMCPClient +from services.llm_service import call_llm +import httpx +import re + +async def run(input_dict, tools): + mcpc = { url : RemoteMCPClient(remote_url = url) for url in tools.keys() } + http_client = httpx.AsyncClient() + try: + owner = input_dict.get("owner", "apache") + print(f"Agent 3 starting for owner={owner}", flush=True) + + report_ns = data_store.use_namespace(f"ci-report:{owner}") + security_ns = data_store.use_namespace(f"ci-security:{owner}") + + pub_stats = report_ns.get("latest_stats") + sec_stats = security_ns.get("latest_stats") + pub_report = report_ns.get("latest_report") + sec_report = security_ns.get("latest_report") + + if not pub_stats or not sec_stats: + return {"outputText": "Error: Run Agent 1 and Agent 2 first."} + + print(f"Publishing report: {len(pub_report or '')} chars", flush=True) + print(f"Security report: {len(sec_report or '')} chars", flush=True) + + # --- Parse per-repo ecosystems from publishing report text --- + # Matches lines like: ### apache/iggy\n**4** ... | Ecosystems: **crates_io, docker_hub** | ... + repo_ecosystems = {} + repo_categories = {} # repo -> {release: N, snapshot: N} + if pub_report: + # Match detailed results headers + header_pattern = re.compile( + r'### ' + re.escape(f'{owner}/') + r'(\S+)\s*\n+' + r'\*\*(\d+)\*\* release/snapshot workflows \| Ecosystems: \*\*([^*]+)\*\*' + r' \|(.+)') + for m in header_pattern.finditer(pub_report): + repo = m.group(1) + ecosystems = [e.strip() for e in m.group(3).split(",")] + repo_ecosystems[repo] = ecosystems + cats_str = m.group(4) + cats = {} + for cat_m in re.finditer(r'(Release Artifacts|Snapshot[^:]*): (\d+)', cats_str): + if "Release" in cat_m.group(1): + cats["release"] = int(cat_m.group(2)) + else: + cats["snapshot"] = int(cat_m.group(2)) + repo_categories[repo] = cats + + print(f"Parsed ecosystems for {len(repo_ecosystems)} repos", flush=True) + + # --- Read per-repo security findings --- + all_sec_keys = security_ns.list_keys() + finding_keys = [k for k in all_sec_keys if k.startswith("findings:")] + + repo_security = {} # repo -> {severities, total, worst, top_checks} + SEV_ORDER = {"CRITICAL": 0, "HIGH": 1, "MEDIUM": 2, "LOW": 3, "INFO": 4} + + for k in finding_keys: + repo = k.replace("findings:", "") + findings = security_ns.get(k) + if not findings or not isinstance(findings, list): + continue + + sev_counts = {} + check_counts = {} + for f in findings: + sev = f.get("severity", "INFO") + sev_counts[sev] = sev_counts.get(sev, 0) + 1 + chk = f.get("check", "unknown") + # Skip info-level noise for top checks + if sev != "INFO": + check_counts[chk] = check_counts.get(chk, 0) + 1 + + worst = "INFO" + for s in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]: + if sev_counts.get(s, 0) > 0: + worst = s + break + + # Top 3 non-INFO checks by count + top_checks = sorted(check_counts.items(), key=lambda x: -x[1])[:3] + + repo_security[repo] = { + "severities": sev_counts, + "total": len(findings), + "worst": worst, + "top_checks": top_checks, + } + + print(f"Security data for {len(repo_security)} repos", flush=True) + + # --- Parse trusted publishing opportunities --- + # Which repos have TP opportunities (using long-lived tokens where OIDC is available) + tp_repos = set() + tp_opportunities = pub_stats.get("trusted_publishing_opportunities", []) + if isinstance(tp_opportunities, list): + for opp in tp_opportunities: + if isinstance(opp, dict): + tp_repos.add(opp.get("repo", "")) + elif isinstance(opp, str): + tp_repos.add(opp) + # Also try parsing from report text for reliability + if pub_report: + tp_section = False + for line in pub_report.split("\n"): + if "Trusted Publishing Migration" in line: + tp_section = True + continue + if tp_section and line.startswith("## ") and "Trusted" not in line: + break + if tp_section and "| " in line and "`" in line: + parts = line.split("|") + if len(parts) > 1: + repo_name = parts[1].strip() + if repo_name and repo_name != "Repository": + tp_repos.add(repo_name) + + print(f"Trusted publishing opportunity repos: {len(tp_repos)}", flush=True) + + # --- Identify repos already using OIDC --- + oidc_repos = set() + if pub_report: + for line in pub_report.split("\n"): + if "OIDC" in line and ("trusted publishing" in line.lower() or "id-token" in line.lower()): + # Find which repo section we're in + pass # Complex to parse; skip for now + + # --- Build combined risk table --- + publishing_repos = set(pub_stats.get("publishing_repos", [])) + all_repos = publishing_repos | set(repo_security.keys()) + + repo_rows = [] + for repo in sorted(all_repos): + ecosystems = repo_ecosystems.get(repo, []) + sec = repo_security.get(repo, {}) + worst = sec.get("worst", "—") + total = sec.get("total", 0) + sev_counts = sec.get("severities", {}) + top_checks = sec.get("top_checks", []) + cats = repo_categories.get(repo, {}) + publishes = repo in publishing_repos + has_tp_opportunity = repo in tp_repos + + # Risk score for sorting: publishing breadth * security severity + eco_score = len(ecosystems) if ecosystems else (1 if publishes else 0) + sev_score = {"CRITICAL": 100, "HIGH": 50, "MEDIUM": 10, "LOW": 3, "INFO": 1, "—": 0}.get(worst, 0) + risk_score = eco_score * sev_score + total + + repo_rows.append({ + "repo": repo, + "ecosystems": ecosystems, + "publishes": publishes, + "worst": worst, + "total": total, + "sev_counts": sev_counts, + "top_checks": top_checks, + "cats": cats, + "has_tp": has_tp_opportunity, + "risk_score": risk_score, + }) + + repo_rows.sort(key=lambda r: -r["risk_score"]) + + # --- Classify into tiers --- + critical_repos = [r for r in repo_rows if r["worst"] == "CRITICAL"] + high_repos = [r for r in repo_rows if r["worst"] == "HIGH"] + medium_repos = [r for r in repo_rows if r["worst"] == "MEDIUM" and r["publishes"]] + low_repos = [r for r in repo_rows if r["worst"] in ("LOW", "INFO", "—") and r["publishes"]] + + # --- Generate report --- + PUB = "apache-github-publishing.md" + SEC = "apache-github-security.md" + + def anchor(text): + a = text.lower().strip() + a = re.sub(r'[^\w\s-]', '', a) + a = re.sub(r'\s+', '-', a) + a = re.sub(r'-+', '-', a) + return a.strip('-') + + def repo_pub_link(repo): + return f"[publishing]({PUB}#{anchor(f'{owner}/{repo}')})" + + def repo_sec_link(repo): + return f"[security]({SEC}#{anchor(f'{owner}/{repo}')})" + + def eco_str(ecosystems): + if not ecosystems: + return "—" + return ", ".join(ecosystems) + + def sev_summary(sev_counts): + parts = [] + for s in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]: + c = sev_counts.get(s, 0) + if c > 0: + parts.append(f"{c} {s}") + return ", ".join(parts) if parts else "INFO only" + + def check_summary(top_checks): + if not top_checks: + return "" + return ", ".join(f"{chk} ({n})" for chk, n in top_checks) + + lines = [] + lines.append(f"# Apache GitHub Review: Combined Risk Assessment\n") + lines.append(f"Cross-referencing CI publishing analysis with security scan results " + f"across **{len(all_repos)}** repositories.\n") + + lines.append("## Companion Reports\n") + lines.append(f"| Report | Description |") + lines.append(f"|--------|-------------|") + lines.append(f"| [{PUB}]({PUB}) | Which repos publish packages to registries, " + f"what ecosystems, auth methods, trusted publishing opportunities. " + f"{pub_stats.get('total_workflows', '?')} workflows across " + f"{pub_stats.get('repos_scanned', '?')} repos. |") + lines.append(f"| [{SEC}]({SEC}) | Pattern-matching security checks on cached workflow YAML: " + f"injection patterns, unpinned actions, permissions, composite action analysis. " + f"{sec_stats.get('total_findings', '?')} findings across " + f"{sec_stats.get('repos_with_findings', '?')} repos. |") + lines.append("") + + # --- At a glance --- + lines.append("## At a Glance\n") + lines.append(f"| Metric | Value |") + lines.append(f"|--------|-------|") + lines.append(f"| Repos scanned | {pub_stats.get('repos_scanned', '?')} |") + lines.append(f"| Repos publishing to registries | {len(publishing_repos)} |") + lines.append(f"| Total security findings | {sec_stats.get('total_findings', '?')} |") + sev = sec_stats.get("severity_counts", {}) + lines.append(f"| CRITICAL findings | {sev.get('CRITICAL', 0)} |") + lines.append(f"| HIGH findings | {sev.get('HIGH', 0)} |") + lines.append(f"| Repos needing trusted publishing migration | {len(tp_repos)} |") + eco = pub_stats.get("ecosystem_counts", {}) + # Filter out documentation/CI targets for the publishing risk summary + doc_targets = {"codecov", "github_pages", "surge_sh", "s3", "gcr"} + release_eco = {k: v for k, v in eco.items() if k not in doc_targets} + top_eco = sorted(release_eco.items(), key=lambda x: -x[1])[:5] + eco_summary = ", ".join(f"{e} ({c})" for e, c in top_eco) + lines.append(f"| Top ecosystems | {eco_summary} |") + lines.append("") + + # --- CRITICAL + HIGH tier --- + if critical_repos or high_repos: + lines.append("## Immediate Attention Required\n") + lines.append("Repos with CRITICAL or HIGH security findings that also publish packages.\n") + + for r in critical_repos + high_repos: + repo = r["repo"] + lines.append(f"### {owner}/{repo}\n") + + eco_display = eco_str(r["ecosystems"]) + cat_parts = [] + if r["cats"].get("release"): + cat_parts.append(f"{r['cats']['release']} release") + if r["cats"].get("snapshot"): + cat_parts.append(f"{r['cats']['snapshot']} snapshot") + cat_display = ", ".join(cat_parts) if cat_parts else "" + + details = [] + if r["publishes"] and r["ecosystems"]: + pub_line = f"**Publishes to:** {eco_display}" + if cat_display: + pub_line += f" ({cat_display})" + details.append(pub_line) + elif r["publishes"]: + details.append(f"**Publishes:** yes (see {repo_pub_link(repo)})") + + details.append(f"**Security:** {r['total']} findings — {sev_summary(r['sev_counts'])}") + + if r["top_checks"]: + details.append(f"**Top issues:** {check_summary(r['top_checks'])}") + + if r["has_tp"]: + details.append(f"**Trusted publishing:** migration opportunity — currently using long-lived tokens " + f"([details]({PUB}#trusted-publishing-migration-opportunities))") + + details.append(f"**Details:** {repo_pub_link(repo)} · {repo_sec_link(repo)}") + + # Join with double-space + newline for markdown line breaks + lines.append(" \n".join(details)) + lines.append("") + + # --- MEDIUM tier: publishing repos --- + if medium_repos: + lines.append("## Moderate Risk: Publishing Repos with MEDIUM Findings\n") + lines.append("These repos publish packages and have MEDIUM-severity findings (typically unpinned actions).\n") + lines.append(f"| Repo | Ecosystems | Findings | Top Issue | Trusted Pub | Details |") + lines.append(f"|------|-----------|----------|-----------|------------|---------|") + + for r in medium_repos: + repo = r["repo"] + eco = eco_str(r["ecosystems"]) if r["ecosystems"] else "npm" + top = r["top_checks"][0][0] if r["top_checks"] else "unpinned_actions" + tp = "migrate" if r["has_tp"] else "—" + links = f"{repo_pub_link(repo)} · {repo_sec_link(repo)}" + lines.append(f"| {owner}/{repo} | {eco} | {r['total']} | {top} | {tp} | {links} |") + + lines.append("") + + # --- LOW tier summary --- + if low_repos: + lines.append("## Low Risk: Publishing Repos\n") + lines.append(f"{len(low_repos)} repos publish packages with only LOW/INFO-level security findings " + f"(missing CODEOWNERS, no dependabot config).\n") + lines.append(f"<details>\n<summary>Show {len(low_repos)} repos</summary>\n") + for r in low_repos: + repo = r["repo"] + eco = eco_str(r["ecosystems"]) if r["ecosystems"] else "—" + lines.append(f"- **{owner}/{repo}** — {eco} — {r['total']} findings " + f"({repo_pub_link(repo)} · {repo_sec_link(repo)})") + lines.append(f"\n</details>\n") + + # --- Trusted Publishing summary --- + lines.append("## Trusted Publishing Opportunities\n") + lines.append(f"**{len(tp_repos)}** repos use long-lived tokens to publish to ecosystems that support " + f"OIDC trusted publishing. Migrating eliminates stored secrets.\n") + lines.append(f"Full details: [{PUB} → Trusted Publishing]" + f"({PUB}#trusted-publishing-migration-opportunities)\n") + + # Group by ecosystem from pub_report + tp_ecosystems = {} + if pub_report: + current_eco = None + for line in pub_report.split("\n"): + if line.startswith("### ") and "Trusted Publishing" not in line: + # Check if this is an ecosystem header inside TP section + eco_name = line[4:].strip() + if eco_name in ("crates.io", "npm", "NuGet", "PyPI", "RubyGems"): + current_eco = eco_name + continue + if current_eco and line.startswith("## "): + current_eco = None + continue + if current_eco and "| " in line and "`" in line: + parts = [p.strip() for p in line.split("|")] + if len(parts) > 2 and parts[1] and parts[1] != "Repository": + tp_ecosystems.setdefault(current_eco, []).append(parts[1]) + + for eco, repos in sorted(tp_ecosystems.items()): + unique = sorted(set(repos)) + lines.append(f"- **{eco}**: {', '.join(unique)}") + lines.append("") + + # --- Key recommendations --- + lines.append("## Key Recommendations\n") + + rec_num = 1 + + if critical_repos: + crit_names = ", ".join("`" + r["repo"] + "`" for r in critical_repos) + verb = "has" if len(critical_repos) == 1 else "have" + lines.append(f"{rec_num}. **Fix CRITICAL findings immediately.** " + f"{crit_names} {verb} " + f"exploitable vulnerabilities in publishing workflows.") + rec_num += 1 + + lines.append(f"{rec_num}. **Migrate to trusted publishing.** " + f"{len(tp_repos)} repos can eliminate long-lived secrets by adopting OIDC. " + f"Start with repos publishing to PyPI and npm — " + f"[migration guide]({PUB}#trusted-publishing-migration-opportunities).") + rec_num += 1 + + if high_repos: + # Count all repos that have any HIGH findings (not just worst=HIGH) + repos_with_high = [r for r in repo_rows + if r["sev_counts"].get("HIGH", 0) > 0] + lines.append(f"{rec_num}. **Review composite action injection patterns.** " + f"{len(repos_with_high)} repos have HIGH findings from `inputs.*` directly interpolated " + f"in composite action run blocks. While these are called from trusted contexts today, " + f"they create hidden injection surfaces.") + rec_num += 1 + + lines.append(f"{rec_num}. **Pin actions to SHA hashes.** " + f"All {sec_stats.get('repos_with_findings', '?')} repos use mutable tag refs. " + f"See the [unpinned actions findings]({SEC}#medium-findings) for per-repo counts.") + rec_num += 1 + + # Count repos with missing_codeowners or codeowners_gap findings + no_codeowners = 0 + for repo, sec in repo_security.items(): + for chk, cnt in sec.get("top_checks", []): + pass # top_checks doesn't have all checks + # Count from check_counts in sec_stats + codeowners_missing = sec_stats.get("check_counts", {}).get("missing_codeowners", 0) + codeowners_gap = sec_stats.get("check_counts", {}).get("codeowners_gap", 0) + lines.append(f"{rec_num}. **Add CODEOWNERS with `.github/` coverage.** " + f"{codeowners_missing} repos have no CODEOWNERS file and " + f"{codeowners_gap} have CODEOWNERS without `.github/` rules. " + f"Workflow changes can bypass security review.") + lines.append("") + + lines.append("---\n") + lines.append(f"*Generated from [{PUB}]({PUB}) and [{SEC}]({SEC}).*") + + full_report = "\n".join(lines) + print(f"Report length: {len(full_report)} chars", flush=True) + + combined_ns = data_store.use_namespace(f"ci-combined:{owner}") + combined_ns.set("latest_report", full_report) + + return {"outputText": full_report} + + finally: + await http_client.aclose() \ No newline at end of file diff --git a/repos/apache/github-review/monitor-agent.sh b/repos/apache/github-review/monitor-agent.sh deleted file mode 100755 index 4f08305..0000000 --- a/repos/apache/github-review/monitor-agent.sh +++ /dev/null @@ -1,176 +0,0 @@ -#!/bin/bash -# monitor-agent.sh — tmux dashboard for CI analyzer agents -# Monitors: Publishing Analyzer, Security Scanner, Report Combiner -# Usage: chmod +x monitor-agent.sh && ./monitor-agent.sh - -COUCH_URL="http://user:password@localhost:5984" -DB="agent_data_store" -SESSION="ci-monitor" -OWNER="apache" - -# Kill existing session if any -tmux kill-session -t "$SESSION" 2>/dev/null - -tmux new-session -d -s "$SESSION" -x 220 -y 60 - -# ── Pane 0 (top-left): All namespace doc counts grouped by agent ── -tmux send-keys "watch -n 5 'echo \"=== Doc Counts (all agents) ===\"; echo; \ -curl -s \"${COUCH_URL}/${DB}/_find\" \ - -H \"Content-Type: application/json\" \ - -d \"{\\\"selector\\\":{\\\"namespace\\\":{\\\"\\\$in\\\":[\\\"ci-classification:${OWNER}\\\",\\\"ci-workflows:${OWNER}\\\",\\\"ci-report:${OWNER}\\\",\\\"ci-security:${OWNER}\\\",\\\"ci-combined:${OWNER}\\\"]}},\\\"fields\\\":[\\\"namespace\\\"],\\\"limit\\\":9999}\" \ -| python3 -c \" -import sys, json -from collections import Counter -docs = json.load(sys.stdin)[\\\"docs\\\"] -counts = Counter(d[\\\"namespace\\\"] for d in docs) -print(f\\\"Total: {len(docs)} docs\\\") -print() -groups = { - \\\"Agent 1 (Publishing)\\\": [\\\"ci-classification:${OWNER}\\\", \\\"ci-workflows:${OWNER}\\\", \\\"ci-report:${OWNER}\\\"], - \\\"Agent 2 (Security)\\\": [\\\"ci-security:${OWNER}\\\"], - \\\"Agent 3 (Combined)\\\": [\\\"ci-combined:${OWNER}\\\"], -} -for label, namespaces in groups.items(): - group_total = sum(counts.get(ns, 0) for ns in namespaces) - print(f\\\"{label}: {group_total} docs\\\") - for ns in namespaces: - c = counts.get(ns, 0) - short = ns.split(\\\":\\\")[0] - print(f\\\" {c}\\t{short}\\\") - print() -if not docs: - print(\\\" (all empty)\\\") -\"'" C-m - -# ── Pane 1 (top-right): Completed repos + classification status ── -tmux split-window -h -tmux send-keys "watch -n 10 'echo \"=== Agent 1: Repo Status ===\"; echo; \ -curl -s \"${COUCH_URL}/${DB}/_find\" \ - -H \"Content-Type: application/json\" \ - -d \"{\\\"selector\\\":{\\\"namespace\\\":\\\"ci-classification:${OWNER}\\\",\\\"key\\\":{\\\"\\\$regex\\\":\\\"^__meta__:\\\"}},\\\"fields\\\":[\\\"key\\\",\\\"value\\\"],\\\"limit\\\":9999}\" \ -| python3 -c \" -import sys, json -docs = json.load(sys.stdin)[\\\"docs\\\"] -done = [d for d in docs if d.get(\\\"value\\\", {}).get(\\\"complete\\\")] -with_wf = [d for d in done if d.get(\\\"value\\\", {}).get(\\\"workflows\\\")] -without_wf = [d for d in done if not d.get(\\\"value\\\", {}).get(\\\"workflows\\\")] -print(f\\\"Completed: {len(done)} repos ({len(with_wf)} with workflows, {len(without_wf)} empty)\\\") -print() -for d in sorted(with_wf, key=lambda x: x[\\\"key\\\"]): - repo = d[\\\"key\\\"].replace(\\\"__meta__:\\\", \\\"\\\") - wfs = d[\\\"value\\\"].get(\\\"workflows\\\", []) - print(f\\\" ✓ {repo}: {len(wfs)} workflows\\\") -\"'" C-m - -# ── Pane 2 (middle-left): In-progress classification ── -tmux select-pane -t 0 -tmux split-window -v -tmux send-keys "watch -n 5 'echo \"=== Agent 1: In-Progress ===\"; echo; \ -curl -s \"${COUCH_URL}/${DB}/_find\" \ - -H \"Content-Type: application/json\" \ - -d \"{\\\"selector\\\":{\\\"namespace\\\":\\\"ci-classification:${OWNER}\\\",\\\"key\\\":{\\\"\\\$not\\\":{\\\"\\\$regex\\\":\\\"^__meta__:\\\"}}},\\\"fields\\\":[\\\"key\\\"],\\\"limit\\\":9999}\" \ -| python3 -c \" -import sys, json, subprocess -docs = json.load(sys.stdin)[\\\"docs\\\"] - -meta_raw = subprocess.run( - [\\\"curl\\\", \\\"-s\\\", \\\"${COUCH_URL}/${DB}/_find\\\", - \\\"-H\\\", \\\"Content-Type: application/json\\\", - \\\"-d\\\", json.dumps({\\\"selector\\\": {\\\"namespace\\\": \\\"ci-classification:${OWNER}\\\", \\\"key\\\": {\\\"\\\$regex\\\": \\\"^__meta__:\\\"}}, \\\"fields\\\": [\\\"key\\\", \\\"value\\\"], \\\"limit\\\": 9999})], - capture_output=True, text=True -).stdout -meta_docs = json.loads(meta_raw)[\\\"docs\\\"] -complete_repos = {d[\\\"key\\\"].replace(\\\"__meta__:\\\", \\\"\\\") for d in meta_docs if d.get(\\\"value\\\", {}).get(\\\"complete\\\")} - -by_repo = {} -for d in docs: - key = d[\\\"key\\\"] - repo = key.split(\\\":\\\")[0] if \\\":\\\" in key else \\\"unknown\\\" - by_repo.setdefault(repo, []).append(key) - -in_progress = {r: files for r, files in by_repo.items() if r not in complete_repos} - -if not in_progress: - print(\\\"No repos currently being classified.\\\") - print(f\\\"\\\\nTotal classified: {len(docs)} workflows across {len(by_repo)} repos\\\") -else: - for repo, files in sorted(in_progress.items()): - print(f\\\"⏳ {repo}: {len(files)} classified so far\\\") - recent = sorted(files)[-15:] - for f in recent: - wf_name = f.split(\\\":\\\", 1)[1] if \\\":\\\" in f else f - print(f\\\" {wf_name}\\\") - if len(files) > 15: - print(f\\\" ... and {len(files) - 15} more\\\") - print(f\\\"\\\\nTotal classified: {len(docs)} workflows\\\") -\"'" C-m - -# ── Pane 3 (middle-right): Agent 2 security findings summary ── -tmux select-pane -t 1 -tmux split-window -v -tmux send-keys "watch -n 5 'echo \"=== Agent 2: Security Findings ===\"; echo; \ -curl -s \"${COUCH_URL}/${DB}/_find\" \ - -H \"Content-Type: application/json\" \ - -d \"{\\\"selector\\\":{\\\"namespace\\\":\\\"ci-security:${OWNER}\\\"},\\\"fields\\\":[\\\"key\\\",\\\"value\\\"],\\\"limit\\\":9999}\" \ -| python3 -c \" -import sys, json -docs = json.load(sys.stdin)[\\\"docs\\\"] - -if not docs: - print(\\\"No security data yet.\\\") - print(\\\"Run Agent 2 after Agent 1 completes.\\\") -else: - finding_docs = [d for d in docs if d[\\\"key\\\"].startswith(\\\"findings:\\\")] - meta_docs = [d for d in docs if d[\\\"key\\\"].startswith(\\\"latest_\\\")] - - repos_with_findings = 0 - repos_clean = 0 - total_findings = 0 - severity_counts = {} - check_counts = {} - - for d in finding_docs: - findings = d.get(\\\"value\\\", []) - if isinstance(findings, list): - if findings: - repos_with_findings += 1 - total_findings += len(findings) - else: - repos_clean += 1 - for f in findings: - if isinstance(f, dict): - sev = f.get(\\\"severity\\\", \\\"?\\\") - severity_counts[sev] = severity_counts.get(sev, 0) + 1 - chk = f.get(\\\"check\\\", \\\"?\\\") - check_counts[chk] = check_counts.get(chk, 0) + 1 - - print(f\\\"Repos processed: {len(finding_docs)} ({repos_with_findings} with findings, {repos_clean} clean)\\\") - print(f\\\"Total findings: {total_findings}\\\") - print() - - if severity_counts: - print(\\\"By severity:\\\") - for sev in [\\\"CRITICAL\\\", \\\"HIGH\\\", \\\"MEDIUM\\\", \\\"LOW\\\", \\\"INFO\\\"]: - c = severity_counts.get(sev, 0) - if c > 0: - print(f\\\" {sev}: {c}\\\") - print() - - if check_counts: - print(\\\"By check:\\\") - for chk, c in sorted(check_counts.items(), key=lambda x: -x[1])[:10]: - print(f\\\" {c}\\t{chk}\\\") - - if meta_docs: - print() - print(\\\"Reports: \\\", \\\", \\\".join(d[\\\"key\\\"] for d in meta_docs)) -\"'" C-m - -# ── Pane 4 (bottom): Docker logs tail ── -tmux select-pane -t 2 -tmux split-window -v -tmux send-keys "docker compose logs -f --tail=100 api 2>&1 | grep --line-buffered -E '(Scanning|Progress|Rate limit|WARNING|ERROR|Scan complete|classified|cached|Preflight|Security scan|findings|composite|call_llm)'" C-m - -# Layout and attach -tmux select-layout -t "$SESSION" tiled -tmux attach -t "$SESSION" \ No newline at end of file diff --git a/repos/apache/github-review/report.md b/repos/apache/github-review/report.md deleted file mode 100644 index 27e375a..0000000 --- a/repos/apache/github-review/report.md +++ /dev/null @@ -1,203 +0,0 @@ -# CI Registry Publishing Analysis: apache - -## Contents - -- [Executive Summary](#executive-summary) -- [Package Ecosystem Distribution](#package-ecosystem-distribution-releases-snapshots-only) -- [Release Artifact Workflows](#release-artifact-workflows) (5) -- [Snapshot / Nightly Workflows](#snapshot-nightly-artifact-workflows) (1) -- [CI Infrastructure Workflows](#ci-infrastructure-image-workflows) (24) -- [Documentation Workflows](#documentation-website-workflows) (6) -- [Security: Low Risk](#security-low-risk-findings) (11) -- [Detailed Results](#detailed-results-release-snapshot-workflows) - - [apache/airflow](#apacheairflow) - - [apache/kafka](#apachekafka) - - [apache/spark](#apachespark) - ---- - -Scanned **3** repositories, **3** had GitHub Actions workflow files, **110** total workflows analyzed. - -## Executive Summary - -| Metric | Value | -|--------|-------| -| Repositories scanned | 3 | -| Repositories with workflows | 3 | -| Total workflow files | 110 | -| **Repos with any publishing** | **3** | -| Release artifact workflows | 5 | -| Snapshot / nightly workflows | 1 | -| CI infrastructure image workflows | 24 | -| Documentation / website workflows | 6 | -| Security notes flagged | 14 | - -## Package Ecosystem Distribution (releases + snapshots only) - -| Ecosystem | Workflows | Percentage | -|-----------|-----------|------------| -| docker_hub | 4 | 44.4% | -| maven_central | 2 | 22.2% | -| ghcr | 1 | 11.1% | -| apache_dist | 1 | 11.1% | -| pypi | 1 | 11.1% | - -## Release Artifact Workflows - -These workflows publish versioned packages to public registries consumed by end users. - -| Repository | Workflow | Ecosystems | Trigger | Auth | -|------------|----------|------------|---------|------| -| airflow | `release_dockerhub_image.yml` | docker_hub | workflow_dispatch with airflowVersion input (e.g. 3.0.1, 3.0.1rc1, 3.0.1b1) | DOCKERHUB_USER and DOCKERHUB_TOKEN secrets | -| airflow | `release_single_dockerhub_image.yml` | docker_hub, ghcr | workflow_call | DOCKERHUB_USER/DOCKERHUB_TOKEN secrets for Docker Hub, GITHUB_TOKEN for GHCR | -| kafka | `docker_promote.yml` | docker_hub | workflow_dispatch | secrets.DOCKERHUB_USER and secrets.DOCKERHUB_TOKEN | -| kafka | `docker_rc_release.yml` | docker_hub | workflow_dispatch | secrets.DOCKERHUB_USER and secrets.DOCKERHUB_TOKEN | -| spark | `release.yml` | apache_dist, maven_central, pypi | workflow_dispatch with inputs for branch, release-version, rc-count, and finalize; also scheduled cron | ASF credentials (ASF_USERNAME, ASF_PASSWORD, ASF_NEXUS_TOKEN), GPG key signing (GPG_PRIVATE_KEY, GPG_PASSPHRASE), PyPI API token (PYPI_API_TOKEN) | - -## Snapshot / Nightly Artifact Workflows - -These workflows publish snapshot or nightly builds to staging registries. - -| Repository | Workflow | Ecosystems | Trigger | Auth | -|------------|----------|------------|---------|------| -| spark | `publish_snapshot.yml` | maven_central | schedule (daily cron) and workflow_dispatch | ASF Nexus credentials (NEXUS_USER, NEXUS_PW, NEXUS_TOKEN) stored in GitHub secrets | - -## CI Infrastructure Image Workflows - -These workflows push Docker images used only for CI build caching, test execution, or build acceleration. They do not publish end-user artifacts. - -<details> -<summary>Show 24 CI infrastructure workflows</summary> - -| Repository | Workflow | Target | Summary | -|------------|----------|--------|---------| -| airflow | `additional-ci-image-checks.yml` | ghcr | This workflow pushes early BuildX cache images to GitHub Container Registry (GHC | -| airflow | `ci-image-build.yml` | ghcr | This workflow builds CI Docker images for Apache Airflow and conditionally pushe | -| airflow | `finalize-tests.yml` | ghcr | This workflow finalizes test runs by updating constraints and pushing Docker bui | -| airflow | `prod-image-build.yml` | ghcr | This workflow builds Apache Airflow production Docker images for CI/CD purposes. | -| airflow | `push-image-cache.yml` | ghcr | This workflow pushes CI and PROD Docker image caches to GitHub Container Registr | -| spark | `build_and_test.yml` | ghcr | This workflow builds and pushes Docker images to GitHub Container Registry (GHCR | -| spark | `build_branch35.yml` | ghcr | This workflow calls a reusable workflow (build_and_test.yml) with packages:write | -| spark | `build_branch40.yml` | ghcr | This workflow is a scheduled build job that calls a reusable workflow (build_and | -| spark | `build_branch40_java21.yml` | ghcr | This workflow is a scheduled CI build that runs every 2 days for Apache Spark's | -| spark | `build_branch40_python_pypy3.10.yml` | ghcr | This workflow calls a reusable workflow (build_and_test.yml) with packages:write | -| spark | `build_branch41.yml` | ghcr | Scheduled nightly build workflow for Apache Spark branch-4.1 that calls a reusab | -| spark | `build_branch41_java21.yml` | ghcr | This workflow is a scheduled nightly build that calls a reusable workflow (build | -| spark | `build_branch41_python_pypy3.10.yml` | ghcr | Scheduled workflow that calls a reusable workflow (build_and_test.yml) with pack | -| spark | `build_infra_images_cache.yml` | ghcr | Builds and pushes Docker images to GHCR for CI/CD infrastructure. Multiple test | -| spark | `build_java21.yml` | ghcr | This workflow is a scheduled nightly build that calls a reusable workflow (build | -| spark | `build_java25.yml` | ghcr | This workflow is a scheduled nightly build job that tests Apache Spark with Java | -| spark | `build_main.yml` | ghcr | This workflow calls a reusable workflow (build_and_test.yml) with packages:write | -| spark | `build_python_3.10.yml` | ghcr | This workflow calls a reusable workflow (build_and_test.yml) with packages:write | -| spark | `build_python_3.11.yml` | ghcr | This workflow calls a reusable workflow (build_and_test.yml) with packages:write | -| spark | `build_python_3.12_classic_only.yml` | ghcr | This workflow calls a reusable workflow (build_and_test.yml) with packages:write | -| spark | `build_python_3.12_pandas_3.yml` | ghcr | This workflow is a scheduled nightly build that calls a reusable workflow (build | -| spark | `build_python_3.13.yml` | ghcr | This workflow calls a reusable workflow (build_and_test.yml) with packages:write | -| spark | `build_python_3.14.yml` | ghcr | This workflow calls a reusable workflow (build_and_test.yml) with packages:write | -| spark | `build_python_3.14_nogil.yml` | ghcr | This workflow calls a reusable workflow (build_and_test.yml) with packages:write | - -</details> - -## Documentation / Website Workflows - -<details> -<summary>Show 6 documentation workflows</summary> - -| Repository | Workflow | Target | Summary | -|------------|----------|--------|---------| -| airflow | `ci-image-checks.yml` | s3 | This workflow builds Apache Airflow documentation and publishes it to AWS S3 (s3 | -| airflow | `publish-docs-to-s3.yml` | s3 | This workflow builds Apache Airflow documentation and publishes it to AWS S3 buc | -| airflow | `registry-backfill.yml` | s3 | This workflow backfills Apache Airflow provider registry documentation to S3 buc | -| airflow | `registry-build.yml` | s3 | Builds and publishes Apache Airflow provider registry documentation to S3. Extra | -| spark | `build_coverage.yml` | codecov | This workflow runs Python coverage tests on a schedule and uploads results to Co | -| spark | `pages.yml` | github_pages | Builds Apache Spark documentation using Jekyll, Sphinx, and other tools, then de | - -</details> - -## Security: Low Risk Findings - -GitHub-controlled values used directly in `run:` blocks. Not user-injectable but poor practice. - -<details> -<summary>Show 11 low-risk findings</summary> - -- **apache/airflow** (`prod-image-build.yml`): [LOW] Direct interpolation of github.sha in run block at step 'Build PROD images w/ source providers'. While github.sha is GitHub-controlled and not user-injectable, best practice is to pass through env block. -- **apache/airflow** (`publish-docs-to-s3.yml`): [LOW] GitHub-controlled value github.actor used directly in env blocks -- **apache/airflow** (`publish-docs-to-s3.yml`): [LOW] GitHub-controlled value github.repository used directly in env blocks -- **apache/airflow** (`registry-build.yml`): [LOW] GitHub-controlled value github.event.sender.login used in conditional expression -- **apache/airflow** (`release_dockerhub_image.yml`): [LOW] GitHub-controlled value github.event.inputs.airflowVersion used in concurrency.group -- **apache/airflow** (`release_dockerhub_image.yml`): [LOW] Input parameter airflowVersion passed through environment variables and shell scripts in build-info job -- **apache/airflow** (`release_single_dockerhub_image.yml`): [LOW] GitHub-controlled value github.sha used directly in env block COMMIT_SHA -- **apache/airflow** (`release_single_dockerhub_image.yml`): [LOW] GitHub-controlled value github.repository used directly in env block REPOSITORY -- **apache/airflow** (`release_single_dockerhub_image.yml`): [LOW] GitHub-controlled value github.actor used in docker login command via ACTOR env variable -- **apache/spark** (`publish_snapshot.yml`): [LOW] GitHub-controlled value matrix.branch used in checkout ref and GIT_REF environment variable -- **apache/spark** (`release.yml`): [LOW] GitHub-controlled value github.actor used directly in GIT_NAME environment variable - -</details> - -## Detailed Results: Release & Snapshot Workflows - -### apache/airflow - -**2** release/snapshot workflows | Ecosystems: **docker_hub, ghcr** | Release Artifacts: 2 - -**`release_dockerhub_image.yml`** — Release PROD images [Release Artifacts] -- **Summary**: This workflow publishes production Apache Airflow Docker images to Docker Hub. It is manually triggered with an Airflow version parameter (supporting release, RC, and beta versions). The workflow builds images for multiple Python versions and platforms (amd64 and optionally arm64), then delegates to a reusable workflow (release_single_dockerhub_image.yml) that performs the actual Docker Hub publishing. Access is restricted to a whitelist of Apache Airflow committers. -- **Ecosystems**: docker_hub -- **Trigger**: workflow_dispatch with airflowVersion input (e.g. 3.0.1, 3.0.1rc1, 3.0.1b1) -- **Auth**: DOCKERHUB_USER and DOCKERHUB_TOKEN secrets -- **Confidence**: high - -**`release_single_dockerhub_image.yml`** — Release single PROD image [Release Artifacts] -- **Summary**: Builds and publishes versioned Apache Airflow production Docker images to Docker Hub for multiple platforms (linux/amd64, linux/arm64) and Python versions. The workflow builds both regular and slim images, verifies them, then merges multi-platform manifests. Images are tagged with specific Airflow versions (e.g., 3.0.1, 3.0.1rc1) and optionally as 'latest'. Also logs into GHCR for intermediate operations. -- **Ecosystems**: docker_hub, ghcr -- **Trigger**: workflow_call -- **Auth**: DOCKERHUB_USER/DOCKERHUB_TOKEN secrets for Docker Hub, GITHUB_TOKEN for GHCR -- **Confidence**: high -- **Commands**: `breeze release-management release-prod-images`, `breeze release-management merge-prod-images` - -### apache/kafka - -**2** release/snapshot workflows | Ecosystems: **docker_hub** | Release Artifacts: 2 - -**`docker_promote.yml`** — Promote Release Candidate Docker Image [Release Artifacts] -- **Summary**: This workflow promotes Apache Kafka release candidate Docker images to final release versions on Docker Hub. It uses workflow_dispatch to manually trigger promotion, taking RC image names (e.g., apache/kafka:3.8.0-rc0) and promoted image names (e.g., apache/kafka:3.8.0) as inputs. The workflow authenticates to Docker Hub and uses docker buildx imagetools to copy/tag the RC image as the promoted release image. User inputs are safely passed through env variables before being [...] -- **Ecosystems**: docker_hub -- **Trigger**: workflow_dispatch -- **Auth**: secrets.DOCKERHUB_USER and secrets.DOCKERHUB_TOKEN -- **Confidence**: high -- **GitHub Actions**: `docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef` -- **Commands**: `docker buildx imagetools create --tag $PROMOTED_DOCKER_IMAGE $RC_DOCKER_IMAGE` - -**`docker_rc_release.yml`** — Build and Push Release Candidate Docker Image [Release Artifacts] -- **Summary**: This workflow builds and publishes Apache Kafka release candidate Docker images to Docker Hub. It supports both JVM and native image types, is manually triggered via workflow_dispatch, and uses a Python script (docker_release.py) to build and push multi-architecture images (via QEMU and Docker Buildx) to apache/kafka or apache/kafka-native repositories on Docker Hub. -- **Ecosystems**: docker_hub -- **Trigger**: workflow_dispatch -- **Auth**: secrets.DOCKERHUB_USER and secrets.DOCKERHUB_TOKEN -- **Confidence**: high -- **GitHub Actions**: `docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef` -- **Commands**: `python docker/docker_release.py $RC_DOCKER_IMAGE --kafka-url $KAFKA_URL --image-type $IMAGE_TYPE` - -### apache/spark - -**2** release/snapshot workflows | Ecosystems: **apache_dist, maven_central, pypi** | Release Artifacts: 1, Snapshot / Nightly Artifacts: 1 - -**`release.yml`** — Release Apache Spark [Release Artifacts] -- **Summary**: This workflow orchestrates the Apache Spark release process, publishing release artifacts to Apache Distribution SVN (apache_dist), Maven Central (via ASF Nexus), and PyPI. It supports both RC creation and finalization modes. The workflow calls dev/create-release/do-release-docker.sh which handles the actual publishing. It includes dry-run capability and is designed to run in forked repositories with manual dispatch. The finalize mode converts RC artifacts to official rele [...] -- **Ecosystems**: apache_dist, maven_central, pypi -- **Trigger**: workflow_dispatch with inputs for branch, release-version, rc-count, and finalize; also scheduled cron -- **Auth**: ASF credentials (ASF_USERNAME, ASF_PASSWORD, ASF_NEXUS_TOKEN), GPG key signing (GPG_PRIVATE_KEY, GPG_PASSPHRASE), PyPI API token (PYPI_API_TOKEN) -- **Confidence**: high -- **Commands**: `dev/create-release/do-release-docker.sh` - -**`publish_snapshot.yml`** — Publish snapshot [Snapshot / Nightly Artifacts] -- **Summary**: Publishes Apache Spark snapshot builds to ASF Nexus repository on a daily schedule for multiple branches (master, branch-4.1, branch-4.0, branch-3.5). Uses Maven to build and deploy snapshot artifacts with ASF Nexus authentication. -- **Ecosystems**: maven_central -- **Trigger**: schedule (daily cron) and workflow_dispatch -- **Auth**: ASF Nexus credentials (NEXUS_USER, NEXUS_PW, NEXUS_TOKEN) stored in GitHub secrets -- **Confidence**: high -- **Commands**: `./dev/create-release/release-build.sh publish-snapshot` - ---- - -*Cached in `ci-classification:apache`. Set `clear_cache` to `true` to force a fresh scan. Raw YAML stored in `ci-workflows:apache`.* \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
