(tooling-agents) branch main updated: Improvements to deduping

akm Tue, 31 Mar 2026 17:08:02 -0700

This is an automated email from the ASF dual-hosted git repository.

akm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tooling-agents.git



The following commit(s) were added to refs/heads/main by this push:
     new 3092160  Improvements to deduping
3092160 is described below

commit 30921603fa9b556b710b1c8faaca58cfbabc5387
Author: Andrew Musselman <[email protected]>
AuthorDate: Tue Mar 31 17:06:44 2026 -0700

    Improvements to deduping
---
 .../code.py                                        | 141 ++++++++++++++++-----
 1 file changed, 112 insertions(+), 29 deletions(-)

diff --git 
a/repos/tooling-trusted-releases/ASVS/agents/consolidate_asvs_security_audit_reports/code.py
 
b/repos/tooling-trusted-releases/ASVS/agents/consolidate_asvs_security_audit_reports/code.py
index 8252370..fb0bbd7 100644
--- 
a/repos/tooling-trusted-releases/ASVS/agents/consolidate_asvs_security_audit_reports/code.py
+++ 
b/repos/tooling-trusted-releases/ASVS/agents/consolidate_asvs_security_audit_reports/code.py
@@ -955,35 +955,51 @@ Return ONLY valid JSON."""
                 by_func.setdefault(fn, set()).add(gid)
 
         # Assign cross-references using hard rules
+        # Same file ALONE is too broad (atr/api/__init__.py has 20 findings).
+        # Require same-file AND (same-CWE or same-function), or same-CWE 
alone, or same-function alone.
         xref_count = 0
         for finding in all_findings:
             gid = finding["global_id"]
             related = set()
 
-            # Rule 1: same primary file
             pf = extract_primary_file(finding)
-            if pf and pf in by_file:
-                related |= by_file[pf]
-
-            # Rule 2: same CWE
             cwe = finding.get("cwe", "")
-            if cwe and cwe != "null" and cwe in by_cwe:
+            if cwe == "null":
+                cwe = ""
+            funcs = extract_function_names(finding)
+
+            # Rule 1: same CWE (regardless of file)
+            if cwe and cwe in by_cwe:
                 related |= by_cwe[cwe]
 
-            # Rule 3: same function name
-            for fn in extract_function_names(finding):
+            # Rule 2: same function name (regardless of file)
+            for fn in funcs:
                 if fn in by_func:
                     related |= by_func[fn]
 
+            # Rule 3: same primary file AND (same CWE or same function)
+            if pf and pf in by_file:
+                same_file_ids = by_file[pf]
+                for other_id in same_file_ids:
+                    if other_id == gid:
+                        continue
+                    other = next((f for f in all_findings if f["global_id"] == 
other_id), None)
+                    if not other:
+                        continue
+                    other_cwe = other.get("cwe", "")
+                    if other_cwe == "null":
+                        other_cwe = ""
+                    other_funcs = extract_function_names(other)
+                    # Must share CWE or function, not just file
+                    if (cwe and other_cwe and cwe == other_cwe) or (funcs & 
other_funcs):
+                        related.add(other_id)
+
             # Remove self-reference
             related.discard(gid)
 
-            # Cap at 10 most relevant (same-file first, then same-CWE, then 
same-func)
+            # Cap at 10
             if len(related) > 10:
-                # Prioritize same-file references
-                same_file = by_file.get(pf, set()) - {gid} if pf else set()
-                others = related - same_file
-                related = same_file | set(list(others)[:10 - len(same_file)])
+                related = set(sorted(related)[:10])
 
             finding["related_findings"] = sorted(related) if related else []
             if related:
@@ -1152,7 +1168,7 @@ Findings data:
 
 Output ONLY Markdown. Include ALL {len(sub_batch)} findings with full 
detail."""
 
-                for attempt in range(2):
+                for attempt in range(3):
                     try:
                         sub_result, _ = await call_llm(
                             provider=FAST_PROVIDER,
@@ -1161,16 +1177,23 @@ Output ONLY Markdown. Include ALL {len(sub_batch)} 
findings with full detail."""
                             parameters={**FAST_PARAMS, "max_tokens": 64000},
                             timeout=900,
                         )
-                        findings_md_parts.append(sub_result)
                         sub_count = len(re.findall(r'#### FINDING-\d{3}', 
sub_result))
                         print(f"    Batch {sb_idx+1}: 
{sub_count}/{len(sub_batch)} sections generated")
-                        break
+                        if sub_count >= len(sub_batch):
+                            findings_md_parts.append(sub_result)
+                            break
+                        elif attempt < 2:
+                            print(f"    Missing {len(sub_batch) - sub_count} 
sections, retrying...", flush=True)
+                            await asyncio.sleep(5)
+                        else:
+                            print(f"    Accepting {sub_count}/{len(sub_batch)} 
after 3 attempts")
+                            findings_md_parts.append(sub_result)
                     except Exception as e:
-                        if attempt == 0:
-                            print(f"    Batch {sb_idx+1} attempt 1 failed 
({type(e).__name__}), retrying...", flush=True)
+                        if attempt < 2:
+                            print(f"    Batch {sb_idx+1} attempt {attempt+1} 
failed ({type(e).__name__}), retrying...", flush=True)
                             await asyncio.sleep(5)
                         else:
-                            print(f"    Batch {sb_idx+1} FAILED after 2 
attempts: {e}")
+                            print(f"    Batch {sb_idx+1} FAILED after 3 
attempts: {e}")
 
         # ----------------------------------------------------------
         # Part 3: Tail sections (Sonnet)
@@ -1205,10 +1228,9 @@ Table with columns: ASVS Section | Section Title | Level 
| Status (✅ Pass / 
 Table grouping findings by attack surface with finding IDs organized by 
severity.
 Columns: Attack Surface | ASVS Levels | Critical | High | Medium | Low | Info
 
-## 7. Level Coverage Analysis
-Table showing: Level | Sections Audited | Findings Found | Unique Findings 
(not found in other levels)
+Do NOT generate a Level Coverage section — it will be added separately.
 
-End with: `*End of Consolidated Security Audit Report*`
+End with nothing — more content will be appended after this.
 
 Output ONLY Markdown."""
 
@@ -1238,6 +1260,25 @@ Output ONLY Markdown."""
         consolidated_md += "\n\n---\n\n"
         consolidated_md += tail_result
 
+        # Generate Section 7 deterministically (not via LLM — avoids 
hallucinated numbers)
+        print("Generating Section 7 (Level Coverage) deterministically...")
+        section7_lines = ["\n\n## 7. Level Coverage Analysis\n"]
+        section7_lines.append("| Level | Sections Audited | Findings Found | 
Unique to Level |")
+        
section7_lines.append("|-------|-----------------|----------------|-----------------|")
+        for lv in sorted(set(dir_levels.values())):
+            lv_sections = sum(1 for rk in all_extracted if 
report_levels.get(rk) == lv)
+            lv_findings = [f for f in all_findings if lv in 
f.get("asvs_levels", [])]
+            lv_only = [f for f in lv_findings if f.get("asvs_levels") == [lv]]
+            section7_lines.append(f"| {lv} | {lv_sections} | 
{len(lv_findings)} | {len(lv_only)} |")
+
+        # Both levels
+        both = [f for f in all_findings if len(f.get("asvs_levels", [])) > 1]
+        section7_lines.append(f"| Both | — | {len(both)} | — |")
+        section7_lines.append(f"\n**Total consolidated findings: 
{len(all_findings)}**")
+        section7_lines.append(f"\n*End of Consolidated Security Audit Report*")
+
+        consolidated_md += "\n".join(section7_lines)
+
         print(f"Consolidated report generated: {len(consolidated_md)} chars")
 
         # ============================================================
@@ -1245,8 +1286,30 @@ Output ONLY Markdown."""
         # ============================================================
         ISSUES_BATCH_SIZE = 75
 
-        actionable_findings = [f for f in all_findings if f.get("severity") != 
"Informational"]
-        total_batches = (len(actionable_findings) + ISSUES_BATCH_SIZE - 1) // 
ISSUES_BATCH_SIZE
+        # Filter out Informational findings for issues generation
+        # Robust check: severity field may contain emoji prefix from earlier 
formatting
+        informational_ids = set()
+        actionable_findings = []
+        for f in all_findings:
+            sev = f.get("severity", "").strip()
+            if "informational" in sev.lower():
+                informational_ids.add(f.get("global_id", ""))
+            else:
+                actionable_findings.append(f)
+
+        # Strip Informational finding IDs from related_findings in issues data
+        # so Sonnet doesn't reference or generate issues for them
+        issues_findings_data = []
+        for f in actionable_findings:
+            f_copy = dict(f)
+            if f_copy.get("related_findings"):
+                f_copy["related_findings"] = [
+                    r for r in f_copy["related_findings"]
+                    if r not in informational_ids
+                ]
+            issues_findings_data.append(f_copy)
+
+        total_batches = (len(issues_findings_data) + ISSUES_BATCH_SIZE - 1) // 
ISSUES_BATCH_SIZE
 
         print(f"\nGenerating {issues_filename} (Sonnet, {total_batches} 
batches)...")
         print(f"Actionable findings for issues: {len(actionable_findings)}")
@@ -1288,8 +1351,8 @@ Output ONLY Markdown."""
         issues_params = {**FAST_PARAMS}
         issues_params["max_tokens"] = 64000
 
-        for batch_idx in range(0, len(actionable_findings), ISSUES_BATCH_SIZE):
-            batch = actionable_findings[batch_idx:batch_idx + 
ISSUES_BATCH_SIZE]
+        for batch_idx in range(0, len(issues_findings_data), 
ISSUES_BATCH_SIZE):
+            batch = issues_findings_data[batch_idx:batch_idx + 
ISSUES_BATCH_SIZE]
             batch_num = (batch_idx // ISSUES_BATCH_SIZE) + 1
             is_first_batch = batch_idx == 0
 
@@ -1399,8 +1462,19 @@ Affected files: {files}
         # ============================================================
         print("\n=== Quality Checks ===")
 
+        # Check for duplicate FINDING IDs in the report
+        all_finding_ids = re.findall(r'#### (FINDING-\d{3}):', consolidated_md)
+        id_counts = {}
+        for fid in all_finding_ids:
+            id_counts[fid] = id_counts.get(fid, 0) + 1
+        duplicate_ids = {fid: cnt for fid, cnt in id_counts.items() if cnt > 1}
+        if duplicate_ids:
+            print(f"WARNING: Duplicate FINDING IDs detected: {duplicate_ids}")
+        else:
+            print(f"FINDING ID uniqueness: OK ({len(all_finding_ids)} unique 
IDs)")
+
         finding_ids_in_report = set(re.findall(r'FINDING-\d{3}', 
consolidated_md))
-        print(f"FINDING IDs in consolidated report: 
{len(finding_ids_in_report)} (expected: {len(all_findings)})")
+        print(f"FINDING IDs mentioned in consolidated report: 
{len(finding_ids_in_report)}")
 
         finding_sections_in_report = set(re.findall(r'#### FINDING-(\d{3})', 
consolidated_md))
         print(f"FINDING sections with full detail: 
{len(finding_sections_in_report)} (expected: {len(all_findings)})")
@@ -1408,8 +1482,17 @@ Affected files: {files}
             missing_sections = set(f"{i:03d}" for i in range(1, 
len(all_findings) + 1)) - finding_sections_in_report
             print(f"  Missing sections for: {', '.join(f'FINDING-{m}' for m in 
sorted(missing_sections)[:10])}{'...' if len(missing_sections) > 10 else ''}")
 
-        issue_ids_in_issues = set(re.findall(r'FINDING-\d{3}', issues_md))
-        print(f"FINDING IDs in issues: {len(issue_ids_in_issues)} (expected: 
{len(actionable_findings)})")
+        # Count actual issue entries (headers), not just FINDING ID mentions
+        issue_headers_in_issues = set(re.findall(r'## Issue: (FINDING-\d{3})', 
issues_md))
+        issue_ids_mentioned = set(re.findall(r'FINDING-\d{3}', issues_md))
+        print(f"Issue entries in issues file: {len(issue_headers_in_issues)} 
(expected: {len(actionable_findings)})")
+        if len(issue_headers_in_issues) != len(actionable_findings):
+            extra = issue_headers_in_issues - set(f.get("global_id", "") for f 
in actionable_findings)
+            missing = set(f.get("global_id", "") for f in actionable_findings) 
- issue_headers_in_issues
+            if extra:
+                print(f"  Extra issues (should not exist): 
{sorted(extra)[:10]}")
+            if missing:
+                print(f"  Missing issues: {sorted(missing)[:10]}")
 
         # Per-level finding counts
         print(f"\n--- Level Analysis ---")


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(tooling-agents) branch main updated: Improvements to deduping

Reply via email to