This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow-steward.git
The following commit(s) were added to refs/heads/main by this push:
new 0931bd8 skill-evals: field-aware grading, intersection key matching,
wrap non-JSON output (#370)
0931bd8 is described below
commit 0931bd88857d0f1e4b0076c8fe2700e74a63b277
Author: Justin Mclean <[email protected]>
AuthorDate: Sat May 30 08:42:05 2026 +1000
skill-evals: field-aware grading, intersection key matching, wrap non-JSON
output (#370)
* add grading
* rummer improvements so CLI runs work
---
tools/skill-evals/README.md | 76 +++-
tools/skill-evals/src/skill_evals/runner.py | 437 ++++++++++++++++++--
tools/skill-evals/tests/_grader_count.py | 34 ++
tools/skill-evals/tests/_grader_no.py | 28 ++
tools/skill-evals/tests/_grader_yes.py | 30 ++
tools/skill-evals/tests/test_runner.py | 591 +++++++++++++++++++++++++++-
6 files changed, 1164 insertions(+), 32 deletions(-)
diff --git a/tools/skill-evals/README.md b/tools/skill-evals/README.md
index 42fb01f..3360d5d 100644
--- a/tools/skill-evals/README.md
+++ b/tools/skill-evals/README.md
@@ -85,6 +85,18 @@ stdout as JSON, look for the first ```` ```json ```` fenced
block, then
the largest balanced `{...}` (or `[...]`) substring. Models that wrap
output in prose or markdown fences still work.
+If none of those strategies finds JSON, the runner silently wraps the
+raw stdout as `{"raw_output": <stdout>}` and proceeds with normal
+field-aware grading. Under the intersection-only comparator this means
+a model that refused to emit JSON (e.g. a prose-only refusal) will
+PASS any case whose `expected.json` doesn't declare a `raw_output`
+key. A non-zero exit from the CLI is wrapped the same way as
+`{"raw_output": <stdout>, "stderr": <stderr>, "exit_code": <rc>}`, so
+refusals that signal via exit code (some safety filters) also fall
+back to the comparator. Suite authors who want to gate on the prose
+can add `"raw_output": "<expected text>"` to their `expected.json`.
+In `--exact` mode, non-JSON and non-zero exits still ERROR.
+
**Structural cases (composition steps).** When `expected.json` describes
prose properties via boolean flags (`has_security_model_quote`,
`has_bare_issue_numbers`) or membership lists (`mention_handles`),
@@ -92,6 +104,66 @@ automatic JSON-equality comparison is meaningless. Those
cases report
`MANUAL` and the runner skips the CLI call; review them by re-running
without `--cli` (or with `--verbose`).
+### Field-aware grading (default in `--cli` mode)
+
+Pure JSON-equality on `expected.json` is too strict for free-text fields
+like `rationale`, `reason`, `drop_reason`, and `blockers`: a candidate
+answer can carry the right decision but be flagged FAIL on wording
+alone. By default, `--cli` mode now sends those fields to a cheap judge
+model and grades them by meaning instead of by string equality.
+
+```bash
+# Decision fields exact; prose fields graded by the default Haiku judge.
+PYTHONPATH=tools/skill-evals/src python3 -m skill_evals.runner --cli "claude
-p" \
+ tools/skill-evals/evals/issue-triage/
+
+# Use a different judge.
+PYTHONPATH=tools/skill-evals/src python3 -m skill_evals.runner \
+ --cli "claude -p" \
+ --grader-cli "llm -m gpt-4o-mini" \
+ tools/skill-evals/evals/issue-triage/
+
+# Opt out: require verbatim JSON equality on every field (old behaviour).
+PYTHONPATH=tools/skill-evals/src python3 -m skill_evals.runner --cli "claude
-p" --exact \
+ tools/skill-evals/evals/issue-triage/
+```
+
+Decision fields (booleans, enums, counts, ordering, IDs) stay on exact
+equality. `expected.json` is treated as a description of values where
+the model speaks: only keys present in **both** expected and actual
+are asserted. Extra keys in the model's output are ignored, and keys
+declared in expected that the model didn't emit are skipped (not
+failed). Suite authors should keep expected.json focused on the keys
+that actually carry the eval's signal, since a model returning `{}`
+would match any expected. All prose-field mismatches for a single
+case are batched into one rubric prompt and sent to the grader as a
+single call (so a case with N prose-field mismatches costs one Haiku
+call, not N). The grader returns a one-line JSON object mapping each
+field path to `{"match": bool, "reason": str}`. A case passes when
+every asserted decision field matches exactly and every asserted
+prose field returns `match: true`. When a decision field already
+fails, the grader is not called at all for that case.
+
+The default grader is `claude -p --model haiku`. Override with
+`--grader-cli "<command>"` (any shell command that reads stdin and
+writes stdout works). Pass `--exact` to disable grading entirely.
+
+The default prose-field set is `rationale`, `reason`, `reasons`,
+`drop_reason`, `blockers`, `notes`, `summary`, `explanation`,
+`details`, `description`. Override it per fixtures dir by placing a
+`grading-schema.json` next to `step-config.json`:
+
+```json
+{
+ "prose_fields": ["rationale", "drop_reason"]
+}
+```
+
+An empty list (`"prose_fields": []`) makes every field decision-graded
+even with the grader on, equivalent to passing `--exact` for that
+fixtures dir. The grader is called fresh on every run; nothing is
+cached.
+
**Self-eval caveat.** When the model invoked by `--cli` is the same
model (or model class) that just authored the skill change, the
comparison is a self-eval pass — useful as a smoke test for prompt /
@@ -146,7 +218,9 @@ This means:
## Assertion approach
-Most steps assert an exact JSON match against `expected.json`. Composition
steps — where the model writes prose (e.g. a GitHub triage proposal comment) —
use structural assertions instead. The expected JSON contains boolean flags
like `has_security_model_quote` and `has_bare_issue_numbers` and a
`mention_handles` list, rather than requiring prose to match verbatim. This
avoids brittle string comparison while still catching the properties that
matter.
+Most steps assert an exact JSON match against `expected.json`. Composition
steps, where the model writes prose (e.g. a GitHub triage proposal comment),
use structural assertions instead. The expected JSON contains boolean flags
like `has_security_model_quote` and `has_bare_issue_numbers` and a
`mention_handles` list, rather than requiring prose to match verbatim. This
avoids brittle string comparison while still catching the properties that
matter.
+
+For everything in between (decisions wrapped in explanatory prose like
`rationale` or `reason`), `--grader-cli` adds a third mode: decision fields
stay on exact equality, prose fields go to a cheap judge model that scores
"does the candidate support the same conclusion?" See the "Field-aware grading"
section above.
## CI considerations
diff --git a/tools/skill-evals/src/skill_evals/runner.py
b/tools/skill-evals/src/skill_evals/runner.py
index 5da34e9..42ed0eb 100644
--- a/tools/skill-evals/src/skill_evals/runner.py
+++ b/tools/skill-evals/src/skill_evals/runner.py
@@ -33,6 +33,17 @@ Two modes:
``has_*`` flags or ``mention_*`` lists) where automatic comparison
is not meaningful; those still print prompts for manual review.
+ By default, free-text fields (rationale, reason, drop_reason,
+ blockers, etc.) are graded by piping a short rubric prompt to a
+ cheap judge model (``claude -p --model haiku`` by default) and
+ parsing ``{"match": bool, "reason": str}``. Decision fields
+ (booleans, enums, counts, ordering, IDs) are still compared
+ exactly. Override the grader command with ``--grader-cli``, or pass
+ ``--exact`` to disable grading and require verbatim equality on
+ every field. The set of prose fields defaults to a built-in list
+ and can be overridden per fixtures dir via ``grading-schema.json``.
+ No caching: every prose field is sent to the grader on every run.
+
Usage:
# Print prompts for all cases under a fixtures directory
uv run --project tools/skill-evals skill-eval \\
@@ -42,10 +53,20 @@ Usage:
uv run --project tools/skill-evals skill-eval \\
evals/security-issue-import/step-2a-semantic-sweep/fixtures/case-1-clear-duplicate
- # Automated comparison against a CLI (Claude Code shown; any LLM CLI
- # that reads a prompt on stdin and writes the response to stdout works)
+ # Automated comparison against a CLI. Decision fields are graded
+ # exact; prose fields go to the default grader (claude -p --model haiku).
uv run --project tools/skill-evals skill-eval --cli "claude -p" \\
evals/security-issue-import/step-2a-semantic-sweep/fixtures/
+
+ # Override the grader, e.g. to use a different cheap model.
+ uv run --project tools/skill-evals skill-eval \\
+ --cli "claude -p" \\
+ --grader-cli "llm -m gpt-4o-mini" \\
+ evals/security-issue-import/step-2a-semantic-sweep/fixtures/
+
+ # Disable the grader and require verbatim JSON equality on every field.
+ uv run --project tools/skill-evals skill-eval --cli "claude -p" --exact \\
+ evals/security-issue-import/step-2a-semantic-sweep/fixtures/
"""
from __future__ import annotations
@@ -301,6 +322,301 @@ def compare_outputs(actual: object, expected: object) ->
tuple[bool, str]:
return False, _format_diff(actual, expected)
+# ---------------------------------------------------------------------------
+# Field-aware grading (--grader-cli mode)
+# ---------------------------------------------------------------------------
+
+# Default grader shell command. Used when --cli is set and --exact is not.
+# Haiku is the cheapest Claude model; the rubric is small so cost is minimal.
+DEFAULT_GRADER_CLI: str = "claude -p --model haiku"
+
+
+# Keys whose values are treated as prose by default. The runner sends these
+# to the grader CLI for a soft "does the candidate support the same
+# conclusion?" judgement instead of requiring verbatim string equality.
+# A per-fixtures-dir ``grading-schema.json`` can replace this list.
+DEFAULT_PROSE_FIELDS: frozenset[str] = frozenset(
+ {
+ "rationale",
+ "reason",
+ "reasons",
+ "drop_reason",
+ "blockers",
+ "notes",
+ "summary",
+ "explanation",
+ "details",
+ "description",
+ }
+)
+
+
+GRADER_RUBRIC = """\
+You are grading one field of a model's structured answer against a reference
answer.
+
+Field path: {field_path}
+
+Expected value:
+{expected_value}
+
+Candidate value:
+{candidate_value}
+
+Does the candidate value support the same conclusion as the expected value?
Ignore wording differences and reorderings. Reply with one line of JSON only,
no prose: {{"match": true, "reason": "<one-line explanation>"}} or {{"match":
false, "reason": "<one-line explanation>"}}.
+"""
+
+
+BATCH_GRADER_RUBRIC = """\
+You are grading a model's structured answer against a reference answer, field
by field.
+
+For each (Field, Expected, Candidate) triple below, decide whether the
candidate value supports the same conclusion as the expected value. Ignore
wording differences and reorderings.
+
+{fields_block}
+
+Reply with one line of JSON only, no prose. The JSON is an object mapping each
field path string to {{"match": true|false, "reason": "<one-line
explanation>"}}. Include every field listed above. Example:
+{{"$.foo": {{"match": true, "reason": "same conclusion"}}, "$.bar": {{"match":
false, "reason": "different verdict"}}}}
+"""
+
+
+def load_grading_schema(fixtures_dir: Path) -> set[str]:
+ """Return the set of prose field names for cases in this fixtures dir.
+
+ Reads ``fixtures_dir/grading-schema.json`` when present. The file may
+ set ``prose_fields`` to a string list that *replaces* the default set
+ (use ``["rationale", "reason", ...]`` to be explicit, or ``[]`` to
+ grade everything by exact match).
+
+ Falls back to :data:`DEFAULT_PROSE_FIELDS` when no schema file exists.
+ """
+ path = fixtures_dir / "grading-schema.json"
+ if not path.exists():
+ return set(DEFAULT_PROSE_FIELDS)
+ data = json.loads(path.read_text())
+ fields = data.get("prose_fields")
+ if fields is None:
+ return set(DEFAULT_PROSE_FIELDS)
+ if not isinstance(fields, list) or not all(isinstance(f, str) for f in
fields):
+ raise ValueError(f"{path} must contain a string-list 'prose_fields'
field")
+ return set(fields)
+
+
+def _render_field_value(value: object) -> str:
+ """Render an expected/candidate field value for the grader prompt."""
+ if isinstance(value, str):
+ return value
+ return json.dumps(value, indent=2, sort_keys=True, ensure_ascii=False)
+
+
+def grade_prose_field(
+ field_path: str,
+ expected_value: object,
+ actual_value: object,
+ grader_cli: str,
+ timeout: int,
+) -> tuple[bool, str]:
+ """Ask the grader CLI whether the candidate value supports the same
conclusion.
+
+ Returns ``(match, note)``. ``note`` is empty on match and a one-line
+ summary on mismatch (or grader failure).
+ """
+ if expected_value == actual_value:
+ return True, ""
+ prompt = GRADER_RUBRIC.format(
+ field_path=field_path,
+ expected_value=_render_field_value(expected_value),
+ candidate_value=_render_field_value(actual_value),
+ )
+ try:
+ stdout, stderr, rc = run_cli(grader_cli, prompt, timeout=timeout)
+ except subprocess.TimeoutExpired:
+ return False, f"{field_path}: grader CLI timed out after {timeout}s"
+ except OSError as exc:
+ return False, f"{field_path}: grader CLI invocation failed ({exc})"
+ if rc != 0:
+ return False, f"{field_path}: grader CLI exited {rc}
({stderr.strip()[:200]})"
+ verdict, err = extract_json_from_output(stdout)
+ if err is not None or not isinstance(verdict, dict) or "match" not in
verdict:
+ return False, f"{field_path}: grader returned unusable output ({err or
'missing match key'})"
+ match = bool(verdict.get("match"))
+ reason = str(verdict.get("reason", "")).strip()
+ if match:
+ return True, ""
+ return False, f"{field_path}: grader says NO ({reason or 'no reason
given'})"
+
+
+def collect_diffs(
+ actual: object,
+ expected: object,
+ *,
+ prose_fields: set[str],
+ path: str = "$",
+) -> tuple[list[str], list[tuple[str, object, object]]]:
+ """Walk both trees in parallel; return (decision_msgs, prose_pairs).
+
+ ``decision_msgs`` lists structural/decision-field mismatches (type, key
+ set, list length, scalar inequality on non-prose keys). These cannot
+ be resolved by the grader. ``prose_pairs`` lists
+ ``(field_path, expected_value, actual_value)`` for prose-keyed
+ mismatches that the grader should judge. Equal values are omitted from
+ both lists.
+ """
+ if type(actual) is not type(expected):
+ return [
+ f"{path}: type mismatch (actual={type(actual).__name__},
expected={type(expected).__name__})"
+ ], []
+
+ if isinstance(expected, dict):
+ actual_dict = actual # type: ignore[assignment]
+ # Only assert on the intersection of keys. Keys in expected that the
+ # model didn't emit are skipped (not failed), and keys in actual that
+ # expected doesn't declare are ignored. expected.json describes what
+ # the model's answer SHOULD say where it speaks, not what it must
+ # include.
+ decision_msgs: list[str] = []
+ prose_pairs: list[tuple[str, object, object]] = []
+ for key in expected:
+ if key not in actual_dict:
+ continue
+ child_path = f"{path}.{key}" if path else key
+ if key in prose_fields:
+ if expected[key] != actual_dict[key]:
+ prose_pairs.append((child_path, expected[key],
actual_dict[key]))
+ else:
+ sub_d, sub_p = collect_diffs(
+ actual_dict[key],
+ expected[key],
+ prose_fields=prose_fields,
+ path=child_path,
+ )
+ decision_msgs.extend(sub_d)
+ prose_pairs.extend(sub_p)
+ return decision_msgs, prose_pairs
+
+ if isinstance(expected, list):
+ actual_list = actual # type: ignore[assignment]
+ if len(actual_list) != len(expected):
+ return [
+ f"{path}: length mismatch (actual={len(actual_list)},
expected={len(expected)})"
+ ], []
+ decision_msgs = []
+ prose_pairs = []
+ for i, (a_item, e_item) in enumerate(zip(actual_list, expected)):
+ sub_d, sub_p = collect_diffs(
+ a_item,
+ e_item,
+ prose_fields=prose_fields,
+ path=f"{path}[{i}]",
+ )
+ decision_msgs.extend(sub_d)
+ prose_pairs.extend(sub_p)
+ return decision_msgs, prose_pairs
+
+ if actual == expected:
+ return [], []
+ return [f"{path}: expected={expected!r}, actual={actual!r}"], []
+
+
+def _format_batch_fields_block(pairs: list[tuple[str, object, object]]) -> str:
+ chunks = []
+ for path, expected, actual in pairs:
+ chunks.append(
+ f"Field:
{path}\nExpected:\n{_render_field_value(expected)}\nCandidate:\n{_render_field_value(actual)}"
+ )
+ return "\n\n".join(chunks)
+
+
+def batch_grade_prose_fields(
+ pairs: list[tuple[str, object, object]],
+ grader_cli: str,
+ timeout: int,
+) -> dict[str, tuple[bool, str]]:
+ """Send one rubric prompt covering every pair; return path -> (match,
note).
+
+ Returns an empty dict when ``pairs`` is empty (no grader call). On grader
+ failure (timeout, OSError, non-zero exit, unparsable output, missing
+ path in the verdict), every pair without a clean verdict is returned as
+ ``(False, <one-line explanation>)``.
+ """
+ if not pairs:
+ return {}
+ prompt =
BATCH_GRADER_RUBRIC.format(fields_block=_format_batch_fields_block(pairs))
+ try:
+ stdout, stderr, rc = run_cli(grader_cli, prompt, timeout=timeout)
+ except subprocess.TimeoutExpired:
+ return {p: (False, f"grader CLI timed out after {timeout}s") for p, _,
_ in pairs}
+ except OSError as exc:
+ return {p: (False, f"grader CLI invocation failed ({exc})") for p, _,
_ in pairs}
+ if rc != 0:
+ return {
+ p: (False, f"grader CLI exited {rc} ({stderr.strip()[:200]})")
+ for p, _, _ in pairs
+ }
+ verdict, err = extract_json_from_output(stdout)
+ if err is not None or not isinstance(verdict, dict):
+ return {
+ p: (False, f"grader returned unusable output ({err or 'not a
dict'})")
+ for p, _, _ in pairs
+ }
+ result: dict[str, tuple[bool, str]] = {}
+ for path, _, _ in pairs:
+ entry = verdict.get(path)
+ if not isinstance(entry, dict) or "match" not in entry:
+ result[path] = (False, f"grader did not return a verdict for
{path}")
+ continue
+ match = bool(entry.get("match"))
+ reason = str(entry.get("reason", "")).strip()
+ if match:
+ result[path] = (True, "")
+ else:
+ result[path] = (False, f"grader says NO ({reason or 'no reason
given'})")
+ return result
+
+
+def compare_with_grader(
+ actual: object,
+ expected: object,
+ *,
+ prose_fields: set[str],
+ grader_cli: str,
+ timeout: int,
+) -> tuple[bool, list[str]]:
+ """Field-aware comparison: decision keys exact, prose keys judged by
grader.
+
+ Walks both trees once to separate decision-field diffs from prose-field
+ diffs, then makes a single batched grader call covering every prose
+ mismatch. If decision fields already fail the comparison, the grader
+ is skipped entirely (one fewer CLI call per failing case).
+
+ Returns ``(ok, messages)``; ``messages`` is empty when ok and otherwise
+ lists one note per failing field.
+ """
+ decision_msgs, prose_pairs = collect_diffs(
+ actual, expected, prose_fields=prose_fields
+ )
+ if decision_msgs:
+ # Case already fails on a decision field; no need to call the grader.
+ return False, decision_msgs
+ if not prose_pairs:
+ return True, []
+ grades = batch_grade_prose_fields(prose_pairs, grader_cli, timeout)
+ ok = True
+ msgs: list[str] = []
+ for path, _, _ in prose_pairs:
+ match, note = grades.get(
+ path, (False, f"{path}: no verdict returned by grader")
+ )
+ if not match:
+ ok = False
+ # `note` from batch_grade_prose_fields is already field-attributed
+ # for missing entries; for grader verdicts it isn't, so prepend the
+ # path for clarity in the output.
+ if note.startswith(path):
+ msgs.append(note)
+ else:
+ msgs.append(f"{path}: {note}")
+ return ok, msgs
+
+
def _format_diff(actual: object, expected: object) -> str:
actual_text = json.dumps(actual, indent=2, sort_keys=True)
expected_text = json.dumps(expected, indent=2, sort_keys=True)
@@ -409,6 +725,34 @@ def main(argv: list[str] | None = None) -> int:
default=120,
help="Timeout in seconds for each --cli invocation (default: 120).",
)
+ parser.add_argument(
+ "--grader-cli",
+ type=str,
+ default=DEFAULT_GRADER_CLI,
+ help=(
+ "Shell command for a cheap judge model that grades free-text "
+ "fields (rationale, reason, drop_reason, blockers, etc.). "
+ "Prose fields are compared via a rubric prompt instead of "
+ "exact equality; decision fields stay on exact compare. The set "
+ "of prose fields is the runner's default plus any per-fixtures "
+ "grading-schema.json overrides. Requires --cli. Default: "
+ f"'{DEFAULT_GRADER_CLI}'. Pass --exact to disable grading."
+ ),
+ )
+ parser.add_argument(
+ "--exact",
+ action="store_true",
+ help=(
+ "Disable the field-aware grader and require verbatim JSON "
+ "equality on every field (the runner's pre-grader behaviour)."
+ ),
+ )
+ parser.add_argument(
+ "--grader-timeout",
+ type=int,
+ default=60,
+ help="Timeout in seconds for each --grader-cli invocation (default:
60).",
+ )
parser.add_argument(
"--verbose",
"-v",
@@ -426,6 +770,10 @@ def main(argv: list[str] | None = None) -> int:
)
args = parser.parse_args(argv)
+ grader_explicit = args.grader_cli != DEFAULT_GRADER_CLI
+ if args.cli is None and (grader_explicit or args.exact):
+ parser.error("--grader-cli and --exact require --cli")
+
cases = find_cases(args.path)
if args.tag:
requested_tags = set(args.tag)
@@ -442,6 +790,8 @@ def main(argv: list[str] | None = None) -> int:
# Cache loaded step configs so we don't re-read prompts for every case in
# the same fixtures dir (common when running a whole skill at once).
_step_config_cache: dict[Path, tuple[str, str]] = {}
+ # Cache the prose-field schema per fixtures dir (config only, not grader
results).
+ _grading_schema_cache: dict[Path, set[str]] = {}
passed = failed = manual = errored = 0
@@ -502,30 +852,67 @@ def main(argv: list[str] | None = None) -> int:
continue
if rc != 0:
- print(f"ERROR {case_label} (CLI exited {rc}; stderr:
{stderr.strip()[:200]})")
- errored += 1
- if args.verbose:
- print("--- STDOUT ---")
- print(stdout)
- continue
-
- actual, parse_err = extract_json_from_output(stdout)
- if parse_err is not None:
- print(f"ERROR {case_label} ({parse_err})")
- errored += 1
- if args.verbose:
- print("--- STDOUT ---")
- print(stdout)
- continue
-
- ok, diff = compare_outputs(actual, expected)
- if ok:
- print(f"PASS {case_label}")
- passed += 1
+ if args.exact:
+ print(
+ f"ERROR {case_label} (CLI exited {rc}; stderr:
{stderr.strip()[:200]})"
+ )
+ errored += 1
+ if args.verbose:
+ print("--- STDOUT ---")
+ print(stdout)
+ continue
+ # Field-aware mode: a non-zero exit (often a refusal or a CLI
+ # safety filter) is wrapped just like a no-JSON case. The
+ # intersection-only comparator decides whether this case still
+ # passes based on the keys expected.json declares. Wrap is a
+ # silent implementation detail — the case still reports as
+ # PASS or FAIL like any other.
+ actual = {"raw_output": stdout, "stderr": stderr, "exit_code": rc}
+ else:
+ actual, parse_err = extract_json_from_output(stdout)
+ if parse_err is not None:
+ if args.exact:
+ # Exact mode requires literal JSON; non-JSON is an error.
+ print(f"ERROR {case_label} ({parse_err})")
+ errored += 1
+ if args.verbose:
+ print("--- STDOUT ---")
+ print(stdout)
+ continue
+ # Field-aware mode: wrap the prose as a synthetic object so
+ # the intersection-only comparator can proceed. A model that
+ # produced prose-only output will PASS unless expected.json
+ # asserts on `raw_output`.
+ actual = {"raw_output": stdout}
+
+ if not args.exact:
+ if fixtures_dir not in _grading_schema_cache:
+ _grading_schema_cache[fixtures_dir] =
load_grading_schema(fixtures_dir)
+ prose_fields = _grading_schema_cache[fixtures_dir]
+ ok, notes = compare_with_grader(
+ actual,
+ expected,
+ prose_fields=prose_fields,
+ grader_cli=args.grader_cli,
+ timeout=args.grader_timeout,
+ )
+ if ok:
+ print(f"PASS {case_label}")
+ passed += 1
+ else:
+ print(f"FAIL {case_label}")
+ for note in notes:
+ print(f" {note}")
+ failed += 1
else:
- print(f"FAIL {case_label}")
- print(diff)
- failed += 1
+ ok, diff = compare_outputs(actual, expected)
+ if ok:
+ print(f"PASS {case_label}")
+ passed += 1
+ else:
+ print(f"FAIL {case_label}")
+ print(diff)
+ failed += 1
if args.verbose:
print("--- SYSTEM PROMPT ---")
diff --git a/tools/skill-evals/tests/_grader_count.py
b/tools/skill-evals/tests/_grader_count.py
new file mode 100644
index 0000000..dac5251
--- /dev/null
+++ b/tools/skill-evals/tests/_grader_count.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+"""Mock batched grader that records its invocation count to a file.
+
+Tests set ``GRADER_COUNTER_FILE`` to a temp path, then assert that exactly one
+batched grader call was made per case regardless of how many prose-field
+mismatches it contained.
+"""
+from __future__ import annotations
+
+import json
+import os
+import re
+import sys
+
+
+def main() -> None:
+ counter = os.environ["GRADER_COUNTER_FILE"]
+ with open(counter, "a") as f:
+ f.write("1\n")
+ paths = re.findall(r"^Field: (\S+)$", sys.stdin.read(), flags=re.MULTILINE)
+ print(json.dumps({p: {"match": True, "reason": "ok"} for p in paths}))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/skill-evals/tests/_grader_no.py
b/tools/skill-evals/tests/_grader_no.py
new file mode 100644
index 0000000..9ce1336
--- /dev/null
+++ b/tools/skill-evals/tests/_grader_no.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+"""Mock batched grader: returns match=false for every Field: path in stdin.
+
+Companion to ``_grader_yes.py`` for testing the runner's batched-grader path.
+"""
+from __future__ import annotations
+
+import json
+import re
+import sys
+
+
+def main() -> None:
+ paths = re.findall(r"^Field: (\S+)$", sys.stdin.read(), flags=re.MULTILINE)
+ print(json.dumps({p: {"match": False, "reason": "differs"} for p in
paths}))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/skill-evals/tests/_grader_yes.py
b/tools/skill-evals/tests/_grader_yes.py
new file mode 100644
index 0000000..831628c
--- /dev/null
+++ b/tools/skill-evals/tests/_grader_yes.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+"""Mock batched grader: returns match=true for every Field: path in stdin.
+
+Used by tests as a stand-in for ``claude -p --model haiku``. Reads the runner's
+batched rubric prompt on stdin, extracts every ``Field: <path>`` line, and
+emits a one-line JSON object mapping each path to ``{"match": true, ...}``.
+"""
+from __future__ import annotations
+
+import json
+import re
+import sys
+
+
+def main() -> None:
+ paths = re.findall(r"^Field: (\S+)$", sys.stdin.read(), flags=re.MULTILINE)
+ print(json.dumps({p: {"match": True, "reason": "ok"} for p in paths}))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/skill-evals/tests/test_runner.py
b/tools/skill-evals/tests/test_runner.py
index f6dbc14..dd95a44 100644
--- a/tools/skill-evals/tests/test_runner.py
+++ b/tools/skill-evals/tests/test_runner.py
@@ -25,20 +25,42 @@ from pathlib import Path
import pytest
from skill_evals.runner import (
+ DEFAULT_GRADER_CLI,
+ DEFAULT_PROSE_FIELDS,
+ batch_grade_prose_fields,
build_corpus_text,
build_roster_text,
+ collect_diffs,
compare_outputs,
+ compare_with_grader,
extract_json_from_output,
extract_skill_section,
find_cases,
find_repo_root,
+ grade_prose_field,
is_structural_expected,
load_case,
load_case_tags,
+ load_grading_schema,
load_step_config,
main,
)
+_TESTS_DIR = Path(__file__).resolve().parent
+_GRADER_YES = f"python3 {_TESTS_DIR / '_grader_yes.py'}"
+_GRADER_NO = f"python3 {_TESTS_DIR / '_grader_no.py'}"
+
+
+def _grader_count_cli(counter_path: Path) -> str:
+ """Return a grader-cli string that records each call to
``counter_path``."""
+ return f"GRADER_COUNTER_FILE={counter_path} python3 {_TESTS_DIR /
'_grader_count.py'}"
+
+
+def _count_grader_calls(counter_path: Path) -> int:
+ if not counter_path.exists():
+ return 0
+ return sum(1 for _ in counter_path.read_text().splitlines() if _)
+
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
@@ -745,26 +767,92 @@ def test_cli_mode_manual_skips_structural(tmp_path: Path,
capsys: pytest.Capture
assert "1 manual" in stdout
-def test_cli_mode_error_on_non_json_output(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
- """A CLI that returns prose without any JSON should ERROR and exit
non-zero."""
+def test_cli_mode_non_json_under_exact_errors(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
+ """In --exact mode, prose with no JSON returns ERROR and exits non-zero."""
fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "ok"})
rc, stdout, _ = _run_main(
capsys,
- ["--cli", "echo 'just prose, no JSON here'", str(fixtures_dir)],
+ [
+ "--cli",
+ "echo 'just prose, no JSON here'",
+ "--exact",
+ str(fixtures_dir),
+ ],
)
assert rc == 1
assert "ERROR" in stdout
assert "1 errored" in stdout
-def test_cli_mode_error_on_non_zero_exit(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
- """A CLI that exits non-zero should ERROR and exit non-zero."""
+def test_cli_mode_non_json_wraps_and_passes_under_field_aware(tmp_path: Path,
capsys: pytest.CaptureFixture[str]):
+ """Default field-aware mode wraps prose as {"raw_output": ...} so the
+ intersection-only comparator can proceed. With expected.json declaring
+ no raw_output key, the case passes."""
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "ok"})
+ rc, stdout, _ = _run_main(
+ capsys,
+ [
+ "--cli",
+ "echo 'just prose, no JSON here'",
+ "--grader-cli",
+ _GRADER_YES, # not actually invoked; no overlapping keys
+ str(fixtures_dir),
+ ],
+ )
+ assert rc == 0
+ assert "PASS" in stdout
+ assert "1 passed" in stdout
+
+
+def test_cli_mode_non_json_wrap_can_assert_on_raw_output(tmp_path: Path,
capsys: pytest.CaptureFixture[str]):
+ """Suite authors who want to gate on the prose can declare raw_output
+ in expected.json; the wrapped actual carries the model's prose and a
+ mismatch is a real decision-level failure."""
+ fixtures_dir, _ = _make_cli_case(
+ tmp_path, expected={"raw_output": "this exact prose\n"}
+ )
+ rc, stdout, _ = _run_main(
+ capsys,
+ [
+ "--cli",
+ "echo 'different prose'",
+ "--grader-cli",
+ _GRADER_YES,
+ str(fixtures_dir),
+ ],
+ )
+ assert rc == 1
+ assert "FAIL" in stdout
+ assert "raw_output" in stdout
+
+
+def test_cli_mode_non_zero_exit_under_exact_errors(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
+ """In --exact mode, a non-zero CLI exit still returns ERROR and exits
non-zero."""
fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "ok"})
- rc, stdout, _ = _run_main(capsys, ["--cli", "false", str(fixtures_dir)])
+ rc, stdout, _ = _run_main(capsys, ["--cli", "false", "--exact",
str(fixtures_dir)])
assert rc == 1
assert "ERROR" in stdout
+def test_cli_mode_non_zero_exit_wraps_under_field_aware(tmp_path: Path,
capsys: pytest.CaptureFixture[str]):
+ """In the default field-aware mode, a non-zero CLI exit is wrapped as
+ raw_output (+ stderr + exit_code) and the intersection-only comparator
+ decides whether the case passes."""
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "ok"})
+ rc, stdout, _ = _run_main(
+ capsys,
+ [
+ "--cli",
+ "false",
+ "--grader-cli",
+ _GRADER_YES,
+ str(fixtures_dir),
+ ],
+ )
+ assert rc == 0
+ assert "PASS" in stdout
+
+
def test_cli_mode_extracts_json_from_fenced_response(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
"""The runner should find JSON inside a ```json fence in the CLI's
stdout."""
fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "ok"})
@@ -810,3 +898,494 @@ def test_tag_filter_runs_only_matching_cases(tmp_path:
Path, capsys: pytest.Capt
assert rc == 0
assert "1 passed" in stdout
assert "case-2-untagged" not in stdout
+
+
+# ---------------------------------------------------------------------------
+# load_grading_schema
+# ---------------------------------------------------------------------------
+
+
+def test_load_grading_schema_defaults_when_no_file(tmp_path: Path):
+ fixtures_dir = tmp_path / "fixtures"
+ fixtures_dir.mkdir()
+ assert load_grading_schema(fixtures_dir) == set(DEFAULT_PROSE_FIELDS)
+
+
+def test_load_grading_schema_override_replaces_default(tmp_path: Path):
+ fixtures_dir = tmp_path / "fixtures"
+ fixtures_dir.mkdir()
+ (fixtures_dir /
"grading-schema.json").write_text(json.dumps({"prose_fields": ["why"]}))
+ assert load_grading_schema(fixtures_dir) == {"why"}
+
+
+def test_load_grading_schema_empty_list_disables_grader(tmp_path: Path):
+ fixtures_dir = tmp_path / "fixtures"
+ fixtures_dir.mkdir()
+ (fixtures_dir /
"grading-schema.json").write_text(json.dumps({"prose_fields": []}))
+ assert load_grading_schema(fixtures_dir) == set()
+
+
+def test_load_grading_schema_rejects_non_string_entries(tmp_path: Path):
+ fixtures_dir = tmp_path / "fixtures"
+ fixtures_dir.mkdir()
+ (fixtures_dir /
"grading-schema.json").write_text(json.dumps({"prose_fields": ["why", 7]}))
+ with pytest.raises(ValueError, match="prose_fields"):
+ load_grading_schema(fixtures_dir)
+
+
+def test_load_grading_schema_missing_key_falls_back_to_default(tmp_path: Path):
+ fixtures_dir = tmp_path / "fixtures"
+ fixtures_dir.mkdir()
+ (fixtures_dir / "grading-schema.json").write_text(json.dumps({"unrelated":
True}))
+ assert load_grading_schema(fixtures_dir) == set(DEFAULT_PROSE_FIELDS)
+
+
+# ---------------------------------------------------------------------------
+# grade_prose_field (single-field helper retained for callers that want
+# per-field grading; the main runner path uses the batched grader below.)
+# ---------------------------------------------------------------------------
+
+
+def test_grade_prose_field_short_circuits_on_exact_equality():
+ # Identical values should pass without invoking any CLI.
+ ok, note = grade_prose_field(
+ "$.reason", "boom", "boom", grader_cli="false", timeout=5
+ )
+ assert ok is True
+ assert note == ""
+
+
+def test_grade_prose_field_grader_says_match():
+ grader = "echo '{\"match\": true, \"reason\": \"same meaning\"}'"
+ ok, note = grade_prose_field(
+ "$.reason", "the build failed", "build broke", grader_cli=grader,
timeout=5
+ )
+ assert ok is True
+ assert note == ""
+
+
+def test_grade_prose_field_grader_says_no():
+ grader = "echo '{\"match\": false, \"reason\": \"different conclusion\"}'"
+ ok, note = grade_prose_field(
+ "$.reason", "the build failed", "the build passed", grader_cli=grader,
timeout=5
+ )
+ assert ok is False
+ assert "$.reason" in note
+ assert "different conclusion" in note
+
+
+def test_grade_prose_field_grader_returns_garbage():
+ ok, note = grade_prose_field(
+ "$.reason", "x", "y", grader_cli="echo 'not json at all'", timeout=5
+ )
+ assert ok is False
+ assert "$.reason" in note
+
+
+def test_grade_prose_field_grader_non_zero_exit():
+ ok, note = grade_prose_field(
+ "$.reason", "x", "y", grader_cli="false", timeout=5
+ )
+ assert ok is False
+ assert "$.reason" in note
+
+
+# ---------------------------------------------------------------------------
+# collect_diffs (pure walker; no grader calls)
+# ---------------------------------------------------------------------------
+
+
+def test_collect_diffs_no_diff_when_equal():
+ d, p = collect_diffs({"verdict": "BUG"}, {"verdict": "BUG"},
prose_fields=set())
+ assert d == []
+ assert p == []
+
+
+def test_collect_diffs_decision_scalar_mismatch_is_decision_only():
+ d, p = collect_diffs(
+ {"verdict": "X"}, {"verdict": "Y"}, prose_fields={"reason"}
+ )
+ assert any("verdict" in m for m in d)
+ assert p == []
+
+
+def test_collect_diffs_prose_mismatch_yields_pair():
+ d, p = collect_diffs(
+ {"verdict": "BUG", "reason": "wording A"},
+ {"verdict": "BUG", "reason": "wording B"},
+ prose_fields={"reason"},
+ )
+ assert d == []
+ assert len(p) == 1
+ path, exp, act = p[0]
+ assert path == "$.reason"
+ assert exp == "wording B"
+ assert act == "wording A"
+
+
+def test_collect_diffs_nested_list_of_prose_fields_yields_multiple_pairs():
+ actual = {"items": [{"reason": "a"}, {"reason": "c"}]}
+ expected = {"items": [{"reason": "b"}, {"reason": "d"}]}
+ d, p = collect_diffs(actual, expected, prose_fields={"reason"})
+ assert d == []
+ assert [pair[0] for pair in p] == ["$.items[0].reason",
"$.items[1].reason"]
+
+
+def test_collect_diffs_missing_key_in_actual_is_skipped():
+ """Expected declares 'b'; actual omits it. Skipped, not failed —
+ expected.json is a description of values where the model speaks, not a
+ required-keys schema."""
+ d, p = collect_diffs({"a": 1}, {"a": 1, "b": 2}, prose_fields=set())
+ assert d == []
+ assert p == []
+
+
+def test_collect_diffs_extra_keys_in_actual_are_ignored():
+ """Only the intersection is asserted; extras in actual pass."""
+ d, p = collect_diffs(
+ {"a": 1, "extra": "anything"}, {"a": 1}, prose_fields=set()
+ )
+ assert d == []
+ assert p == []
+
+
+def test_collect_diffs_intersection_value_mismatch_still_fails():
+ """Keys present in both that differ in value still fail."""
+ d, _ = collect_diffs(
+ {"a": 2, "extra": "x"}, {"a": 1, "b": 2}, prose_fields=set()
+ )
+ assert any("a" in m for m in d)
+
+
+def test_collect_diffs_empty_actual_passes_against_any_expected():
+ """Document the trade-off: a model returning {} matches any expected.
+ Suite authors should keep expected.json focused on the keys that
+ actually carry the eval's signal."""
+ d, p = collect_diffs({}, {"a": 1, "b": 2, "c": [1, 2]}, prose_fields=set())
+ assert d == []
+ assert p == []
+
+
+def test_collect_diffs_length_mismatch_is_decision():
+ d, _ = collect_diffs({"items": [1, 2]}, {"items": [1, 2, 3]},
prose_fields=set())
+ assert any("length mismatch" in m for m in d)
+
+
+def test_collect_diffs_equal_prose_does_not_emit_pair():
+ d, p = collect_diffs(
+ {"reason": "same"}, {"reason": "same"}, prose_fields={"reason"}
+ )
+ assert d == []
+ assert p == []
+
+
+# ---------------------------------------------------------------------------
+# batch_grade_prose_fields
+# ---------------------------------------------------------------------------
+
+
+def test_batch_grade_empty_pairs_makes_no_grader_call(tmp_path: Path):
+ counter = tmp_path / "calls"
+ result = batch_grade_prose_fields(
+ [], _grader_count_cli(counter), timeout=5
+ )
+ assert result == {}
+ assert _count_grader_calls(counter) == 0
+
+
+def test_batch_grade_single_pair_one_call(tmp_path: Path):
+ counter = tmp_path / "calls"
+ pairs = [("$.reason", "expected", "actual")]
+ result = batch_grade_prose_fields(
+ pairs, _grader_count_cli(counter), timeout=5
+ )
+ assert _count_grader_calls(counter) == 1
+ assert result["$.reason"] == (True, "")
+
+
+def test_batch_grade_many_pairs_one_call(tmp_path: Path):
+ """Headline guarantee: N prose mismatches → 1 grader call."""
+ counter = tmp_path / "calls"
+ pairs = [
+ ("$.a", "x", "y"),
+ ("$.b", "x", "y"),
+ ("$.c.d", "x", "y"),
+ ("$.list[0].reason", "x", "y"),
+ ]
+ result = batch_grade_prose_fields(
+ pairs, _grader_count_cli(counter), timeout=5
+ )
+ assert _count_grader_calls(counter) == 1
+ for path, _, _ in pairs:
+ assert result[path] == (True, "")
+
+
+def test_batch_grade_grader_says_no():
+ pairs = [("$.reason", "expected", "actual")]
+ result = batch_grade_prose_fields(pairs, _GRADER_NO, timeout=5)
+ ok, note = result["$.reason"]
+ assert ok is False
+ assert "differs" in note
+
+
+def test_batch_grade_grader_failure_marks_all_fail():
+ pairs = [("$.a", "x", "y"), ("$.b", "x", "y")]
+ result = batch_grade_prose_fields(pairs, "false", timeout=5)
+ for path, _, _ in pairs:
+ ok, _ = result[path]
+ assert ok is False
+
+
+# ---------------------------------------------------------------------------
+# compare_with_grader (uses the batched grader path)
+# ---------------------------------------------------------------------------
+
+
+def
test_compare_with_grader_passes_when_decision_fields_match_and_prose_judged_match():
+ actual = {"verdict": "BUG", "reason": "the system crashes on a null
record"}
+ expected = {"verdict": "BUG", "reason": "crashes on null input"}
+ ok, msgs = compare_with_grader(
+ actual,
+ expected,
+ prose_fields={"reason"},
+ grader_cli=_GRADER_YES,
+ timeout=5,
+ )
+ assert ok is True
+ assert msgs == []
+
+
+def
test_compare_with_grader_skips_grader_when_decision_field_differs(tmp_path:
Path):
+ """Decision-field failure must not invoke the grader."""
+ counter = tmp_path / "calls"
+ actual = {"verdict": "INVALID", "reason": "wording A"}
+ expected = {"verdict": "BUG", "reason": "wording B"}
+ ok, msgs = compare_with_grader(
+ actual,
+ expected,
+ prose_fields={"reason"},
+ grader_cli=_grader_count_cli(counter),
+ timeout=5,
+ )
+ assert ok is False
+ assert _count_grader_calls(counter) == 0
+ assert any("verdict" in m for m in msgs)
+
+
+def test_compare_with_grader_multiple_prose_mismatches_one_call(tmp_path:
Path):
+ counter = tmp_path / "calls"
+ actual = {
+ "verdict": "BUG",
+ "reason": "a",
+ "follow_up": [{"reason": "c"}, {"reason": "e"}],
+ }
+ expected = {
+ "verdict": "BUG",
+ "reason": "b",
+ "follow_up": [{"reason": "d"}, {"reason": "f"}],
+ }
+ ok, msgs = compare_with_grader(
+ actual,
+ expected,
+ prose_fields={"reason"},
+ grader_cli=_grader_count_cli(counter),
+ timeout=5,
+ )
+ assert ok is True, msgs
+ assert _count_grader_calls(counter) == 1
+
+
+def test_compare_with_grader_fails_when_grader_says_no():
+ actual = {"verdict": "BUG", "reason": "crashes on overflow"}
+ expected = {"verdict": "BUG", "reason": "null pointer on init"}
+ ok, msgs = compare_with_grader(
+ actual,
+ expected,
+ prose_fields={"reason"},
+ grader_cli=_GRADER_NO,
+ timeout=5,
+ )
+ assert ok is False
+ assert any("reason" in m for m in msgs)
+
+
+def test_compare_with_grader_handles_nested_list_of_dicts():
+ actual = {
+ "overall": "fail",
+ "follow_up": [
+ {"skill": "install", "reason": "missing hook script"},
+ {"skill": "update", "reason": "stale claude-code version"},
+ ],
+ }
+ expected = {
+ "overall": "fail",
+ "follow_up": [
+ {"skill": "install", "reason": "hooks/scripts not installed"},
+ {"skill": "update", "reason": "claude-code is older than pinned
version"},
+ ],
+ }
+ ok, msgs = compare_with_grader(
+ actual,
+ expected,
+ prose_fields={"reason"},
+ grader_cli=_GRADER_YES,
+ timeout=5,
+ )
+ assert ok is True
+ assert msgs == []
+
+
+def test_compare_with_grader_no_prose_diff_no_grader_call(tmp_path: Path):
+ counter = tmp_path / "calls"
+ ok, _ = compare_with_grader(
+ {"verdict": "BUG", "reason": "same"},
+ {"verdict": "BUG", "reason": "same"},
+ prose_fields={"reason"},
+ grader_cli=_grader_count_cli(counter),
+ timeout=5,
+ )
+ assert ok is True
+ assert _count_grader_calls(counter) == 0
+
+
+# ---------------------------------------------------------------------------
+# --grader-cli end-to-end
+# ---------------------------------------------------------------------------
+
+
+def test_cli_grader_mode_passes_on_wording_difference(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
+ """Same verdict, different prose in `reason` — grader-cli mode should
PASS."""
+ expected = {"verdict": "BUG", "reason": "crashes on null input"}
+ actual = {"verdict": "BUG", "reason": "null pointer on first call"}
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected=expected)
+ rc, stdout, _ = _run_main(
+ capsys,
+ [
+ "--cli",
+ f"echo '{json.dumps(actual)}'",
+ "--grader-cli",
+ _GRADER_YES,
+ str(fixtures_dir),
+ ],
+ )
+ assert rc == 0, stdout
+ assert "PASS" in stdout
+ assert "1 passed" in stdout
+
+
+def test_cli_grader_mode_fails_on_decision_field_difference(tmp_path: Path,
capsys: pytest.CaptureFixture[str]):
+ """Decision field (verdict) differs — must FAIL even if grader would say
YES."""
+ expected = {"verdict": "BUG", "reason": "same"}
+ actual = {"verdict": "INVALID", "reason": "same"}
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected=expected)
+ rc, stdout, _ = _run_main(
+ capsys,
+ [
+ "--cli",
+ f"echo '{json.dumps(actual)}'",
+ "--grader-cli",
+ _GRADER_YES,
+ str(fixtures_dir),
+ ],
+ )
+ assert rc == 1
+ assert "FAIL" in stdout
+ assert "verdict" in stdout
+
+
+def test_cli_grader_mode_fails_when_grader_rejects_prose(tmp_path: Path,
capsys: pytest.CaptureFixture[str]):
+ """Decision match, but grader says prose differs."""
+ expected = {"verdict": "BUG", "reason": "crashes on null input"}
+ actual = {"verdict": "BUG", "reason": "totally unrelated text"}
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected=expected)
+ rc, stdout, _ = _run_main(
+ capsys,
+ [
+ "--cli",
+ f"echo '{json.dumps(actual)}'",
+ "--grader-cli",
+ _GRADER_NO,
+ str(fixtures_dir),
+ ],
+ )
+ assert rc == 1
+ assert "FAIL" in stdout
+ assert "reason" in stdout
+
+
+def test_cli_grader_mode_respects_grading_schema_override(tmp_path: Path,
capsys: pytest.CaptureFixture[str]):
+ """grading-schema.json with prose_fields=[] forces exact compare on
`reason`."""
+ expected = {"verdict": "BUG", "reason": "crashes on null input"}
+ actual = {"verdict": "BUG", "reason": "null pointer on first call"}
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected=expected)
+ (fixtures_dir /
"grading-schema.json").write_text(json.dumps({"prose_fields": []}))
+ rc, stdout, _ = _run_main(
+ capsys,
+ [
+ "--cli",
+ f"echo '{json.dumps(actual)}'",
+ "--grader-cli",
+ _GRADER_YES, # would say YES, but reason should be graded exact
now
+ str(fixtures_dir),
+ ],
+ )
+ assert rc == 1
+ assert "FAIL" in stdout
+ assert "reason" in stdout
+
+
+def test_grader_cli_requires_cli_flag(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "ok"})
+ with pytest.raises(SystemExit):
+ main(["--grader-cli", _GRADER_YES, str(fixtures_dir)])
+ err = capsys.readouterr().err
+ assert "require --cli" in err
+
+
+def test_exact_requires_cli_flag(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected={"verdict": "ok"})
+ with pytest.raises(SystemExit):
+ main(["--exact", str(fixtures_dir)])
+ err = capsys.readouterr().err
+ assert "require --cli" in err
+
+
+def test_default_grader_constant_is_haiku():
+ # Defending the documented default so a future rename doesn't silently
+ # change cost characteristics for users.
+ assert "haiku" in DEFAULT_GRADER_CLI
+
+
+def test_exact_mode_falls_back_to_verbatim_comparison(tmp_path: Path, capsys:
pytest.CaptureFixture[str]):
+ """With --exact, a wording-only diff on a prose field should FAIL."""
+ expected = {"verdict": "BUG", "reason": "null input crash"}
+ actual = {"verdict": "BUG", "reason": "different wording"}
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected=expected)
+ rc, stdout, _ = _run_main(
+ capsys,
+ [
+ "--cli",
+ f"echo '{json.dumps(actual)}'",
+ "--exact",
+ str(fixtures_dir),
+ ],
+ )
+ assert rc == 1
+ assert "FAIL" in stdout
+
+
+def test_default_grader_not_invoked_when_decision_field_differs(tmp_path:
Path, capsys: pytest.CaptureFixture[str]):
+ """If only decision fields differ, the grader is never called, so the
+ default (claude -p --model haiku) does not need to exist on PATH."""
+ expected = {"verdict": "BUG"}
+ actual = {"verdict": "INVALID"}
+ fixtures_dir, _ = _make_cli_case(tmp_path, expected=expected)
+ # Do NOT pass --grader-cli or --exact: rely on the default grader being
+ # un-invoked. If it were invoked, the test would error (claude not on
PATH).
+ rc, stdout, _ = _run_main(
+ capsys,
+ ["--cli", f"echo '{json.dumps(actual)}'", str(fixtures_dir)],
+ )
+ assert rc == 1
+ assert "FAIL" in stdout
+ assert "verdict" in stdout