Re: [PR] GSoC 2026 PoC Agent Skills executable document system with context detection [airflow]

via GitHub Wed, 01 Apr 2026 17:58:14 -0700


Copilot commented on code in PR #64008:
URL: https://github.com/apache/airflow/pull/64008#discussion_r3025358363



##########
scripts/ci/pre_commit/skill_graph.py:
##########
@@ -0,0 +1,186 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use it except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Build dependency graph from skills.json (prereqs) and output tree + JSON 
graph."""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+def parse_prereq_skill_ids(prereqs_str: str, skill_ids: set[str]) -> list[str]:
+    """Return list of skill IDs that appear in prereqs string 
(comma-separated)."""
+    if not prereqs_str or not prereqs_str.strip():
+        return []
+    return [x.strip() for x in prereqs_str.split(",") if x.strip() in 
skill_ids]
+
+
+def find_cycle(skill_ids: set[str], edges: list[tuple[str, str]]) -> list[str] 
| None:
+    """edges are (from_id, to_id) meaning from_id requires to_id. Return cycle 
as list or None."""
+    # Build adjacency: node -> list of nodes it points to (dependencies)
+    adj: dict[str, list[str]] = {sid: [] for sid in skill_ids}
+    for from_id, to_id in edges:
+        adj[from_id].append(to_id)
+    path: list[str] = []
+    path_set: set[str] = set()
+    in_stack: set[str] = set()
+
+    def visit(n: str) -> list[str] | None:
+        path.append(n)
+        path_set.add(n)
+        in_stack.add(n)
+        for m in adj.get(n, []):
+            if m not in path_set:
+                cycle = visit(m)
+                if cycle is not None:
+                    return cycle
+            elif m in in_stack:
+                # cycle: from m back to m
+                start = path.index(m)
+                return path[start:] + [m]
+        path.pop()
+        path_set.remove(n)
+        in_stack.remove(n)
+        return None
+
+    for sid in skill_ids:
+        if sid not in path_set:
+            cycle = visit(sid)
+            if cycle is not None:
+                return cycle
+    return None
+
+
+def build_graph(skills: list[dict]) -> tuple[list[dict], list[dict], dict[str, 
dict]]:
+    """
+    Build nodes (for JSON), edges (for JSON), and id->skill for tree.
+    Edges: from skill_id to prereq_id (skill_id requires prereq_id).
+    """
+    skill_ids = {s["id"] for s in skills}
+    id_to_skill = {s["id"]: s for s in skills}
+    nodes = [
+        {"id": s["id"], "category": s.get("category", ""), "context": 
s.get("context", "")}
+        for s in skills
+    ]
+    edges = []
+    for s in skills:
+        sid = s["id"]
+        prereqs = parse_prereq_skill_ids(s.get("prereqs", "") or "", skill_ids)
+        for p in prereqs:
+            edges.append({"from": sid, "to": p, "type": "requires"})
+    return nodes, edges, id_to_skill
+
+
+def build_tree_edges(id_to_skill: dict[str, dict], edges: list[dict]) -> 
dict[str, list[str]]:
+    """Given edges (from->to = requires), return for each node the list of 
children (dependents)."""
+    # edges: from A to B means A requires B. So B is dependency of A. Children 
of B = skills that require B.
+    children: dict[str, list[str]] = {sid: [] for sid in id_to_skill}
+    for e in edges:
+        child_id = e["from"]
+        parent_id = e["to"]
+        children[parent_id].append(child_id)
+    return children
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Build skill dependency graph 
from skills.json")
+    parser.add_argument(
+        "--skills-json",
+        default=None,
+        help="Path to skills.json (default: 
contributing-docs/agent_skills/skills.json under repo root)",
+    )
+    parser.add_argument(
+        "--output",
+        default=None,
+        help="Path to write skill_graph.json (default: 
contributing-docs/agent_skills/skill_graph.json)",
+    )
+    args = parser.parse_args()
+    root = Path(__file__).resolve().parents[3]
+    skills_path = Path(args.skills_json) if args.skills_json else root / 
"contributing-docs/agent_skills/skills.json"
+    if not skills_path.exists():
+        print("skills.json not found", file=sys.stderr)
+        return 1
+    out_path = Path(args.output) if args.output else root / 
"contributing-docs/agent_skills/skill_graph.json"
+    data = json.loads(skills_path.read_text(encoding="utf-8"))
+    skills = data.get("skills", [])
+    if not skills:

Review Comment:
   `skill_graph.py` reads `skills.json` and proceeds even if the manifest 
contains invalid/duplicate skills; it only checks for cycles. Since this file 
is generated from docs, a malformed `skills.json` will cause confusing 
failures. Consider reusing the same validation logic as 
`extract_agent_skills.py`/`validate_skills.py` (or importing it) so graph 
generation fails with a clear validation error when the manifest is invalid.



##########
scripts/ci/agent_skills/breeze_context.py:
##########
@@ -0,0 +1,170 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Apache Airflow Agent Skills — Breeze Context Detection.
+
+Provides runtime host vs. container detection and command routing for
+AI coding agents contributing to Apache Airflow.
+
+Detection priority chain:
+  1. AIRFLOW_BREEZE_CONTAINER env var (explicit)
+  2. /.dockerenv file (Docker marker)
+  2b. /.containerenv file (Podman marker)
+  3. /opt/airflow path (Breeze mount)
+  4. Default: host
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import pathlib
+
+SKILLS_JSON = pathlib.Path("contributing-docs/agent_skills/skills.json")
+
+
+def get_context(force: str | None = None) -> str:
+    """
+    Return "breeze" if running inside Breeze container, else "host".
+
+    Detection priority:
+      1. AIRFLOW_BREEZE_CONTAINER env var
+      2. /.dockerenv exists
+      2b. /.containerenv exists (Podman)
+      3. /opt/airflow exists
+      4. default: host
+
+    Args:
+        force: If set, bypass automatic detection and return this value
+            directly. Expected values: "host" or "breeze".
+    """
+    if force is not None:
+        if force not in {"host", "breeze"}:
+            raise ValueError(f"force must be 'host' or 'breeze', got 
{force!r}")
+        return force
+
+    if os.environ.get("AIRFLOW_BREEZE_CONTAINER"):
+        return "breeze"
+    if pathlib.Path("/.dockerenv").exists():
+        return "breeze"
+    if pathlib.Path("/.containerenv").exists():
+        return "breeze"
+    if pathlib.Path("/opt/airflow").exists():
+        return "breeze"
+    return "host"
+
+
+def load_skills() -> list[dict]:
+    """Load skills from the generated skills.json manifest."""
+    if not SKILLS_JSON.exists():
+        raise FileNotFoundError(
+            f"skills.json not found at {SKILLS_JSON}. Run 
extract_agent_skills.py first.",
+        )
+    with SKILLS_JSON.open() as f:
+        data = json.load(f)
+    return data["skills"]
+
+
+def get_skill(skill_id: str) -> dict:
+    """Return a skill definition by id."""
+    skills = load_skills()
+    for skill in skills:
+        if skill.get("id") == skill_id:
+            return skill
+    raise KeyError(f"Skill '{skill_id}' not found. Available: {[s.get('id') 
for s in skills]}")

Review Comment:
   `get_skill()` raises `KeyError` with a message that includes the full list 
of available skill IDs. As the skill set grows, this can produce very large 
error output and be noisy in CI/logs. Consider including only a count plus a 
short hint (e.g., suggest `query_skills.py`) or truncating the list to the 
first N IDs.
   ```suggestion
       available_ids = [s.get("id") for s in skills]
       total = len(available_ids)
       preview_limit = 10
       preview_ids = available_ids[:preview_limit]
       preview_text = ", ".join(str(i) for i in preview_ids)
       if total > preview_limit:
           preview_text += ", ..."
       raise KeyError(
           f"Skill '{skill_id}' not found among {total} skills. "
           f"First {min(total, preview_limit)} IDs: [{preview_text}]. "
           "Use scripts/ci/agent_skills/query_skills.py or inspect skills.json 
for the full list.",
       )
   ```



##########
scripts/ci/agent_skills/validate_skills.py:
##########
@@ -0,0 +1,136 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Validate the Agent Skills manifest against a JSON-schema-like contract."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import pathlib
+import re
+import sys
+
+SKILLS_JSON = pathlib.Path("contributing-docs/agent_skills/skills.json")

Review Comment:
   `SKILLS_JSON` is a relative path, so running `validate_skills.py` from 
outside the repo root will fail. Consider resolving the default path relative 
to the repo root (via `Path(__file__)`) and/or adding a `--skills-json` option 
to make the CLI usable from any working directory.



##########
scripts/ci/pre_commit/extract_agent_skills.py:
##########
@@ -0,0 +1,220 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use it except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Extract and validate .. agent-skill:: blocks from AGENTS.md and 
contributing-docs/*.rst into skills.json."""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+
+def collect_source_texts(root: Path, docs_file: str, docs_dir: str) -> 
list[tuple[Path, str]]:
+    """Return list of (path, text) for AGENTS.md and all .rst under 
docs_dir."""
+    result: list[tuple[Path, str]] = []
+    agents_path = Path(docs_file) if Path(docs_file).is_absolute() else root / 
docs_file
+    if agents_path.exists():
+        result.append((agents_path, agents_path.read_text(encoding="utf-8")))
+    contrib_dir = Path(docs_dir) if Path(docs_dir).is_absolute() else root / 
docs_dir
+    if contrib_dir.is_dir():
+        for rst_path in sorted(contrib_dir.rglob("*.rst")):
+            result.append((rst_path, rst_path.read_text(encoding="utf-8")))
+    return result
+
+VALID_CONTEXTS = {"host", "breeze", "either"}
+VALID_CATEGORIES = {"environment", "testing", "linting", "providers", "dags", 
"documentation"}
+ID_RE = re.compile(r"^[a-z][a-z0-9-]*$")
+
+
+def parse_skills(text: str) -> list[dict]:
+    """Extract agent-skill blocks from one file's text; return list of skill 
dicts."""
+    blocks = re.split(r"\n\.\. agent-skill::\s*\n", text)
+    skills = []
+    for block in blocks[1:]:  # skip content before first block
+        skill = {}
+        # Options: :key: value (value can continue on next line if indented)
+        for m in re.finditer(r"^   :(\w+):\s*(.*?)(?=^   :\w+:|^   \.\.|\Z)", 
block, re.M | re.S):
+            key, val = m.group(1), m.group(2)
+            skill[key] = re.sub(r"\n\s+", " ", val).strip()
+        # code-block content
+        cb = re.search(r"\.\. code-block::\s*\w+\s*\n(.*?)(?=\n   \.\.|\Z)", 
block, re.S)
+        skill["command"] = re.sub(r"^      ", "", 
cb.group(1).strip()).replace("\n      ", "\n") if cb else ""
+        # expected output
+        eo = re.search(r"\.\. agent-skill-expected-output::\s*\n(.*?)(?=\n\.\. 
agent-skill|\Z)", block, re.S)
+        skill["expected_output"] = re.sub(r"^      ", "", 
eo.group(1).strip()).replace("\n      ", "\n") if eo else ""
+        skills.append(skill)
+    return skills
+
+
+def validate_skill(skill: dict, index: int) -> None:
+    """Raise ValueError if skill is invalid."""
+    sid = skill.get("id", "")
+    if not sid:
+        raise ValueError(f"Skill at index {index}: missing required field 
'id'")
+    if not ID_RE.match(sid):
+        raise ValueError(f"Skill '{sid}': id must match ^[a-z][a-z0-9-]*$")
+    ctx = skill.get("context", "")
+    if not ctx:
+        raise ValueError(f"Skill '{sid}': missing required field 'context'")
+    if ctx not in VALID_CONTEXTS:
+        raise ValueError(f"Skill '{sid}': context must be one of 
{sorted(VALID_CONTEXTS)}, got '{ctx}'")
+    cat = skill.get("category", "")
+    if not cat:
+        raise ValueError(f"Skill '{sid}': missing required field 'category'")
+    if cat not in VALID_CATEGORIES:
+        raise ValueError(f"Skill '{sid}': category must be one of 
{sorted(VALID_CATEGORIES)}, got '{cat}'")
+    desc = (skill.get("description") or "").strip()
+    if not desc:
+        raise ValueError(f"Skill '{sid}': missing or empty required field 
'description'")
+    cmd = (skill.get("command") or "").strip()
+    if not cmd:
+        raise ValueError(f"Skill '{sid}': missing or empty required field 
'command'")
+
+
+def _skill_key_set(skill: dict) -> set[str]:
+    """Return set of keys that define skill identity and content for 
comparison."""
+    return {"id", "context", "category", "prereqs", "validates", 
"description", "command", "expected_output"}
+
+
+def _normalize_skill_for_compare(skill: dict) -> dict:
+    """Return a comparable dict with only the fields we care about, sorted 
keys."""
+    keys = _skill_key_set(skill)
+    return {k: skill.get(k, "") for k in sorted(keys) if k in skill or k in 
keys}
+

Review Comment:
   Drift detection normalizes skills using a fixed key set that excludes fields 
like `fallback` / `fallback_condition`, even though those fields are present in 
the extracted output (see `contributing-docs/agent_skills/skills.json` for 
`run-single-test`). As a result, changes to these fields will not be detected 
by `--check`. Consider either including all schema-supported keys in 
`_skill_key_set()` (or comparing the full dict after normalizing formatting) so 
drift checking is complete for the manifest contract.



##########
scripts/ci/pre_commit/tests/test_e2e.py:
##########
@@ -0,0 +1,108 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""End-to-end tests for agent-skills extraction and command routing."""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PRE_COMMIT_DIR = Path(__file__).resolve().parents[1]
+CI_DIR = Path(__file__).resolve().parents[2]
+REPO_ROOT = Path(__file__).resolve().parents[4]
+sys.path.insert(0, str(CI_DIR))
+
+from agent_skills import breeze_context  # noqa: E402
+
+
+def _extract_skills_to_file(skills_file: Path) -> 
subprocess.CompletedProcess[str]:

Review Comment:
   This test mutates `sys.path` to import `agent_skills` from `scripts/ci`. 
This can mask real import/package issues and makes behavior depend on how 
pytest is invoked. Prefer setting up tests so imports work via normal module 
resolution (e.g., add the repo root to `PYTHONPATH` in the test runner) rather 
than per-test `sys.path.insert`.
   ```suggestion
   REPO_ROOT = Path(__file__).resolve().parents[4]
   
   from scripts.ci.agent_skills import breeze_context  # noqa: E402
   
   
   def _extract_skills_to_file(skills_file: Path) -> 
subprocess.CompletedProcess[str]:
   
   def _extract_skills_to_file(skills_file: Path) -> 
subprocess.CompletedProcess[str]:
   ```



##########
scripts/ci/pre_commit/extract_agent_skills.py:
##########
@@ -0,0 +1,220 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use it except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Extract and validate .. agent-skill:: blocks from AGENTS.md and 
contributing-docs/*.rst into skills.json."""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+
+def collect_source_texts(root: Path, docs_file: str, docs_dir: str) -> 
list[tuple[Path, str]]:
+    """Return list of (path, text) for AGENTS.md and all .rst under 
docs_dir."""
+    result: list[tuple[Path, str]] = []
+    agents_path = Path(docs_file) if Path(docs_file).is_absolute() else root / 
docs_file
+    if agents_path.exists():
+        result.append((agents_path, agents_path.read_text(encoding="utf-8")))
+    contrib_dir = Path(docs_dir) if Path(docs_dir).is_absolute() else root / 
docs_dir
+    if contrib_dir.is_dir():
+        for rst_path in sorted(contrib_dir.rglob("*.rst")):
+            result.append((rst_path, rst_path.read_text(encoding="utf-8")))
+    return result
+
+VALID_CONTEXTS = {"host", "breeze", "either"}
+VALID_CATEGORIES = {"environment", "testing", "linting", "providers", "dags", 
"documentation"}
+ID_RE = re.compile(r"^[a-z][a-z0-9-]*$")
+
+
+def parse_skills(text: str) -> list[dict]:
+    """Extract agent-skill blocks from one file's text; return list of skill 
dicts."""
+    blocks = re.split(r"\n\.\. agent-skill::\s*\n", text)
+    skills = []
+    for block in blocks[1:]:  # skip content before first block
+        skill = {}
+        # Options: :key: value (value can continue on next line if indented)
+        for m in re.finditer(r"^   :(\w+):\s*(.*?)(?=^   :\w+:|^   \.\.|\Z)", 
block, re.M | re.S):
+            key, val = m.group(1), m.group(2)
+            skill[key] = re.sub(r"\n\s+", " ", val).strip()
+        # code-block content
+        cb = re.search(r"\.\. code-block::\s*\w+\s*\n(.*?)(?=\n   \.\.|\Z)", 
block, re.S)
+        skill["command"] = re.sub(r"^      ", "", 
cb.group(1).strip()).replace("\n      ", "\n") if cb else ""
+        # expected output
+        eo = re.search(r"\.\. agent-skill-expected-output::\s*\n(.*?)(?=\n\.\. 
agent-skill|\Z)", block, re.S)

Review Comment:
   The `agent-skill-expected-output` capture regex is too broad: it stops only 
at the next top-level `.. agent-skill` or EOF, so for the last skill block in a 
file it will swallow the remainder of the document. This is already visible in 
the generated `contributing-docs/agent_skills/skills.json` where 
`run-static-checks-prek.expected_output` contains large unrelated sections. 
Please change the stop condition to end the expected-output block on dedent / 
next directive within the same skill block (e.g., first non-indented line) so 
only the intended expected output is captured.
   ```suggestion
           eo = re.search(
               r"\.\. agent-skill-expected-output::\s*\n(.*?)(?=\n   
\.\.|\n\S|\Z)",
               block,
               re.S,
           )
   ```



##########
scripts/ci/pre_commit/tests/test_extract_agent_skills.py:
##########
@@ -0,0 +1,474 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use it except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for extract_agent_skills.py."""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+# Add parent so we can import extract_agent_skills
+SCRIPT_DIR = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(SCRIPT_DIR))
+
+from extract_agent_skills import extract_all_skills, parse_skills, 
validate_skill
+
+VALID_SKILL_RST = """
+.. agent-skill::
+   :id: setup-breeze-environment
+   :context: host
+   :category: environment
+   :prereqs: docker, python>=3.9
+   :validates: breeze-env-ready
+   :description: Start the Airflow Breeze development environment
+
+   .. code-block:: bash
+
+      breeze start-airflow
+
+   .. agent-skill-expected-output::
+
+      Airflow webserver at http://localhost:28080
+"""
+
+
+def test_valid_skill_parsed():
+    """Sample RST string with one valid skill block, assert all fields 
extracted correctly."""
+    skills = parse_skills(VALID_SKILL_RST)
+    assert len(skills) == 1
+    skill = skills[0]
+    assert skill["id"] == "setup-breeze-environment"
+    assert skill["context"] == "host"
+    assert skill["category"] == "environment"
+    assert skill["prereqs"] == "docker, python>=3.9"
+    assert skill["validates"] == "breeze-env-ready"
+    assert "Breeze development environment" in skill["description"]
+    assert "breeze start-airflow" in skill["command"]
+    assert "localhost:28080" in skill["expected_output"]
+    validate_skill(skill, 0)
+
+
+def test_invalid_context_fails(tmp_path):
+    """context: 'container' should raise SystemExit with code 1."""
+    rst = """
+.. agent-skill::
+   :id: bad-context
+   :context: container
+   :category: environment
+   :description: Bad context skill
+
+   .. code-block:: bash
+
+      echo ok
+
+   .. agent-skill-expected-output::
+
+      ok
+"""
+    docs_file = tmp_path / "AGENTS.md"
+    docs_file.write_text(rst)
+    out_file = tmp_path / "skills.json"
+    result = subprocess.run(
+        [
+            sys.executable,
+            str(SCRIPT_DIR / "extract_agent_skills.py"),
+            "--docs-file",
+            str(docs_file),
+            "--output",
+            str(out_file),
+        ],
+        capture_output=True,
+        text=True,
+    )

Review Comment:
   Several tests invoke `subprocess.run(...)` without a timeout (e.g., this 
block and others below). If the script blocks for any reason, pytest can hang. 
Please add a small `timeout=` to these subprocess calls so failures are bounded 
and CI is more reliable.



##########
contributing-docs/agent_skills/skills.json:
##########
@@ -0,0 +1,100 @@
+{
+  "version": "1.0",
+  "generated_by": "extract_agent_skills.py",
+  "generated_from": "AGENTS.md, contributing-docs/*.rst",
+  "skill_count": 9,
+  "skills": [
+    {
+      "id": "setup-breeze-environment",
+      "context": "host",
+      "category": "environment",
+      "prereqs": "docker, python>=3.9",
+      "validates": "breeze-env-ready",
+      "description": "Start the Airflow Breeze development environment",
+      "command": "breeze start-airflow",
+      "expected_output": "Airflow webserver at http://localhost:28080\nLogin: 
admin / admin"
+    },
+    {
+      "id": "run-single-test",
+      "context": "host",
+      "category": "testing",
+      "prereqs": "setup-breeze-environment",
+      "validates": "tests-pass",
+      "fallback": "breeze run pytest {test_path} -xvs",
+      "fallback_condition": "missing_system_deps",
+      "description": "Run a single test with uv, falling back to breeze if 
system deps are missing",
+      "command": "# Primary: uv (no Docker needed, faster)\nuv run --project 
{project} pytest {test_path} -xvs\n# Fallback: only when system deps missing\n# 
breeze run pytest {test_path} -xvs",
+      "expected_output": "PASSED"
+    },
+    {
+      "id": "run-static-checks",
+      "context": "host",
+      "category": "linting",
+      "prereqs": "setup-breeze-environment",
+      "validates": "static-checks-pass",
+      "description": "Run fast static checks with prek",
+      "command": "prek run --from-ref main --stage pre-commit",
+      "expected_output": "All checks passed."
+    },
+    {
+      "id": "run-manual-checks",
+      "context": "host",
+      "category": "linting",
+      "prereqs": "run-static-checks",
+      "validates": "manual-checks-pass",
+      "description": "Run slower manual checks with prek",
+      "command": "prek run --from-ref main --stage manual",
+      "expected_output": "All checks passed."
+    },
+    {
+      "id": "build-docs",
+      "context": "host",
+      "category": "documentation",
+      "prereqs": "setup-breeze-environment",
+      "validates": "docs-build-clean",
+      "description": "Build Airflow documentation locally",
+      "command": "breeze build-docs",
+      "expected_output": "Build finished. The HTML pages are in _build/html."
+    },
+    {
+      "id": "run-provider-tests",
+      "context": "host",
+      "category": "testing",
+      "prereqs": "setup-breeze-environment",
+      "validates": "provider-tests-pass",
+      "description": "Run complete test suite for a specific provider",
+      "command": "breeze testing providers-tests\n  --test-type 
\"Providers[{provider}]\"",
+      "expected_output": "All tests passed."
+    },
+    {
+      "id": "run-type-check",
+      "context": "host",
+      "category": "linting",
+      "prereqs": "setup-breeze-environment",
+      "validates": "type-check-pass",
+      "description": "Run mypy type checking on changed files",
+      "command": "breeze run mypy {path}",
+      "expected_output": "Success: no issues found"
+    },
+    {
+      "id": "run-tests-uv-first",
+      "context": "host",
+      "category": "testing",
+      "prereqs": "setup-breeze-environment",
+      "validates": "tests-pass",
+      "description": "Run tests with uv, fall back to Breeze if system deps 
missing",
+      "command": "uv run --project {project} pytest {test_path} -xvs\n# 
fallback: breeze run pytest {test_path} -xvs",
+      "expected_output": "PASSED"
+    },
+    {
+      "id": "run-static-checks-prek",
+      "context": "host",
+      "category": "linting",
+      "prereqs": "setup-breeze-environment",
+      "validates": "static-checks-pass",
+      "description": "Run fast pre-commit static checks on changed files using 
prek",
+      "command": "prek run --from-ref main --stage pre-commit",
+      "expected_output": "All checks passed.\n\n- Running Unit tests inside 
Breeze environment.\n\n  Just run ``pytest filepath+filename`` to run the 
tests.\n\n.. code-block:: bash\n\n   [Breeze:3.10.19] 
root@63528318c8b1:/opt/airflow# pytest tests/utils/test_dates.py\n   
============================================================= test session 
starts ==============================================================\n   
platform linux -- Python 3.10.20, pytest-8.3.3, pluggy-1.5.0 -- 
/usr/python/bin/python\n   cachedir: .pytest_cache\n   rootdir: /opt/airflow\n  
 configfile: pyproject.toml\n   plugins: anyio-4.6.0, time-machine-2.15.0, 
icdiff-0.9, rerunfailures-14.0, instafail-0.5.0, custom-exit-code-0.3.0, 
xdist-3.6.1, mock-3.14.0, cov-5.0.0, asyncio-0.24.0, requests-mock-1.12.1, 
timeouts-1.2.1\n   asyncio: mode=strict, default_loop_scope=None\n   setup 
timeout: 0.0s, execution timeout: 0.0s, teardown timeout: 0.0s\n   collected 4 
items\n\n   tests/utils/test_dates.py::TestDates::test
 _parse_execution_date PASSED                                                   
                        [ 25%]\n   
tests/utils/test_dates.py::TestDates::test_round_time PASSED                    
                                                                 [ 50%]\n   
tests/utils/test_dates.py::TestDates::test_infer_time_unit PASSED               
                                                                 [ 75%]\n   
tests/utils/test_dates.py::TestDates::test_scale_time_units PASSED              
                                                                 [100%]\n\n   
================================================================== 4 passed in 
3.30s ===================================================================\n\n- 
Running All the tests with Breeze by specifying the required python version, 
backend, backend version\n\n.. code-block:: bash\n\n   breeze --backend 
postgres --postgres-version 15 --python 3.10 --db-reset testing tests 
--test-type All\n\n- Running specifi
 c type of test\n\n  .. code-block:: bash\n\n    breeze --backend postgres 
--postgres-version 15 --python 3.10 --db-reset testing tests --test-type 
Core\n\n\n- Running Integration test for specific test type\n\n  .. 
code-block:: bash\n\n   breeze --backend postgres --postgres-version 15 
--python 3.10 --db-reset testing tests --test-type All --integration mongo\n\n- 
For more information on Testing visit |09_testing.rst|\n\n  .. |09_testing.rst| 
raw:: html\n\n   <a 
href=\"https://github.com/apache/airflow/blob/main/contributing-docs/09_testing.rst\";
 target=\"_blank\">09_testing.rst</a>\n\n- Similarly to regular development, 
you can also debug while testing using your IDE, for more information, you may 
refer to\n\n  |Local and Remote Debugging in IDE|\n\n  .. |Local and Remote 
Debugging in IDE| raw:: html\n\n   <a 
href=\"https://github.com/apache/airflow/blob/main/contributing-docs/07_local_virtualenv.rst#local-and-remote-debugging-in-ide\"\n
   target=\"_blank\">Local and Remote Debuggi
 ng in IDE</a>\n\nContribution guide\n##################\n\n- To know how to 
contribute to the project visit |README.rst|\n\n.. |README.rst| raw:: html\n\n  
 <a 
href=\"https://github.com/apache/airflow/blob/main/contributing-docs/README.rst\";
 target=\"_blank\">README.rst</a>\n\n- Following are some of the important 
links of Contribution documentation\n\n  - |Types of contributions|\n\n  .. 
|Types of contributions| raw:: html\n\n   <a 
href=\"https://github.com/apache/airflow/blob/main/contributing-docs/04_how_to_contribute.rst\";
 target=\"_blank\">\n   Types of contributions</a>\n\n  - |Roles of 
contributor|\n\n  .. |Roles of contributor| raw:: html\n\n   <a 
href=\"https://github.com/apache/airflow/blob/main/contributing-docs/01_roles_in_airflow_project.rst\";
 target=\"_blank\">Roles of\n   contributor</a>\n\n\n  - |Workflow for a 
contribution|\n\n  .. |Workflow for a contribution| raw:: html\n\n   <a 
href=\"https://github.com/apache/airflow/blob/main/contributing-docs/18_contribution_w
 orkflow.rst\" target=\"_blank\">\n   Workflow for a 
contribution</a>\n\n\n\nRaising Pull Request\n--------------------\n\n1. Go to 
your GitHub account and open your fork project and click on Branches\n\n   .. 
raw:: html\n\n    <div align=\"center\" style=\"padding-bottom:10px\">\n<img 
src=\"images/quick_start/pr1.png\"\n     alt=\"Go to fork and select 
branches\">\n    </div>\n\n2. Click on ``New pull request`` button on branch 
from which you want to raise a pull request\n\n   .. raw:: html\n\n<div 
align=\"center\" style=\"padding-bottom:10px\">\n  <img 
src=\"images/quick_start/pr2.png\"\n       alt=\"Accessing local 
airflow\">\n</div>\n\n3. Add title and description as per Contributing 
guidelines and click on ``Create pull request``\n\n   .. raw:: html\n\n<div 
align=\"center\" style=\"padding-bottom:10px\">\n  <img 
src=\"images/quick_start/pr3.png\"\n       alt=\"Accessing local 
airflow\">\n</div>\n\n\nSyncing Fork and rebasing Pull 
request\n--------------------------------------\n
 \nOften it takes several days or weeks to discuss and iterate with the PR 
until it is ready to merge.\nIn the meantime new commits are merged, and you 
might run into conflicts, therefore you should periodically\nsynchronize main 
in your fork with the ``apache/airflow`` main and rebase your PR on top of it. 
Following\ndescribes how to do it.\n\n* `Update new changes made to 
apache:airflow project to your fork 
<10_working_with_git.rst#how-to-sync-your-fork>`__\n* `Rebasing pull request 
<10_working_with_git.rst#how-to-rebase-pr>`__\n\n\nUsing your 
IDE\n##############\n\nIf you are familiar with Python development and use your 
favourite editors, Airflow can be setup\nsimilarly to other projects of yours. 
However, if you need specific instructions for your IDE you\nwill find more 
detailed instructions here:\n\n* `Pycharm/IntelliJ 
<quick-start-ide/contributors_quick_start_pycharm.rst>`_\n* `Visual Studio Code 
<quick-start-ide/contributors_quick_start_vscode.rst>`_\n\n\nUsing Remote develo
 pment environments\n#####################################\n\nIn order to use 
remote development environment, you usually need a paid account, but you do not 
have to\nsetup local machine for development.\n\n* `GitPod 
<quick-start-ide/contributors_quick_start_gitpod.rst>`_\n* `GitHub Codespaces 
<quick-start-ide/contributors_quick_start_codespaces.rst>`_\n\n\n----------------\n\nOnce
 you have your environment set up, you can start contributing to Airflow. You 
can find more\nabout ways you can contribute in the `How to contribute 
<04_how_to_contribute.rst>`_ document."
+    }

Review Comment:
   The committed generated `skills.json` contains an incorrect 
`expected_output` for `run-static-checks-prek` (it includes a large chunk of 
unrelated RST content after the intended 'All checks passed.'). This likely 
comes from the extractor’s expected-output parsing and makes the manifest 
unusable for consumers. Regenerate `skills.json` after fixing the extractor so 
that `expected_output` only contains the expected-output block contents.



##########
scripts/ci/pre_commit/skill_graph.py:
##########
@@ -0,0 +1,186 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use it except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Build dependency graph from skills.json (prereqs) and output tree + JSON 
graph."""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+def parse_prereq_skill_ids(prereqs_str: str, skill_ids: set[str]) -> list[str]:
+    """Return list of skill IDs that appear in prereqs string 
(comma-separated)."""
+    if not prereqs_str or not prereqs_str.strip():
+        return []
+    return [x.strip() for x in prereqs_str.split(",") if x.strip() in 
skill_ids]
+
+
+def find_cycle(skill_ids: set[str], edges: list[tuple[str, str]]) -> list[str] 
| None:
+    """edges are (from_id, to_id) meaning from_id requires to_id. Return cycle 
as list or None."""
+    # Build adjacency: node -> list of nodes it points to (dependencies)
+    adj: dict[str, list[str]] = {sid: [] for sid in skill_ids}
+    for from_id, to_id in edges:
+        adj[from_id].append(to_id)
+    path: list[str] = []
+    path_set: set[str] = set()
+    in_stack: set[str] = set()
+
+    def visit(n: str) -> list[str] | None:
+        path.append(n)
+        path_set.add(n)
+        in_stack.add(n)
+        for m in adj.get(n, []):
+            if m not in path_set:
+                cycle = visit(m)
+                if cycle is not None:
+                    return cycle
+            elif m in in_stack:
+                # cycle: from m back to m
+                start = path.index(m)
+                return path[start:] + [m]
+        path.pop()
+        path_set.remove(n)
+        in_stack.remove(n)
+        return None
+
+    for sid in skill_ids:
+        if sid not in path_set:
+            cycle = visit(sid)
+            if cycle is not None:
+                return cycle
+    return None
+
+
+def build_graph(skills: list[dict]) -> tuple[list[dict], list[dict], dict[str, 
dict]]:
+    """
+    Build nodes (for JSON), edges (for JSON), and id->skill for tree.
+    Edges: from skill_id to prereq_id (skill_id requires prereq_id).
+    """

Review Comment:
   `build_graph()` assumes every skill dict has an `id` key (`{s['id'] for s in 
skills}`), which will raise `KeyError` if the manifest contains an invalid 
entry. Consider validating entries (or skipping malformed ones with a clear 
error) before building sets/dicts so the CLI fails with a helpful message 
rather than a traceback.
   ```suggestion
       """
       # Validate that every skill entry has an "id" key before building 
sets/dicts.
       for index, s in enumerate(skills):
           if not isinstance(s, dict):
               raise ValueError(f"Skill entry at index {index} is not a 
mapping: {s!r}")
           if "id" not in s:
               raise ValueError(f"Skill entry at index {index} is missing 
required 'id' field: {s!r}")
   ```



##########
contributing-docs/agent_skills/skill_graph.json:
##########
@@ -0,0 +1,71 @@
+{
+  "nodes": [
+    {
+      "id": "setup-breeze-environment",
+      "category": "environment",
+      "context": "host"
+    },
+    {
+      "id": "run-single-test",
+      "category": "testing",
+      "context": "host"
+    },
+    {
+      "id": "run-static-checks",
+      "category": "linting",
+      "context": "host"
+    },
+    {
+      "id": "run-manual-checks",
+      "category": "linting",
+      "context": "host"
+    },
+    {
+      "id": "build-docs",
+      "category": "documentation",
+      "context": "host"
+    },
+    {
+      "id": "run-provider-tests",
+      "category": "testing",
+      "context": "host"
+    },
+    {
+      "id": "run-type-check",
+      "category": "linting",
+      "context": "host"
+    }
+  ],

Review Comment:
   `skill_graph.json` is out of sync with the committed `skills.json`: it only 
includes 7 nodes (missing the 2 RST-defined skills `run-tests-uv-first` and 
`run-static-checks-prek`) while `skills.json` reports 9 skills. This suggests 
the graph was generated from a different input or is stale. Please regenerate 
`skill_graph.json` from the current `skills.json` (or adjust the workflow so 
this derived artifact cannot drift).



##########
scripts/ci/pre_commit/extract_agent_skills.py:
##########
@@ -0,0 +1,220 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use it except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Extract and validate .. agent-skill:: blocks from AGENTS.md and 
contributing-docs/*.rst into skills.json."""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+
+def collect_source_texts(root: Path, docs_file: str, docs_dir: str) -> 
list[tuple[Path, str]]:
+    """Return list of (path, text) for AGENTS.md and all .rst under 
docs_dir."""
+    result: list[tuple[Path, str]] = []
+    agents_path = Path(docs_file) if Path(docs_file).is_absolute() else root / 
docs_file
+    if agents_path.exists():
+        result.append((agents_path, agents_path.read_text(encoding="utf-8")))
+    contrib_dir = Path(docs_dir) if Path(docs_dir).is_absolute() else root / 
docs_dir
+    if contrib_dir.is_dir():
+        for rst_path in sorted(contrib_dir.rglob("*.rst")):
+            result.append((rst_path, rst_path.read_text(encoding="utf-8")))
+    return result
+
+VALID_CONTEXTS = {"host", "breeze", "either"}
+VALID_CATEGORIES = {"environment", "testing", "linting", "providers", "dags", 
"documentation"}
+ID_RE = re.compile(r"^[a-z][a-z0-9-]*$")
+
+
+def parse_skills(text: str) -> list[dict]:
+    """Extract agent-skill blocks from one file's text; return list of skill 
dicts."""
+    blocks = re.split(r"\n\.\. agent-skill::\s*\n", text)
+    skills = []
+    for block in blocks[1:]:  # skip content before first block
+        skill = {}
+        # Options: :key: value (value can continue on next line if indented)
+        for m in re.finditer(r"^   :(\w+):\s*(.*?)(?=^   :\w+:|^   \.\.|\Z)", 
block, re.M | re.S):
+            key, val = m.group(1), m.group(2)
+            skill[key] = re.sub(r"\n\s+", " ", val).strip()
+        # code-block content
+        cb = re.search(r"\.\. code-block::\s*\w+\s*\n(.*?)(?=\n   \.\.|\Z)", 
block, re.S)
+        skill["command"] = re.sub(r"^      ", "", 
cb.group(1).strip()).replace("\n      ", "\n") if cb else ""
+        # expected output
+        eo = re.search(r"\.\. agent-skill-expected-output::\s*\n(.*?)(?=\n\.\. 
agent-skill|\Z)", block, re.S)
+        skill["expected_output"] = re.sub(r"^      ", "", 
eo.group(1).strip()).replace("\n      ", "\n") if eo else ""
+        skills.append(skill)
+    return skills

Review Comment:
   `parse_skills()` relies on regexes that are sensitive to exact indentation 
(e.g., options must start with exactly three spaces, and 
command/expected-output unindent assumes exactly six spaces). This is brittle 
for RST and will be hard to maintain as docs evolve. Consider parsing based on 
“indented block” rules (detect indentation level dynamically) or using docutils 
to parse directives, so extraction is robust to minor formatting changes.



##########
scripts/ci/pre_commit/tests/test_validate_skills.py:
##########
@@ -0,0 +1,130 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for validate_skills.py."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+
+SCRIPT_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(SCRIPT_ROOT))
+
+from agent_skills import validate_skills  # noqa: E402
+
+

Review Comment:
   These tests modify `sys.path` at runtime to import `agent_skills`. This can 
mask import/package issues and make tests order-dependent. Prefer importing via 
the repo’s normal Python packaging/test configuration (or use `PYTHONPATH` in 
the test runner) so imports work the same way in CI and locally.
   ```suggestion
   import importlib.util
   from pathlib import Path
   
   import pytest
   
   SCRIPT_ROOT = Path(__file__).resolve().parents[2]
   _VALIDATE_SKILLS_PATH = SCRIPT_ROOT / "agent_skills" / "validate_skills.py"
   _spec = 
importlib.util.spec_from_file_location("agent_skills.validate_skills", 
_VALIDATE_SKILLS_PATH)
   if _spec is None or _spec.loader is None:
       raise ImportError(f"Cannot load validate_skills from 
{_VALIDATE_SKILLS_PATH}")
   validate_skills = importlib.util.module_from_spec(_spec)
   _spec.loader.exec_module(validate_skills)  # type: ignore[arg-type]
   ```



##########
scripts/ci/pre_commit/tests/test_extract_agent_skills.py:
##########
@@ -0,0 +1,474 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use it except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for extract_agent_skills.py."""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+# Add parent so we can import extract_agent_skills
+SCRIPT_DIR = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(SCRIPT_DIR))
+
+from extract_agent_skills import extract_all_skills, parse_skills, 
validate_skill
+

Review Comment:
   This test file mutates `sys.path` to import the script module directly. This 
makes imports dependent on execution location and can hide real packaging 
issues. Prefer importing via a proper module path (e.g., 
`scripts.ci.pre_commit...` if available) or configuring tests to include the 
repo root on `PYTHONPATH` rather than doing per-test `sys.path.insert`.
   ```suggestion
   import importlib.util
   import json
   import subprocess
   import sys
   from pathlib import Path
   
   # Load extract_agent_skills module directly from its file path without 
mutating sys.path
   SCRIPT_DIR = Path(__file__).resolve().parent.parent
   _extract_agent_skills_path = SCRIPT_DIR / "extract_agent_skills.py"
   _spec = importlib.util.spec_from_file_location("extract_agent_skills", 
_extract_agent_skills_path)
   _extract_agent_skills_module = importlib.util.module_from_spec(_spec)
   assert _spec is not None and _spec.loader is not None
   _spec.loader.exec_module(_extract_agent_skills_module)
   extract_all_skills = _extract_agent_skills_module.extract_all_skills
   parse_skills = _extract_agent_skills_module.parse_skills
   validate_skill = _extract_agent_skills_module.validate_skill
   ```



##########
scripts/ci/pre_commit/tests/test_e2e.py:
##########
@@ -0,0 +1,108 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""End-to-end tests for agent-skills extraction and command routing."""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PRE_COMMIT_DIR = Path(__file__).resolve().parents[1]
+CI_DIR = Path(__file__).resolve().parents[2]
+REPO_ROOT = Path(__file__).resolve().parents[4]
+sys.path.insert(0, str(CI_DIR))
+
+from agent_skills import breeze_context  # noqa: E402
+
+
+def _extract_skills_to_file(skills_file: Path) -> 
subprocess.CompletedProcess[str]:
+    extract_script = PRE_COMMIT_DIR / "extract_agent_skills.py"
+    return subprocess.run(
+        [
+            sys.executable,
+            str(extract_script),
+            "--docs-file",
+            str(REPO_ROOT / "AGENTS.md"),
+            "--docs-dir",
+            str(REPO_ROOT / "contributing-docs"),
+            "--output",
+            str(skills_file),
+        ],
+        capture_output=True,
+        text=True,
+        cwd=REPO_ROOT,
+    )

Review Comment:
   These end-to-end tests run the extractor/check scripts via 
`subprocess.run(...)` without any timeout. If the script blocks (e.g., due to 
unexpected parsing edge cases), pytest can hang. Please add a reasonable 
`timeout=` to these subprocess calls to make CI failures bounded.



##########
scripts/ci/pre_commit/tests/test_skill_graph.py:
##########
@@ -0,0 +1,98 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use it except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for skill_graph.py."""
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+SCRIPT_DIR = Path(__file__).resolve().parent.parent
+
+
+def _run_skill_graph(skills_json: Path, output_json: Path | None = None) -> 
subprocess.CompletedProcess:
+    cmd = [sys.executable, str(SCRIPT_DIR / "skill_graph.py"), 
"--skills-json", str(skills_json)]
+    if output_json is not None:
+        cmd.extend(["--output", str(output_json)])
+    return subprocess.run(cmd, capture_output=True, text=True)

Review Comment:
   `subprocess.run(..., capture_output=True, text=True)` here should set 
`check=False/True` explicitly and include a `timeout=` to prevent the test from 
hanging if the script blocks. Adding a timeout also makes CI failure modes 
clearer.
   ```suggestion
       return subprocess.run(cmd, capture_output=True, text=True, check=False, 
timeout=30)
   ```



##########
scripts/ci/agent_skills/query_skills.py:
##########
@@ -0,0 +1,118 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Query the Agent Skills manifest for contributors and AI agents."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import pathlib
+import sys
+
+SKILLS_JSON = pathlib.Path("contributing-docs/agent_skills/skills.json")
+GRAPH_JSON = pathlib.Path("contributing-docs/agent_skills/skill_graph.json")
+
+
+def load_skills() -> list[dict]:
+    with SKILLS_JSON.open() as f:
+        return json.load(f)["skills"]
+
+
+def load_graph() -> dict:
+    with GRAPH_JSON.open() as f:
+        return json.load(f)

Review Comment:
   `load_skills()` / `load_graph()` open `skills.json` / `skill_graph.json` 
relative to the current working directory. Invoking this CLI from outside the 
repo root will fail with FileNotFoundError. Consider resolving paths relative 
to the repo root (e.g., `Path(__file__).resolve().parents[...]`) or accepting a 
`--skills-json` / `--graph-json` argument (with current paths as defaults).



##########
scripts/ci/agent_skills/breeze_context.py:
##########
@@ -0,0 +1,170 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Apache Airflow Agent Skills — Breeze Context Detection.
+
+Provides runtime host vs. container detection and command routing for
+AI coding agents contributing to Apache Airflow.
+
+Detection priority chain:
+  1. AIRFLOW_BREEZE_CONTAINER env var (explicit)
+  2. /.dockerenv file (Docker marker)
+  2b. /.containerenv file (Podman marker)
+  3. /opt/airflow path (Breeze mount)
+  4. Default: host
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import pathlib
+
+SKILLS_JSON = pathlib.Path("contributing-docs/agent_skills/skills.json")

Review Comment:
   `SKILLS_JSON` is a relative path, which means `breeze_context` will fail to 
find the manifest if imported/used from a working directory other than the repo 
root. Consider resolving the default path relative to the repo root (e.g., 
based on `__file__`) or providing an environment variable/parameter override so 
consumers can locate the manifest reliably.
   ```suggestion
   # Resolve skills manifest path relative to the repository root, not the CWD.
   # This makes imports/usage from arbitrary working directories reliable.
   _REPO_ROOT = pathlib.Path(__file__).resolve().parents[3]
   _DEFAULT_SKILLS_JSON = _REPO_ROOT / 
"contributing-docs/agent_skills/skills.json"
   _SKILLS_JSON_OVERRIDE = os.environ.get("AIRFLOW_AGENT_SKILLS_JSON")
   SKILLS_JSON = pathlib.Path(_SKILLS_JSON_OVERRIDE) if _SKILLS_JSON_OVERRIDE 
else _DEFAULT_SKILLS_JSON
   ```



##########
scripts/ci/pre_commit/tests/test_breeze_context.py:
##########
@@ -0,0 +1,165 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for breeze_context.py."""
+
+from __future__ import annotations
+
+import json
+import pathlib
+import sys
+from pathlib import Path
+
+import pytest
+
+SCRIPT_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(SCRIPT_ROOT))
+
+from agent_skills import breeze_context  # noqa: E402

Review Comment:
   This test module modifies `sys.path` at runtime to import `agent_skills`. 
This can make tests order-dependent and hide packaging/import issues. Prefer 
configuring test execution so the repo root / relevant package is importable 
without per-test `sys.path.insert`.
   ```suggestion
   import importlib.util
   import json
   import pathlib
   from pathlib import Path
   
   import pytest
   
   SCRIPT_ROOT = Path(__file__).resolve().parents[2]
   BREEZE_CONTEXT_PATH = SCRIPT_ROOT / "agent_skills" / "breeze_context.py"
   
   _spec = 
importlib.util.spec_from_file_location("agent_skills.breeze_context", 
BREEZE_CONTEXT_PATH)
   breeze_context = importlib.util.module_from_spec(_spec)  # type: 
ignore[assignment]
   assert _spec is not None and _spec.loader is not None
   _spec.loader.exec_module(breeze_context)  # type: ignore[union-attr]
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GSoC 2026 PoC Agent Skills executable document system with context detection [airflow]

Reply via email to