alnzng commented on code in PR #596: URL: https://github.com/apache/flink-agents/pull/596#discussion_r3071146044
########## python/flink_agents/runtime/skill/skill_parser.py: ########## @@ -0,0 +1,146 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import re +from dataclasses import dataclass +from typing import Dict + +import yaml + +from flink_agents.runtime.skill.agent_skill import AgentSkill + + +@dataclass +class ParsedMarkdown: + """Result of parsing markdown with frontmatter. + + Contains both the extracted metadata and the markdown content. + + Attributes: + ---------- + metadata : Dict[str, str] + YAML metadata extracted from frontmatter. + content : str + Markdown content (without frontmatter). + """ + + metadata: Dict[str, str] + content: str + + +class MarkdownSkillParser: + """Utility for parsing Markdown files with YAML frontmatter.""" + + # Pattern to match frontmatter: starts with ---, ends with --- + # Group 1: frontmatter content (non-greedy) + # Group 2: remaining content + FRONTMATTER_PATTERN = re.compile( + r"^---\s*[\r\n]+(.*?)[\r\n]*---(?:\s*[\r\n]+)?(.*)", re.DOTALL + ) + + # Pattern to match key: value format + KEY_VALUE_PATTERN = re.compile(r"^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$") + + @classmethod + def parse(cls, markdown: str) -> ParsedMarkdown: + """Parse markdown content with YAML frontmatter. + + Extracts both the YAML metadata and the markdown content. + + Args: + markdown: Markdown content. + + Returns: + ParsedMarkdown containing metadata and content. + + Raises: + ValueError: If YAML syntax is invalid. + """ + if not markdown: + return ParsedMarkdown(metadata={}, content="") + + matcher = cls.FRONTMATTER_PATTERN.match(markdown) + + if not matcher: + # No frontmatter found, treat entire content as markdown + return ParsedMarkdown(metadata={}, content=markdown) + + yaml_content = matcher.group(1).strip() + markdown_content = matcher.group(2) + + if not yaml_content: + return ParsedMarkdown(metadata={}, content=markdown_content) + + try: + metadata = yaml.safe_load(yaml_content) Review Comment: does markdown frontmatter always follow YAML syntax? ########## python/flink_agents/runtime/skill/skill_parser.py: ########## @@ -0,0 +1,146 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import re +from dataclasses import dataclass +from typing import Dict + +import yaml + +from flink_agents.runtime.skill.agent_skill import AgentSkill + + +@dataclass +class ParsedMarkdown: + """Result of parsing markdown with frontmatter. + + Contains both the extracted metadata and the markdown content. + + Attributes: + ---------- + metadata : Dict[str, str] + YAML metadata extracted from frontmatter. + content : str + Markdown content (without frontmatter). + """ + + metadata: Dict[str, str] + content: str + + +class MarkdownSkillParser: + """Utility for parsing Markdown files with YAML frontmatter.""" + + # Pattern to match frontmatter: starts with ---, ends with --- + # Group 1: frontmatter content (non-greedy) + # Group 2: remaining content + FRONTMATTER_PATTERN = re.compile( + r"^---\s*[\r\n]+(.*?)[\r\n]*---(?:\s*[\r\n]+)?(.*)", re.DOTALL + ) + + # Pattern to match key: value format + KEY_VALUE_PATTERN = re.compile(r"^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$") Review Comment: Looks like we can remove this? I didn't see it is used in any other place. ########## python/flink_agents/runtime/skill/skill_manager.py: ########## @@ -0,0 +1,285 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################# +import shlex +from pathlib import Path +from typing import ClassVar, Dict, List, Set + +from flink_agents.api.skills import Skills +from flink_agents.runtime.skill.agent_skill import AgentSkill +from flink_agents.runtime.skill.repository.filesystem_repository import ( + FileSystemSkillRepository, +) +from flink_agents.runtime.skill.skill_prompt_provider import SkillPromptProvider +from flink_agents.runtime.skill.skill_repository import SkillRepository + + +class RegisteredSkill: + """A wrapper that associates an AgentSkill with its source repository. + + Provides lazy activation for skill resources, loading them only when + first needed (e.g., when get_resource or get_resource_paths is called). + """ + + def __init__(self, agent_skill: AgentSkill, repo: SkillRepository) -> None: + """Initialize a new registered skill.""" + self.agent_skill = agent_skill + self.repo = repo + self.active = False + + @property + def name(self) -> str: + """Get the name of the skill.""" + return self.agent_skill.name + + @property + def description(self) -> str: + """Get the description of the skill.""" + return self.agent_skill.description + + @property + def content(self) -> str: + """Get the content of the skill.""" + return self.agent_skill.content + + def get_resource(self, resource_path: str) -> str: + """Get the resource content by relative path.""" + self._activate() + return self.agent_skill.get_resource(resource_path) + + def get_resource_paths(self) -> List[str]: + """Get all the resource relative paths of the skill.""" + self._activate() + return self.agent_skill.get_resource_paths() + + def _activate(self) -> None: + if not self.active: + self.agent_skill.resources = self.repo.get_resources(self.agent_skill.name) + self.active = True + + +class SkillManager: + """Internal runtime component for loading, parsing, and managing skills. + + Created by the runtime from a :class:`Skills` configuration resource. + Never exposed to users directly. + + Progressive Disclosure: + - Discovery: Load only name/description at startup (~100 tokens) + - Activation: Load full SKILL.md when skill matches task + - Execution: Load resources/scripts only when needed + """ + + # Mapping from script type name to (file extensions, known interpreters) + SCRIPT_TYPE_REGISTRY: ClassVar[Dict[str, tuple[set[str], set[str]]]] = { + "shell": ({".sh", ".bash"}, {"sh", "bash"}), + "python": ({".py"}, {"python", "python3"}), + } + + def __init__(self, skills_config: Skills) -> None: + """Initialize the SkillManager from a Skills configuration.""" + self._skills: Dict[str, RegisteredSkill] = {} + self._config = skills_config + self._allowed_commands: Set[str] = set(skills_config.allowed_commands) + # Build allowed extensions and interpreters from configured script types + self._allowed_extensions: Set[str] = set() + self._allowed_interpreters: Set[str] = set() + for script_type in skills_config.allowed_script_types: + entry = self.SCRIPT_TYPE_REGISTRY.get(script_type) + if entry: + self._allowed_extensions.update(entry[0]) + self._allowed_interpreters.update(entry[1]) + self._load_skills_from_paths() + self._load_skills_from_urls() + self._load_skills_from_resources() + + @property + def size(self) -> int: + """Get the number of registered skills.""" + return len(self._skills) + + def get_skill(self, name: str) -> RegisteredSkill: + """Get a registered skill by name.""" + if name not in self._skills: + msg = f"Skill {name} not found, available skill names are: {list(self._skills.keys())}" + raise ValueError(msg) + return self._skills[name] + + def get_all_skill_names(self) -> List[str]: + """Get the names of all registered skills.""" + return list(self._skills.keys()) + + def load_skill_resource(self, skill_name: str, resource_path: str) -> str | None: + """Load a specified resource of a skill.""" + skill = self.get_skill(skill_name) + return skill.get_resource(resource_path) + + def generate_discovery_prompt(self, *names: str) -> str: + """Generate a system prompt for skill discovery.""" + if self.size == 0: + return "" + + skill_list = [] + for name in names: + skill = self.get_skill(name) + skill_list.append( + SkillPromptProvider.AVAILABLE_SKILL_TEMPLATE.format( + name=skill.name, description=skill.description + ) + ) + + return ( + SkillPromptProvider.SKILL_DISCOVERY_PROMPT.format() + + ("".join(skill_list)) + + SkillPromptProvider.AVAILABLE_SKILLS_TAG_END + ) + + def validate_command(self, skill_name: str, command: str) -> str | None: + """Validate a command before execution. + + Returns None if the command is allowed, or an error message if rejected. + + If the command contains shell operators (``|``, ``&&``, ``;``, etc.), + it is split into sub-commands and each is validated individually. + + For each sub-command: + 1. If it executes a skill script (directly or via interpreter), + check that the script is a skill resource with an allowed extension. + 2. Otherwise, check the executable against the allowed_commands whitelist. + """ + sub_commands = [s.strip() for s in self._split_commands(command) if s.strip()] + if not sub_commands: + return "Empty command." + + for sub_cmd in sub_commands: + error = self._validate_single_command(skill_name, sub_cmd) + if error is not None: + return error + return None + + def _validate_single_command(self, skill_name: str, command: str) -> str | None: + """Validate a single command (no shell operators).""" + tokens = shlex.split(command) + if not tokens: + return "Empty command." + + executable = tokens[0] + + # Case 1a: direct script execution — e.g. "scripts/run.sh arg1" + if self.is_skill_resource(skill_name, executable): + return self._validate_script_extension(executable) + + # Case 1b: interpreter + script — e.g. "python scripts/calc.py arg1" + if executable in self._allowed_interpreters and len(tokens) > 1: + script = tokens[1] + if self.is_skill_resource(skill_name, script): + return self._validate_script_extension(script) + + # Case 2: not a script — check allowed_commands whitelist + if executable in self._allowed_commands: + return None + + return ( + f"Command '{executable}' is not allowed. " + f"Allowed commands: {sorted(self._allowed_commands)}." + ) + + def _validate_script_extension(self, script_path: str) -> str | None: + """Check that a script has an allowed extension.""" + if any(script_path.endswith(ext) for ext in self._allowed_extensions): + return None + return ( + f"Script '{script_path}' has an unsupported type. " + f"Allowed extensions: {sorted(self._allowed_extensions)}." + ) + + @staticmethod + def _split_commands(command: str) -> List[str]: + """Split a command string by shell operators, respecting quotes. Review Comment: IIUC, The intention of this method is to extract individual executable commands from a compound command string, not to perform a full shell/bash tokenization. Shell has many operators (e.g. redirection >, >>, <), but this method only targets command-separating operators, those that connect multiple executables. Can you confirm? If yes, let's make it clear in the comment. Also can we confirm all command-separating operators are covered? For example, are && and || handled correctly? Out of curiosity, is there any native Python lib or thirdparty lib could provide this kind of functionality? I am not sure if we covered all the necessary shell / bash operators, so I'm thinking if there is a such kind of lib then we could just use it. ########## python/flink_agents/runtime/skill/repository/url_repository.py: ########## @@ -0,0 +1,46 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +from typing import Dict, List + +from typing_extensions import override + +from flink_agents.runtime.skill.agent_skill import AgentSkill +from flink_agents.runtime.skill.skill_repository import ( + SkillRepository, +) + + +# TODO: Implement Review Comment: I would suggest to remove this unimplemented class to avoid confusion. ########## python/flink_agents/runtime/skill/repository/filesystem_repository.py: ########## @@ -0,0 +1,210 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import logging +import os +from pathlib import Path +from typing import Dict, List + +from typing_extensions import override + +from flink_agents.runtime.skill.agent_skill import AgentSkill +from flink_agents.runtime.skill.skill_parser import SkillParser +from flink_agents.runtime.skill.skill_repository import ( + SkillRepository, +) + +logger = logging.getLogger(__name__) + + +class FileSystemSkillRepository(SkillRepository): + """File system based implementation of SkillRepository. + + This repository stores skills in a local file system directory structure + where each skill is stored in its own subdirectory containing a SKILL.md + file and optional resource files. + + Directory structure: + + baseDir/ + ├── skill-name-1/ + │ ├── SKILL.md # Required: Entry file with YAML frontmatter + │ ├── references/ # Optional: Reference documentation + │ ├── examples/ # Optional: Example files + │ └── scripts/ # Optional: Script files + └── skill-name-2/ + └── SKILL.md + """ + + SKILL_MD_FILE = "SKILL.md" + + def __init__( + self, + base_dir: Path | str, + ) -> None: + """Create a FileSystemSkillRepository. + + Args: + base_dir: The base directory containing skill subdirectories. + skip_dirs: Optional set of directory names to skip. + skip_patterns: Optional set of file patterns to skip. + + Raises: + ValueError: If base_dir is None, doesn't exist, or is not a directory. + """ + if base_dir is None: + msg = "Base directory cannot be None" + raise ValueError(msg) + + # Convert to Path and normalize + self._base_dir = Path(base_dir).resolve() + + # Validate directory exists + if not self._base_dir.exists(): + msg = f"Base directory does not exist: {self._base_dir}" + raise ValueError(msg) + + # Validate it's a directory + if not self._base_dir.is_dir(): + msg = f"Base directory is not a directory: {self._base_dir}" + raise ValueError(msg) + + @property + def base_dir(self) -> Path: + """Get the base directory. + + Returns: + The base directory path. + """ + return self._base_dir + + @override + def get_skill(self, name: str) -> AgentSkill | None: + """Get a skill by name. + + Args: + name: The skill name. + + Returns: + The skill, or None if not found. + """ + skill_dir = self._base_dir / name + skill_md_path = skill_dir / self.SKILL_MD_FILE + + if not skill_md_path.exists(): + return None + + return self._load_skill(skill_dir) + + @override + def get_resources(self, name: str) -> Dict[str, str]: + skill_dir = self._base_dir / name + return self._load_resources(skill_dir) + + @override + def get_skills(self) -> List[AgentSkill]: + """Get all skills in this repository. + + Returns: + List of all skills. + """ + skills = [] + for skill_name in self._get_all_skill_names(): + skill = self.get_skill(skill_name) + if skill is not None: + skills.append(skill) + return skills + + def _get_all_skill_names(self) -> List[str]: + """Get all skill names in this repository. + + Returns: + List of skill names. + """ + return sorted( + [ + entry.name + for entry in self._base_dir.iterdir() + if entry.is_dir() and (entry / self.SKILL_MD_FILE).exists() + ] + ) + + def _load_skill(self, skill_dir: Path) -> AgentSkill | None: + """Load a skill from a directory. + + Args: + skill_dir: Path to the skill directory. + + Returns: + The loaded skill, or None if loading failed. + """ + skill_md_path = skill_dir / self.SKILL_MD_FILE + + if not skill_md_path.exists(): + return None + + try: + skill_md_content = skill_md_path.read_text() + + skill = SkillParser.parse_skill(skill_md_content) + + if skill.name != skill_dir.name: + logger.warning( + f"The skill name {skill.name}is different from the base directory {skill_dir.name}." Review Comment: `{skill.name}is` -> `{skill.name} is` ########## python/flink_agents/runtime/skill/skill_repository.py: ########## @@ -0,0 +1,84 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################# +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Dict, List + +from flink_agents.runtime.skill.agent_skill import AgentSkill + + +@dataclass +class SkillRepositoryInfo: + """Information about a skill repository. + + Attributes: + ---------- + repo_type : str + The type of repository (e.g., "filesystem", "classpath", "url"). + location : str + The location of the repository (e.g., path, URL). + writeable : bool + Whether the repository supports write operations. + """ + + repo_type: str + location: str + writeable: bool + + +class SkillRepository(ABC): + """Abstract interface for skill repositories. + + A SkillRepository is responsible for loading and optionally storing skills + from a specific source (filesystem, classpath, URL, etc.). + + Each skill is stored in its own subdirectory containing a SKILL.md file + and optional resource files: + + baseDir/ + ├── skill-name-1/ Review Comment: What will happen if the users define folder with other different names? For example, this Claude official skill: https://github.com/anthropics/skills/tree/main/skills/skill-creator ########## python/flink_agents/runtime/skill/skill_parser.py: ########## @@ -0,0 +1,146 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import re +from dataclasses import dataclass +from typing import Dict + +import yaml + +from flink_agents.runtime.skill.agent_skill import AgentSkill + + +@dataclass +class ParsedMarkdown: + """Result of parsing markdown with frontmatter. + + Contains both the extracted metadata and the markdown content. + + Attributes: + ---------- + metadata : Dict[str, str] + YAML metadata extracted from frontmatter. + content : str + Markdown content (without frontmatter). + """ + + metadata: Dict[str, str] + content: str + + +class MarkdownSkillParser: + """Utility for parsing Markdown files with YAML frontmatter.""" + + # Pattern to match frontmatter: starts with ---, ends with --- Review Comment: is below a valid frontmatter? ``` --- name: tricky description: | This has a --- inside --- ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
