This is an automated email from the ASF dual-hosted git repository.
imbajin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/hugegraph-ai.git
The following commit(s) were added to refs/heads/main by this push:
new f6e08f94 fix(llm): strip markdown code fences in keyword extraction
parser (#338)
f6e08f94 is described below
commit f6e08f9482aa5810c5595197cbbc60fce782c894
Author: mengmeng.lin <[email protected]>
AuthorDate: Tue May 19 12:53:06 2026 +0800
fix(llm): strip markdown code fences in keyword extraction parser (#338)
## Summary
- Strip markdown code fences (` ``` `) from LLM response before keyword
extraction parsing
- Consistent with how other operators (`gremlin_generate`,
`schema_build`, `property_graph_extract`) already handle LLM output
Fixes #333
## Test plan
- [x] `ruff format --check .` and `ruff check .` pass
- [ ] Verify keyword extraction works when LLM returns responses wrapped
in markdown code blocks
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---------
Co-authored-by: linmm <[email protected]>
Co-authored-by: Claude Opus 4.6 <[email protected]>
Co-authored-by: imbajin <[email protected]>
---
.../operators/llm_op/keyword_extract.py | 5 +++
.../tests/operators/llm_op/test_keyword_extract.py | 46 ++++++++++++++++++++++
2 files changed, 51 insertions(+)
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
index 0d8b4a17..5b322751 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
@@ -151,6 +151,11 @@ class KeywordExtract:
) -> Dict[str, float]:
results = {}
+ # Strip markdown code fences before parsing
+ response = re.sub(r"```\w*\n?", "", response)
+ response = re.sub(r"```", "", response)
+ response = response.strip()
+
# use re.escape(start_token) if start_token contains special chars
like */&/^ etc.
matches = re.findall(rf"{start_token}([^\n]+\n?)", response)
diff --git a/hugegraph-llm/src/tests/operators/llm_op/test_keyword_extract.py
b/hugegraph-llm/src/tests/operators/llm_op/test_keyword_extract.py
index e5672954..a2cc1bb5 100644
--- a/hugegraph-llm/src/tests/operators/llm_op/test_keyword_extract.py
+++ b/hugegraph-llm/src/tests/operators/llm_op/test_keyword_extract.py
@@ -206,6 +206,52 @@ class TestKeywordExtract(unittest.TestCase):
self.assertIn("machine learning", keywords)
self.assertIn("neural networks", keywords)
+ def test_extract_keywords_from_markdown_code_block(self):
+ """Test _extract_keywords_from_response strips markdown code fences."""
+ response = """```text
+KEYWORDS: artificial intelligence:0.9, machine learning:0.8, neural
networks:0.7
+```"""
+ keywords = self.extractor._extract_keywords_from_response(response,
lowercase=False, start_token="KEYWORDS:")
+
+ self.assertEqual(keywords["artificial intelligence"], 0.9)
+ self.assertEqual(keywords["machine learning"], 0.8)
+ self.assertEqual(keywords["neural networks"], 0.7)
+
+ def test_extract_keywords_from_common_markdown_code_fences(self):
+ """Test common fenced output variants from LLMs."""
+ responses = [
+ """```json
+KEYWORDS: artificial intelligence:0.9, machine learning:0.8
+```""",
+ """```
+KEYWORDS: artificial intelligence:0.9, machine learning:0.8
+```""",
+ "Here are the keywords:\n```text\r\nKEYWORDS: artificial
intelligence:0.9, machine learning:0.8\r\n```\nDone.",
+ ]
+
+ for response in responses:
+ with self.subTest(response=response):
+ keywords = self.extractor._extract_keywords_from_response(
+ response, lowercase=False, start_token="KEYWORDS:"
+ )
+
+ self.assertEqual(keywords["artificial intelligence"], 0.9)
+ self.assertEqual(keywords["machine learning"], 0.8)
+
+ def test_extract_keywords_from_fenced_output_skips_malformed_items(self):
+ """Test fenced output keeps valid keywords while skipping malformed
items."""
+ response = """```markdown
+KEYWORDS: artificial intelligence:0.9, missing-score, bad score:not-a-number,
graph:1.2, machine learning:0.8
+```"""
+
+ keywords = self.extractor._extract_keywords_from_response(response,
lowercase=False, start_token="KEYWORDS:")
+
+ self.assertEqual(keywords["artificial intelligence"], 0.9)
+ self.assertEqual(keywords["graph"], 1.0)
+ self.assertEqual(keywords["machine learning"], 0.8)
+ self.assertNotIn("missing-score", keywords)
+ self.assertNotIn("bad score", keywords)
+
def test_extract_keywords_from_response_without_start_token(self):
"""Test _extract_keywords_from_response method without start token."""
response = "artificial intelligence:0.9, machine learning:0.8, neural
networks:0.7"