This is an automated email from the ASF dual-hosted git repository.
imbajin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/hugegraph-ai.git
The following commit(s) were added to refs/heads/main by this push:
new 016158f1 fix(llm): improve graph JSON parsing robustness for LLM
outputs (#332)
016158f1 is described below
commit 016158f1a3ee19f880a9e9cd7bb1e641234b20d3
Author: mengmeng.lin <[email protected]>
AuthorDate: Tue May 19 15:53:36 2026 +0800
fix(llm): improve graph JSON parsing robustness for LLM outputs (#332)
## Summary
- Improve `_extract_and_filter_label` to handle varying LLM output
formats
- Strip markdown code blocks before JSON extraction
- Support both `{"vertices":[...], "edges":[...]}` (object) and flat
array formats
- Auto-convert flat arrays to the expected object structure
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---------
Co-authored-by: linmm <[email protected]>
Co-authored-by: imbajin <[email protected]>
---
.../operators/llm_op/property_graph_extract.py | 18 ++-
.../llm_op/test_property_graph_extract.py | 154 +++++++++++++++++++++
2 files changed, 167 insertions(+), 5 deletions(-)
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py
index bf363019..31411d96 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py
@@ -121,18 +121,26 @@ class PropertyGraphExtract:
return self.llm.generate(prompt=prompt)
def _extract_and_filter_label(self, schema, text) -> List[Dict[str, Any]]:
- # Use regex to extract a JSON object with curly braces
- json_match = re.search(r"({.*})", text, re.DOTALL)
+ # Strip markdown code blocks (e.g. ```json ... ```)
+ text = re.sub(r"```\w*\n?", "", text)
+ text = re.sub(r"```", "", text)
+ text = text.strip()
+
+ # Try to extract JSON (object or array)
+ json_match = re.search(r"(\{.*\}|\[.*\])", text, re.DOTALL)
if not json_match:
- log.critical(
- "Invalid property graph! No JSON object found, please check
the output format example in prompt."
- )
+ log.critical("Invalid property graph! No JSON found, please check
the output format example in prompt.")
return []
json_str = json_match.group(1).strip()
items = []
try:
property_graph = json.loads(json_str)
+ # Handle flat array format: convert to {"vertices": [...],
"edges": [...]}
+ if isinstance(property_graph, list):
+ vertices = [item for item in property_graph if
isinstance(item, dict) and item.get("type") == "vertex"]
+ edges = [item for item in property_graph if isinstance(item,
dict) and item.get("type") == "edge"]
+ property_graph = {"vertices": vertices, "edges": edges}
# Expect property_graph to be a dict with keys "vertices" and
"edges"
if not (isinstance(property_graph, dict) and "vertices" in
property_graph and "edges" in property_graph):
log.critical("Invalid property graph format; expecting
'vertices' and 'edges'.")
diff --git
a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py
b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py
index 5a2dee09..7c84de15 100644
--- a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py
+++ b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py
@@ -200,6 +200,160 @@ class TestPropertyGraphExtract(unittest.TestCase):
self.assertEqual(result[1]["type"], "edge")
self.assertEqual(result[1]["label"], "acted_in")
+ def test_extract_and_filter_label_markdown_json(self):
+ """Test _extract_and_filter_label with JSON wrapped in markdown
fences."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = f"""```json
+{self.llm_responses[1]}
+```"""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(len(result), 2)
+ self.assertEqual(result[0]["type"], "vertex")
+ self.assertEqual(result[0]["label"], "movie")
+ self.assertEqual(result[1]["type"], "edge")
+ self.assertEqual(result[1]["label"], "acted_in")
+
+ def test_extract_and_filter_label_markdown_json_with_prose(self):
+ """Test fenced JSON can be parsed when the LLM adds prose."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = f"""Here is the extracted graph:
+```
+{self.llm_responses[1]}
+```
+Hope this helps."""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(len(result), 2)
+ self.assertEqual(result[0]["type"], "vertex")
+ self.assertEqual(result[0]["label"], "movie")
+ self.assertEqual(result[1]["type"], "edge")
+ self.assertEqual(result[1]["label"], "acted_in")
+
+ def test_extract_and_filter_label_flat_array_json(self):
+ """Test _extract_and_filter_label converts flat arrays to vertices and
edges."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = """```json
+ [
+ {
+ "type": "vertex",
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ {
+ "type": "edge",
+ "label": "acted_in",
+ "properties": {
+ "role": "Forrest Gump"
+ },
+ "source": {
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ "target": {
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ }
+ ]
+ ```"""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(len(result), 2)
+ self.assertEqual(result[0]["type"], "vertex")
+ self.assertEqual(result[0]["label"], "person")
+ self.assertEqual(result[1]["type"], "edge")
+ self.assertEqual(result[1]["label"], "acted_in")
+
+ def test_extract_and_filter_label_flat_array_filters_invalid_items(self):
+ """Test flat arrays keep valid graph items and drop invalid ones."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = """[
+ {
+ "type": "vertex",
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ {
+ "type": "vertex",
+ "label": "unknown_label",
+ "properties": {
+ "name": "Unknown"
+ }
+ },
+ {
+ "type": "edge",
+ "label": "acted_in",
+ "properties": {
+ "role": "Forrest Gump"
+ },
+ "source": {
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ "target": {
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ },
+ {
+ "type": "edge",
+ "label": "unknown_edge",
+ "properties": {}
+ },
+ {
+ "type": "note",
+ "label": "person",
+ "properties": {}
+ },
+ "not-a-dict"
+ ]"""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(len(result), 2)
+ self.assertEqual(result[0]["type"], "vertex")
+ self.assertEqual(result[0]["label"], "person")
+ self.assertEqual(result[1]["type"], "edge")
+ self.assertEqual(result[1]["label"], "acted_in")
+
+ def test_extract_and_filter_label_malformed_fenced_json(self):
+ """Test malformed fenced JSON returns no graph items."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = """```json
+ {
+ "vertices": [
+ {
+ "type": "vertex",
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ }
+ ],
+ "edges": []
+ ```
+ """
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(result, [])
+
def test_extract_and_filter_label_invalid_json(self):
"""Test the _extract_and_filter_label method with invalid JSON."""
extractor = PropertyGraphExtract(llm=self.mock_llm)