(hugegraph-ai) branch main updated: fix(llm): improve graph JSON parsing robustness for LLM outputs (#332)

jin Tue, 19 May 2026 00:55:52 -0700

This is an automated email from the ASF dual-hosted git repository.

imbajin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/hugegraph-ai.git



The following commit(s) were added to refs/heads/main by this push:
     new 016158f1 fix(llm): improve graph JSON parsing robustness for LLM 
outputs (#332)
016158f1 is described below

commit 016158f1a3ee19f880a9e9cd7bb1e641234b20d3
Author: mengmeng.lin <[email protected]>
AuthorDate: Tue May 19 15:53:36 2026 +0800

    fix(llm): improve graph JSON parsing robustness for LLM outputs (#332)
    
    ## Summary
    
    - Improve `_extract_and_filter_label` to handle varying LLM output
    formats
    - Strip markdown code blocks before JSON extraction
    - Support both `{"vertices":[...], "edges":[...]}` (object) and flat
    array formats
    - Auto-convert flat arrays to the expected object structure
    
    🤖 Generated with [Claude Code](https://claude.com/claude-code)
    
    ---------
    
    Co-authored-by: linmm <[email protected]>
    Co-authored-by: imbajin <[email protected]>
---
 .../operators/llm_op/property_graph_extract.py     |  18 ++-
 .../llm_op/test_property_graph_extract.py          | 154 +++++++++++++++++++++
 2 files changed, 167 insertions(+), 5 deletions(-)

diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py 
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py
index bf363019..31411d96 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py
@@ -121,18 +121,26 @@ class PropertyGraphExtract:
         return self.llm.generate(prompt=prompt)
 
     def _extract_and_filter_label(self, schema, text) -> List[Dict[str, Any]]:
-        # Use regex to extract a JSON object with curly braces
-        json_match = re.search(r"({.*})", text, re.DOTALL)
+        # Strip markdown code blocks (e.g. ```json ... ```)
+        text = re.sub(r"```\w*\n?", "", text)
+        text = re.sub(r"```", "", text)
+        text = text.strip()
+
+        # Try to extract JSON (object or array)
+        json_match = re.search(r"(\{.*\}|\[.*\])", text, re.DOTALL)
         if not json_match:
-            log.critical(
-                "Invalid property graph! No JSON object found, please check 
the output format example in prompt."
-            )
+            log.critical("Invalid property graph! No JSON found, please check 
the output format example in prompt.")
             return []
         json_str = json_match.group(1).strip()
 
         items = []
         try:
             property_graph = json.loads(json_str)
+            # Handle flat array format: convert to {"vertices": [...], 
"edges": [...]}
+            if isinstance(property_graph, list):
+                vertices = [item for item in property_graph if 
isinstance(item, dict) and item.get("type") == "vertex"]
+                edges = [item for item in property_graph if isinstance(item, 
dict) and item.get("type") == "edge"]
+                property_graph = {"vertices": vertices, "edges": edges}
             # Expect property_graph to be a dict with keys "vertices" and 
"edges"
             if not (isinstance(property_graph, dict) and "vertices" in 
property_graph and "edges" in property_graph):
                 log.critical("Invalid property graph format; expecting 
'vertices' and 'edges'.")
diff --git 
a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py 
b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py
index 5a2dee09..7c84de15 100644
--- a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py
+++ b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py
@@ -200,6 +200,160 @@ class TestPropertyGraphExtract(unittest.TestCase):
         self.assertEqual(result[1]["type"], "edge")
         self.assertEqual(result[1]["label"], "acted_in")
 
+    def test_extract_and_filter_label_markdown_json(self):
+        """Test _extract_and_filter_label with JSON wrapped in markdown 
fences."""
+        extractor = PropertyGraphExtract(llm=self.mock_llm)
+        text = f"""```json
+{self.llm_responses[1]}
+```"""
+
+        result = extractor._extract_and_filter_label(self.schema, text)
+
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[0]["type"], "vertex")
+        self.assertEqual(result[0]["label"], "movie")
+        self.assertEqual(result[1]["type"], "edge")
+        self.assertEqual(result[1]["label"], "acted_in")
+
+    def test_extract_and_filter_label_markdown_json_with_prose(self):
+        """Test fenced JSON can be parsed when the LLM adds prose."""
+        extractor = PropertyGraphExtract(llm=self.mock_llm)
+        text = f"""Here is the extracted graph:
+```
+{self.llm_responses[1]}
+```
+Hope this helps."""
+
+        result = extractor._extract_and_filter_label(self.schema, text)
+
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[0]["type"], "vertex")
+        self.assertEqual(result[0]["label"], "movie")
+        self.assertEqual(result[1]["type"], "edge")
+        self.assertEqual(result[1]["label"], "acted_in")
+
+    def test_extract_and_filter_label_flat_array_json(self):
+        """Test _extract_and_filter_label converts flat arrays to vertices and 
edges."""
+        extractor = PropertyGraphExtract(llm=self.mock_llm)
+        text = """```json
+        [
+            {
+                "type": "vertex",
+                "label": "person",
+                "properties": {
+                    "name": "Tom Hanks"
+                }
+            },
+            {
+                "type": "edge",
+                "label": "acted_in",
+                "properties": {
+                    "role": "Forrest Gump"
+                },
+                "source": {
+                    "label": "person",
+                    "properties": {
+                        "name": "Tom Hanks"
+                    }
+                },
+                "target": {
+                    "label": "movie",
+                    "properties": {
+                        "title": "Forrest Gump"
+                    }
+                }
+            }
+        ]
+        ```"""
+
+        result = extractor._extract_and_filter_label(self.schema, text)
+
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[0]["type"], "vertex")
+        self.assertEqual(result[0]["label"], "person")
+        self.assertEqual(result[1]["type"], "edge")
+        self.assertEqual(result[1]["label"], "acted_in")
+
+    def test_extract_and_filter_label_flat_array_filters_invalid_items(self):
+        """Test flat arrays keep valid graph items and drop invalid ones."""
+        extractor = PropertyGraphExtract(llm=self.mock_llm)
+        text = """[
+            {
+                "type": "vertex",
+                "label": "person",
+                "properties": {
+                    "name": "Tom Hanks"
+                }
+            },
+            {
+                "type": "vertex",
+                "label": "unknown_label",
+                "properties": {
+                    "name": "Unknown"
+                }
+            },
+            {
+                "type": "edge",
+                "label": "acted_in",
+                "properties": {
+                    "role": "Forrest Gump"
+                },
+                "source": {
+                    "label": "person",
+                    "properties": {
+                        "name": "Tom Hanks"
+                    }
+                },
+                "target": {
+                    "label": "movie",
+                    "properties": {
+                        "title": "Forrest Gump"
+                    }
+                }
+            },
+            {
+                "type": "edge",
+                "label": "unknown_edge",
+                "properties": {}
+            },
+            {
+                "type": "note",
+                "label": "person",
+                "properties": {}
+            },
+            "not-a-dict"
+        ]"""
+
+        result = extractor._extract_and_filter_label(self.schema, text)
+
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[0]["type"], "vertex")
+        self.assertEqual(result[0]["label"], "person")
+        self.assertEqual(result[1]["type"], "edge")
+        self.assertEqual(result[1]["label"], "acted_in")
+
+    def test_extract_and_filter_label_malformed_fenced_json(self):
+        """Test malformed fenced JSON returns no graph items."""
+        extractor = PropertyGraphExtract(llm=self.mock_llm)
+        text = """```json
+        {
+            "vertices": [
+                {
+                    "type": "vertex",
+                    "label": "person",
+                    "properties": {
+                        "name": "Tom Hanks"
+                    }
+                }
+            ],
+            "edges": []
+        ```
+        """
+
+        result = extractor._extract_and_filter_label(self.schema, text)
+
+        self.assertEqual(result, [])
+
     def test_extract_and_filter_label_invalid_json(self):
         """Test the _extract_and_filter_label method with invalid JSON."""
         extractor = PropertyGraphExtract(llm=self.mock_llm)

(hugegraph-ai) branch main updated: fix(llm): improve graph JSON parsing robustness for LLM outputs (#332)

Reply via email to