(incubator-hugegraph-ai) branch main updated: fix(llm): correct e_cache type (#117)

jin Wed, 20 Nov 2024 22:55:10 -0800

This is an automated email from the ASF dual-hosted git repository.

jin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git



The following commit(s) were added to refs/heads/main by this push:
     new f8b4582  fix(llm): correct e_cache type (#117)
f8b4582 is described below

commit f8b4582ee39f13e5f8734caa2e608082a685d91c
Author: chenzihong <[email protected]>
AuthorDate: Thu Nov 21 14:46:49 2024 +0800

    fix(llm): correct e_cache type (#117)
    
    - [X] fix edge cache using (inV, label, outV) as key
    - [X] limit node and edge property length
    - [X] handle isolate node problem
    
    ---------
    
    Co-authored-by: imbajin <[email protected]>
---
 .../src/hugegraph_llm/config/config_data.py        |  1 +
 .../src/hugegraph_llm/operators/graph_rag_task.py  |  7 ++-
 .../operators/hugegraph_op/graph_rag_query.py      | 69 +++++++++++++---------
 3 files changed, 49 insertions(+), 28 deletions(-)

diff --git a/hugegraph-llm/src/hugegraph_llm/config/config_data.py 
b/hugegraph-llm/src/hugegraph_llm/config/config_data.py
index 004a29e..52b41dd 100644
--- a/hugegraph-llm/src/hugegraph_llm/config/config_data.py
+++ b/hugegraph-llm/src/hugegraph_llm/config/config_data.py
@@ -70,6 +70,7 @@ class ConfigData:
     graph_user: Optional[str] = "admin"
     graph_pwd: Optional[str] = "xxx"
     graph_space: Optional[str] = None
+    limit_property: Optional[str] = "False"
 
     """Admin settings"""
     enable_login: Optional[str] = "False"
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py 
b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
index e6da8e0..789ec20 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
@@ -117,6 +117,8 @@ class RAGPipeline:
         self,
         max_deep: int = 2,
         max_items: int = 30,
+        max_v_prop_len: int = 2048,
+        max_e_prop_len: int = 256,
         prop_to_match: Optional[str] = None,
     ):
         """
@@ -124,11 +126,14 @@ class RAGPipeline:
 
         :param max_deep: Maximum depth for the graph query.
         :param max_items: Maximum number of items to retrieve.
+        :param max_v_prop_len: Maximum length of vertex properties.
+        :param max_e_prop_len: Maximum length of edge properties.
         :param prop_to_match: Property to match in the graph.
         :return: Self-instance for chaining.
         """
         self._operators.append(
-            GraphRAGQuery(max_deep=max_deep, max_items=max_items, 
prop_to_match=prop_to_match)
+            GraphRAGQuery(max_deep=max_deep, max_items=max_items, 
max_v_prop_len=max_v_prop_len,
+                          max_e_prop_len=max_e_prop_len, 
prop_to_match=prop_to_match)
         )
         return self
 
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
index 3f11c91..a3dc1ad 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
@@ -71,7 +71,8 @@ g.V().has('{prop}', within({keywords}))
 
 class GraphRAGQuery:
 
-    def __init__(self, max_deep: int = 2, max_items: int = 20, prop_to_match: 
Optional[str] = None):
+    def __init__(self, max_deep: int = 2, max_items: int = 20, max_v_prop_len: 
int = 2048,
+                 max_e_prop_len: int = 256, prop_to_match: Optional[str] = 
None):
         self._client = PyHugeClient(
             settings.graph_ip,
             settings.graph_port,
@@ -84,6 +85,9 @@ class GraphRAGQuery:
         self._max_items = max_items
         self._prop_to_match = prop_to_match
         self._schema = ""
+        self._limit_property = settings.limit_property.lower() == "true"
+        self._max_v_prop_len = max_v_prop_len
+        self._max_e_prop_len = max_e_prop_len
 
     def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
         # pylint: disable=R0915 (too-many-statements)
@@ -138,6 +142,10 @@ class GraphRAGQuery:
             graph_chain_knowledge, vertex_degree_list, knowledge_with_degree = 
self._format_graph_query_result(
                 query_paths=paths
             )
+
+            # TODO: we may need to optimize the logic here with global 
deduplication (may lack some single vertex)
+            if not graph_chain_knowledge:
+                graph_chain_knowledge.update(vertex_knowledge)
             if vertex_degree_list:
                 vertex_degree_list[0].update(vertex_knowledge)
             else:
@@ -171,8 +179,7 @@ class GraphRAGQuery:
             "extracted based on key entities as subject:\n"
         )
         # TODO: set color for ↓ "\033[93mKnowledge from Graph:\033[0m"
-        log.debug("Knowledge from Graph:")
-        log.debug("\n".join(context["graph_result"]))
+        log.debug("Knowledge from Graph:\n%s", 
"\n".join(context["graph_result"]))
         return context
 
     def _format_graph_from_vertex(self, query_result: List[Any]) -> Set[str]:
@@ -189,20 +196,20 @@ class GraphRAGQuery:
         subgraph_with_degree = {}
         vertex_degree_list: List[Set[str]] = []
         v_cache: Set[str] = set()
-        e_cache: Set[str] = set()
+        e_cache: Set[Tuple[str, str, str]] = set()
 
         for path in query_paths:
             # 1. Process each path
-            flat_rel, nodes_with_degree = self._process_path(path, 
use_id_to_match, v_cache, e_cache)
-            subgraph.add(flat_rel)
-            subgraph_with_degree[flat_rel] = nodes_with_degree
+            path_str, vertex_with_degree = self._process_path(path, 
use_id_to_match, v_cache, e_cache)
+            subgraph.add(path_str)
+            subgraph_with_degree[path_str] = vertex_with_degree
             # 2. Update vertex degree list
-            self._update_vertex_degree_list(vertex_degree_list, 
nodes_with_degree)
+            self._update_vertex_degree_list(vertex_degree_list, 
vertex_with_degree)
 
         return subgraph, vertex_degree_list, subgraph_with_degree
 
     def _process_path(self, path: Any, use_id_to_match: bool, v_cache: 
Set[str],
-                      e_cache: Set[str]) -> Tuple[str, List[str]]:
+                      e_cache: Set[Tuple[str, str, str]]) -> Tuple[str, 
List[str]]:
         flat_rel = ""
         raw_flat_rel = path["objects"]
         assert len(raw_flat_rel) % 2 == 1, "The length of raw_flat_rel should 
be odd."
@@ -222,7 +229,7 @@ class GraphRAGQuery:
             else:
                 # Process each edge
                 flat_rel, prior_edge_str_len = self._process_edge(
-                    item, flat_rel, prior_edge_str_len, raw_flat_rel, 
i,use_id_to_match, e_cache
+                    item, flat_rel, raw_flat_rel, i, use_id_to_match, e_cache
                 )
 
         return flat_rel, nodes_with_degree
@@ -236,40 +243,40 @@ class GraphRAGQuery:
             return flat_rel, prior_edge_str_len, depth
 
         node_cache.add(matched_str)
-        props_str = ", ".join(f"{k}: {v}" for k, v in item["props"].items() if 
v)
+        props_str = ", ".join(f"{k}: {self._limit_property_query(v, 'v')}"
+                              for k, v in item["props"].items() if v)
+
         # TODO: we may remove label id or replace with label name
         if matched_str in v_cache:
             node_str = matched_str
         else:
             v_cache.add(matched_str)
             node_str = f"{item['id']}{{{props_str}}}"
+
         flat_rel += node_str
         nodes_with_degree.append(node_str)
         depth += 1
-
         return flat_rel, prior_edge_str_len, depth
 
-    def _process_edge(self, item: Any, flat_rel: str, prior_edge_str_len: int,
-                      raw_flat_rel: List[Any], i: int, use_id_to_match: bool, 
e_cache: Set[str]) -> Tuple[str, int]:
-        props_str = ", ".join(f"{k}: {v}" for k, v in item["props"].items() if 
v)
-        props_str = f"{{{props_str}}}" if len(props_str) > 0 else ""
+    def _process_edge(self, item: Any, path_str: str, raw_flat_rel: List[Any], 
i: int, use_id_to_match: bool,
+                      e_cache: Set[Tuple[str, str, str]]) -> Tuple[str, int]:
+        props_str = ", ".join(f"{k}: {self._limit_property_query(v, 'e')}"
+                              for k, v in item["props"].items() if v)
+        props_str = f"{{{props_str}}}" if props_str else ""
         prev_matched_str = raw_flat_rel[i - 1]["id"] if use_id_to_match else (
             raw_flat_rel)[i - 1]["props"][self._prop_to_match]
 
-        if item["label"] in e_cache:
-            edge_str = f"{item['label']}"
+        edge_key = (item['inV'], item['label'], item['outV'])
+        if edge_key not in e_cache:
+            e_cache.add(edge_key)
+            edge_label = f"{item['label']}{props_str}"
         else:
-            e_cache.add(item["label"])
-            edge_str = f"{item['label']}{props_str}"
+            edge_label = item['label']
 
-        if item["outV"] == prev_matched_str:
-            edge_str = f" --[{edge_str}]--> "
-        else:
-            edge_str = f" <--[{edge_str}]-- "
-
-        flat_rel += edge_str
+        edge_str = f"--[{edge_label}]-->" if item["outV"] == prev_matched_str 
else f"<--[{edge_label}]--"
+        path_str += edge_str
         prior_edge_str_len = len(edge_str)
-        return flat_rel, prior_edge_str_len
+        return path_str, prior_edge_str_len
 
     def _update_vertex_degree_list(self, vertex_degree_list: List[Set[str]], 
nodes_with_degree: List[str]) -> None:
         for depth, node_str in enumerate(nodes_with_degree):
@@ -313,3 +320,11 @@ class GraphRAGQuery:
         )
         log.debug("Link(Relation): %s", relationships)
         return self._schema
+
+    def _limit_property_query(self, value: Optional[str], item_type: str) -> 
Optional[str]:
+        # NOTE: we skip the filter for list/set type (e.g., list of string, 
add it if needed)
+        if not self._limit_property or not isinstance(value, str):
+            return value
+
+        max_len = self._max_v_prop_len if item_type == "v" else 
self._max_e_prop_len
+        return value[:max_len] if value else value

(incubator-hugegraph-ai) branch main updated: fix(llm): correct e_cache type (#117)

Reply via email to