(incubator-hugegraph-ai) branch main updated: fix(llm): multi vid k-neighbor query only return the data of first vid

jin Tue, 10 Dec 2024 06:48:29 -0800

This is an automated email from the ASF dual-hosted git repository.

jin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git



The following commit(s) were added to refs/heads/main by this push:
     new 9512459  fix(llm): multi vid k-neighbor query only return the data of 
first vid
9512459 is described below

commit 95124598c4f8d81f73f37d04aae3f9fad26963ac
Author: yc319 <[email protected]>
AuthorDate: Tue Dec 10 22:44:29 2024 +0800

    fix(llm): multi vid k-neighbor query only return the data of first vid
    
    Currently, the gremlin:
    ```groovy
    g.V(v1, v2, ..., vn....)
    .repeat(
       bothE({edge_labels}).limit({edge_limit}).otherV().dedup()
    ).times({max_deep}).emit()
    .simplePath()
    .path()
    .by(project('label', 'id', 'props')
       .by(label())
       .by(id())
       .by(valueMap().by(unfold()))
    )
    .by(project('label', 'inV', 'outV', 'props')
       .by(label())
       .by(inV().id())
       .by(outV().id())
       .by(valueMap().by(unfold()))
    )
    .limit({max_items})
    .toList()
    ```
    will only fetch the first vid data (if we use path() & search multi vids 
together)
    
    So downgrade to for-loop to make the logic right now (Enhance it later)
    
    ---------
    
    Co-authored-by: yucui <[email protected]>
    Co-authored-by: imbajin <[email protected]>
---
 .../src/hugegraph_llm/config/config_data.py        |  4 ++
 .../demo/rag_demo/text2gremlin_block.py            |  2 +-
 .../src/hugegraph_llm/indices/vector_index.py      |  3 +-
 .../operators/hugegraph_op/graph_rag_query.py      | 57 ++++++++++++----------
 .../operators/index_op/semantic_id_query.py        |  3 +-
 5 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/hugegraph-llm/src/hugegraph_llm/config/config_data.py 
b/hugegraph-llm/src/hugegraph_llm/config/config_data.py
index f69e04b..a3d887e 100644
--- a/hugegraph-llm/src/hugegraph_llm/config/config_data.py
+++ b/hugegraph-llm/src/hugegraph_llm/config/config_data.py
@@ -105,6 +105,10 @@ class ConfigData:
     graph_pwd: Optional[str] = "xxx"
     graph_space: Optional[str] = None
     limit_property: Optional[str] = "False"
+    max_graph_path: Optional[int] = 10
+    max_items: Optional[int] = 30
+    edge_limit_pre_label: Optional[int] = 8
+    vector_dis_threshold: Optional[float] = 0.9
 
     """Admin settings"""
     enable_login: Optional[str] = "False"
diff --git 
a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/text2gremlin_block.py 
b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/text2gremlin_block.py
index f2fb6fb..797ec73 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/text2gremlin_block.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/text2gremlin_block.py
@@ -62,7 +62,7 @@ def build_example_vector_index(temp_file) -> dict:
 
 def gremlin_generate(inp, example_num, schema, gremlin_prompt) -> tuple[str, 
str] | tuple[str, Any, Any, Any, Any]:
     generator = GremlinGenerator(llm=LLMs().get_text2gql_llm(), 
embedding=Embeddings().get_embedding())
-    sm = SchemaManager(graph_name="schema")
+    sm = SchemaManager(graph_name=schema)
     short_schema = False
 
     if schema:
diff --git a/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py 
b/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
index 4ba1983..7f93c3d 100644
--- a/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
+++ b/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
@@ -31,6 +31,7 @@ PROPERTIES_FILE_NAME = "properties.pkl"
 
 class VectorIndex:
     """Comment"""
+
     def __init__(self, embed_dim: int = 1024):
         self.index = faiss.IndexFlatL2(embed_dim)
         self.properties = []
@@ -95,7 +96,7 @@ class VectorIndex:
         distances, indices = self.index.search(np.array([query_vector]), top_k)
         results = []
         for dist, i in zip(distances[0], indices[0]):
-            if dist < dis_threshold: # Smaller distances indicate higher 
similarity
+            if dist < dis_threshold:  # Smaller distances indicate higher 
similarity
                 results.append(deepcopy(self.properties[i]))
                 log.debug("[✓] Add valid distance %s to results.", dist)
             else:
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
index cd759ef..2f5291a 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
@@ -75,17 +75,16 @@ g.V().has('{prop}', within({keywords}))
 
 
 class GraphRAGQuery:
-
     def __init__(
             self,
             max_deep: int = 2,
-            max_items: int = 20,
+            max_items: int = int(settings.max_items),
             prop_to_match: Optional[str] = None,
-            with_gremlin_template: bool = True,
             llm: Optional[BaseLLM] = None,
             embedding: Optional[BaseEmbedding] = None,
             max_v_prop_len: int = 2048,
             max_e_prop_len: int = 256,
+            with_gremlin_template: bool = True,
             num_gremlin_generate_example: int = 1
     ):
         self._client = PyHugeClient(
@@ -111,19 +110,7 @@ class GraphRAGQuery:
         self._with_gremlin_template = with_gremlin_template
 
     def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
-        # pylint: disable=R0915 (too-many-statements)
-        if self._client is None:
-            if isinstance(context.get("graph_client"), PyHugeClient):
-                self._client = context["graph_client"]
-            else:
-                ip = context.get("ip") or "localhost"
-                port = context.get("port") or "8080"
-                graph = context.get("graph") or "hugegraph"
-                user = context.get("user") or "admin"
-                pwd = context.get("pwd") or "admin"
-                gs = context.get("graphspace") or None
-                self._client = PyHugeClient(ip, port, graph, user, pwd, gs)
-        assert self._client is not None, "No valid graph to search."
+        self._init_client(context)
 
         # initial flag: -1 means no result, 0 means subgraph query, 1 means 
gremlin query
         context["graph_result_flag"] = -1
@@ -189,7 +176,7 @@ class GraphRAGQuery:
         _, edge_labels = self._extract_labels_from_schema()
         edge_labels_str = ",".join("'" + label + "'" for label in edge_labels)
         # TODO: enhance the limit logic later
-        edge_limit_amount = len(edge_labels) * 10
+        edge_limit_amount = len(edge_labels) * settings.edge_limit_pre_label
 
         use_id_to_match = self._prop_to_match is None
         if use_id_to_match:
@@ -201,15 +188,18 @@ class GraphRAGQuery:
             log.debug("Vids gremlin query: %s", gremlin_query)
 
             vertex_knowledge = 
self._format_graph_from_vertex(query_result=vertexes)
-            gremlin_query = VID_QUERY_NEIGHBOR_TPL.format(
-                keywords=matched_vids,
-                max_deep=self._max_deep,
-                edge_labels=edge_labels_str,
-                edge_limit=edge_limit_amount,
-                max_items=self._max_items,
-            )
-            log.debug("Kneighbor gremlin query: %s", 
gremlin_query.replace("\n", "").replace(" ", ""))
-            paths = self._client.gremlin().exec(gremlin=gremlin_query)["data"]
+            paths: List[Any] = []
+            # TODO: use generator or asyncio to speed up the query logic
+            for matched_vid in matched_vids:
+                gremlin_query = VID_QUERY_NEIGHBOR_TPL.format(
+                    keywords="'{}'".format(matched_vid),
+                    max_deep=self._max_deep,
+                    edge_labels=edge_labels_str,
+                    edge_limit=edge_limit_amount,
+                    max_items=self._max_items,
+                )
+                log.debug("Kneighbor gremlin query: %s", gremlin_query)
+                
paths.extend(self._client.gremlin().exec(gremlin=gremlin_query)["data"])
 
             graph_chain_knowledge, vertex_degree_list, knowledge_with_degree = 
self._format_graph_query_result(
                 query_paths=paths
@@ -254,6 +244,21 @@ class GraphRAGQuery:
             )
         return context
 
+    def _init_client(self, context):
+        # pylint: disable=R0915 (too-many-statements)
+        if self._client is None:
+            if isinstance(context.get("graph_client"), PyHugeClient):
+                self._client = context["graph_client"]
+            else:
+                ip = context.get("ip") or "localhost"
+                port = context.get("port") or "8080"
+                graph = context.get("graph") or "hugegraph"
+                user = context.get("user") or "admin"
+                pwd = context.get("pwd") or "admin"
+                gs = context.get("graphspace") or None
+                self._client = PyHugeClient(ip, port, graph, user, pwd, gs)
+        assert self._client is not None, "No valid graph to search."
+
     def _format_graph_from_vertex(self, query_result: List[Any]) -> Set[str]:
         knowledge = set()
         for item in query_result:
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py 
b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
index f5dd8de..51a6769 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
@@ -75,7 +75,8 @@ class SemanticIdQuery:
         fuzzy_match_result = []
         for keyword in keywords:
             keyword_vector = self.embedding.get_text_embedding(keyword)
-            results = self.vector_index.search(keyword_vector, 
top_k=self.topk_per_keyword)
+            results = self.vector_index.search(keyword_vector, 
top_k=self.topk_per_keyword,
+                                               
dis_threshold=float(settings.vector_dis_threshold))
             if results:
                 fuzzy_match_result.extend(results[:self.topk_per_keyword])
         return fuzzy_match_result

(incubator-hugegraph-ai) branch main updated: fix(llm): multi vid k-neighbor query only return the data of first vid

Reply via email to