This is an automated email from the ASF dual-hosted git repository.
jin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git
The following commit(s) were added to refs/heads/main by this push:
new 9512459 fix(llm): multi vid k-neighbor query only return the data of
first vid
9512459 is described below
commit 95124598c4f8d81f73f37d04aae3f9fad26963ac
Author: yc319 <[email protected]>
AuthorDate: Tue Dec 10 22:44:29 2024 +0800
fix(llm): multi vid k-neighbor query only return the data of first vid
Currently, the gremlin:
```groovy
g.V(v1, v2, ..., vn....)
.repeat(
bothE({edge_labels}).limit({edge_limit}).otherV().dedup()
).times({max_deep}).emit()
.simplePath()
.path()
.by(project('label', 'id', 'props')
.by(label())
.by(id())
.by(valueMap().by(unfold()))
)
.by(project('label', 'inV', 'outV', 'props')
.by(label())
.by(inV().id())
.by(outV().id())
.by(valueMap().by(unfold()))
)
.limit({max_items})
.toList()
```
will only fetch the first vid data (if we use path() & search multi vids
together)
So downgrade to for-loop to make the logic right now (Enhance it later)
---------
Co-authored-by: yucui <[email protected]>
Co-authored-by: imbajin <[email protected]>
---
.../src/hugegraph_llm/config/config_data.py | 4 ++
.../demo/rag_demo/text2gremlin_block.py | 2 +-
.../src/hugegraph_llm/indices/vector_index.py | 3 +-
.../operators/hugegraph_op/graph_rag_query.py | 57 ++++++++++++----------
.../operators/index_op/semantic_id_query.py | 3 +-
5 files changed, 40 insertions(+), 29 deletions(-)
diff --git a/hugegraph-llm/src/hugegraph_llm/config/config_data.py
b/hugegraph-llm/src/hugegraph_llm/config/config_data.py
index f69e04b..a3d887e 100644
--- a/hugegraph-llm/src/hugegraph_llm/config/config_data.py
+++ b/hugegraph-llm/src/hugegraph_llm/config/config_data.py
@@ -105,6 +105,10 @@ class ConfigData:
graph_pwd: Optional[str] = "xxx"
graph_space: Optional[str] = None
limit_property: Optional[str] = "False"
+ max_graph_path: Optional[int] = 10
+ max_items: Optional[int] = 30
+ edge_limit_pre_label: Optional[int] = 8
+ vector_dis_threshold: Optional[float] = 0.9
"""Admin settings"""
enable_login: Optional[str] = "False"
diff --git
a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/text2gremlin_block.py
b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/text2gremlin_block.py
index f2fb6fb..797ec73 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/text2gremlin_block.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/text2gremlin_block.py
@@ -62,7 +62,7 @@ def build_example_vector_index(temp_file) -> dict:
def gremlin_generate(inp, example_num, schema, gremlin_prompt) -> tuple[str,
str] | tuple[str, Any, Any, Any, Any]:
generator = GremlinGenerator(llm=LLMs().get_text2gql_llm(),
embedding=Embeddings().get_embedding())
- sm = SchemaManager(graph_name="schema")
+ sm = SchemaManager(graph_name=schema)
short_schema = False
if schema:
diff --git a/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
b/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
index 4ba1983..7f93c3d 100644
--- a/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
+++ b/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
@@ -31,6 +31,7 @@ PROPERTIES_FILE_NAME = "properties.pkl"
class VectorIndex:
"""Comment"""
+
def __init__(self, embed_dim: int = 1024):
self.index = faiss.IndexFlatL2(embed_dim)
self.properties = []
@@ -95,7 +96,7 @@ class VectorIndex:
distances, indices = self.index.search(np.array([query_vector]), top_k)
results = []
for dist, i in zip(distances[0], indices[0]):
- if dist < dis_threshold: # Smaller distances indicate higher
similarity
+ if dist < dis_threshold: # Smaller distances indicate higher
similarity
results.append(deepcopy(self.properties[i]))
log.debug("[✓] Add valid distance %s to results.", dist)
else:
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
index cd759ef..2f5291a 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
@@ -75,17 +75,16 @@ g.V().has('{prop}', within({keywords}))
class GraphRAGQuery:
-
def __init__(
self,
max_deep: int = 2,
- max_items: int = 20,
+ max_items: int = int(settings.max_items),
prop_to_match: Optional[str] = None,
- with_gremlin_template: bool = True,
llm: Optional[BaseLLM] = None,
embedding: Optional[BaseEmbedding] = None,
max_v_prop_len: int = 2048,
max_e_prop_len: int = 256,
+ with_gremlin_template: bool = True,
num_gremlin_generate_example: int = 1
):
self._client = PyHugeClient(
@@ -111,19 +110,7 @@ class GraphRAGQuery:
self._with_gremlin_template = with_gremlin_template
def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
- # pylint: disable=R0915 (too-many-statements)
- if self._client is None:
- if isinstance(context.get("graph_client"), PyHugeClient):
- self._client = context["graph_client"]
- else:
- ip = context.get("ip") or "localhost"
- port = context.get("port") or "8080"
- graph = context.get("graph") or "hugegraph"
- user = context.get("user") or "admin"
- pwd = context.get("pwd") or "admin"
- gs = context.get("graphspace") or None
- self._client = PyHugeClient(ip, port, graph, user, pwd, gs)
- assert self._client is not None, "No valid graph to search."
+ self._init_client(context)
# initial flag: -1 means no result, 0 means subgraph query, 1 means
gremlin query
context["graph_result_flag"] = -1
@@ -189,7 +176,7 @@ class GraphRAGQuery:
_, edge_labels = self._extract_labels_from_schema()
edge_labels_str = ",".join("'" + label + "'" for label in edge_labels)
# TODO: enhance the limit logic later
- edge_limit_amount = len(edge_labels) * 10
+ edge_limit_amount = len(edge_labels) * settings.edge_limit_pre_label
use_id_to_match = self._prop_to_match is None
if use_id_to_match:
@@ -201,15 +188,18 @@ class GraphRAGQuery:
log.debug("Vids gremlin query: %s", gremlin_query)
vertex_knowledge =
self._format_graph_from_vertex(query_result=vertexes)
- gremlin_query = VID_QUERY_NEIGHBOR_TPL.format(
- keywords=matched_vids,
- max_deep=self._max_deep,
- edge_labels=edge_labels_str,
- edge_limit=edge_limit_amount,
- max_items=self._max_items,
- )
- log.debug("Kneighbor gremlin query: %s",
gremlin_query.replace("\n", "").replace(" ", ""))
- paths = self._client.gremlin().exec(gremlin=gremlin_query)["data"]
+ paths: List[Any] = []
+ # TODO: use generator or asyncio to speed up the query logic
+ for matched_vid in matched_vids:
+ gremlin_query = VID_QUERY_NEIGHBOR_TPL.format(
+ keywords="'{}'".format(matched_vid),
+ max_deep=self._max_deep,
+ edge_labels=edge_labels_str,
+ edge_limit=edge_limit_amount,
+ max_items=self._max_items,
+ )
+ log.debug("Kneighbor gremlin query: %s", gremlin_query)
+
paths.extend(self._client.gremlin().exec(gremlin=gremlin_query)["data"])
graph_chain_knowledge, vertex_degree_list, knowledge_with_degree =
self._format_graph_query_result(
query_paths=paths
@@ -254,6 +244,21 @@ class GraphRAGQuery:
)
return context
+ def _init_client(self, context):
+ # pylint: disable=R0915 (too-many-statements)
+ if self._client is None:
+ if isinstance(context.get("graph_client"), PyHugeClient):
+ self._client = context["graph_client"]
+ else:
+ ip = context.get("ip") or "localhost"
+ port = context.get("port") or "8080"
+ graph = context.get("graph") or "hugegraph"
+ user = context.get("user") or "admin"
+ pwd = context.get("pwd") or "admin"
+ gs = context.get("graphspace") or None
+ self._client = PyHugeClient(ip, port, graph, user, pwd, gs)
+ assert self._client is not None, "No valid graph to search."
+
def _format_graph_from_vertex(self, query_result: List[Any]) -> Set[str]:
knowledge = set()
for item in query_result:
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
index f5dd8de..51a6769 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
@@ -75,7 +75,8 @@ class SemanticIdQuery:
fuzzy_match_result = []
for keyword in keywords:
keyword_vector = self.embedding.get_text_embedding(keyword)
- results = self.vector_index.search(keyword_vector,
top_k=self.topk_per_keyword)
+ results = self.vector_index.search(keyword_vector,
top_k=self.topk_per_keyword,
+
dis_threshold=float(settings.vector_dis_threshold))
if results:
fuzzy_match_result.extend(results[:self.topk_per_keyword])
return fuzzy_match_result