(incubator-hugegraph-ai) branch main updated: feat(llm): support word segmt extraction component & multi-files uploading (#69)

jin Thu, 22 Aug 2024 08:41:12 -0700

This is an automated email from the ASF dual-hosted git repository.

jin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git



The following commit(s) were added to refs/heads/main by this push:
     new d1b5814  feat(llm): support word segmt extraction component & 
multi-files uploading (#69)
d1b5814 is described below

commit d1b5814f9d3de307abae4b4bacb2fa6a2caf5aaa
Author: vichayturen <[email protected]>
AuthorDate: Thu Aug 22 23:41:00 2024 +0800

    feat(llm): support word segmt extraction component & multi-files uploading 
(#69)
    
    1. Change LLM keyword extraction to word tokenization extraction, change 
vid matching method.
    2. Change the file uploading in the import stage to fit multiple file 
uploading.
    3. Fix api connection test of ollama.
    4. Fix Graph RAG twice when clicking RAG button.
    5. feat(llm): support graphspace & unify the port to str
    6. chore: update gitignore files
    7. Select keyword as the default extraction scheme and add a full sentence 
matching method for query
    
    ---------
    
    Co-authored-by: imbajin <[email protected]>
---
 .asf.yaml                                          |  3 +-
 .gitignore                                         |  3 +
 .../hugegraph_llm/api/exceptions/rag_exceptions.py |  2 +-
 .../src/hugegraph_llm/api/models/rag_requests.py   |  3 +-
 hugegraph-llm/src/hugegraph_llm/api/rag_api.py     |  4 +-
 hugegraph-llm/src/hugegraph_llm/config/config.py   |  4 +-
 .../src/hugegraph_llm/demo/rag_web_demo.py         | 91 ++++++++++++----------
 .../src/hugegraph_llm/indices/graph_index.py       |  5 +-
 .../operators/common_op/merge_dedup_rerank.py      | 30 +++++--
 .../operators/document_op/chunk_split.py           | 17 ++--
 .../operators/document_op/word_extract.py          | 88 +++++++++++++++++++++
 .../src/hugegraph_llm/operators/graph_rag_task.py  | 35 +++++++--
 .../operators/hugegraph_op/commit_to_hugegraph.py  |  1 +
 .../operators/hugegraph_op/graph_rag_query.py      | 15 +++-
 .../operators/hugegraph_op/schema_manager.py       |  3 +-
 .../operators/index_op/semantic_id_query.py        | 31 ++++++--
 .../operators/kg_construction_task.py              |  4 +-
 .../operators/llm_op/answer_synthesize.py          |  4 +
 .../src/hugegraph_llm/utils/hugegraph_utils.py     |  5 +-
 .../src/pyhugegraph/example/hugegraph_example.py   |  2 +-
 .../src/pyhugegraph/example/hugegraph_test.py      |  4 +-
 hugegraph-python-client/src/tests/client_utils.py  |  5 +-
 22 files changed, 268 insertions(+), 91 deletions(-)

diff --git a/.asf.yaml b/.asf.yaml
index a5308d0..c113d64 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -34,8 +34,7 @@ github:
     - graph-embedding
     - knowledge-graph
   enabled_merge_buttons:
-    # TODO: disable it soon
-    merge: true
+    merge: false
     rebase: true
     squash: true
   protected_branches:
diff --git a/.gitignore b/.gitignore
index 786b5e1..f39daf9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -165,3 +165,6 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the 
entire idea folder.
 .idea/
 *.DS_Store
+*.out
+*.zip
+*.tar.gz
diff --git a/hugegraph-llm/src/hugegraph_llm/api/exceptions/rag_exceptions.py 
b/hugegraph-llm/src/hugegraph_llm/api/exceptions/rag_exceptions.py
index 24ef7c1..75eb14c 100644
--- a/hugegraph-llm/src/hugegraph_llm/api/exceptions/rag_exceptions.py
+++ b/hugegraph-llm/src/hugegraph_llm/api/exceptions/rag_exceptions.py
@@ -32,6 +32,6 @@ class ConnectionFailedException(HTTPException):
 def generate_response(response: RAGResponse) -> dict:
     if response.status_code == -1:
         raise ExternalException()
-    elif not (200 <= response.status_code < 300):
+    if not 200 <= response.status_code < 300:
         raise ConnectionFailedException(response.status_code, response.message)
     return {"message": "Connection successful. Configured finished."}
diff --git a/hugegraph-llm/src/hugegraph_llm/api/models/rag_requests.py 
b/hugegraph-llm/src/hugegraph_llm/api/models/rag_requests.py
index d12a1b8..47610f5 100644
--- a/hugegraph-llm/src/hugegraph_llm/api/models/rag_requests.py
+++ b/hugegraph-llm/src/hugegraph_llm/api/models/rag_requests.py
@@ -15,9 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from pydantic import BaseModel
 from typing import Optional
 
+from pydantic import BaseModel
+
 
 class RAGRequest(BaseModel):
     query: str
diff --git a/hugegraph-llm/src/hugegraph_llm/api/rag_api.py 
b/hugegraph-llm/src/hugegraph_llm/api/rag_api.py
index b6d8068..e583619 100644
--- a/hugegraph-llm/src/hugegraph_llm/api/rag_api.py
+++ b/hugegraph-llm/src/hugegraph_llm/api/rag_api.py
@@ -17,10 +17,10 @@
 
 from fastapi import status, APIRouter
 
+from hugegraph_llm.api.exceptions.rag_exceptions import generate_response
+from hugegraph_llm.api.models.rag_requests import RAGRequest, 
GraphConfigRequest, LLMConfigRequest
 from hugegraph_llm.api.models.rag_response import RAGResponse
 from hugegraph_llm.config import settings
-from hugegraph_llm.api.models.rag_requests import RAGRequest, 
GraphConfigRequest, LLMConfigRequest
-from hugegraph_llm.api.exceptions.rag_exceptions import generate_response
 
 
 def rag_http_api(router: APIRouter, rag_answer_func, apply_graph_conf, 
apply_llm_conf, apply_embedding_conf):
diff --git a/hugegraph-llm/src/hugegraph_llm/config/config.py 
b/hugegraph-llm/src/hugegraph_llm/config/config.py
index 3659cc1..2fd8262 100644
--- a/hugegraph-llm/src/hugegraph_llm/config/config.py
+++ b/hugegraph-llm/src/hugegraph_llm/config/config.py
@@ -66,11 +66,11 @@ class Config:
 
     """HugeGraph settings"""
     graph_ip: Optional[str] = "127.0.0.1"
-    graph_port: Optional[int] = 8080
-    graph_space: Optional[str] = None
+    graph_port: Optional[str] = "8080"
     graph_name: Optional[str] = "hugegraph"
     graph_user: Optional[str] = "admin"
     graph_pwd: Optional[str] = "xxx"
+    graph_space: Optional[str] = None
 
     def from_env(self):
         if os.path.exists(env_path):
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py 
b/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py
index c924186..74704ba 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py
@@ -19,6 +19,7 @@
 import argparse
 import json
 import os
+from typing import List, Union
 
 import docx
 import gradio as gr
@@ -26,6 +27,7 @@ import requests
 import uvicorn
 from fastapi import FastAPI, Depends, APIRouter
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from gradio.utils import NamedString
 from requests.auth import HTTPBasicAuth
 
 from hugegraph_llm.api.rag_api import rag_http_api
@@ -50,7 +52,7 @@ def authenticate(credentials: HTTPAuthorizationCredentials = 
Depends(sec)):
         from fastapi import HTTPException
         raise HTTPException(
             status_code=401,
-            detail="Invalid token, please contact the admin",
+            detail=f"Invalid token {credentials.credentials}, please contact 
the admin",
             headers={"WWW-Authenticate": "Bearer"},
         )
 
@@ -74,7 +76,7 @@ def rag_answer(
         vector_only_answer=vector_only_answer,
         graph_only_answer=graph_only_answer,
         graph_vector_answer=graph_vector_answer,
-    ).run(verbose=True, query=text)
+    )
 
     try:
         context = searcher.run(verbose=True, query=text)
@@ -87,27 +89,41 @@ def rag_answer(
     except ValueError as e:
         log.error(e)
         raise gr.Error(str(e))
-    except Exception as e:  # pylint: disable=broad-exception-caught
+    except Exception as e:
         log.error(e)
         raise gr.Error(f"An unexpected error occurred: {str(e)}")
 
 
-def build_kg(file, schema, example_prompt, build_mode) -> str:  # pylint: 
disable=too-many-branches
-    full_path = file.name
-    if full_path.endswith(".txt"):
-        with open(full_path, "r", encoding="utf-8") as f:
-            text = f.read()
-    elif full_path.endswith(".docx"):
-        text = ""
-        doc = docx.Document(full_path)
-        for para in doc.paragraphs:
-            text += para.text
-            text += "\n"
-    elif full_path.endswith(".pdf"):
-        # TODO: support PDF file
-        raise gr.Error("PDF will be supported later! Try to upload text/docx 
now")
-    else:
-        raise gr.Error("Please input txt or docx file.")
+def build_kg(  # pylint: disable=too-many-branches
+        files: Union[NamedString, List[NamedString]],
+        schema: str,
+        example_prompt: str,
+        build_mode: str
+) -> str:
+    if isinstance(files, NamedString):
+        files = [files]
+    texts = []
+    for file in files:
+        full_path = file.name
+        if full_path.endswith(".txt"):
+            with open(full_path, "r", encoding="utf-8") as f:
+                texts.append(f.read())
+        elif full_path.endswith(".docx"):
+            text = ""
+            doc = docx.Document(full_path)
+            for para in doc.paragraphs:
+                text += para.text
+                text += "\n"
+            texts.append(text)
+        elif full_path.endswith(".pdf"):
+            # TODO: support PDF file
+            raise gr.Error("PDF will be supported later! Try to upload 
text/docx now")
+        else:
+            raise gr.Error("Please input txt or docx file.")
+    if build_mode in (BuildMode.CLEAR_AND_IMPORT.value, 
BuildMode.REBUILD_VECTOR.value):
+        clean_vector_index()
+    if build_mode == BuildMode.CLEAR_AND_IMPORT.value:
+        clean_hg_data()
     builder = KgBuilder(LLMs().get_llm(), Embeddings().get_embedding(), 
get_hg_client())
 
     if build_mode != BuildMode.REBUILD_VERTEX_INDEX.value:
@@ -120,7 +136,7 @@ def build_kg(file, schema, example_prompt, build_mode) -> 
str:  # pylint: disabl
                 builder.import_schema(from_hugegraph=schema)
         else:
             return "ERROR: please input schema."
-    builder.chunk_split(text, "paragraph", "zh")
+    builder.chunk_split(texts, "paragraph", "zh")
 
     if build_mode == BuildMode.REBUILD_VECTOR.value:
         builder.fetch_graph_data()
@@ -128,11 +144,7 @@ def build_kg(file, schema, example_prompt, build_mode) -> 
str:  # pylint: disabl
         builder.extract_info(example_prompt, "property_graph")
     # "Test Mode", "Import Mode", "Clear and Import", "Rebuild Vector"
     if build_mode != BuildMode.TEST_MODE.value:
-        if build_mode in (BuildMode.CLEAR_AND_IMPORT.value, 
BuildMode.REBUILD_VECTOR.value):
-            clean_vector_index()
         builder.build_vector_index()
-    if build_mode == BuildMode.CLEAR_AND_IMPORT.value:
-        clean_hg_data()
     if build_mode in (BuildMode.CLEAR_AND_IMPORT.value, 
BuildMode.IMPORT_MODE.value):
         builder.commit_to_hugegraph()
     if build_mode != BuildMode.TEST_MODE.value:
@@ -208,8 +220,7 @@ def apply_embedding_config(arg1, arg2, arg3, 
origin_call=None) -> int:
         settings.ollama_host = arg1
         settings.ollama_port = int(arg2)
         settings.ollama_embedding_model = arg3
-        # TODO: right way to test ollama conn?
-        status_code = test_api_connection(f"http://{arg1}:{arg2}/status";, 
origin_call=origin_call)
+        status_code = test_api_connection(f"http://{arg1}:{arg2}";, 
origin_call=origin_call)
     settings.update_env()
     gr.Info("Configured!")
     return status_code
@@ -217,7 +228,7 @@ def apply_embedding_config(arg1, arg2, arg3, 
origin_call=None) -> int:
 
 def apply_graph_config(ip, port, name, user, pwd, gs, origin_call=None) -> int:
     settings.graph_ip = ip
-    settings.graph_port = int(port)
+    settings.graph_port = port
     settings.graph_name = name
     settings.graph_user = user
     settings.graph_pwd = pwd
@@ -253,15 +264,16 @@ def apply_llm_config(arg1, arg2, arg3, arg4, 
origin_call=None) -> int:
         settings.ollama_host = arg1
         settings.ollama_port = int(arg2)
         settings.ollama_language_model = arg3
-        # TODO: right way to test ollama conn?
-        status_code = test_api_connection(f"http://{arg1}:{arg2}/status";, 
origin_call=origin_call)
+        status_code = test_api_connection(f"http://{arg1}:{arg2}";, 
origin_call=origin_call)
     gr.Info("Configured!")
     settings.update_env()
     return status_code
 
 
 def init_rag_ui() -> gr.Interface:
-    with gr.Blocks() as hugegraph_llm_ui:
+    with gr.Blocks(theme='default',
+                   title="HugeGraph RAG Platform",
+                   css="footer {visibility: hidden}") as hugegraph_llm_ui:
         gr.Markdown(
             """# HugeGraph LLM RAG Demo
         1. Set up the HugeGraph server."""
@@ -269,13 +281,11 @@ def init_rag_ui() -> gr.Interface:
         with gr.Row():
             graph_config_input = [
                 gr.Textbox(value=settings.graph_ip, label="ip"),
-                gr.Textbox(value=str(settings.graph_port), label="port"),
+                gr.Textbox(value=settings.graph_port, label="port"),
                 gr.Textbox(value=settings.graph_name, label="graph"),
                 gr.Textbox(value=settings.graph_user, label="user"),
                 gr.Textbox(value=settings.graph_pwd, label="pwd", 
type="password"),
-                # gr.Textbox(value=settings.graph_space, label="graphspace 
(None)"),
-                # wip: graph_space issue pending
-                gr.Textbox(value="", label="graphspace (None)"),
+                gr.Textbox(value=settings.graph_space, 
label="graphspace(Optional)"),
             ]
         graph_config_button = gr.Button("apply configuration")
 
@@ -353,13 +363,13 @@ def init_rag_ui() -> gr.Interface:
             embedding_config_button = gr.Button("apply configuration")
 
             # Call the separate apply_embedding_configuration function here
-            embedding_config_button.click(
+            embedding_config_button.click(  # pylint: disable=no-member
                 apply_embedding_config, inputs=embedding_config_input  # 
pylint: disable=no-member
             )
 
         gr.Markdown(
             """## 1. Build vector/graph RAG (💡)
-- Document: Input document file which should be TXT or DOCX.
+- Doc(s): Upload document file(s) which should be TXT or DOCX. (Multiple files 
can be selected together)
 - Schema: Accepts two types of text as below:
     - User-defined JSON format Schema.
     - Specify the name of the HugeGraph graph instance, it will automatically 
get the schema from it.
@@ -409,7 +419,10 @@ def init_rag_ui() -> gr.Interface:
 }"""
 
         with gr.Row():
-            input_file = gr.File(value=os.path.join(resource_path, "demo", 
"test.txt"), label="Document")
+            input_file = gr.File(
+                value=[os.path.join(resource_path, "demo", "test.txt")],
+                label="Doc(s) (multi-files can be selected together)",
+                file_count="multiple")
             input_schema = gr.Textbox(value=schema, label="Schema")
             info_extract_template = gr.Textbox(value=SCHEMA_EXAMPLE_PROMPT, 
label="Info extract head")
             with gr.Column():
@@ -439,13 +452,13 @@ def init_rag_ui() -> gr.Interface:
                 graph_only_radio = gr.Radio(choices=[True, False], 
value=False, label="Graph-only Answer")
                 graph_vector_radio = gr.Radio(choices=[True, False], 
value=False, label="Graph-Vector Answer")
                 btn = gr.Button("Answer Question")
-        btn.click(
+        btn.click(  # pylint: disable=no-member
             fn=rag_answer,
             inputs=[
                 inp,
                 raw_radio,
                 vector_only_radio,
-                graph_only_radio,  # pylint: disable=no-member
+                graph_only_radio,
                 graph_vector_radio,
             ],
             outputs=[raw_out, vector_only_out, graph_only_out, 
graph_vector_out],
diff --git a/hugegraph-llm/src/hugegraph_llm/indices/graph_index.py 
b/hugegraph-llm/src/hugegraph_llm/indices/graph_index.py
index 74269fc..996c25c 100644
--- a/hugegraph-llm/src/hugegraph_llm/indices/graph_index.py
+++ b/hugegraph-llm/src/hugegraph_llm/indices/graph_index.py
@@ -26,12 +26,13 @@ class GraphIndex:
     def __init__(
             self,
             graph_ip: Optional[str] = settings.graph_ip,
-            graph_port: Optional[int] = settings.graph_port,
+            graph_port: Optional[str] = settings.graph_port,
             graph_name: Optional[str] = settings.graph_name,
             graph_user: Optional[str] = settings.graph_user,
             graph_pwd: Optional[str] = settings.graph_pwd,
+            graph_space: Optional[str] = settings.graph_space,
     ):
-        self.client = PyHugeClient(graph_ip, graph_port, graph_name, 
graph_user, graph_pwd)
+        self.client = PyHugeClient(graph_ip, graph_port, graph_name, 
graph_user, graph_pwd, graph_space)
 
     def clear_graph(self):
         self.client.gremlin().exec("g.V().drop()")
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/common_op/merge_dedup_rerank.py 
b/hugegraph-llm/src/hugegraph_llm/operators/common_op/merge_dedup_rerank.py
index 19ad4e4..e012479 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/common_op/merge_dedup_rerank.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/common_op/merge_dedup_rerank.py
@@ -16,7 +16,7 @@
 # under the License.
 
 
-from typing import Dict, Any, List
+from typing import Dict, Any, List, Literal
 
 import jieba
 from hugegraph_llm.models.embeddings.base import BaseEmbedding
@@ -30,27 +30,45 @@ def get_score(query: str, content: str) -> float:
 
 
 class MergeDedupRerank:
-    def __init__(self, embedding: BaseEmbedding, topk: int = 10):
+    def __init__(
+            self,
+            embedding: BaseEmbedding,
+            topk: int = 10,
+            policy: Literal["bleu", "priority"] = "bleu"
+    ):
         self.embedding = embedding
         self.topk = topk
+        if policy == "bleu":
+            self.rerank_func = self._bleu_rerank
+        elif policy == "priority":
+            self.rerank_func = self._priority_rerank
+        else:
+            raise ValueError(f"Unimplemented policy {policy}.")
 
     def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
-        # TODO: exact > fuzzy; vertex > 1-depth-neighbour > 2-depth-neighbour; 
priority vertices
         query = context.get("query")
 
         vector_result = context.get("vector_result", [])
-        vector_result = self._dedup_and_rerank(query, 
vector_result)[:self.topk]
+        vector_result = self.rerank_func(query, vector_result)[:self.topk]
 
         graph_result = context.get("graph_result", [])
-        graph_result = self._dedup_and_rerank(query, graph_result)[:self.topk]
+        graph_result = self.rerank_func(query, graph_result)[:self.topk]
 
         context["vector_result"] = vector_result
         context["graph_result"] = graph_result
 
         return context
 
-    def _dedup_and_rerank(self, query: str, results: List[str]):
+    def _bleu_rerank(self, query: str, results: List[str]):
         results = list(set(results))
         result_score_list = [[res, get_score(query, res)] for res in results]
         result_score_list.sort(key=lambda x: x[1], reverse=True)
         return [res[0] for res in result_score_list]
+
+    def _priority_rerank(self, query: str, results: List[str]):
+        # TODO: implement
+        # 1. Precise recall > Fuzzy recall
+        # 2. 1-degree neighbors > 2-degree neighbors
+        # 3. The priority of a certain type of point is higher than others,
+        # such as Law being higher than vehicles/people/locations
+        raise NotImplementedError()
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py 
b/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py
index 6aaaaed..e450b3c 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py
@@ -16,18 +16,20 @@
 # under the License.
 
 
-from typing import Literal, Dict, Any, Optional
+from typing import Literal, Dict, Any, Optional, Union, List
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 
 class ChunkSplit:
     def __init__(
             self,
-            text: str,
+            texts: Union[str, List[str]],
             split_type: Literal["paragraph", "sentence"] = "paragraph",
             language: Literal["zh", "en"] = "zh"
     ):
-        self.text = text
+        if isinstance(texts, str):
+            texts = [texts]
+        self.texts = texts
         if language == "zh":
             separators = ["\n\n", "\n", "。", "，", ""]
         elif language == "en":
@@ -50,8 +52,11 @@ class ChunkSplit:
             raise ValueError("type must be paragraph, sentence, html or 
markdown")
 
     def run(self, context: Optional[Dict[str, Any]]) -> Dict[str, Any]:
-        chunks = self.text_splitter.split_text(self.text)
+        all_chunks = []
+        for text in self.texts:
+            chunks = self.text_splitter.split_text(text)
+            all_chunks.extend(chunks)
         if context is None:
-            return {"chunks": chunks}
-        context["chunks"] = chunks
+            return {"chunks": all_chunks}
+        context["chunks"] = all_chunks
         return context
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py 
b/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
new file mode 100644
index 0000000..43d0a29
--- /dev/null
+++ b/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import re
+from typing import Dict, Any, Optional, List
+
+import jieba
+
+from hugegraph_llm.models.llms.base import BaseLLM
+from hugegraph_llm.models.llms.init_llm import LLMs
+from hugegraph_llm.operators.common_op.nltk_helper import NLTKHelper
+
+
+class WordExtract:
+    def __init__(
+        self,
+        text: Optional[str] = None,
+        llm: Optional[BaseLLM] = None,
+        language: str = "english",
+    ):
+        self._llm = llm
+        self._query = text
+        self._language = language.lower()
+
+    def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
+        if self._query is None:
+            self._query = context.get("query")
+            assert self._query is not None, "No query for keywords extraction."
+        else:
+            context["query"] = self._query
+
+        if self._llm is None:
+            self._llm = context.get("llm") or LLMs().get_llm()
+            assert isinstance(self._llm, BaseLLM), "Invalid LLM Object."
+        if context.get("llm") is None:
+            context["llm"] = self._llm
+
+        if isinstance(context.get("language"), str):
+            self._language = context["language"].lower()
+        else:
+            context["language"] = self._language
+
+        keywords = jieba.lcut(self._query)
+        keywords = self._filter_keywords(keywords, lowercase=False)
+
+        context["keywords"] = keywords
+
+        verbose = context.get("verbose") or False
+        if verbose:
+            print(f"\033[92mKEYWORDS: {context['keywords']}\033[0m")
+
+        return context
+
+    def _filter_keywords(
+        self,
+        keywords: List[str],
+        lowercase: bool = True,
+    ) -> List[str]:
+        if lowercase:
+            keywords = [w.lower() for w in keywords]
+
+        # if the keyword consists of multiple words, split into sub-words
+        # (removing stopwords)
+        results = set()
+        for token in keywords:
+            results.add(token)
+            sub_tokens = re.findall(r"\w+", token)
+            if len(sub_tokens) > 1:
+                results.update(
+                    {w for w in sub_tokens if w not in 
NLTKHelper().stopwords(lang=self._language)}
+                )
+
+        return list(results)
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py 
b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
index e3dcd99..cacac61 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
@@ -17,7 +17,7 @@
 
 
 import time
-from typing import Dict, Any, Optional, List
+from typing import Dict, Any, Optional, List, Literal
 
 from hugegraph_llm.models.llms.base import BaseLLM
 from hugegraph_llm.models.embeddings.base import BaseEmbedding
@@ -25,6 +25,7 @@ from hugegraph_llm.models.llms.init_llm import LLMs
 from hugegraph_llm.models.embeddings.init_embedding import Embeddings
 from hugegraph_llm.operators.common_op.print_result import PrintResult
 from hugegraph_llm.operators.common_op.merge_dedup_rerank import 
MergeDedupRerank
+from hugegraph_llm.operators.document_op.word_extract import WordExtract
 from hugegraph_llm.operators.hugegraph_op.graph_rag_query import GraphRAGQuery
 from hugegraph_llm.operators.index_op.semantic_id_query import SemanticIdQuery
 from hugegraph_llm.operators.index_op.vector_index_query import 
VectorIndexQuery
@@ -39,6 +40,19 @@ class GraphRAG:
         self._embedding = embedding or Embeddings().get_embedding()
         self._operators: List[Any] = []
 
+    def extract_word(
+        self,
+        text: Optional[str] = None,
+        language: str = "english",
+    ):
+        self._operators.append(
+            WordExtract(
+                text=text,
+                language=language,
+            )
+        )
+        return self
+
     def extract_keyword(
         self,
         text: Optional[str] = None,
@@ -58,20 +72,27 @@ class GraphRAG:
         )
         return self
 
-    def match_keyword_to_id(self, topk_per_keyword: int = 1):
+    def match_keyword_to_id(
+            self,
+            by: Literal["query", "keywords"] = "keywords",
+            topk_per_keyword: int = 1,
+            topk_per_query: int = 10
+    ):
         self._operators.append(
             SemanticIdQuery(
                 embedding=self._embedding,
-                topk_per_keyword=topk_per_keyword
+                by=by,
+                topk_per_keyword=topk_per_keyword,
+                topk_per_query=topk_per_query
             )
         )
         return self
 
     def query_graph_for_rag(
-        self,
-        max_deep: int = 2,
-        max_items: int = 30,
-        prop_to_match: Optional[str] = None,
+            self,
+            max_deep: int = 2,
+            max_items: int = 30,
+            prop_to_match: Optional[str] = None,
     ):
         self._operators.append(
             GraphRAGQuery(
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
index b4ad640..cf252e2 100644
--- 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
+++ 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
@@ -32,6 +32,7 @@ class CommitToKg:
             settings.graph_name,
             settings.graph_user,
             settings.graph_pwd,
+            settings.graph_space,
         )
         self.schema = self.client.schema()
 
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
index 2c81ee0..5f18d3c 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
@@ -27,6 +27,11 @@ class GraphRAGQuery:
     VERTEX_GREMLIN_QUERY_TEMPL = (
         "g.V().hasId({keywords}).as('subj').toList()"
     )
+    # ID_RAG_GREMLIN_QUERY_TEMPL = 
"g.V().hasId({keywords}).as('subj').repeat(bothE({edge_labels}).as('rel').otherV(
+    # ).as('obj')).times({max_deep}).path().by(project('label', 'id', 
'props').by(label()).by(id()).by(valueMap().by(
+    # unfold()))).by(project('label', 'inV', 'outV', 
'props').by(label()).by(inV().id()).by(outV().id()).by(valueMap(
+    # ).by(unfold()))).limit({max_items}).toList()"
+
     # TODO: we could use a simpler query (like kneighbor-api to get the edges)
     ID_RAG_GREMLIN_QUERY_TEMPL = """
     g.V().hasId({keywords}).as('subj')
@@ -81,6 +86,7 @@ class GraphRAGQuery:
             settings.graph_name,
             settings.graph_user,
             settings.graph_pwd,
+            settings.graph_space,
         )
         self._max_deep = max_deep
         self._max_items = max_items
@@ -93,17 +99,16 @@ class GraphRAGQuery:
                 self._client = context["graph_client"]
             else:
                 ip = context.get("ip") or "localhost"
-                port = context.get("port") or 8080
+                port = context.get("port") or "8080"
                 graph = context.get("graph") or "hugegraph"
                 user = context.get("user") or "admin"
                 pwd = context.get("pwd") or "admin"
-                self._client = PyHugeClient(ip=ip, port=port, graph=graph, 
user=user, pwd=pwd)
+                gs = context.get("graphspace") or None
+                self._client = PyHugeClient(ip, port, graph, user, pwd, gs)
         assert self._client is not None, "No valid graph to search."
 
         keywords = context.get("keywords")
-        assert keywords is not None, "No keywords for graph query."
         entrance_vids = context.get("entrance_vids")
-        assert entrance_vids is not None, "No entrance vertices for query."
 
         if isinstance(context.get("max_deep"), int):
             self._max_deep = context["max_deep"]
@@ -118,6 +123,7 @@ class GraphRAGQuery:
         use_id_to_match = self._prop_to_match is None
 
         if not use_id_to_match:
+            assert keywords is not None, "No keywords for graph query."
             keywords_str = ",".join("'" + kw + "'" for kw in keywords)
             rag_gremlin_query = self.PROP_RAG_GREMLIN_QUERY_TEMPL.format(
                 prop=self._prop_to_match,
@@ -129,6 +135,7 @@ class GraphRAGQuery:
             result: List[Any] = 
self._client.gremlin().exec(gremlin=rag_gremlin_query)["data"]
             knowledge: Set[str] = 
self._format_knowledge_from_query_result(query_result=result)
         else:
+            assert entrance_vids is not None, "No entrance vertices for query."
             rag_gremlin_query = self.VERTEX_GREMLIN_QUERY_TEMPL.format(
                 keywords=entrance_vids,
             )
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
index 78c06a5..5c002ae 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
@@ -29,10 +29,11 @@ class SchemaManager:
             self.graph_name,
             settings.graph_user,
             settings.graph_pwd,
+            settings.graph_space,
         )
         self.schema = self.client.schema()
 
-    def run(self, data: dict):
+    def run(self):
         schema = self.schema.getSchema()
         vertices = []
         for vl in schema["vertexlabels"]:
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py 
b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
index 4eaef4e..042d140 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
@@ -17,7 +17,7 @@
 
 
 import os
-from typing import Dict, Any
+from typing import Dict, Any, Literal
 
 from hugegraph_llm.config import resource_path, settings
 from hugegraph_llm.indices.vector_index import VectorIndex
@@ -25,20 +25,35 @@ from hugegraph_llm.models.embeddings.base import 
BaseEmbedding
 
 
 class SemanticIdQuery:
-    def __init__(self, embedding: BaseEmbedding, topk_per_keyword: int = 1):
+    def __init__(
+            self,
+            embedding: BaseEmbedding,
+            by: Literal["query", "keywords"] = "keywords",
+            topk_per_query: int = 10,
+            topk_per_keyword: int = 1
+    ):
         index_file = str(os.path.join(resource_path, settings.graph_name, 
"vid.faiss"))
         content_file = str(os.path.join(resource_path, settings.graph_name, 
"vid.pkl"))
         self.vector_index = VectorIndex.from_index_file(index_file, 
content_file)
         self.embedding = embedding
-        self._topk_per_keyword = topk_per_keyword
+        self.by = by
+        self.topk_per_query = topk_per_query
+        self.topk_per_keyword = topk_per_keyword
 
     def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
-        keywords = context["keywords"]
         graph_query_entrance = []
-        for keyword in keywords:
-            query_vector = self.embedding.get_text_embedding(keyword)
-            results = self.vector_index.search(query_vector, 
top_k=self._topk_per_keyword)
+        if self.by == "query":
+            query = context["query"]
+            query_vector = self.embedding.get_text_embedding(query)
+            results = self.vector_index.search(query_vector, 
top_k=self.topk_per_query)
             if results:
-                graph_query_entrance.extend(results[:self._topk_per_keyword])
+                graph_query_entrance.extend(results[:self.topk_per_query])
+        else:  # by keywords
+            keywords = context["keywords"]
+            for keyword in keywords:
+                keyword_vector = self.embedding.get_text_embedding(keyword)
+                results = self.vector_index.search(keyword_vector, 
top_k=self.topk_per_keyword)
+                if results:
+                    
graph_query_entrance.extend(results[:self.topk_per_keyword])
         context["entrance_vids"] = list(set(graph_query_entrance))
         return context
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py 
b/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
index b9d6a6a..876d1c8 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
@@ -17,7 +17,7 @@
 
 
 import time
-from typing import Dict, Any, Optional, Literal
+from typing import Dict, Any, Optional, Literal, Union, List
 
 from pyhugegraph.client import PyHugeClient
 
@@ -63,7 +63,7 @@ class KgBuilder:
 
     def chunk_split(
             self,
-            text: str,  # text to be split
+            text: Union[str, List[str]],  # text to be split
             split_type: Literal["paragraph", "sentence"] = "paragraph",
             language: Literal["zh", "en"] = "zh"
     ):
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py 
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
index f885182..f3803c7 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
@@ -28,6 +28,10 @@ DEFAULT_ANSWER_SYNTHESIZE_TEMPLATE_TMPL = (
     "---------------------\n"
     "{context_str}\n"
     "---------------------\n"
+    "You need to refer to the context based on the following priority:\n"
+    "1. Graph recall > vector recall\n"
+    "2. Exact recall > Fuzzy recall\n"
+    "3. Independent vertex > 1-depth neighbor> 2-depth neighbors\n"
     "Given the context information and not prior knowledge, answer the 
query.\n"
     "Query: {query_str}\n"
     "Answer: "
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/hugegraph_utils.py 
b/hugegraph-llm/src/hugegraph_llm/utils/hugegraph_utils.py
index d942bf9..3320efa 100644
--- a/hugegraph-llm/src/hugegraph_llm/utils/hugegraph_utils.py
+++ b/hugegraph-llm/src/hugegraph_llm/utils/hugegraph_utils.py
@@ -32,6 +32,7 @@ def get_hg_client():
         settings.graph_name,
         settings.graph_user,
         settings.graph_pwd,
+        settings.graph_space,
     )
 
 
@@ -41,9 +42,7 @@ def init_hg_test_data():
     schema = client.schema()
     schema.propertyKey("name").asText().ifNotExist().create()
     schema.propertyKey("birthDate").asText().ifNotExist().create()
-    schema.vertexLabel("Person").properties(
-        "name", "birthDate"
-    ).useCustomizeStringId().ifNotExist().create()
+    schema.vertexLabel("Person").properties("name", 
"birthDate").useCustomizeStringId().ifNotExist().create()
     
schema.vertexLabel("Movie").properties("name").useCustomizeStringId().ifNotExist().create()
     
schema.edgeLabel("ActedIn").sourceLabel("Person").targetLabel("Movie").ifNotExist().create()
 
diff --git 
a/hugegraph-python-client/src/pyhugegraph/example/hugegraph_example.py 
b/hugegraph-python-client/src/pyhugegraph/example/hugegraph_example.py
index d58190e..15ce868 100644
--- a/hugegraph-python-client/src/pyhugegraph/example/hugegraph_example.py
+++ b/hugegraph-python-client/src/pyhugegraph/example/hugegraph_example.py
@@ -19,7 +19,7 @@ from pyhugegraph.client import PyHugeClient
 
 if __name__ == "__main__":
     client = PyHugeClient(
-        "127.0.0.1", "8080", user="admin", pwd="admin", graph="hugegraph"
+        "127.0.0.1", "8080", user="admin", pwd="admin", graph="hugegraph", 
graphspace=None
     )
 
     """schema"""
diff --git a/hugegraph-python-client/src/pyhugegraph/example/hugegraph_test.py 
b/hugegraph-python-client/src/pyhugegraph/example/hugegraph_test.py
index 739862d..e58a98d 100644
--- a/hugegraph-python-client/src/pyhugegraph/example/hugegraph_test.py
+++ b/hugegraph-python-client/src/pyhugegraph/example/hugegraph_test.py
@@ -24,7 +24,7 @@ class HugeGraph:
         username: str = "default",
         password: str = "default",
         address: str = "127.0.0.1",
-        port: int = 8081,
+        port: str = "8081",
         graph: str = "hugegraph",
     ) -> None:
         """Create a new HugeGraph wrapper instance."""
@@ -42,7 +42,7 @@ class HugeGraph:
         self.port = port
         self.graph = graph
         self.client = PyHugeClient(
-            address, port, user=username, pwd=password, graph=graph
+            address, port, user=username, pwd=password, graph=graph, 
graphspace=None
         )
         self.schema = ""
 
diff --git a/hugegraph-python-client/src/tests/client_utils.py 
b/hugegraph-python-client/src/tests/client_utils.py
index 12b35db..6a75ab1 100644
--- a/hugegraph-python-client/src/tests/client_utils.py
+++ b/hugegraph-python-client/src/tests/client_utils.py
@@ -20,15 +20,16 @@ from pyhugegraph.client import PyHugeClient
 
 class ClientUtils:
     IP = "127.0.0.1"
-    PORT = 8080
+    PORT = "8080"
     GRAPH = "hugegraph"
     USERNAME = "admin"
     PASSWORD = "admin"
+    GRAPHSPACE = None
     TIMEOUT = 10
 
     def __init__(self):
         self.client = PyHugeClient(
-            self.IP, self.PORT, user=self.USERNAME, pwd=self.PASSWORD, 
graph=self.GRAPH
+            self.IP, self.PORT, user=self.USERNAME, pwd=self.PASSWORD, 
graph=self.GRAPH, graphspace=self.GRAPHSPACE
         )
         assert self.client is not None

(incubator-hugegraph-ai) branch main updated: feat(llm): support word segmt extraction component & multi-files uploading (#69)

Reply via email to