(incubator-hugegraph-ai) branch main updated: feat(llm): modify the summary info and enhance the request logic (#147)

jin Thu, 02 Jan 2025 01:06:38 -0800

This is an automated email from the ASF dual-hosted git repository.

jin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git



The following commit(s) were added to refs/heads/main by this push:
     new 3ad572f  feat(llm): modify the summary info and enhance the request 
logic (#147)
3ad572f is described below

commit 3ad572fb79927326b03d4c78d6e51746f983d8d0
Author: SoJGooo <[email protected]>
AuthorDate: Thu Jan 2 17:06:26 2025 +0800

    feat(llm): modify the summary info and enhance the request logic (#147)
    
    1. **Modify the return messages of `Get Vector Index Info`** . Display the 
number of elements in **"chunk vector"** and **"graph vid vector"** separately.
    `{
      "embed_dim": 1024,
      "vector_info": {
        "chunk_vector_num": 8,
        "graph_vid_vector_num": 483,
        "graph_properties_vector_num": 8
      }
    }`
    
    2. **Modify the return messages of `Get Graph Index Info`** . Display the 
number of vertices and edges. Display the first 100 edges.
    `{
      "num_vertices": [
        483
      ],
      "num_edges": [
        692
      ],
      "vertices": [
        "12:James",
        "12:Sarah"
        ....],
      "edges": [
        "S9:徐起>12>12>>S10:冀RM78Y7!小型轿车!徐起",
        "S9:刘雪峰>11>11>>S11:未与前车保持足以采取紧急制动措施的安全距离",
        ....],
          "vid_index": {
        "embed_dim": 1024,
        "num_vectors": 483,
        "num_vids": 483
      }
    }
      `
    
    3. **When a user simultaneously employs text and files, a `log.error` pops 
up.**
    
    4. add doc_input_text into config_prompt.py to **impl text in Doc(s) 
persistence**
    
    ---------
    
    Co-authored-by: imbajin <[email protected]>
---
 .../config/models/base_prompt_config.py            |  5 +++
 .../src/hugegraph_llm/config/prompt_config.py      |  8 ++++
 .../src/hugegraph_llm/demo/rag_demo/app.py         |  8 ++--
 .../demo/rag_demo/vector_graph_block.py            | 43 ++++++++++++----------
 .../operators/hugegraph_op/fetch_graph_data.py     | 31 +++++++++++++---
 .../operators/index_op/build_semantic_index.py     |  3 +-
 .../src/hugegraph_llm/utils/graph_index_utils.py   | 20 +++++++---
 .../src/hugegraph_llm/utils/vector_index_utils.py  | 21 +++++++----
 8 files changed, 97 insertions(+), 42 deletions(-)

diff --git 
a/hugegraph-llm/src/hugegraph_llm/config/models/base_prompt_config.py 
b/hugegraph-llm/src/hugegraph_llm/config/models/base_prompt_config.py
index a03b4df..1557272 100644
--- a/hugegraph-llm/src/hugegraph_llm/config/models/base_prompt_config.py
+++ b/hugegraph-llm/src/hugegraph_llm/config/models/base_prompt_config.py
@@ -35,6 +35,7 @@ class BasePromptConfig:
     keywords_extract_prompt: str = ''
     text2gql_graph_schema: str = ''
     gremlin_generate_prompt: str = ''
+    doc_input_text: str = ''
 
     def ensure_yaml_file_exists(self):
         if os.path.exists(yaml_file_path):
@@ -61,6 +62,7 @@ class BasePromptConfig:
         indented_keywords_extract_template = (
             "\n".join([f"    {line}" for line in 
self.keywords_extract_prompt.splitlines()])
         )
+        indented_doc_input_text = "\n".join([f"  {line}" for line in 
self.doc_input_text.splitlines()])
 
         # This can be extended to add storage fields according to the data 
needs to be stored
         yaml_content = f"""graph_schema: |
@@ -87,6 +89,9 @@ keywords_extract_prompt: |
 gremlin_generate_prompt: |
 {indented_gremlin_prompt}
 
+doc_input_text: |
+{indented_doc_input_text}
+
 """
         with open(yaml_file_path, "w", encoding="utf-8") as file:
             file.write(yaml_content)
diff --git a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py 
b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
index f6aeef5..8f55fa6 100644
--- a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
+++ b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
@@ -203,4 +203,12 @@ g.V().limit(10)
 {query}
 
 The generated gremlin is:
+"""
+
+    doc_input_text: str = """Meet Sarah, a 30-year-old attorney, and her 
roommate, James, whom she's shared a home with since 2010.
+James, in his professional life, works as a journalist. Additionally, Sarah is 
the proud owner of the website 
+www.sarahsplace.com, while James manages his own webpage, though the specific 
URL is not mentioned here. 
+These two individuals, Sarah and James, have not only forged a strong personal 
bond as roommates but have also 
+carved out their distinctive digital presence through their respective 
webpages, showcasing their varied interests 
+and experiences.
 """
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py 
b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py
index 1d94bd9..e0508dd 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py
@@ -57,7 +57,7 @@ def authenticate(credentials: HTTPAuthorizationCredentials = 
Depends(sec)):
             headers={"WWW-Authenticate": "Bearer"},
         )
 
-
+ # TODO: move the logic to a separate file
 async def timely_update_vid_embedding():
     while True:
         try:
@@ -120,7 +120,7 @@ def init_rag_ui() -> gr.Interface:
         textbox_array_graph_config = create_configs_block()
 
         with gr.Tab(label="1. Build RAG Index 💡"):
-            textbox_input_schema, textbox_info_extract_template = 
create_vector_graph_block()
+            textbox_input_text, textbox_input_schema, 
textbox_info_extract_template = create_vector_graph_block()
         with gr.Tab(label="2. (Graph)RAG & User Functions 📖"):
             (
                 textbox_inp,
@@ -147,6 +147,7 @@ def init_rag_ui() -> gr.Interface:
                 huge_settings.graph_user,
                 huge_settings.graph_pwd,
                 huge_settings.graph_space,
+                prompt.doc_input_text,
                 prompt.graph_schema,
                 prompt.extract_graph_prompt,
                 prompt.default_question,
@@ -155,7 +156,7 @@ def init_rag_ui() -> gr.Interface:
                 prompt.custom_rerank_info,
                 prompt.default_question,
                 huge_settings.graph_name,
-                prompt.gremlin_generate_prompt,
+                prompt.gremlin_generate_prompt
             )
 
         hugegraph_llm_ui.load(  # pylint: disable=E1101
@@ -167,6 +168,7 @@ def init_rag_ui() -> gr.Interface:
                 textbox_array_graph_config[3],
                 textbox_array_graph_config[4],
                 textbox_array_graph_config[5],
+                textbox_input_text,
                 textbox_input_schema,
                 textbox_info_extract_template,
                 textbox_inp,
diff --git 
a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/vector_graph_block.py 
b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/vector_graph_block.py
index 4bb49e3..1e48e21 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/vector_graph_block.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/vector_graph_block.py
@@ -17,11 +17,9 @@
 
 # pylint: disable=E1101
 
-import os
-
 import gradio as gr
 
-from hugegraph_llm.config import resource_path, prompt
+from hugegraph_llm.config import prompt
 from hugegraph_llm.utils.graph_index_utils import (
     get_graph_index_info,
     clean_all_graph_index,
@@ -31,10 +29,10 @@ from hugegraph_llm.utils.graph_index_utils import (
 )
 from hugegraph_llm.utils.vector_index_utils import clean_vector_index, 
build_vector_index, get_vector_index_info
 
-
-def store_prompt(schema, example_prompt):
-    # update env variables: schema and example_prompt
-    if prompt.graph_schema != schema or prompt.extract_graph_prompt != 
example_prompt:
+def store_prompt(doc, schema, example_prompt):
+    # update env variables: doc, schema and example_prompt
+    if prompt.doc_input_text != doc or prompt.graph_schema != schema or 
prompt.extract_graph_prompt != example_prompt:
+        prompt.doc_input_text = doc
         prompt.graph_schema = schema
         prompt.extract_graph_prompt = example_prompt
         prompt.update_yaml_file()
@@ -53,7 +51,7 @@ def create_vector_graph_block():
   to modify it)
   - Specify the name of the HugeGraph graph instance, it will automatically 
get the schema from it (like 
   **"hugegraph"**)
-- Graph extract head: The user-defined prompt of graph extracting
+- Graph Extract Prompt Header: The user-defined prompt of graph extracting
 - If already exist the graph data, you should click "**Rebuild vid Index**" to 
update the index
 """
     )
@@ -61,16 +59,21 @@ def create_vector_graph_block():
     with gr.Row():
         with gr.Column():
             with gr.Tab("text") as tab_upload_text:
-                input_text = gr.Textbox(value="", label="Doc(s)", lines=20, 
show_copy_button=True)
+                input_text = gr.Textbox(
+                    value=prompt.doc_input_text,
+                    label="Doc(s)",
+                    lines=20,
+                    show_copy_button=True
+                )
             with gr.Tab("file") as tab_upload_file:
                 input_file = gr.File(
-                    value=[os.path.join(resource_path, "demo", "test.txt")],
+                    value=None,
                     label="Docs (multi-files can be selected together)",
                     file_count="multiple",
                 )
         input_schema = gr.Textbox(value=prompt.graph_schema, label="Schema", 
lines=15, show_copy_button=True)
         info_extract_template = gr.Textbox(
-            value=prompt.extract_graph_prompt, label="Graph extract head", 
lines=15, show_copy_button=True
+            value=prompt.extract_graph_prompt, label="Graph Extract Prompt 
Header", lines=15, show_copy_button=True
         )
         out = gr.Code(label="Output", language="json", 
elem_classes="code-container-edit")
 
@@ -91,37 +94,37 @@ def create_vector_graph_block():
 
     vector_index_btn0.click(get_vector_index_info, outputs=out).then(
         store_prompt,
-        inputs=[input_schema, info_extract_template],
+        inputs=[input_text, input_schema, info_extract_template],
     )
     vector_index_btn1.click(clean_vector_index).then(
         store_prompt,
-        inputs=[input_schema, info_extract_template],
+        inputs=[input_text, input_schema, info_extract_template],
     )
     vector_import_bt.click(build_vector_index, inputs=[input_file, 
input_text], outputs=out).then(
         store_prompt,
-        inputs=[input_schema, info_extract_template],
+        inputs=[input_text, input_schema, info_extract_template],
     )
     graph_index_btn0.click(get_graph_index_info, outputs=out).then(
         store_prompt,
-        inputs=[input_schema, info_extract_template],
+        inputs=[input_text, input_schema, info_extract_template],
     )
     graph_index_btn1.click(clean_all_graph_index).then(
         store_prompt,
-        inputs=[input_schema, info_extract_template],
+        inputs=[input_text, input_schema, info_extract_template],
     )
     graph_index_rebuild_bt.click(update_vid_embedding, outputs=out).then(
         store_prompt,
-        inputs=[input_schema, info_extract_template],
+        inputs=[input_text, input_schema, info_extract_template],
     )
 
     # origin_out = gr.Textbox(visible=False)
     graph_extract_bt.click(
         extract_graph, inputs=[input_file, input_text, input_schema, 
info_extract_template], outputs=[out]
-    ).then(store_prompt, inputs=[input_schema, info_extract_template], )
+    ).then(store_prompt, inputs=[input_text, input_schema, 
info_extract_template], )
 
     graph_loading_bt.click(import_graph_data, inputs=[out, input_schema], 
outputs=[out]).then(update_vid_embedding).then(
         store_prompt,
-        inputs=[input_schema, info_extract_template],
+        inputs=[input_text, input_schema, info_extract_template],
     )
 
     def on_tab_select(input_f, input_t, evt: gr.SelectData):
@@ -135,4 +138,4 @@ def create_vector_graph_block():
     tab_upload_file.select(fn=on_tab_select, inputs=[input_file, input_text], 
outputs=[input_file, input_text])
     tab_upload_text.select(fn=on_tab_select, inputs=[input_file, input_text], 
outputs=[input_file, input_text])
 
-    return input_schema, info_extract_template
+    return input_text, input_schema, info_extract_template
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
index 1428a3a..e93d916 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
@@ -22,12 +22,31 @@ from pyhugegraph.client import PyHugeClient
 
 
 class FetchGraphData:
+
     def __init__(self, graph: PyHugeClient):
         self.graph = graph
 
-    def run(self, context: Optional[Dict[str, Any]]) -> Dict[str, Any]:
-        if context is None:
-            context = {}
-        if "vertices" not in context:
-            context["vertices"] = 
self.graph.gremlin().exec("g.V().id().limit(10000)")["data"]
-        return context
+    def run(self, graph_summary: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+        if graph_summary is None:
+            graph_summary = {}
+
+        # TODO: v_limit will influence the vid embedding logic in 
build_semantic_index.py
+        v_limit = 10000
+        e_limit = 200
+        keys = ["vertex_num", "edge_num", "vertices", "edges", "note"]
+
+        groovy_code = f"""
+        def res = [:];
+        res.{keys[0]} = g.V().count().next();
+        res.{keys[1]} = g.E().count().next();
+        res.{keys[2]} = g.V().id().limit({v_limit}).toList();
+        res.{keys[3]} = g.E().id().limit({e_limit}).toList();
+        res.{keys[4]} = "Only ≤{v_limit} VIDs and ≤ {e_limit} EIDs for brief 
overview .";
+        return res;
+        """
+
+        result = self.graph.gremlin().exec(groovy_code)["data"]
+
+        if isinstance(result, list) and len(result) > 0:
+            graph_summary.update({key: result[i].get(key) for i, key in 
enumerate(keys)})
+        return graph_summary
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py 
b/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
index a69be81..c8ce907 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
@@ -34,7 +34,8 @@ class BuildSemanticIndex:
 
     def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
         past_vids = self.vid_index.properties
-        present_vids = context["vertices"]
+        # TODO: We should build vid vector index separately, especially when 
the vertices may be very large
+        present_vids = context["vertices"] # Warning: data truncated by 
fetch_graph_data.py
         removed_vids = set(past_vids) - set(present_vids)
         removed_num = self.vid_index.remove(removed_vids)
         added_vids = list(set(present_vids) - set(past_vids))
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py 
b/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
index ace61c0..a9ebc05 100644
--- a/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
+++ b/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
@@ -35,14 +35,14 @@ from ..operators.kg_construction_task import KgBuilder
 
 def get_graph_index_info():
     builder = KgBuilder(LLMs().get_chat_llm(), Embeddings().get_embedding(), 
get_hg_client())
-    context = builder.fetch_graph_data().run()
+    graph_summary_info = builder.fetch_graph_data().run()
     vector_index = VectorIndex.from_index_file(str(os.path.join(resource_path, 
huge_settings.graph_name, "graph_vids")))
-    context["vid_index"] = {
+    graph_summary_info["vid_index"] = {
         "embed_dim": vector_index.index.d,
         "num_vectors": vector_index.index.ntotal,
         "num_vids": len(vector_index.properties),
     }
-    return json.dumps(context, ensure_ascii=False, indent=2)
+    return json.dumps(graph_summary_info, ensure_ascii=False, indent=2)
 
 
 def clean_all_graph_index():
@@ -80,8 +80,18 @@ def extract_graph(input_file, input_text, schema, 
example_prompt) -> str:
 
     try:
         context = builder.run()
-        graph_elements = {"vertices": context["vertices"], "edges": 
context["edges"]}
-        return json.dumps(graph_elements, ensure_ascii=False, indent=2)
+        if not context["vertices"] and not context["edges"]:
+            log.info("Please check the schema.(The schema may not match the 
Doc)")
+            return json.dumps(
+                {
+                    "vertices": context["vertices"],
+                    "edges": context["edges"],
+                    "warning": "The schema may not match the Doc"
+                    },
+                ensure_ascii=False,
+                indent=2
+            )
+        return json.dumps({"vertices": context["vertices"], "edges": 
context["edges"]}, ensure_ascii=False, indent=2)
     except Exception as e:  # pylint: disable=broad-exception-caught
         log.error(e)
         raise gr.Error(str(e))
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/vector_index_utils.py 
b/hugegraph-llm/src/hugegraph_llm/utils/vector_index_utils.py
index 0911549..ef2b5e9 100644
--- a/hugegraph-llm/src/hugegraph_llm/utils/vector_index_utils.py
+++ b/hugegraph-llm/src/hugegraph_llm/utils/vector_index_utils.py
@@ -29,7 +29,9 @@ from hugegraph_llm.utils.hugegraph_utils import get_hg_client
 
 
 def read_documents(input_file, input_text):
-    if input_file:
+    if input_text:
+        texts = [input_text]
+    elif input_file:
         texts = []
         for file in input_file:
             full_path = file.name
@@ -48,19 +50,22 @@ def read_documents(input_file, input_text):
                 raise gr.Error("PDF will be supported later! Try to upload 
text/docx now")
             else:
                 raise gr.Error("Please input txt or docx file.")
-    elif input_text:
-        texts = [input_text]
     else:
         raise gr.Error("Please input text or upload file.")
     return texts
 
 
+#pylint: disable=C0301
 def get_vector_index_info():
-    vector_index = VectorIndex.from_index_file(str(os.path.join(resource_path, 
huge_settings.graph_name, "chunks")))
+    chunk_vector_index = 
VectorIndex.from_index_file(str(os.path.join(resource_path, 
huge_settings.graph_name, "chunks")))
+    graph_vid_vector_index = 
VectorIndex.from_index_file(str(os.path.join(resource_path, 
huge_settings.graph_name, "graph_vids")))
     return json.dumps({
-        "embed_dim": vector_index.index.d,
-        "num_vectors": vector_index.index.ntotal,
-        "num_properties": len(vector_index.properties)
+        "embed_dim": chunk_vector_index.index.d,
+        "vector_info": {
+            "chunk_vector_num": chunk_vector_index.index.ntotal,
+            "graph_vid_vector_num": graph_vid_vector_index.index.ntotal,
+            "graph_properties_vector_num": len(chunk_vector_index.properties)
+        }
     }, ensure_ascii=False, indent=2)
 
 
@@ -70,6 +75,8 @@ def clean_vector_index():
 
 
 def build_vector_index(input_file, input_text):
+    if input_file and input_text:
+        raise gr.Error("Please only choose one between file and text.")
     texts = read_documents(input_file, input_text)
     builder = KgBuilder(LLMs().get_chat_llm(), Embeddings().get_embedding(), 
get_hg_client())
     context = builder.chunk_split(texts, "paragraph", 
"zh").build_vector_index().run()

(incubator-hugegraph-ai) branch main updated: feat(llm): modify the summary info and enhance the request logic (#147)

Reply via email to