This is an automated email from the ASF dual-hosted git repository.
jin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git
The following commit(s) were added to refs/heads/main by this push:
new 3ad572f feat(llm): modify the summary info and enhance the request
logic (#147)
3ad572f is described below
commit 3ad572fb79927326b03d4c78d6e51746f983d8d0
Author: SoJGooo <[email protected]>
AuthorDate: Thu Jan 2 17:06:26 2025 +0800
feat(llm): modify the summary info and enhance the request logic (#147)
1. **Modify the return messages of `Get Vector Index Info`** . Display the
number of elements in **"chunk vector"** and **"graph vid vector"** separately.
`{
"embed_dim": 1024,
"vector_info": {
"chunk_vector_num": 8,
"graph_vid_vector_num": 483,
"graph_properties_vector_num": 8
}
}`
2. **Modify the return messages of `Get Graph Index Info`** . Display the
number of vertices and edges. Display the first 100 edges.
`{
"num_vertices": [
483
],
"num_edges": [
692
],
"vertices": [
"12:James",
"12:Sarah"
....],
"edges": [
"S9:徐起>12>12>>S10:冀RM78Y7!小型轿车!徐起",
"S9:刘雪峰>11>11>>S11:未与前车保持足以采取紧急制动措施的安全距离",
....],
"vid_index": {
"embed_dim": 1024,
"num_vectors": 483,
"num_vids": 483
}
}
`
3. **When a user simultaneously employs text and files, a `log.error` pops
up.**
4. add doc_input_text into config_prompt.py to **impl text in Doc(s)
persistence**
---------
Co-authored-by: imbajin <[email protected]>
---
.../config/models/base_prompt_config.py | 5 +++
.../src/hugegraph_llm/config/prompt_config.py | 8 ++++
.../src/hugegraph_llm/demo/rag_demo/app.py | 8 ++--
.../demo/rag_demo/vector_graph_block.py | 43 ++++++++++++----------
.../operators/hugegraph_op/fetch_graph_data.py | 31 +++++++++++++---
.../operators/index_op/build_semantic_index.py | 3 +-
.../src/hugegraph_llm/utils/graph_index_utils.py | 20 +++++++---
.../src/hugegraph_llm/utils/vector_index_utils.py | 21 +++++++----
8 files changed, 97 insertions(+), 42 deletions(-)
diff --git
a/hugegraph-llm/src/hugegraph_llm/config/models/base_prompt_config.py
b/hugegraph-llm/src/hugegraph_llm/config/models/base_prompt_config.py
index a03b4df..1557272 100644
--- a/hugegraph-llm/src/hugegraph_llm/config/models/base_prompt_config.py
+++ b/hugegraph-llm/src/hugegraph_llm/config/models/base_prompt_config.py
@@ -35,6 +35,7 @@ class BasePromptConfig:
keywords_extract_prompt: str = ''
text2gql_graph_schema: str = ''
gremlin_generate_prompt: str = ''
+ doc_input_text: str = ''
def ensure_yaml_file_exists(self):
if os.path.exists(yaml_file_path):
@@ -61,6 +62,7 @@ class BasePromptConfig:
indented_keywords_extract_template = (
"\n".join([f" {line}" for line in
self.keywords_extract_prompt.splitlines()])
)
+ indented_doc_input_text = "\n".join([f" {line}" for line in
self.doc_input_text.splitlines()])
# This can be extended to add storage fields according to the data
needs to be stored
yaml_content = f"""graph_schema: |
@@ -87,6 +89,9 @@ keywords_extract_prompt: |
gremlin_generate_prompt: |
{indented_gremlin_prompt}
+doc_input_text: |
+{indented_doc_input_text}
+
"""
with open(yaml_file_path, "w", encoding="utf-8") as file:
file.write(yaml_content)
diff --git a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
index f6aeef5..8f55fa6 100644
--- a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
+++ b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
@@ -203,4 +203,12 @@ g.V().limit(10)
{query}
The generated gremlin is:
+"""
+
+ doc_input_text: str = """Meet Sarah, a 30-year-old attorney, and her
roommate, James, whom she's shared a home with since 2010.
+James, in his professional life, works as a journalist. Additionally, Sarah is
the proud owner of the website
+www.sarahsplace.com, while James manages his own webpage, though the specific
URL is not mentioned here.
+These two individuals, Sarah and James, have not only forged a strong personal
bond as roommates but have also
+carved out their distinctive digital presence through their respective
webpages, showcasing their varied interests
+and experiences.
"""
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py
b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py
index 1d94bd9..e0508dd 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py
@@ -57,7 +57,7 @@ def authenticate(credentials: HTTPAuthorizationCredentials =
Depends(sec)):
headers={"WWW-Authenticate": "Bearer"},
)
-
+ # TODO: move the logic to a separate file
async def timely_update_vid_embedding():
while True:
try:
@@ -120,7 +120,7 @@ def init_rag_ui() -> gr.Interface:
textbox_array_graph_config = create_configs_block()
with gr.Tab(label="1. Build RAG Index 💡"):
- textbox_input_schema, textbox_info_extract_template =
create_vector_graph_block()
+ textbox_input_text, textbox_input_schema,
textbox_info_extract_template = create_vector_graph_block()
with gr.Tab(label="2. (Graph)RAG & User Functions 📖"):
(
textbox_inp,
@@ -147,6 +147,7 @@ def init_rag_ui() -> gr.Interface:
huge_settings.graph_user,
huge_settings.graph_pwd,
huge_settings.graph_space,
+ prompt.doc_input_text,
prompt.graph_schema,
prompt.extract_graph_prompt,
prompt.default_question,
@@ -155,7 +156,7 @@ def init_rag_ui() -> gr.Interface:
prompt.custom_rerank_info,
prompt.default_question,
huge_settings.graph_name,
- prompt.gremlin_generate_prompt,
+ prompt.gremlin_generate_prompt
)
hugegraph_llm_ui.load( # pylint: disable=E1101
@@ -167,6 +168,7 @@ def init_rag_ui() -> gr.Interface:
textbox_array_graph_config[3],
textbox_array_graph_config[4],
textbox_array_graph_config[5],
+ textbox_input_text,
textbox_input_schema,
textbox_info_extract_template,
textbox_inp,
diff --git
a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/vector_graph_block.py
b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/vector_graph_block.py
index 4bb49e3..1e48e21 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/vector_graph_block.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/vector_graph_block.py
@@ -17,11 +17,9 @@
# pylint: disable=E1101
-import os
-
import gradio as gr
-from hugegraph_llm.config import resource_path, prompt
+from hugegraph_llm.config import prompt
from hugegraph_llm.utils.graph_index_utils import (
get_graph_index_info,
clean_all_graph_index,
@@ -31,10 +29,10 @@ from hugegraph_llm.utils.graph_index_utils import (
)
from hugegraph_llm.utils.vector_index_utils import clean_vector_index,
build_vector_index, get_vector_index_info
-
-def store_prompt(schema, example_prompt):
- # update env variables: schema and example_prompt
- if prompt.graph_schema != schema or prompt.extract_graph_prompt !=
example_prompt:
+def store_prompt(doc, schema, example_prompt):
+ # update env variables: doc, schema and example_prompt
+ if prompt.doc_input_text != doc or prompt.graph_schema != schema or
prompt.extract_graph_prompt != example_prompt:
+ prompt.doc_input_text = doc
prompt.graph_schema = schema
prompt.extract_graph_prompt = example_prompt
prompt.update_yaml_file()
@@ -53,7 +51,7 @@ def create_vector_graph_block():
to modify it)
- Specify the name of the HugeGraph graph instance, it will automatically
get the schema from it (like
**"hugegraph"**)
-- Graph extract head: The user-defined prompt of graph extracting
+- Graph Extract Prompt Header: The user-defined prompt of graph extracting
- If already exist the graph data, you should click "**Rebuild vid Index**" to
update the index
"""
)
@@ -61,16 +59,21 @@ def create_vector_graph_block():
with gr.Row():
with gr.Column():
with gr.Tab("text") as tab_upload_text:
- input_text = gr.Textbox(value="", label="Doc(s)", lines=20,
show_copy_button=True)
+ input_text = gr.Textbox(
+ value=prompt.doc_input_text,
+ label="Doc(s)",
+ lines=20,
+ show_copy_button=True
+ )
with gr.Tab("file") as tab_upload_file:
input_file = gr.File(
- value=[os.path.join(resource_path, "demo", "test.txt")],
+ value=None,
label="Docs (multi-files can be selected together)",
file_count="multiple",
)
input_schema = gr.Textbox(value=prompt.graph_schema, label="Schema",
lines=15, show_copy_button=True)
info_extract_template = gr.Textbox(
- value=prompt.extract_graph_prompt, label="Graph extract head",
lines=15, show_copy_button=True
+ value=prompt.extract_graph_prompt, label="Graph Extract Prompt
Header", lines=15, show_copy_button=True
)
out = gr.Code(label="Output", language="json",
elem_classes="code-container-edit")
@@ -91,37 +94,37 @@ def create_vector_graph_block():
vector_index_btn0.click(get_vector_index_info, outputs=out).then(
store_prompt,
- inputs=[input_schema, info_extract_template],
+ inputs=[input_text, input_schema, info_extract_template],
)
vector_index_btn1.click(clean_vector_index).then(
store_prompt,
- inputs=[input_schema, info_extract_template],
+ inputs=[input_text, input_schema, info_extract_template],
)
vector_import_bt.click(build_vector_index, inputs=[input_file,
input_text], outputs=out).then(
store_prompt,
- inputs=[input_schema, info_extract_template],
+ inputs=[input_text, input_schema, info_extract_template],
)
graph_index_btn0.click(get_graph_index_info, outputs=out).then(
store_prompt,
- inputs=[input_schema, info_extract_template],
+ inputs=[input_text, input_schema, info_extract_template],
)
graph_index_btn1.click(clean_all_graph_index).then(
store_prompt,
- inputs=[input_schema, info_extract_template],
+ inputs=[input_text, input_schema, info_extract_template],
)
graph_index_rebuild_bt.click(update_vid_embedding, outputs=out).then(
store_prompt,
- inputs=[input_schema, info_extract_template],
+ inputs=[input_text, input_schema, info_extract_template],
)
# origin_out = gr.Textbox(visible=False)
graph_extract_bt.click(
extract_graph, inputs=[input_file, input_text, input_schema,
info_extract_template], outputs=[out]
- ).then(store_prompt, inputs=[input_schema, info_extract_template], )
+ ).then(store_prompt, inputs=[input_text, input_schema,
info_extract_template], )
graph_loading_bt.click(import_graph_data, inputs=[out, input_schema],
outputs=[out]).then(update_vid_embedding).then(
store_prompt,
- inputs=[input_schema, info_extract_template],
+ inputs=[input_text, input_schema, info_extract_template],
)
def on_tab_select(input_f, input_t, evt: gr.SelectData):
@@ -135,4 +138,4 @@ def create_vector_graph_block():
tab_upload_file.select(fn=on_tab_select, inputs=[input_file, input_text],
outputs=[input_file, input_text])
tab_upload_text.select(fn=on_tab_select, inputs=[input_file, input_text],
outputs=[input_file, input_text])
- return input_schema, info_extract_template
+ return input_text, input_schema, info_extract_template
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
index 1428a3a..e93d916 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
@@ -22,12 +22,31 @@ from pyhugegraph.client import PyHugeClient
class FetchGraphData:
+
def __init__(self, graph: PyHugeClient):
self.graph = graph
- def run(self, context: Optional[Dict[str, Any]]) -> Dict[str, Any]:
- if context is None:
- context = {}
- if "vertices" not in context:
- context["vertices"] =
self.graph.gremlin().exec("g.V().id().limit(10000)")["data"]
- return context
+ def run(self, graph_summary: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+ if graph_summary is None:
+ graph_summary = {}
+
+ # TODO: v_limit will influence the vid embedding logic in
build_semantic_index.py
+ v_limit = 10000
+ e_limit = 200
+ keys = ["vertex_num", "edge_num", "vertices", "edges", "note"]
+
+ groovy_code = f"""
+ def res = [:];
+ res.{keys[0]} = g.V().count().next();
+ res.{keys[1]} = g.E().count().next();
+ res.{keys[2]} = g.V().id().limit({v_limit}).toList();
+ res.{keys[3]} = g.E().id().limit({e_limit}).toList();
+ res.{keys[4]} = "Only ≤{v_limit} VIDs and ≤ {e_limit} EIDs for brief
overview .";
+ return res;
+ """
+
+ result = self.graph.gremlin().exec(groovy_code)["data"]
+
+ if isinstance(result, list) and len(result) > 0:
+ graph_summary.update({key: result[i].get(key) for i, key in
enumerate(keys)})
+ return graph_summary
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
b/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
index a69be81..c8ce907 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
@@ -34,7 +34,8 @@ class BuildSemanticIndex:
def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
past_vids = self.vid_index.properties
- present_vids = context["vertices"]
+ # TODO: We should build vid vector index separately, especially when
the vertices may be very large
+ present_vids = context["vertices"] # Warning: data truncated by
fetch_graph_data.py
removed_vids = set(past_vids) - set(present_vids)
removed_num = self.vid_index.remove(removed_vids)
added_vids = list(set(present_vids) - set(past_vids))
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
b/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
index ace61c0..a9ebc05 100644
--- a/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
+++ b/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
@@ -35,14 +35,14 @@ from ..operators.kg_construction_task import KgBuilder
def get_graph_index_info():
builder = KgBuilder(LLMs().get_chat_llm(), Embeddings().get_embedding(),
get_hg_client())
- context = builder.fetch_graph_data().run()
+ graph_summary_info = builder.fetch_graph_data().run()
vector_index = VectorIndex.from_index_file(str(os.path.join(resource_path,
huge_settings.graph_name, "graph_vids")))
- context["vid_index"] = {
+ graph_summary_info["vid_index"] = {
"embed_dim": vector_index.index.d,
"num_vectors": vector_index.index.ntotal,
"num_vids": len(vector_index.properties),
}
- return json.dumps(context, ensure_ascii=False, indent=2)
+ return json.dumps(graph_summary_info, ensure_ascii=False, indent=2)
def clean_all_graph_index():
@@ -80,8 +80,18 @@ def extract_graph(input_file, input_text, schema,
example_prompt) -> str:
try:
context = builder.run()
- graph_elements = {"vertices": context["vertices"], "edges":
context["edges"]}
- return json.dumps(graph_elements, ensure_ascii=False, indent=2)
+ if not context["vertices"] and not context["edges"]:
+ log.info("Please check the schema.(The schema may not match the
Doc)")
+ return json.dumps(
+ {
+ "vertices": context["vertices"],
+ "edges": context["edges"],
+ "warning": "The schema may not match the Doc"
+ },
+ ensure_ascii=False,
+ indent=2
+ )
+ return json.dumps({"vertices": context["vertices"], "edges":
context["edges"]}, ensure_ascii=False, indent=2)
except Exception as e: # pylint: disable=broad-exception-caught
log.error(e)
raise gr.Error(str(e))
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/vector_index_utils.py
b/hugegraph-llm/src/hugegraph_llm/utils/vector_index_utils.py
index 0911549..ef2b5e9 100644
--- a/hugegraph-llm/src/hugegraph_llm/utils/vector_index_utils.py
+++ b/hugegraph-llm/src/hugegraph_llm/utils/vector_index_utils.py
@@ -29,7 +29,9 @@ from hugegraph_llm.utils.hugegraph_utils import get_hg_client
def read_documents(input_file, input_text):
- if input_file:
+ if input_text:
+ texts = [input_text]
+ elif input_file:
texts = []
for file in input_file:
full_path = file.name
@@ -48,19 +50,22 @@ def read_documents(input_file, input_text):
raise gr.Error("PDF will be supported later! Try to upload
text/docx now")
else:
raise gr.Error("Please input txt or docx file.")
- elif input_text:
- texts = [input_text]
else:
raise gr.Error("Please input text or upload file.")
return texts
+#pylint: disable=C0301
def get_vector_index_info():
- vector_index = VectorIndex.from_index_file(str(os.path.join(resource_path,
huge_settings.graph_name, "chunks")))
+ chunk_vector_index =
VectorIndex.from_index_file(str(os.path.join(resource_path,
huge_settings.graph_name, "chunks")))
+ graph_vid_vector_index =
VectorIndex.from_index_file(str(os.path.join(resource_path,
huge_settings.graph_name, "graph_vids")))
return json.dumps({
- "embed_dim": vector_index.index.d,
- "num_vectors": vector_index.index.ntotal,
- "num_properties": len(vector_index.properties)
+ "embed_dim": chunk_vector_index.index.d,
+ "vector_info": {
+ "chunk_vector_num": chunk_vector_index.index.ntotal,
+ "graph_vid_vector_num": graph_vid_vector_index.index.ntotal,
+ "graph_properties_vector_num": len(chunk_vector_index.properties)
+ }
}, ensure_ascii=False, indent=2)
@@ -70,6 +75,8 @@ def clean_vector_index():
def build_vector_index(input_file, input_text):
+ if input_file and input_text:
+ raise gr.Error("Please only choose one between file and text.")
texts = read_documents(input_file, input_text)
builder = KgBuilder(LLMs().get_chat_llm(), Embeddings().get_embedding(),
get_hg_client())
context = builder.chunk_split(texts, "paragraph",
"zh").build_vector_index().run()