This is an automated email from the ASF dual-hosted git repository.
jin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git
The following commit(s) were added to refs/heads/main by this push:
new f7fc02f feat(llm): support multiple property_type & importing graph
from the entire doc (#84)
f7fc02f is described below
commit f7fc02f4091c2c64c4f1c19fbb69d6c5b84dca52
Author: vichayturen <[email protected]>
AuthorDate: Thu Oct 10 16:19:00 2024 +0800
feat(llm): support multiple property_type & importing graph from the entire
doc (#84)
Done:
1. Fixed bug of config qianfan llm api
2. Added two enums for properties settings
3. support different property type
4. support list/set cardinality for property_key
TODO:
- [ ] enhance the logic for split chunk for vector & graph (maybe we should
divide them separately)
---------
Co-authored-by: imbajin <[email protected]>
---
hugegraph-llm/src/hugegraph_llm/api/rag_api.py | 6 +-
.../hugegraph_llm/demo/rag_demo/configs_block.py | 3 +-
.../src/hugegraph_llm/enums/id_strategy.py | 10 +-
.../{build_mode.py => property_cardinality.py} | 11 +-
.../enums/{build_mode.py => property_data_type.py} | 33 +++-
.../src/hugegraph_llm/models/llms/openai.py | 5 +-
.../operators/common_op/check_schema.py | 159 +++++++++++--------
.../operators/document_op/chunk_split.py | 56 ++++---
.../operators/document_op/word_extract.py | 4 +-
.../src/hugegraph_llm/operators/graph_rag_task.py | 1 -
.../operators/hugegraph_op/commit_to_hugegraph.py | 172 +++++++++++++++++----
.../operators/hugegraph_op/fetch_graph_data.py | 9 +-
.../operators/hugegraph_op/schema_manager.py | 15 +-
.../operators/index_op/build_semantic_index.py | 2 +-
.../operators/kg_construction_task.py | 26 ++--
.../operators/llm_op/answer_synthesize.py | 4 +-
.../operators/llm_op/keyword_extract.py | 4 +-
.../src/hugegraph_llm/utils/graph_index_utils.py | 12 +-
18 files changed, 335 insertions(+), 197 deletions(-)
diff --git a/hugegraph-llm/src/hugegraph_llm/api/rag_api.py
b/hugegraph-llm/src/hugegraph_llm/api/rag_api.py
index 893c559..8379550 100644
--- a/hugegraph-llm/src/hugegraph_llm/api/rag_api.py
+++ b/hugegraph-llm/src/hugegraph_llm/api/rag_api.py
@@ -70,13 +70,9 @@ def rag_http_api(
near_neighbor_first=req.near_neighbor_first,
custom_related_information=req.custom_related_information
)
- # TODO/FIXME: handle QianFanClient error (not dict..critical)
- # log.critical(f"## {type(result)}, {json.dumps(result)}")
+
if isinstance(result, dict):
- log.critical("##1. %s", type(result))
return {"graph_recall": result}
-
- log.critical("##2. %s", type(result))
return {"graph_recall": json.dumps(result)}
except TypeError as e:
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
index 8de30c8..60ad0fe 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
@@ -63,7 +63,8 @@ def test_api_connection(url, method="GET", headers=None,
params=None, body=None,
def config_qianfan_model(arg1, arg2, arg3=None, origin_call=None) -> int:
settings.qianfan_api_key = arg1
settings.qianfan_secret_key = arg2
- settings.qianfan_language_model = arg3
+ if arg3:
+ settings.qianfan_language_model = arg3
params = {
"grant_type": "client_credentials",
"client_id": arg1,
diff --git a/hugegraph-llm/src/hugegraph_llm/enums/id_strategy.py
b/hugegraph-llm/src/hugegraph_llm/enums/id_strategy.py
index 5f3cadb..f792c1e 100644
--- a/hugegraph-llm/src/hugegraph_llm/enums/id_strategy.py
+++ b/hugegraph-llm/src/hugegraph_llm/enums/id_strategy.py
@@ -19,9 +19,9 @@
from enum import Enum
-# Note: we don't support the "UUID" strategy for now
class IdStrategy(Enum):
- AUTOMATIC = "AUTOMATIC"
- CUSTOMIZE_NUMBER = "CUSTOMIZE_NUMBER"
- CUSTOMIZE_STRING = "CUSTOMIZE_STRING"
- PRIMARY_KEY = "PRIMARY_KEY"
+ AUTOMATIC = "automatic"
+ PRIMARY_KEY = "primary_key"
+ CUSTOMIZE_STRING = "customize_string"
+ CUSTOMIZE_NUMBER = "customize_number"
+ CUSTOMIZE_UUID = "customize_uuid"
diff --git a/hugegraph-llm/src/hugegraph_llm/enums/build_mode.py
b/hugegraph-llm/src/hugegraph_llm/enums/property_cardinality.py
similarity index 79%
copy from hugegraph-llm/src/hugegraph_llm/enums/build_mode.py
copy to hugegraph-llm/src/hugegraph_llm/enums/property_cardinality.py
index 50db4c8..7ad161b 100644
--- a/hugegraph-llm/src/hugegraph_llm/enums/build_mode.py
+++ b/hugegraph-llm/src/hugegraph_llm/enums/property_cardinality.py
@@ -19,9 +19,8 @@
from enum import Enum
-class BuildMode(Enum):
- REBUILD_VECTOR = "Rebuild Vector"
- TEST_MODE = "Test Mode"
- IMPORT_MODE = "Import Mode"
- CLEAR_AND_IMPORT = "Clear and Import"
- REBUILD_VERTEX_INDEX = "Rebuild vertex index"
+class PropertyCardinality(Enum):
+ SINGLE = "SINGLE"
+ LIST = "LIST"
+ SET = "SET"
+ DEFAULT = SINGLE
diff --git a/hugegraph-llm/src/hugegraph_llm/enums/build_mode.py
b/hugegraph-llm/src/hugegraph_llm/enums/property_data_type.py
similarity index 60%
rename from hugegraph-llm/src/hugegraph_llm/enums/build_mode.py
rename to hugegraph-llm/src/hugegraph_llm/enums/property_data_type.py
index 50db4c8..683bf45 100644
--- a/hugegraph-llm/src/hugegraph_llm/enums/build_mode.py
+++ b/hugegraph-llm/src/hugegraph_llm/enums/property_data_type.py
@@ -19,9 +19,30 @@
from enum import Enum
-class BuildMode(Enum):
- REBUILD_VECTOR = "Rebuild Vector"
- TEST_MODE = "Test Mode"
- IMPORT_MODE = "Import Mode"
- CLEAR_AND_IMPORT = "Clear and Import"
- REBUILD_VERTEX_INDEX = "Rebuild vertex index"
+class PropertyDataType(Enum):
+ BOOLEAN = "BOOLEAN"
+ BYTE = "BYTE"
+ INT = "INT"
+ LONG = "LONG"
+ FLOAT = "FLOAT"
+ DOUBLE = "DOUBLE"
+ TEXT = "TEXT"
+ BLOB = "BLOB"
+ DATE = "DATE"
+ UUID = "UUID"
+ DEFAULT = TEXT
+
+
+def default_value_map(data_type: str):
+ return {
+ "BOOLEAN": False,
+ "BYTE": 0,
+ "INT": 0,
+ "LONG": 0,
+ "FLOAT": 0.0,
+ "DOUBLE": 0.0,
+ "TEXT": "",
+ "BLOB": "",
+ "DATE": "2000-01-01",
+ "UUID": "",
+ }[data_type]
diff --git a/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py
b/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py
index 8a6ab8f..be1d1f7 100644
--- a/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py
+++ b/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py
@@ -15,12 +15,11 @@
# specific language governing permissions and limitations
# under the License.
-import json
from typing import Callable, List, Optional, Dict, Any
import openai
-from openai import OpenAI, AsyncOpenAI
import tiktoken
+from openai import OpenAI, AsyncOpenAI
from retry import retry
from hugegraph_llm.models.llms.base import BaseLLM
@@ -141,7 +140,7 @@ class OpenAIClient(BaseLLM):
def max_allowed_token_length(self) -> int:
"""Get max-allowed token length"""
# TODO: list all models and their max tokens from api
- return 2049
+ return 8192
def get_llm_type(self) -> str:
return "openai"
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/common_op/check_schema.py
b/hugegraph-llm/src/hugegraph_llm/operators/common_op/check_schema.py
index 10ed2dc..3220d9f 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/common_op/check_schema.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/common_op/check_schema.py
@@ -17,83 +17,106 @@
from typing import Any, Optional, Dict
+
+from hugegraph_llm.enums.property_cardinality import PropertyCardinality
+from hugegraph_llm.enums.property_data_type import PropertyDataType
from hugegraph_llm.utils.log import log
+def log_and_raise(message: str) -> None:
+ log.warning(message)
+ raise ValueError(message)
+
+
+def check_type(value: Any, expected_type: type, message: str) -> None:
+ if not isinstance(value, expected_type):
+ log_and_raise(message)
+
+
class CheckSchema:
- def __init__(self, data):
+ def __init__(self, data: Dict[str, Any]):
self.result = None
self.data = data
- def run(self, context: Optional[Dict[str, Any]] = None) -> Any: # pylint:
disable=too-many-branches
+ def run(self, context: Optional[Dict[str, Any]] = None) -> Any:
if context is None:
context = {}
+
+ # 1. Validate the schema structure
schema = self.data or context.get("schema")
- if not isinstance(schema, dict):
- raise ValueError("Input data is not a dictionary.")
- if "vertexlabels" not in schema or "edgelabels" not in schema:
- raise ValueError("Input data does not contain 'vertexlabels' or
'edgelabels'.")
- if not isinstance(schema["vertexlabels"], list) or not
isinstance(schema["edgelabels"],
-
list):
- raise ValueError("'vertexlabels' or 'edgelabels' in input data is
not a list.")
- for vertex in schema["vertexlabels"]:
- if not isinstance(vertex, dict):
- raise ValueError("Vertex in input data is not a dictionary.")
- if "name" not in vertex:
- raise ValueError("Vertex in input data does not contain
'name'.")
- if not isinstance(vertex["name"], str):
- raise ValueError("'name' in vertex is not of correct type.")
- if "properties" not in vertex:
- raise ValueError("Vertex in input data does not contain
'properties'.")
- properties = vertex["properties"]
- if not isinstance(properties, list):
- raise ValueError("'properties' in vertex is not of correct
type.")
- if len(properties) == 0:
- raise ValueError("'properties' in vertex is empty.")
- primary_keys = vertex.get("primary_keys", properties[:1])
- if not isinstance(primary_keys, list):
- raise ValueError("'primary_keys' in vertex is not of correct
type.")
- new_primary_keys = []
- for key in primary_keys:
- if key not in properties:
- log.waring("Primary key '%s' not found in properties has
been auto removed.",
- key)
- else:
- new_primary_keys.append(key)
- if len(new_primary_keys) == 0:
- raise ValueError(f"primary keys of vertexLabel
{vertex['vertex_label']} is empty.")
- vertex["primary_keys"] = new_primary_keys
- nullable_keys = vertex.get("nullable_keys", properties[1:])
- if not isinstance(nullable_keys, list):
- raise ValueError("'nullable_keys' in vertex is not of correct
type.")
- new_nullable_keys = []
- for key in nullable_keys:
- if key not in properties:
- log.warning("Nullable key '%s' not found in properties has
been auto removed.",
- key)
- else:
- new_nullable_keys.append(key)
- vertex["nullable_keys"] = new_nullable_keys
- for edge in schema["edgelabels"]:
- if not isinstance(edge, dict):
- raise ValueError("Edge in input data is not a dictionary.")
- if (
- "name" not in edge
- or "source_label" not in edge
- or "target_label" not in edge
- ):
- raise ValueError(
- "Edge in input data does not contain "
- "'name', 'source_label', 'target_label'."
- )
- if (
- not isinstance(edge["name"], str)
- or not isinstance(edge["source_label"], str)
- or not isinstance(edge["target_label"], str)
- ):
- raise ValueError(
- "'name', 'source_label', 'target_label' "
- "in edge is not of correct type."
- )
+ self._validate_schema(schema)
+ # 2. Process property labels and also create a set for it
+ property_labels, property_label_set =
self._process_property_labels(schema)
+ # 3. Process properties in given vertex/edge labels
+ self._process_vertex_labels(schema, property_labels,
property_label_set)
+ self._process_edge_labels(schema, property_labels, property_label_set)
+ # 4. Update schema with processed pks
+ schema["propertykeys"] = property_labels
context.update({"schema": schema})
return context
+
+ def _validate_schema(self, schema: Dict[str, Any]) -> None:
+ check_type(schema, dict, "Input data is not a dictionary.")
+ if "vertexlabels" not in schema or "edgelabels" not in schema:
+ log_and_raise("Input data does not contain 'vertexlabels' or
'edgelabels'.")
+ check_type(schema["vertexlabels"], list, "'vertexlabels' in input data
is not a list.")
+ check_type(schema["edgelabels"], list, "'edgelabels' in input data is
not a list.")
+
+ def _process_property_labels(self, schema: Dict[str, Any]) -> (list, set):
+ property_labels = schema.get("propertykeys", [])
+ check_type(property_labels, list, "'propertykeys' in input data is not
of correct type.")
+ property_label_set = {label["name"] for label in property_labels}
+ return property_labels, property_label_set
+
+ def _process_vertex_labels(self, schema: Dict[str, Any], property_labels:
list, property_label_set: set) -> None:
+ for vertex_label in schema["vertexlabels"]:
+ self._validate_vertex_label(vertex_label)
+ properties = vertex_label["properties"]
+ primary_keys = self._process_keys(vertex_label, "primary_keys",
properties[:1])
+ if len(primary_keys) == 0:
+ log_and_raise(f"'primary_keys' of {vertex_label['name']} is
empty.")
+ vertex_label["primary_keys"] = primary_keys
+ nullable_keys = self._process_keys(vertex_label, "nullable_keys",
properties[1:])
+ vertex_label["nullable_keys"] = nullable_keys
+ self._add_missing_properties(properties, property_labels,
property_label_set)
+
+ def _process_edge_labels(self, schema: Dict[str, Any], property_labels:
list, property_label_set: set) -> None:
+ for edge_label in schema["edgelabels"]:
+ self._validate_edge_label(edge_label)
+ properties = edge_label.get("properties", [])
+ self._add_missing_properties(properties, property_labels,
property_label_set)
+
+ def _validate_vertex_label(self, vertex_label: Dict[str, Any]) -> None:
+ check_type(vertex_label, dict, "VertexLabel in input data is not a
dictionary.")
+ if "name" not in vertex_label:
+ log_and_raise("VertexLabel in input data does not contain 'name'.")
+ check_type(vertex_label["name"], str, "'name' in vertex_label is not
of correct type.")
+ if "properties" not in vertex_label:
+ log_and_raise("VertexLabel in input data does not contain
'properties'.")
+ check_type(vertex_label["properties"], list, "'properties' in
vertex_label is not of correct type.")
+ if len(vertex_label["properties"]) == 0:
+ log_and_raise("'properties' in vertex_label is empty.")
+
+ def _validate_edge_label(self, edge_label: Dict[str, Any]) -> None:
+ check_type(edge_label, dict, "EdgeLabel in input data is not a
dictionary.")
+ if "name" not in edge_label or "source_label" not in edge_label or
"target_label" not in edge_label:
+ log_and_raise("EdgeLabel in input data does not contain 'name',
'source_label', 'target_label'.")
+ check_type(edge_label["name"], str, "'name' in edge_label is not of
correct type.")
+ check_type(edge_label["source_label"], str, "'source_label' in
edge_label is not of correct type.")
+ check_type(edge_label["target_label"], str, "'target_label' in
edge_label is not of correct type.")
+
+ def _process_keys(self, label: Dict[str, Any], key_type: str,
default_keys: list) -> list:
+ keys = label.get(key_type, default_keys)
+ check_type(keys, list, f"'{key_type}' in {label['name']} is not of
correct type.")
+ new_keys = [key for key in keys if key in label["properties"]]
+ return new_keys
+
+ def _add_missing_properties(self, properties: list, property_labels: list,
property_label_set: set) -> None:
+ for prop in properties:
+ if prop not in property_label_set:
+ property_labels.append({
+ "name": prop,
+ "data_type": PropertyDataType.DEFAULT.value,
+ "cardinality": PropertyCardinality.DEFAULT.value,
+ })
+ property_label_set.add(prop)
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py
b/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py
index e450b3c..b26d094 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py
@@ -17,45 +17,57 @@
from typing import Literal, Dict, Any, Optional, Union, List
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
+# Constants
+LANGUAGE_ZH = "zh"
+LANGUAGE_EN = "en"
+SPLIT_TYPE_DOCUMENT = "document"
+SPLIT_TYPE_PARAGRAPH = "paragraph"
+SPLIT_TYPE_SENTENCE = "sentence"
class ChunkSplit:
def __init__(
- self,
- texts: Union[str, List[str]],
- split_type: Literal["paragraph", "sentence"] = "paragraph",
- language: Literal["zh", "en"] = "zh"
+ self,
+ texts: Union[str, List[str]],
+ split_type: Literal["document", "paragraph", "sentence"] =
SPLIT_TYPE_DOCUMENT,
+ language: Literal["zh", "en"] = LANGUAGE_ZH,
):
if isinstance(texts, str):
texts = [texts]
self.texts = texts
- if language == "zh":
- separators = ["\n\n", "\n", "。", ",", ""]
- elif language == "en":
- separators = ["\n\n", "\n", ".", ",", " ", ""]
+ self.separators = self._get_separators(language)
+ self.text_splitter = self._get_text_splitter(split_type)
+
+ def _get_separators(self, language: str) -> List[str]:
+ if language == LANGUAGE_ZH:
+ return ["\n\n", "\n", "。", ",", ""]
+ elif language == LANGUAGE_EN:
+ return ["\n\n", "\n", ".", ",", " ", ""]
else:
raise ValueError("language must be zh or en")
- if split_type == "paragraph":
- self.text_splitter = RecursiveCharacterTextSplitter(
- chunk_size=500,
- chunk_overlap=30,
- separators=separators
- )
- elif split_type == "sentence":
- self.text_splitter = RecursiveCharacterTextSplitter(
- chunk_size=50,
- chunk_overlap=0,
- separators=separators
- )
+
+ def _get_text_splitter(self, split_type: str):
+ if split_type == SPLIT_TYPE_DOCUMENT:
+ return lambda text: [text]
+ elif split_type == SPLIT_TYPE_PARAGRAPH:
+ return RecursiveCharacterTextSplitter(
+ chunk_size=500, chunk_overlap=30, separators=self.separators
+ ).split_text
+ elif split_type == SPLIT_TYPE_SENTENCE:
+ return RecursiveCharacterTextSplitter(
+ chunk_size=50, chunk_overlap=0, separators=self.separators
+ ).split_text
else:
- raise ValueError("type must be paragraph, sentence, html or
markdown")
+ raise ValueError("Type must be paragraph, sentence, html or
markdown")
def run(self, context: Optional[Dict[str, Any]]) -> Dict[str, Any]:
all_chunks = []
for text in self.texts:
- chunks = self.text_splitter.split_text(text)
+ chunks = self.text_splitter(text)
all_chunks.extend(chunks)
+
if context is None:
return {"chunks": all_chunks}
context["chunks"] = all_chunks
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
b/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
index 630e5b0..546d561 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
@@ -45,10 +45,8 @@ class WordExtract:
context["query"] = self._query
if self._llm is None:
- self._llm = context.get("llm") or LLMs().get_llm()
+ self._llm = LLMs().get_llm()
assert isinstance(self._llm, BaseLLM), "Invalid LLM Object."
- if context.get("llm") is None:
- context["llm"] = self._llm
if isinstance(context.get("language"), str):
self._language = context["language"].lower()
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
index dd75b18..e6da8e0 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
@@ -218,7 +218,6 @@ class RAGPipeline:
self.extract_keywords().query_graphdb().synthesize_answer()
context = kwargs
- context["llm"] = self._llm
for operator in self._operators:
context = self._run_operator(operator, context)
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
index 87129b2..1e44786 100644
---
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
+++
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
@@ -15,16 +15,17 @@
# specific language governing permissions and limitations
# under the License.
-
from typing import Dict, Any
from hugegraph_llm.config import settings
+from hugegraph_llm.enums.property_cardinality import PropertyCardinality
+from hugegraph_llm.enums.property_data_type import PropertyDataType,
default_value_map
from hugegraph_llm.utils.log import log
from pyhugegraph.client import PyHugeClient
from pyhugegraph.utils.exceptions import NotFoundError, CreateError
-class CommitToKg:
+class Commit2Graph:
def __init__(self):
self.client = PyHugeClient(
settings.graph_ip,
@@ -54,10 +55,31 @@ class CommitToKg:
self.load_into_graph(vertices, edges, schema)
return data
- def load_into_graph(self, vertices, edges, schema):
+ def _set_default_property(self, key, input_properties, property_label_map):
+ data_type = property_label_map[key]["data_type"]
+ cardinality = property_label_map[key]["cardinality"]
+ if cardinality == PropertyCardinality.SINGLE.value:
+ default_value = default_value_map(data_type)
+ input_properties[key] = default_value
+ else:
+ # list or set
+ default_value = []
+ input_properties[key] = default_value
+ log.warning("Property '%s' missing in vertex, set to '%s' for now",
key, default_value)
+
+ def _handle_graph_creation(self, func, *args, **kwargs):
+ try:
+ return func(*args, **kwargs)
+ except NotFoundError as e:
+ log.error(e)
+ except CreateError as e:
+ log.error("Error on creating: %s, %s", args, e)
+
+ def load_into_graph(self, vertices, edges, schema): # pylint:
disable=too-many-statements
# pylint: disable=R0912 (too-many-branches)
vertex_label_map = {v_label["name"]: v_label for v_label in
schema["vertexlabels"]}
edge_label_map = {e_label["name"]: e_label for e_label in
schema["edgelabels"]}
+ property_label_map = {p_label["name"]: p_label for p_label in
schema["propertykeys"]}
for vertex in vertices:
input_label = vertex["label"]
@@ -72,28 +94,45 @@ class CommitToKg:
nullable_keys = vertex_label.get("nullable_keys", [])
non_null_keys = [key for key in vertex_label["properties"] if key
not in nullable_keys]
+ has_problem = False
# 2. Handle primary-keys mode vertex
for pk in primary_keys:
if not input_properties.get(pk):
if len(primary_keys) == 1:
log.error("Primary-key '%s' missing in vertex %s, skip
it & need check it again", pk, vertex)
- continue
- input_properties[pk] = "null" # FIXME: handle
bool/number/date type
+ has_problem = True
+ break
+ # TODO: transform to Enum first (better in earlier step)
+ data_type = property_label_map[pk]["data_type"]
+ cardinality = property_label_map[pk]["cardinality"]
+ if cardinality == PropertyCardinality.SINGLE.value:
+ input_properties[pk] = default_value_map(data_type)
+ else:
+ input_properties[pk] = []
log.warning("Primary-key '%s' missing in vertex %s, mark
empty & need check it again!", pk, vertex)
+ if has_problem:
+ continue
# 3. Ensure all non-nullable props are set
for key in non_null_keys:
if key not in input_properties:
- input_properties[key] = "" # FIXME: handle
bool/number/date type
- log.warning("Property '%s' missing in vertex %s, set to ''
for now", key, vertex)
- try:
- # TODO: we could try batch add vertices first, setback to
single-mode if failed
- vid = self.client.graph().addVertex(input_label,
input_properties).id
- vertex["id"] = vid
- except NotFoundError as e:
- log.error(e)
- except CreateError as e:
- log.error("Error on creating vertex: %s, %s", vertex, e)
+ self._set_default_property(key, input_properties,
property_label_map)
+
+ # 4. Check all data type value is right
+ for key, value in input_properties.items():
+ # TODO: transform to Enum first (better in earlier step)
+ data_type = property_label_map[key]["data_type"]
+ cardinality = property_label_map[key]["cardinality"]
+ if not self._check_property_data_type(data_type, cardinality,
value):
+ log.error("Property type/format '%s' is not correct, skip
it & need check it again", key)
+ has_problem = True
+ break
+ if has_problem:
+ continue
+
+ # TODO: we could try batch add vertices first, setback to
single-mode if failed
+ vid = self._handle_graph_creation(self.client.graph().addVertex,
input_label, input_properties).id
+ vertex["id"] = vid
for edge in edges:
start = edge["outV"]
@@ -104,25 +143,23 @@ class CommitToKg:
if label not in edge_label_map:
log.critical("(Input) EdgeLabel %s not found in schema, skip &
need check it!", label)
continue
- try:
- # TODO: we could try batch add edges first, setback to
single-mode if failed
- self.client.graph().addEdge(label, start, end, properties)
- except NotFoundError as e:
- log.error(e)
- except CreateError as e:
- log.error("Error on creating edge: %s, %s", edge, e)
-
- def init_schema_if_need(self, schema: object):
+
+ # TODO: we could try batch add edges first, setback to single-mode
if failed
+ self._handle_graph_creation(self.client.graph().addEdge, label,
start, end, properties)
+
+ def init_schema_if_need(self, schema: dict):
+ properties = schema["propertykeys"]
vertices = schema["vertexlabels"]
edges = schema["edgelabels"]
+ for prop in properties:
+ self._create_property(prop)
+
for vertex in vertices:
vertex_label = vertex["name"]
properties = vertex["properties"]
nullable_keys = vertex["nullable_keys"]
primary_keys = vertex["primary_keys"]
- for prop in properties:
- self.schema.propertyKey(prop).asText().ifNotExist().create()
self.schema.vertexLabel(vertex_label).properties(*properties).nullableKeys(
*nullable_keys
).usePrimaryKeyId().primaryKeys(*primary_keys).ifNotExist().create()
@@ -132,8 +169,6 @@ class CommitToKg:
source_vertex_label = edge["source_label"]
target_vertex_label = edge["target_label"]
properties = edge["properties"]
- for prop in properties:
- self.schema.propertyKey(prop).asText().ifNotExist().create()
self.schema.edgeLabel(edge_label).sourceLabel(source_vertex_label).targetLabel(
target_vertex_label
).properties(*properties).nullableKeys(*properties).ifNotExist().create()
@@ -153,3 +188,84 @@ class CommitToKg:
s_id = self.client.graph().addVertex("vertex", {"name": s},
id=s).id
t_id = self.client.graph().addVertex("vertex", {"name": o},
id=o).id
self.client.graph().addEdge("edge", s_id, t_id, {"name": p})
+
+ def _create_property(self, prop: dict):
+ name = prop["name"]
+ try:
+ data_type = PropertyDataType(prop["data_type"])
+ cardinality = PropertyCardinality(prop["cardinality"])
+ except ValueError:
+ log.critical("Invalid data type %s / cardinality %s for property
%s, skip & should check it again",
+ prop["data_type"], prop["cardinality"], name)
+ return
+
+ property_key = self.schema.propertyKey(name)
+ self._set_property_data_type(property_key, data_type)
+ self._set_property_cardinality(property_key, cardinality)
+ property_key.ifNotExist().create()
+
+ def _set_property_data_type(self, property_key, data_type):
+ if data_type == PropertyDataType.BOOLEAN:
+ log.error("Boolean type is not supported")
+ elif data_type == PropertyDataType.BYTE:
+ log.warning("Byte type is not supported, use int instead")
+ property_key.asInt()
+ elif data_type == PropertyDataType.INT:
+ property_key.asInt()
+ elif data_type == PropertyDataType.LONG:
+ property_key.asLong()
+ elif data_type == PropertyDataType.FLOAT:
+ log.warning("Float type is not supported, use double instead")
+ property_key.asDouble()
+ elif data_type == PropertyDataType.DOUBLE:
+ property_key.asDouble()
+ elif data_type == PropertyDataType.TEXT:
+ property_key.asText()
+ elif data_type == PropertyDataType.BLOB:
+ log.warning("Blob type is not supported, use text instead")
+ property_key.asText()
+ elif data_type == PropertyDataType.DATE:
+ property_key.asDate()
+ elif data_type == PropertyDataType.UUID:
+ log.warning("UUID type is not supported, use text instead")
+ property_key.asText()
+ else:
+ log.error("Unknown data type %s for property_key %s", data_type,
property_key)
+
+ def _set_property_cardinality(self, property_key, cardinality):
+ if cardinality == PropertyCardinality.SINGLE:
+ property_key.valueSingle()
+ elif cardinality == PropertyCardinality.LIST:
+ property_key.valueList()
+ elif cardinality == PropertyCardinality.SET:
+ property_key.valueSet()
+ else:
+ log.error("Unknown cardinality %s for property_key %s",
cardinality, property_key)
+
+ def _check_property_data_type(self, data_type: str, cardinality: str,
value) -> bool:
+ if cardinality in (PropertyCardinality.LIST.value,
PropertyCardinality.SET.value):
+ return self._check_collection_data_type(data_type, value)
+ return self._check_single_data_type(data_type, value)
+
+ def _check_collection_data_type(self, data_type: str, value) -> bool:
+ if not isinstance(value, list):
+ return False
+ for item in value:
+ if not self._check_single_data_type(data_type, item):
+ return False
+ return True
+
+ def _check_single_data_type(self, data_type: str, value) -> bool:
+ if data_type == PropertyDataType.BOOLEAN.value:
+ return isinstance(value, bool)
+ if data_type in (PropertyDataType.BYTE.value,
PropertyDataType.INT.value, PropertyDataType.LONG.value):
+ return isinstance(value, int)
+ if data_type in (PropertyDataType.FLOAT.value,
PropertyDataType.DOUBLE.value):
+ return isinstance(value, float)
+ if data_type in (PropertyDataType.TEXT.value,
PropertyDataType.UUID.value):
+ return isinstance(value, str)
+ # TODO: check ok below
+ if data_type == PropertyDataType.DATE.value: # the format should be
"yyyy-MM-dd"
+ import re
+ return isinstance(value, str) and re.match(r'^\d{4}-\d{2}-\d{2}$',
value)
+ raise ValueError(f"Unknown/Unsupported data type: {data_type}")
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
index 77233f0..9cb5729 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
@@ -29,12 +29,5 @@ class FetchGraphData:
if context is None:
context = {}
if "vertices" not in context:
- context["vertices"] = []
- vertices = self.graph.gremlin().exec("g.V()")["data"]
- for vertex in vertices:
- context["vertices"].append({
- "id": vertex["id"],
- "label": vertex["label"],
- "properties": vertex["properties"]
- })
+ context["vertices"] =
self.graph.gremlin().exec("g.V().id()")["data"]
return context
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
index 57da3ef..ab50abf 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
@@ -37,20 +37,7 @@ class SchemaManager:
if context is None:
context = {}
schema = self.schema.getSchema()
- vertices = []
- for vl in schema["vertexlabels"]:
- vertex = {"vertex_label": vl["name"], "properties":
vl["properties"]}
- vertices.append(vertex)
- edges = []
- for el in schema["edgelabels"]:
- edge = {
- "edge_label": el["name"],
- "source_vertex_label": el["source_label"],
- "target_vertex_label": el["target_label"],
- "properties": el["properties"],
- }
- edges.append(edge)
- if not vertices and not edges:
+ if not schema["vertexlabels"] and not schema["edgelabels"]:
raise Exception(f"Can not get {self.graph_name}'s schema from
HugeGraph!")
context.update({"schema": schema})
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
b/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
index 135ce9c..4c19cd6 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
@@ -34,7 +34,7 @@ class BuildSemanticIndex:
def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
past_vids = self.vid_index.properties
- present_vids = [v["id"] for v in context["vertices"]]
+ present_vids = context["vertices"]
removed_vids = set(past_vids) - set(present_vids)
removed_num = self.vid_index.remove(removed_vids)
added_vids = list(set(present_vids) - set(past_vids))
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
b/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
index 921895f..2fb5966 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
@@ -23,7 +23,7 @@ from hugegraph_llm.models.llms.base import BaseLLM
from hugegraph_llm.operators.common_op.check_schema import CheckSchema
from hugegraph_llm.operators.common_op.print_result import PrintResult
from hugegraph_llm.operators.document_op.chunk_split import ChunkSplit
-from hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph import CommitToKg
+from hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph import
Commit2Graph
from hugegraph_llm.operators.hugegraph_op.fetch_graph_data import
FetchGraphData
from hugegraph_llm.operators.hugegraph_op.schema_manager import SchemaManager
from hugegraph_llm.operators.index_op.build_semantic_index import
BuildSemanticIndex
@@ -36,8 +36,7 @@ from pyhugegraph.client import PyHugeClient
class KgBuilder:
- def __init__(self, llm: BaseLLM, embedding: Optional[BaseEmbedding] = None,
- graph: Optional[PyHugeClient] = None):
+ def __init__(self, llm: BaseLLM, embedding: Optional[BaseEmbedding] =
None, graph: Optional[PyHugeClient] = None):
self.operators = []
self.llm = llm
self.embedding = embedding
@@ -50,9 +49,9 @@ class KgBuilder:
elif from_user_defined:
self.operators.append(CheckSchema(from_user_defined))
elif from_extraction:
- raise Exception("Not implemented yet")
+ raise NotImplementedError("Not implemented yet")
else:
- raise Exception("No input data / invalid schema type")
+ raise ValueError("No input data / invalid schema type")
return self
def fetch_graph_data(self):
@@ -60,16 +59,17 @@ class KgBuilder:
return self
def chunk_split(
- self,
- text: Union[str, List[str]], # text to be split
- split_type: Literal["paragraph", "sentence"] = "paragraph",
- language: Literal["zh", "en"] = "zh"
+ self,
+ text: Union[str, List[str]], # text to be split
+ split_type: Literal["document", "paragraph", "sentence"] = "document",
+ language: Literal["zh", "en"] = "zh",
):
self.operators.append(ChunkSplit(text, split_type, language))
return self
- def extract_info(self, example_prompt: Optional[str] = None,
- extract_type: Literal["triples", "property_graph"] =
"triples"):
+ def extract_info(
+ self, example_prompt: Optional[str] = None, extract_type:
Literal["triples", "property_graph"] = "triples"
+ ):
if extract_type == "triples":
self.operators.append(InfoExtract(self.llm, example_prompt))
elif extract_type == "property_graph":
@@ -81,7 +81,7 @@ class KgBuilder:
return self
def commit_to_hugegraph(self):
- self.operators.append(CommitToKg())
+ self.operators.append(Commit2Graph())
return self
def build_vertex_id_semantic_index(self):
@@ -104,5 +104,5 @@ class KgBuilder:
return context
@log_operator_time
- def _run_operator(self, operator, context):
+ def _run_operator(self, operator, context) -> Dict[str, Any]:
return operator.run(context)
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
index 5272f90..2771cef 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
@@ -55,9 +55,7 @@ class AnswerSynthesize:
def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
if self._llm is None:
- self._llm = context.get("llm") or LLMs().get_llm()
- if context.get("llm") is None:
- context["llm"] = self._llm
+ self._llm = LLMs().get_llm()
if self._question is None:
self._question = context.get("query") or None
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
index 0d62121..0df4051 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
@@ -62,10 +62,8 @@ class KeywordExtract:
context["query"] = self._query
if self._llm is None:
- self._llm = context.get("llm") or LLMs().get_llm()
+ self._llm = LLMs().get_llm()
assert isinstance(self._llm, BaseLLM), "Invalid LLM Object."
- if context.get("llm") is None:
- context["llm"] = self._llm
if isinstance(context.get("language"), str):
self._language = context["language"].lower()
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
b/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
index 4b8c7ce..5735eae 100644
--- a/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
+++ b/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
@@ -40,7 +40,7 @@ def get_graph_index_info():
context["vid_index"] = {
"embed_dim": vector_index.index.d,
"num_vectors": vector_index.index.ntotal,
- "num_vids": len(vector_index.properties)
+ "num_vids": len(vector_index.properties),
}
return json.dumps(context, ensure_ascii=False, indent=2)
@@ -70,14 +70,11 @@ def extract_graph(input_file, input_text, schema,
example_prompt) -> str:
builder.import_schema(from_hugegraph=schema)
else:
return "ERROR: please input with correct schema/format."
- builder.chunk_split(texts, "paragraph", "zh").extract_info(example_prompt,
"property_graph")
+ builder.chunk_split(texts, "document", "zh").extract_info(example_prompt,
"property_graph")
try:
context = builder.run()
- graph_elements = {
- "vertices": context["vertices"],
- "edges": context["edges"]
- }
+ graph_elements = {"vertices": context["vertices"], "edges":
context["edges"]}
return json.dumps(graph_elements, ensure_ascii=False, indent=2)
except Exception as e: # pylint: disable=broad-exception-caught
log.error(e)
@@ -113,8 +110,9 @@ def import_graph_data(data: str, schema: str) -> Union[str,
Dict[str, Any]]:
context = builder.commit_to_hugegraph().run(data_json)
gr.Info("Import graph data successfully!")
+ print(context)
return json.dumps(context, ensure_ascii=False, indent=2)
- except Exception as e: # pylint: disable=W0718
+ except Exception as e: # pylint: disable=W0718
log.error(e)
traceback.print_exc()
# Note: can't use gr.Error here