(incubator-hugegraph-ai) branch main updated: feat(llm): support multiple property_type & importing graph from the entire doc (#84)

jin Thu, 10 Oct 2024 01:19:12 -0700

This is an automated email from the ASF dual-hosted git repository.

jin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git



The following commit(s) were added to refs/heads/main by this push:
     new f7fc02f  feat(llm): support multiple property_type & importing graph 
from the entire doc (#84)
f7fc02f is described below

commit f7fc02f4091c2c64c4f1c19fbb69d6c5b84dca52
Author: vichayturen <[email protected]>
AuthorDate: Thu Oct 10 16:19:00 2024 +0800

    feat(llm): support multiple property_type & importing graph from the entire 
doc (#84)
    
    Done:
    1. Fixed bug of config qianfan llm api
    2. Added two enums for properties settings
    3. support different property type
    4. support list/set cardinality for property_key
    
    TODO:
    - [ ] enhance the logic for split chunk for vector & graph (maybe we should 
divide them separately)
    
    ---------
    
    Co-authored-by: imbajin <[email protected]>
---
 hugegraph-llm/src/hugegraph_llm/api/rag_api.py     |   6 +-
 .../hugegraph_llm/demo/rag_demo/configs_block.py   |   3 +-
 .../src/hugegraph_llm/enums/id_strategy.py         |  10 +-
 .../{build_mode.py => property_cardinality.py}     |  11 +-
 .../enums/{build_mode.py => property_data_type.py} |  33 +++-
 .../src/hugegraph_llm/models/llms/openai.py        |   5 +-
 .../operators/common_op/check_schema.py            | 159 +++++++++++--------
 .../operators/document_op/chunk_split.py           |  56 ++++---
 .../operators/document_op/word_extract.py          |   4 +-
 .../src/hugegraph_llm/operators/graph_rag_task.py  |   1 -
 .../operators/hugegraph_op/commit_to_hugegraph.py  | 172 +++++++++++++++++----
 .../operators/hugegraph_op/fetch_graph_data.py     |   9 +-
 .../operators/hugegraph_op/schema_manager.py       |  15 +-
 .../operators/index_op/build_semantic_index.py     |   2 +-
 .../operators/kg_construction_task.py              |  26 ++--
 .../operators/llm_op/answer_synthesize.py          |   4 +-
 .../operators/llm_op/keyword_extract.py            |   4 +-
 .../src/hugegraph_llm/utils/graph_index_utils.py   |  12 +-
 18 files changed, 335 insertions(+), 197 deletions(-)

diff --git a/hugegraph-llm/src/hugegraph_llm/api/rag_api.py 
b/hugegraph-llm/src/hugegraph_llm/api/rag_api.py
index 893c559..8379550 100644
--- a/hugegraph-llm/src/hugegraph_llm/api/rag_api.py
+++ b/hugegraph-llm/src/hugegraph_llm/api/rag_api.py
@@ -70,13 +70,9 @@ def rag_http_api(
                 near_neighbor_first=req.near_neighbor_first,
                 custom_related_information=req.custom_related_information
             )
-            # TODO/FIXME: handle QianFanClient error (not dict..critical)
-            # log.critical(f"## {type(result)}, {json.dumps(result)}")
+
             if isinstance(result, dict):
-                log.critical("##1. %s", type(result))
                 return {"graph_recall": result}
-
-            log.critical("##2. %s", type(result))
             return {"graph_recall": json.dumps(result)}
 
         except TypeError as e:
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py 
b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
index 8de30c8..60ad0fe 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
@@ -63,7 +63,8 @@ def test_api_connection(url, method="GET", headers=None, 
params=None, body=None,
 def config_qianfan_model(arg1, arg2, arg3=None, origin_call=None) -> int:
     settings.qianfan_api_key = arg1
     settings.qianfan_secret_key = arg2
-    settings.qianfan_language_model = arg3
+    if arg3:
+        settings.qianfan_language_model = arg3
     params = {
         "grant_type": "client_credentials",
         "client_id": arg1,
diff --git a/hugegraph-llm/src/hugegraph_llm/enums/id_strategy.py 
b/hugegraph-llm/src/hugegraph_llm/enums/id_strategy.py
index 5f3cadb..f792c1e 100644
--- a/hugegraph-llm/src/hugegraph_llm/enums/id_strategy.py
+++ b/hugegraph-llm/src/hugegraph_llm/enums/id_strategy.py
@@ -19,9 +19,9 @@
 from enum import Enum
 
 
-# Note: we don't support the "UUID" strategy for now
 class IdStrategy(Enum):
-    AUTOMATIC = "AUTOMATIC"
-    CUSTOMIZE_NUMBER = "CUSTOMIZE_NUMBER"
-    CUSTOMIZE_STRING = "CUSTOMIZE_STRING"
-    PRIMARY_KEY = "PRIMARY_KEY"
+    AUTOMATIC = "automatic"
+    PRIMARY_KEY = "primary_key"
+    CUSTOMIZE_STRING = "customize_string"
+    CUSTOMIZE_NUMBER = "customize_number"
+    CUSTOMIZE_UUID = "customize_uuid"
diff --git a/hugegraph-llm/src/hugegraph_llm/enums/build_mode.py 
b/hugegraph-llm/src/hugegraph_llm/enums/property_cardinality.py
similarity index 79%
copy from hugegraph-llm/src/hugegraph_llm/enums/build_mode.py
copy to hugegraph-llm/src/hugegraph_llm/enums/property_cardinality.py
index 50db4c8..7ad161b 100644
--- a/hugegraph-llm/src/hugegraph_llm/enums/build_mode.py
+++ b/hugegraph-llm/src/hugegraph_llm/enums/property_cardinality.py
@@ -19,9 +19,8 @@
 from enum import Enum
 
 
-class BuildMode(Enum):
-    REBUILD_VECTOR = "Rebuild Vector"
-    TEST_MODE = "Test Mode"
-    IMPORT_MODE = "Import Mode"
-    CLEAR_AND_IMPORT = "Clear and Import"
-    REBUILD_VERTEX_INDEX = "Rebuild vertex index"
+class PropertyCardinality(Enum):
+    SINGLE = "SINGLE"
+    LIST = "LIST"
+    SET = "SET"
+    DEFAULT = SINGLE
diff --git a/hugegraph-llm/src/hugegraph_llm/enums/build_mode.py 
b/hugegraph-llm/src/hugegraph_llm/enums/property_data_type.py
similarity index 60%
rename from hugegraph-llm/src/hugegraph_llm/enums/build_mode.py
rename to hugegraph-llm/src/hugegraph_llm/enums/property_data_type.py
index 50db4c8..683bf45 100644
--- a/hugegraph-llm/src/hugegraph_llm/enums/build_mode.py
+++ b/hugegraph-llm/src/hugegraph_llm/enums/property_data_type.py
@@ -19,9 +19,30 @@
 from enum import Enum
 
 
-class BuildMode(Enum):
-    REBUILD_VECTOR = "Rebuild Vector"
-    TEST_MODE = "Test Mode"
-    IMPORT_MODE = "Import Mode"
-    CLEAR_AND_IMPORT = "Clear and Import"
-    REBUILD_VERTEX_INDEX = "Rebuild vertex index"
+class PropertyDataType(Enum):
+    BOOLEAN = "BOOLEAN"
+    BYTE = "BYTE"
+    INT = "INT"
+    LONG = "LONG"
+    FLOAT = "FLOAT"
+    DOUBLE = "DOUBLE"
+    TEXT = "TEXT"
+    BLOB = "BLOB"
+    DATE = "DATE"
+    UUID = "UUID"
+    DEFAULT = TEXT
+
+
+def default_value_map(data_type: str):
+    return {
+        "BOOLEAN": False,
+        "BYTE": 0,
+        "INT": 0,
+        "LONG": 0,
+        "FLOAT": 0.0,
+        "DOUBLE": 0.0,
+        "TEXT": "",
+        "BLOB": "",
+        "DATE": "2000-01-01",
+        "UUID": "",
+    }[data_type]
diff --git a/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py 
b/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py
index 8a6ab8f..be1d1f7 100644
--- a/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py
+++ b/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py
@@ -15,12 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import json
 from typing import Callable, List, Optional, Dict, Any
 
 import openai
-from openai import OpenAI, AsyncOpenAI
 import tiktoken
+from openai import OpenAI, AsyncOpenAI
 from retry import retry
 
 from hugegraph_llm.models.llms.base import BaseLLM
@@ -141,7 +140,7 @@ class OpenAIClient(BaseLLM):
     def max_allowed_token_length(self) -> int:
         """Get max-allowed token length"""
         # TODO: list all models and their max tokens from api
-        return 2049
+        return 8192
 
     def get_llm_type(self) -> str:
         return "openai"
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/common_op/check_schema.py 
b/hugegraph-llm/src/hugegraph_llm/operators/common_op/check_schema.py
index 10ed2dc..3220d9f 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/common_op/check_schema.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/common_op/check_schema.py
@@ -17,83 +17,106 @@
 
 
 from typing import Any, Optional, Dict
+
+from hugegraph_llm.enums.property_cardinality import PropertyCardinality
+from hugegraph_llm.enums.property_data_type import PropertyDataType
 from hugegraph_llm.utils.log import log
 
 
+def log_and_raise(message: str) -> None:
+    log.warning(message)
+    raise ValueError(message)
+
+
+def check_type(value: Any, expected_type: type, message: str) -> None:
+    if not isinstance(value, expected_type):
+        log_and_raise(message)
+
+
 class CheckSchema:
-    def __init__(self, data):
+    def __init__(self, data: Dict[str, Any]):
         self.result = None
         self.data = data
 
-    def run(self, context: Optional[Dict[str, Any]] = None) -> Any:  # pylint: 
disable=too-many-branches
+    def run(self, context: Optional[Dict[str, Any]] = None) -> Any:
         if context is None:
             context = {}
+
+        # 1. Validate the schema structure
         schema = self.data or context.get("schema")
-        if not isinstance(schema, dict):
-            raise ValueError("Input data is not a dictionary.")
-        if "vertexlabels" not in schema or "edgelabels" not in schema:
-            raise ValueError("Input data does not contain 'vertexlabels' or 
'edgelabels'.")
-        if not isinstance(schema["vertexlabels"], list) or not 
isinstance(schema["edgelabels"],
-                                                                          
list):
-            raise ValueError("'vertexlabels' or 'edgelabels' in input data is 
not a list.")
-        for vertex in schema["vertexlabels"]:
-            if not isinstance(vertex, dict):
-                raise ValueError("Vertex in input data is not a dictionary.")
-            if "name" not in vertex:
-                raise ValueError("Vertex in input data does not contain 
'name'.")
-            if not isinstance(vertex["name"], str):
-                raise ValueError("'name' in vertex is not of correct type.")
-            if "properties" not in vertex:
-                raise ValueError("Vertex in input data does not contain 
'properties'.")
-            properties = vertex["properties"]
-            if not isinstance(properties, list):
-                raise ValueError("'properties' in vertex is not of correct 
type.")
-            if len(properties) == 0:
-                raise ValueError("'properties' in vertex is empty.")
-            primary_keys = vertex.get("primary_keys", properties[:1])
-            if not isinstance(primary_keys, list):
-                raise ValueError("'primary_keys' in vertex is not of correct 
type.")
-            new_primary_keys = []
-            for key in primary_keys:
-                if key not in properties:
-                    log.waring("Primary key '%s' not found in properties has 
been auto removed.",
-                               key)
-                else:
-                    new_primary_keys.append(key)
-            if len(new_primary_keys) == 0:
-                raise ValueError(f"primary keys of vertexLabel 
{vertex['vertex_label']} is empty.")
-            vertex["primary_keys"] = new_primary_keys
-            nullable_keys = vertex.get("nullable_keys", properties[1:])
-            if not isinstance(nullable_keys, list):
-                raise ValueError("'nullable_keys' in vertex is not of correct 
type.")
-            new_nullable_keys = []
-            for key in nullable_keys:
-                if key not in properties:
-                    log.warning("Nullable key '%s' not found in properties has 
been auto removed.",
-                                key)
-                else:
-                    new_nullable_keys.append(key)
-            vertex["nullable_keys"] = new_nullable_keys
-        for edge in schema["edgelabels"]:
-            if not isinstance(edge, dict):
-                raise ValueError("Edge in input data is not a dictionary.")
-            if (
-                "name" not in edge
-                or "source_label" not in edge
-                or "target_label" not in edge
-            ):
-                raise ValueError(
-                    "Edge in input data does not contain "
-                    "'name', 'source_label', 'target_label'."
-                )
-            if (
-                not isinstance(edge["name"], str)
-                or not isinstance(edge["source_label"], str)
-                or not isinstance(edge["target_label"], str)
-            ):
-                raise ValueError(
-                    "'name', 'source_label', 'target_label' "
-                    "in edge is not of correct type."
-                )
+        self._validate_schema(schema)
+        # 2. Process property labels and also create a set for it
+        property_labels, property_label_set = 
self._process_property_labels(schema)
+        # 3. Process properties in given vertex/edge labels
+        self._process_vertex_labels(schema, property_labels, 
property_label_set)
+        self._process_edge_labels(schema, property_labels, property_label_set)
+        # 4. Update schema with processed pks
+        schema["propertykeys"] = property_labels
         context.update({"schema": schema})
         return context
+
+    def _validate_schema(self, schema: Dict[str, Any]) -> None:
+        check_type(schema, dict, "Input data is not a dictionary.")
+        if "vertexlabels" not in schema or "edgelabels" not in schema:
+            log_and_raise("Input data does not contain 'vertexlabels' or 
'edgelabels'.")
+        check_type(schema["vertexlabels"], list, "'vertexlabels' in input data 
is not a list.")
+        check_type(schema["edgelabels"], list, "'edgelabels' in input data is 
not a list.")
+
+    def _process_property_labels(self, schema: Dict[str, Any]) -> (list, set):
+        property_labels = schema.get("propertykeys", [])
+        check_type(property_labels, list, "'propertykeys' in input data is not 
of correct type.")
+        property_label_set = {label["name"] for label in property_labels}
+        return property_labels, property_label_set
+
+    def _process_vertex_labels(self, schema: Dict[str, Any], property_labels: 
list, property_label_set: set) -> None:
+        for vertex_label in schema["vertexlabels"]:
+            self._validate_vertex_label(vertex_label)
+            properties = vertex_label["properties"]
+            primary_keys = self._process_keys(vertex_label, "primary_keys", 
properties[:1])
+            if len(primary_keys) == 0:
+                log_and_raise(f"'primary_keys' of {vertex_label['name']} is 
empty.")
+            vertex_label["primary_keys"] = primary_keys
+            nullable_keys = self._process_keys(vertex_label, "nullable_keys", 
properties[1:])
+            vertex_label["nullable_keys"] = nullable_keys
+            self._add_missing_properties(properties, property_labels, 
property_label_set)
+
+    def _process_edge_labels(self, schema: Dict[str, Any], property_labels: 
list, property_label_set: set) -> None:
+        for edge_label in schema["edgelabels"]:
+            self._validate_edge_label(edge_label)
+            properties = edge_label.get("properties", [])
+            self._add_missing_properties(properties, property_labels, 
property_label_set)
+
+    def _validate_vertex_label(self, vertex_label: Dict[str, Any]) -> None:
+        check_type(vertex_label, dict, "VertexLabel in input data is not a 
dictionary.")
+        if "name" not in vertex_label:
+            log_and_raise("VertexLabel in input data does not contain 'name'.")
+        check_type(vertex_label["name"], str, "'name' in vertex_label is not 
of correct type.")
+        if "properties" not in vertex_label:
+            log_and_raise("VertexLabel in input data does not contain 
'properties'.")
+        check_type(vertex_label["properties"], list, "'properties' in 
vertex_label is not of correct type.")
+        if len(vertex_label["properties"]) == 0:
+            log_and_raise("'properties' in vertex_label is empty.")
+
+    def _validate_edge_label(self, edge_label: Dict[str, Any]) -> None:
+        check_type(edge_label, dict, "EdgeLabel in input data is not a 
dictionary.")
+        if "name" not in edge_label or "source_label" not in edge_label or 
"target_label" not in edge_label:
+            log_and_raise("EdgeLabel in input data does not contain 'name', 
'source_label', 'target_label'.")
+        check_type(edge_label["name"], str, "'name' in edge_label is not of 
correct type.")
+        check_type(edge_label["source_label"], str, "'source_label' in 
edge_label is not of correct type.")
+        check_type(edge_label["target_label"], str, "'target_label' in 
edge_label is not of correct type.")
+
+    def _process_keys(self, label: Dict[str, Any], key_type: str, 
default_keys: list) -> list:
+        keys = label.get(key_type, default_keys)
+        check_type(keys, list, f"'{key_type}' in {label['name']} is not of 
correct type.")
+        new_keys = [key for key in keys if key in label["properties"]]
+        return new_keys
+
+    def _add_missing_properties(self, properties: list, property_labels: list, 
property_label_set: set) -> None:
+        for prop in properties:
+            if prop not in property_label_set:
+                property_labels.append({
+                    "name": prop,
+                    "data_type": PropertyDataType.DEFAULT.value,
+                    "cardinality": PropertyCardinality.DEFAULT.value,
+                })
+                property_label_set.add(prop)
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py 
b/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py
index e450b3c..b26d094 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py
@@ -17,45 +17,57 @@
 
 
 from typing import Literal, Dict, Any, Optional, Union, List
+
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 
+# Constants
+LANGUAGE_ZH = "zh"
+LANGUAGE_EN = "en"
+SPLIT_TYPE_DOCUMENT = "document"
+SPLIT_TYPE_PARAGRAPH = "paragraph"
+SPLIT_TYPE_SENTENCE = "sentence"
 
 class ChunkSplit:
     def __init__(
-            self,
-            texts: Union[str, List[str]],
-            split_type: Literal["paragraph", "sentence"] = "paragraph",
-            language: Literal["zh", "en"] = "zh"
+        self,
+        texts: Union[str, List[str]],
+        split_type: Literal["document", "paragraph", "sentence"] = 
SPLIT_TYPE_DOCUMENT,
+        language: Literal["zh", "en"] = LANGUAGE_ZH,
     ):
         if isinstance(texts, str):
             texts = [texts]
         self.texts = texts
-        if language == "zh":
-            separators = ["\n\n", "\n", "。", "，", ""]
-        elif language == "en":
-            separators = ["\n\n", "\n", ".", ",", " ", ""]
+        self.separators = self._get_separators(language)
+        self.text_splitter = self._get_text_splitter(split_type)
+
+    def _get_separators(self, language: str) -> List[str]:
+        if language == LANGUAGE_ZH:
+            return ["\n\n", "\n", "。", "，", ""]
+        elif language == LANGUAGE_EN:
+            return ["\n\n", "\n", ".", ",", " ", ""]
         else:
             raise ValueError("language must be zh or en")
-        if split_type == "paragraph":
-            self.text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=500,
-                chunk_overlap=30,
-                separators=separators
-            )
-        elif split_type == "sentence":
-            self.text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=50,
-                chunk_overlap=0,
-                separators=separators
-            )
+
+    def _get_text_splitter(self, split_type: str):
+        if split_type == SPLIT_TYPE_DOCUMENT:
+            return lambda text: [text]
+        elif split_type == SPLIT_TYPE_PARAGRAPH:
+            return RecursiveCharacterTextSplitter(
+                chunk_size=500, chunk_overlap=30, separators=self.separators
+            ).split_text
+        elif split_type == SPLIT_TYPE_SENTENCE:
+            return RecursiveCharacterTextSplitter(
+                chunk_size=50, chunk_overlap=0, separators=self.separators
+            ).split_text
         else:
-            raise ValueError("type must be paragraph, sentence, html or 
markdown")
+            raise ValueError("Type must be paragraph, sentence, html or 
markdown")
 
     def run(self, context: Optional[Dict[str, Any]]) -> Dict[str, Any]:
         all_chunks = []
         for text in self.texts:
-            chunks = self.text_splitter.split_text(text)
+            chunks = self.text_splitter(text)
             all_chunks.extend(chunks)
+
         if context is None:
             return {"chunks": all_chunks}
         context["chunks"] = all_chunks
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py 
b/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
index 630e5b0..546d561 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
@@ -45,10 +45,8 @@ class WordExtract:
             context["query"] = self._query
 
         if self._llm is None:
-            self._llm = context.get("llm") or LLMs().get_llm()
+            self._llm = LLMs().get_llm()
             assert isinstance(self._llm, BaseLLM), "Invalid LLM Object."
-        if context.get("llm") is None:
-            context["llm"] = self._llm
 
         if isinstance(context.get("language"), str):
             self._language = context["language"].lower()
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py 
b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
index dd75b18..e6da8e0 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
@@ -218,7 +218,6 @@ class RAGPipeline:
             self.extract_keywords().query_graphdb().synthesize_answer()
 
         context = kwargs
-        context["llm"] = self._llm
 
         for operator in self._operators:
             context = self._run_operator(operator, context)
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
index 87129b2..1e44786 100644
--- 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
+++ 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
@@ -15,16 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-
 from typing import Dict, Any
 
 from hugegraph_llm.config import settings
+from hugegraph_llm.enums.property_cardinality import PropertyCardinality
+from hugegraph_llm.enums.property_data_type import PropertyDataType, 
default_value_map
 from hugegraph_llm.utils.log import log
 from pyhugegraph.client import PyHugeClient
 from pyhugegraph.utils.exceptions import NotFoundError, CreateError
 
 
-class CommitToKg:
+class Commit2Graph:
     def __init__(self):
         self.client = PyHugeClient(
             settings.graph_ip,
@@ -54,10 +55,31 @@ class CommitToKg:
             self.load_into_graph(vertices, edges, schema)
         return data
 
-    def load_into_graph(self, vertices, edges, schema):
+    def _set_default_property(self, key, input_properties, property_label_map):
+        data_type = property_label_map[key]["data_type"]
+        cardinality = property_label_map[key]["cardinality"]
+        if cardinality == PropertyCardinality.SINGLE.value:
+            default_value = default_value_map(data_type)
+            input_properties[key] = default_value
+        else:
+            # list or set
+            default_value = []
+            input_properties[key] = default_value
+        log.warning("Property '%s' missing in vertex, set to '%s' for now", 
key, default_value)
+
+    def _handle_graph_creation(self, func, *args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except NotFoundError as e:
+            log.error(e)
+        except CreateError as e:
+            log.error("Error on creating: %s, %s", args, e)
+
+    def load_into_graph(self, vertices, edges, schema):  # pylint: 
disable=too-many-statements
         # pylint: disable=R0912 (too-many-branches)
         vertex_label_map = {v_label["name"]: v_label for v_label in 
schema["vertexlabels"]}
         edge_label_map = {e_label["name"]: e_label for e_label in 
schema["edgelabels"]}
+        property_label_map = {p_label["name"]: p_label for p_label in 
schema["propertykeys"]}
 
         for vertex in vertices:
             input_label = vertex["label"]
@@ -72,28 +94,45 @@ class CommitToKg:
             nullable_keys = vertex_label.get("nullable_keys", [])
             non_null_keys = [key for key in vertex_label["properties"] if key 
not in nullable_keys]
 
+            has_problem = False
             # 2. Handle primary-keys mode vertex
             for pk in primary_keys:
                 if not input_properties.get(pk):
                     if len(primary_keys) == 1:
                         log.error("Primary-key '%s' missing in vertex %s, skip 
it & need check it again", pk, vertex)
-                        continue
-                    input_properties[pk] = "null" # FIXME: handle 
bool/number/date type
+                        has_problem = True
+                        break
+                    # TODO: transform to Enum first (better in earlier step)
+                    data_type = property_label_map[pk]["data_type"]
+                    cardinality = property_label_map[pk]["cardinality"]
+                    if cardinality == PropertyCardinality.SINGLE.value:
+                        input_properties[pk] = default_value_map(data_type)
+                    else:
+                        input_properties[pk] = []
                     log.warning("Primary-key '%s' missing in vertex %s, mark 
empty & need check it again!", pk, vertex)
+            if has_problem:
+                continue
 
             # 3. Ensure all non-nullable props are set
             for key in non_null_keys:
                 if key not in input_properties:
-                    input_properties[key] = "" # FIXME: handle 
bool/number/date type
-                    log.warning("Property '%s' missing in vertex %s, set to '' 
for now", key, vertex)
-            try:
-                # TODO: we could try batch add vertices first, setback to 
single-mode if failed
-                vid = self.client.graph().addVertex(input_label, 
input_properties).id
-                vertex["id"] = vid
-            except NotFoundError as e:
-                log.error(e)
-            except CreateError as e:
-                log.error("Error on creating vertex: %s, %s", vertex, e)
+                    self._set_default_property(key, input_properties, 
property_label_map)
+
+            # 4. Check all data type value is right
+            for key, value in input_properties.items():
+                # TODO: transform to Enum first (better in earlier step)
+                data_type = property_label_map[key]["data_type"]
+                cardinality = property_label_map[key]["cardinality"]
+                if not self._check_property_data_type(data_type, cardinality, 
value):
+                    log.error("Property type/format '%s' is not correct, skip 
it & need check it again", key)
+                    has_problem = True
+                    break
+            if has_problem:
+                continue
+
+            # TODO: we could try batch add vertices first, setback to 
single-mode if failed
+            vid = self._handle_graph_creation(self.client.graph().addVertex, 
input_label, input_properties).id
+            vertex["id"] = vid
 
         for edge in edges:
             start = edge["outV"]
@@ -104,25 +143,23 @@ class CommitToKg:
             if label not in edge_label_map:
                 log.critical("(Input) EdgeLabel %s not found in schema, skip & 
need check it!", label)
                 continue
-            try:
-                # TODO: we could try batch add edges first, setback to 
single-mode if failed
-                self.client.graph().addEdge(label, start, end, properties)
-            except NotFoundError as e:
-                log.error(e)
-            except CreateError as e:
-                log.error("Error on creating edge: %s, %s", edge, e)
-
-    def init_schema_if_need(self, schema: object):
+
+            # TODO: we could try batch add edges first, setback to single-mode 
if failed
+            self._handle_graph_creation(self.client.graph().addEdge, label, 
start, end, properties)
+
+    def init_schema_if_need(self, schema: dict):
+        properties = schema["propertykeys"]
         vertices = schema["vertexlabels"]
         edges = schema["edgelabels"]
 
+        for prop in properties:
+            self._create_property(prop)
+
         for vertex in vertices:
             vertex_label = vertex["name"]
             properties = vertex["properties"]
             nullable_keys = vertex["nullable_keys"]
             primary_keys = vertex["primary_keys"]
-            for prop in properties:
-                self.schema.propertyKey(prop).asText().ifNotExist().create()
             
self.schema.vertexLabel(vertex_label).properties(*properties).nullableKeys(
                 *nullable_keys
             
).usePrimaryKeyId().primaryKeys(*primary_keys).ifNotExist().create()
@@ -132,8 +169,6 @@ class CommitToKg:
             source_vertex_label = edge["source_label"]
             target_vertex_label = edge["target_label"]
             properties = edge["properties"]
-            for prop in properties:
-                self.schema.propertyKey(prop).asText().ifNotExist().create()
             
self.schema.edgeLabel(edge_label).sourceLabel(source_vertex_label).targetLabel(
                 target_vertex_label
             
).properties(*properties).nullableKeys(*properties).ifNotExist().create()
@@ -153,3 +188,84 @@ class CommitToKg:
             s_id = self.client.graph().addVertex("vertex", {"name": s}, 
id=s).id
             t_id = self.client.graph().addVertex("vertex", {"name": o}, 
id=o).id
             self.client.graph().addEdge("edge", s_id, t_id, {"name": p})
+
+    def _create_property(self, prop: dict):
+        name = prop["name"]
+        try:
+            data_type = PropertyDataType(prop["data_type"])
+            cardinality = PropertyCardinality(prop["cardinality"])
+        except ValueError:
+            log.critical("Invalid data type %s / cardinality %s for property 
%s, skip & should check it again",
+                         prop["data_type"], prop["cardinality"], name)
+            return
+
+        property_key = self.schema.propertyKey(name)
+        self._set_property_data_type(property_key, data_type)
+        self._set_property_cardinality(property_key, cardinality)
+        property_key.ifNotExist().create()
+
+    def _set_property_data_type(self, property_key, data_type):
+        if data_type == PropertyDataType.BOOLEAN:
+            log.error("Boolean type is not supported")
+        elif data_type == PropertyDataType.BYTE:
+            log.warning("Byte type is not supported, use int instead")
+            property_key.asInt()
+        elif data_type == PropertyDataType.INT:
+            property_key.asInt()
+        elif data_type == PropertyDataType.LONG:
+            property_key.asLong()
+        elif data_type == PropertyDataType.FLOAT:
+            log.warning("Float type is not supported, use double instead")
+            property_key.asDouble()
+        elif data_type == PropertyDataType.DOUBLE:
+            property_key.asDouble()
+        elif data_type == PropertyDataType.TEXT:
+            property_key.asText()
+        elif data_type == PropertyDataType.BLOB:
+            log.warning("Blob type is not supported, use text instead")
+            property_key.asText()
+        elif data_type == PropertyDataType.DATE:
+            property_key.asDate()
+        elif data_type == PropertyDataType.UUID:
+            log.warning("UUID type is not supported, use text instead")
+            property_key.asText()
+        else:
+            log.error("Unknown data type %s for property_key %s", data_type, 
property_key)
+
+    def _set_property_cardinality(self, property_key, cardinality):
+        if cardinality == PropertyCardinality.SINGLE:
+            property_key.valueSingle()
+        elif cardinality == PropertyCardinality.LIST:
+            property_key.valueList()
+        elif cardinality == PropertyCardinality.SET:
+            property_key.valueSet()
+        else:
+            log.error("Unknown cardinality %s for property_key %s", 
cardinality, property_key)
+
+    def _check_property_data_type(self, data_type: str, cardinality: str, 
value) -> bool:
+        if cardinality in (PropertyCardinality.LIST.value, 
PropertyCardinality.SET.value):
+            return self._check_collection_data_type(data_type, value)
+        return self._check_single_data_type(data_type, value)
+
+    def _check_collection_data_type(self, data_type: str, value) -> bool:
+        if not isinstance(value, list):
+            return False
+        for item in value:
+            if not self._check_single_data_type(data_type, item):
+                return False
+        return True
+
+    def _check_single_data_type(self, data_type: str, value) -> bool:
+        if data_type == PropertyDataType.BOOLEAN.value:
+            return isinstance(value, bool)
+        if data_type in (PropertyDataType.BYTE.value, 
PropertyDataType.INT.value, PropertyDataType.LONG.value):
+            return isinstance(value, int)
+        if data_type in (PropertyDataType.FLOAT.value, 
PropertyDataType.DOUBLE.value):
+            return isinstance(value, float)
+        if data_type in (PropertyDataType.TEXT.value, 
PropertyDataType.UUID.value):
+            return isinstance(value, str)
+        # TODO: check ok below
+        if data_type == PropertyDataType.DATE.value: # the format should be 
"yyyy-MM-dd"
+            import re
+            return isinstance(value, str) and re.match(r'^\d{4}-\d{2}-\d{2}$', 
value)
+        raise ValueError(f"Unknown/Unsupported data type: {data_type}")
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
index 77233f0..9cb5729 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/fetch_graph_data.py
@@ -29,12 +29,5 @@ class FetchGraphData:
         if context is None:
             context = {}
         if "vertices" not in context:
-            context["vertices"] = []
-        vertices = self.graph.gremlin().exec("g.V()")["data"]
-        for vertex in vertices:
-            context["vertices"].append({
-                "id": vertex["id"],
-                "label": vertex["label"],
-                "properties": vertex["properties"]
-            })
+            context["vertices"] = 
self.graph.gremlin().exec("g.V().id()")["data"]
         return context
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
index 57da3ef..ab50abf 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
@@ -37,20 +37,7 @@ class SchemaManager:
         if context is None:
             context = {}
         schema = self.schema.getSchema()
-        vertices = []
-        for vl in schema["vertexlabels"]:
-            vertex = {"vertex_label": vl["name"], "properties": 
vl["properties"]}
-            vertices.append(vertex)
-        edges = []
-        for el in schema["edgelabels"]:
-            edge = {
-                "edge_label": el["name"],
-                "source_vertex_label": el["source_label"],
-                "target_vertex_label": el["target_label"],
-                "properties": el["properties"],
-            }
-            edges.append(edge)
-        if not vertices and not edges:
+        if not schema["vertexlabels"] and not schema["edgelabels"]:
             raise Exception(f"Can not get {self.graph_name}'s schema from 
HugeGraph!")
 
         context.update({"schema": schema})
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py 
b/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
index 135ce9c..4c19cd6 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/index_op/build_semantic_index.py
@@ -34,7 +34,7 @@ class BuildSemanticIndex:
 
     def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
         past_vids = self.vid_index.properties
-        present_vids = [v["id"] for v in context["vertices"]]
+        present_vids = context["vertices"]
         removed_vids = set(past_vids) - set(present_vids)
         removed_num = self.vid_index.remove(removed_vids)
         added_vids = list(set(present_vids) - set(past_vids))
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py 
b/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
index 921895f..2fb5966 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
@@ -23,7 +23,7 @@ from hugegraph_llm.models.llms.base import BaseLLM
 from hugegraph_llm.operators.common_op.check_schema import CheckSchema
 from hugegraph_llm.operators.common_op.print_result import PrintResult
 from hugegraph_llm.operators.document_op.chunk_split import ChunkSplit
-from hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph import CommitToKg
+from hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph import 
Commit2Graph
 from hugegraph_llm.operators.hugegraph_op.fetch_graph_data import 
FetchGraphData
 from hugegraph_llm.operators.hugegraph_op.schema_manager import SchemaManager
 from hugegraph_llm.operators.index_op.build_semantic_index import 
BuildSemanticIndex
@@ -36,8 +36,7 @@ from pyhugegraph.client import PyHugeClient
 
 
 class KgBuilder:
-    def __init__(self, llm: BaseLLM, embedding: Optional[BaseEmbedding] = None,
-                 graph: Optional[PyHugeClient] = None):
+    def __init__(self, llm: BaseLLM, embedding: Optional[BaseEmbedding] = 
None, graph: Optional[PyHugeClient] = None):
         self.operators = []
         self.llm = llm
         self.embedding = embedding
@@ -50,9 +49,9 @@ class KgBuilder:
         elif from_user_defined:
             self.operators.append(CheckSchema(from_user_defined))
         elif from_extraction:
-            raise Exception("Not implemented yet")
+            raise NotImplementedError("Not implemented yet")
         else:
-            raise Exception("No input data / invalid schema type")
+            raise ValueError("No input data / invalid schema type")
         return self
 
     def fetch_graph_data(self):
@@ -60,16 +59,17 @@ class KgBuilder:
         return self
 
     def chunk_split(
-            self,
-            text: Union[str, List[str]],  # text to be split
-            split_type: Literal["paragraph", "sentence"] = "paragraph",
-            language: Literal["zh", "en"] = "zh"
+        self,
+        text: Union[str, List[str]],  # text to be split
+        split_type: Literal["document", "paragraph", "sentence"] = "document",
+        language: Literal["zh", "en"] = "zh",
     ):
         self.operators.append(ChunkSplit(text, split_type, language))
         return self
 
-    def extract_info(self, example_prompt: Optional[str] = None,
-                     extract_type: Literal["triples", "property_graph"] = 
"triples"):
+    def extract_info(
+        self, example_prompt: Optional[str] = None, extract_type: 
Literal["triples", "property_graph"] = "triples"
+    ):
         if extract_type == "triples":
             self.operators.append(InfoExtract(self.llm, example_prompt))
         elif extract_type == "property_graph":
@@ -81,7 +81,7 @@ class KgBuilder:
         return self
 
     def commit_to_hugegraph(self):
-        self.operators.append(CommitToKg())
+        self.operators.append(Commit2Graph())
         return self
 
     def build_vertex_id_semantic_index(self):
@@ -104,5 +104,5 @@ class KgBuilder:
         return context
 
     @log_operator_time
-    def _run_operator(self, operator, context):
+    def _run_operator(self, operator, context) -> Dict[str, Any]:
         return operator.run(context)
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py 
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
index 5272f90..2771cef 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
@@ -55,9 +55,7 @@ class AnswerSynthesize:
 
     def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
         if self._llm is None:
-            self._llm = context.get("llm") or LLMs().get_llm()
-        if context.get("llm") is None:
-            context["llm"] = self._llm
+            self._llm = LLMs().get_llm()
 
         if self._question is None:
             self._question = context.get("query") or None
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py 
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
index 0d62121..0df4051 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
@@ -62,10 +62,8 @@ class KeywordExtract:
             context["query"] = self._query
 
         if self._llm is None:
-            self._llm = context.get("llm") or LLMs().get_llm()
+            self._llm = LLMs().get_llm()
             assert isinstance(self._llm, BaseLLM), "Invalid LLM Object."
-        if context.get("llm") is None:
-            context["llm"] = self._llm
 
         if isinstance(context.get("language"), str):
             self._language = context["language"].lower()
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py 
b/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
index 4b8c7ce..5735eae 100644
--- a/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
+++ b/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
@@ -40,7 +40,7 @@ def get_graph_index_info():
     context["vid_index"] = {
         "embed_dim": vector_index.index.d,
         "num_vectors": vector_index.index.ntotal,
-        "num_vids": len(vector_index.properties)
+        "num_vids": len(vector_index.properties),
     }
     return json.dumps(context, ensure_ascii=False, indent=2)
 
@@ -70,14 +70,11 @@ def extract_graph(input_file, input_text, schema, 
example_prompt) -> str:
             builder.import_schema(from_hugegraph=schema)
     else:
         return "ERROR: please input with correct schema/format."
-    builder.chunk_split(texts, "paragraph", "zh").extract_info(example_prompt, 
"property_graph")
+    builder.chunk_split(texts, "document", "zh").extract_info(example_prompt, 
"property_graph")
 
     try:
         context = builder.run()
-        graph_elements = {
-            "vertices": context["vertices"],
-            "edges": context["edges"]
-        }
+        graph_elements = {"vertices": context["vertices"], "edges": 
context["edges"]}
         return json.dumps(graph_elements, ensure_ascii=False, indent=2)
     except Exception as e:  # pylint: disable=broad-exception-caught
         log.error(e)
@@ -113,8 +110,9 @@ def import_graph_data(data: str, schema: str) -> Union[str, 
Dict[str, Any]]:
 
         context = builder.commit_to_hugegraph().run(data_json)
         gr.Info("Import graph data successfully!")
+        print(context)
         return json.dumps(context, ensure_ascii=False, indent=2)
-    except Exception as e: # pylint: disable=W0718
+    except Exception as e:  # pylint: disable=W0718
         log.error(e)
         traceback.print_exc()
         # Note: can't use gr.Error here

(incubator-hugegraph-ai) branch main updated: feat(llm): support multiple property_type & importing graph from the entire doc (#84)

Reply via email to