ephraimbuddy commented on code in PR #35919:
URL: https://github.com/apache/airflow/pull/35919#discussion_r1419355948


##########
airflow/providers/weaviate/hooks/weaviate.py:
##########
@@ -133,20 +140,201 @@ def create_class(self, class_json: dict[str, Any]) -> 
None:
         client = self.conn
         client.schema.create_class(class_json)
 
-    def create_schema(self, schema_json: dict[str, Any]) -> None:
+    @retry(reraise=True, stop=stop_after_attempt(3), 
retry=retry_if_exception_type(requests.ConnectionError))
+    def create_schema(self, schema_json: dict[str, Any] | str) -> None:
         """
         Create a new Schema.
 
         Instead of adding classes one by one , you can upload a full schema in 
JSON format at once.
 
-        :param schema_json: The schema to create
+        :param schema_json: Schema as a Python dict or the path to a JSON 
file, or the URL of a JSON file.
         """
         client = self.conn
         client.schema.create(schema_json)
 
+    @retry(reraise=True, stop=stop_after_attempt(3), 
retry=retry_if_exception_type(requests.ConnectionError))
+    def get_schema(self, class_name: str | None = None):
+        """Get the schema from Weaviate.
+
+        :param class_name: The class for which to return the schema. If NOT 
provided the whole schema is
+            returned, otherwise only the schema of this class is returned. By 
default None.
+        """
+        client = self.get_client()
+        return client.schema.get(class_name)
+
+    def delete_classes(self, class_names: list[str] | str, if_error: str = 
"stop") -> list[str] | None:
+        """Deletes all or specific classes if class_names are provided.
+
+        :param class_names: list of class names to be deleted.
+        :param if_error: define the actions to be taken if there is an error 
while deleting a class, possible
+         options are `stop` and `continue`
+        :return: if `if_error=continue` return list of classes which we failed 
to delete.
+            if `if_error=stop` returns None.
+        """
+        client = self.get_client()
+        class_names = [class_names] if class_names and isinstance(class_names, 
str) else class_names
+
+        failed_class_list = []
+        for class_name in class_names:
+            try:
+                for attempt in Retrying(
+                    stop=stop_after_attempt(3),
+                    retry=retry_if_exception(lambda exc: 
self.check_http_error_is_retryable(exc)),
+                ):
+                    with attempt:
+                        client.schema.delete_class(class_name)
+            except Exception as e:
+                if if_error == "continue":
+                    self.log.error(e)
+                    failed_class_list.append(class_name)
+                elif if_error == "stop":
+                    raise e
+
+        if if_error == "continue":
+            return failed_class_list
+        return None
+
+    def delete_all_schema(self):
+        """Remove the entire schema from the Weaviate instance and all data 
associated with it."""
+        client = self.get_client()
+        return client.schema.delete_all()
+
+    def update_config(self, class_name: str, config: dict):
+        """Update a schema configuration for a specific class."""
+        client = self.get_client()
+        client.schema.update_config(class_name=class_name, config=config)
+
+    def upsert_classes(self, schema_json: dict[str, Any] | str, existing: 
ExitingSchemaOptions = "ignore"):
+        """
+        Create or update the classes in schema of Weaviate database.
+
+        :param schema_json: Json containing the schema. Format {"class_name": 
"class_dict"}
+            .. seealso:: `example of class_dict 
<https://weaviate-python-client.readthedocs.io/en/v3.25.2/weaviate.schema.html#weaviate.schema.Schema.create>`_.
+        :param existing: Options to handle the case when the classes exist, 
possible options
+            'replace', 'fail', 'ignore'.
+        """
+        existing_schema_options = ["replace", "fail", "ignore"]
+        if existing not in existing_schema_options:
+            raise ValueError(f"Param 'existing' should be one of the 
{existing_schema_options} values.")
+        if isinstance(schema_json, str):
+            schema_json = cast(dict, json.load(open(schema_json)))
+        set__exiting_classes = {class_object["class"] for class_object in 
self.get_schema()["classes"]}
+        set__to_be_added_classes = {key for key, _ in schema_json.items()}
+        intersection_classes = 
set__exiting_classes.intersection(set__to_be_added_classes)
+        classes_to_create = set()
+        if existing == "fail" and intersection_classes:
+            raise ValueError(
+                f"Trying to create class {intersection_classes}" f" but this 
class already exists."
+            )
+        elif existing == "ignore":
+            classes_to_create = set__to_be_added_classes - set__exiting_classes
+        elif existing == "replace":
+            error_list = 
self.delete_classes(class_names=list(intersection_classes))

Review Comment:
   I still don't like this upsert method especially the fact that we are 
deleting data before updating. This has the potential to create data 
inconsistency especially if two tasks are doing this upsert at the same time. 
What if after deleting the object, the update fails?
   
   Regarding data integrity too, deleting the object and creating a new one 
means that the UUID would change. That's not what most users would expect from 
an upsert method. upsert is to create or update not delete then create. 
   
   It might be better to name this method `create_or_replace_classes` 



##########
airflow/providers/weaviate/hooks/weaviate.py:
##########
@@ -17,15 +17,20 @@
 
 from __future__ import annotations
 
+import json
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING, Literal, Sequence, cast

Review Comment:
   When you import TYPE_CHECKING, other imports from typing should be inside 
its `if` statement



##########
airflow/providers/weaviate/hooks/weaviate.py:
##########
@@ -119,38 +125,204 @@ def test_connection(self) -> tuple[bool, str]:
             self.log.error("Error testing Weaviate connection: %s", e)
             return False, str(e)
 
+    @retry(reraise=True, stop=stop_after_attempt(3), 
retry=retry_if_exception_type(requests.ConnectionError))
     def create_class(self, class_json: dict[str, Any]) -> None:
         """Create a new class."""
         client = self.conn
         client.schema.create_class(class_json)
 
-    def create_schema(self, schema_json: dict[str, Any]) -> None:
+    @retry(reraise=True, stop=stop_after_attempt(3), 
retry=retry_if_exception_type(requests.ConnectionError))
+    def create_schema(self, schema_json: dict[str, Any] | str) -> None:
         """
         Create a new Schema.
 
         Instead of adding classes one by one , you can upload a full schema in 
JSON format at once.
 
-        :param schema_json: The schema to create
+        :param schema_json: The schema to create or path to the json file 
holding the schema
         """
         client = self.conn
         client.schema.create(schema_json)
 
+    @retry(reraise=True, stop=stop_after_attempt(3), 
retry=retry_if_exception_type(requests.ConnectionError))
+    def get_schema(self, class_name: str | None = None):
+        """Get the schema from Weaviate.
+
+        get schema of a class or all classes.
+        """
+        client = self.get_client()
+        return client.schema.get(class_name)
+
+    @retry(reraise=True, stop=stop_after_attempt(3), 
retry=retry_if_exception_type(requests.ConnectionError))
+    def delete_class(self, class_name: str):
+        """Delete a schema class from Weaviate. This deletes all associated 
data."""
+        client = self.get_client()
+        client.schema.delete_class(class_name)
+
+    def delete_schema(
+        self, class_names: list[str] | str | None = None
+    ) -> list[UnexpectedStatusCodeException] | None:
+        """
+        Deletes all or specific class if class_names are provided.
+
+        If no class_name is given remove the entire schema from the Weaviate 
instance and all data associated
+        with it. If class_names are given, delete schema classes from 
Weaviate. This deletes all associated data.
+        :return: list of error object if any
+        """
+        client = self.get_client()
+        class_names = [class_names] if class_names and isinstance(class_names, 
str) else class_names
+        if class_names:
+            error_list = []
+            for class_name in class_names:
+                try:
+                    self.delete_class(class_name=class_name)
+                except UnexpectedStatusCodeException as e:
+                    error_list.append(e)
+            return error_list
+        else:
+            return client.schema.delete_all()
+
+    @retry(reraise=True, stop=stop_after_attempt(3), 
retry=retry_if_exception_type(requests.ConnectionError))
+    def update_class(self, class_name: str, config: dict):
+        """Update schema's class."""
+        client = self.get_client()
+        client.schema.update_config(class_name=class_name, config=config)
+
+    def update_multiple_classes(self, schema_json: list[dict]) -> 
list[UnexpectedStatusCodeException] | None:
+        """Updated multiple classes.
+
+        :param schema_json: list of class_config objects
+            .. seealso:: `example of class_config 
<https://weaviate-python-client.readthedocs.io/en/v3.25.2/weaviate.schema.html#weaviate.schema.Schema.update_config>`_.
+        :return: list of error object if any
+        """
+        error_list = []
+        for config in schema_json:
+            try:
+                self.update_class(class_name=config.pop("class"), 
config=config)
+            except UnexpectedStatusCodeException as e:
+                error_list.append(e)
+        return error_list
+
+    def upsert_classes(self, schema_json: dict[str, Any] | str, existing: 
ExitingSchemaOptions = "ignore"):
+        """
+        Create or update the classes in schema of Weaviate database.
+
+        :param schema_json: Json containing the schema. Format {"class_name": 
"class_dict"}
+            .. seealso:: `example of class_dict 
<https://weaviate-python-client.readthedocs.io/en/v3.25.2/weaviate.schema.html#weaviate.schema.Schema.create>`_.
+        :param existing: Options to handle the case when the classes exist, 
possible options
+            'replace', 'fail', 'ignore'.
+        """
+        existing_schema_options = ["replace", "fail", "ignore"]
+        if existing not in existing_schema_options:
+            raise ValueError(f"Param 'existing' should be one of the 
{existing_schema_options} values.")
+        if isinstance(schema_json, str):
+            schema_json = cast(dict, json.load(open(schema_json)))
+        set__exiting_classes = {class_object["class"] for class_object in 
self.get_schema()["classes"]}
+        set__to_be_added_classes = {key for key, _ in schema_json.items()}
+        intersection_classes = 
set__exiting_classes.intersection(set__to_be_added_classes)
+        classes_to_create = set()
+        if existing == "fail" and intersection_classes:
+            raise ValueError(
+                f"Trying to create class {intersection_classes}" f" but this 
class already exists."
+            )
+        elif existing == "ignore":
+            classes_to_create = set__to_be_added_classes - set__exiting_classes
+        elif existing == "replace":
+            error_list = 
self.delete_schema(class_names=list(intersection_classes))
+            if error_list:
+                raise ValueError(error_list)
+            classes_to_create = 
intersection_classes.union(set__to_be_added_classes)
+        classes_to_create_list = [schema_json[item] for item in 
sorted(list(classes_to_create))]
+        self.create_schema({"classes": classes_to_create_list})
+
+    def _compare_schema_subset(self, class_object: Any, class_schema: Any) -> 
bool:
+        """
+        Recursively check if requested schema/object is a subset of the 
current schema.
+
+        :param class_object: The class object to check against current schema
+        :param class_schema: The current schema class object
+        """
+        # Direct equality check
+        if class_object == class_schema:
+            return True
+
+        # Type mismatch early return
+        if type(class_object) != type(class_schema):
+            return False
+
+        # Dictionary comparison
+        if isinstance(class_object, dict):
+            for k, v in class_object.items():
+                if (k not in class_schema) or (not 
self._compare_schema_subset(v, class_schema[k])):
+                    return False
+            return True
+
+        # List or Tuple comparison
+        if isinstance(class_object, (list, tuple)):
+            for obj, sch in zip(class_object, class_schema):
+                if len(class_object) > len(class_schema) or not 
self._compare_schema_subset(obj, sch):
+                    return False
+            return True
+
+        # Default case for non-matching types or unsupported types
+        return False
+
+    @staticmethod
+    def _convert_properties_to_dict(classes_objects, key_property: str = 
"name"):
+        for cls in classes_objects:
+            cls["properties"] = {p[key_property]: p for p in cls["properties"]}
+        return classes_objects
+
+    def check_subset_of_schema(self, classes_objects: list) -> bool:
+        """Check if the class_objects is a subset of existing schema."""
+        # When the class properties are not in same order or not the same 
length. We convert them to dicts
+        # with property `name` as the key. This way we ensure, the subset is 
checked.
+        classes_objects = self._convert_properties_to_dict(classes_objects)
+        exiting_classes_list = 
self._convert_properties_to_dict(self.get_schema()["classes"])
+
+        exiting_classes = {cls["class"]: cls for cls in exiting_classes_list}
+        exiting_classes_set = set(exiting_classes.keys())
+        input_classes_set = {cls["class"] for cls in classes_objects}
+        if not input_classes_set.issubset(exiting_classes_set):
+            return False
+        for cls in classes_objects:
+            if not self._compare_schema_subset(cls, 
exiting_classes[cls["class"]]):
+                return False
+        return True
+
+    @staticmethod
+    def check_http_error_is_retryable(exc: BaseException):
+        return isinstance(exc, requests.HTTPError) and not exc.response.ok
+
     def batch_data(
-        self, class_name: str, data: list[dict[str, Any]], 
batch_config_params: dict[str, Any] | None = None
+        self,
+        class_name: str,

Review Comment:
   Seems the change didn't move completely



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@airflow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to