ephraimbuddy commented on code in PR #35919: URL: https://github.com/apache/airflow/pull/35919#discussion_r1412232659
########## airflow/providers/weaviate/hooks/weaviate.py: ########## @@ -119,38 +125,204 @@ def test_connection(self) -> tuple[bool, str]: self.log.error("Error testing Weaviate connection: %s", e) return False, str(e) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) Review Comment: Should we have this retry as a separate PR? So we can discuss whether they are needed for the methods or not ########## airflow/providers/weaviate/hooks/weaviate.py: ########## @@ -119,38 +125,204 @@ def test_connection(self) -> tuple[bool, str]: self.log.error("Error testing Weaviate connection: %s", e) return False, str(e) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) def create_class(self, class_json: dict[str, Any]) -> None: """Create a new class.""" client = self.conn client.schema.create_class(class_json) - def create_schema(self, schema_json: dict[str, Any]) -> None: + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def create_schema(self, schema_json: dict[str, Any] | str) -> None: """ Create a new Schema. Instead of adding classes one by one , you can upload a full schema in JSON format at once. - :param schema_json: The schema to create + :param schema_json: The schema to create or path to the json file holding the schema Review Comment: Can we use `Schema as a Python dict, or the path to a JSON file, or the URL of a JSON file.`. Same way it is in the client doc ########## airflow/providers/weaviate/hooks/weaviate.py: ########## @@ -119,38 +125,204 @@ def test_connection(self) -> tuple[bool, str]: self.log.error("Error testing Weaviate connection: %s", e) return False, str(e) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) def create_class(self, class_json: dict[str, Any]) -> None: """Create a new class.""" client = self.conn client.schema.create_class(class_json) - def create_schema(self, schema_json: dict[str, Any]) -> None: + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def create_schema(self, schema_json: dict[str, Any] | str) -> None: """ Create a new Schema. Instead of adding classes one by one , you can upload a full schema in JSON format at once. - :param schema_json: The schema to create + :param schema_json: The schema to create or path to the json file holding the schema """ client = self.conn client.schema.create(schema_json) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def get_schema(self, class_name: str | None = None): + """Get the schema from Weaviate. + + get schema of a class or all classes. + """ + client = self.get_client() + return client.schema.get(class_name) + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def delete_class(self, class_name: str): + """Delete a schema class from Weaviate. This deletes all associated data.""" + client = self.get_client() + client.schema.delete_class(class_name) + + def delete_schema( + self, class_names: list[str] | str | None = None + ) -> list[UnexpectedStatusCodeException] | None: + """ + Deletes all or specific class if class_names are provided. + + If no class_name is given remove the entire schema from the Weaviate instance and all data associated + with it. If class_names are given, delete schema classes from Weaviate. This deletes all associated data. + :return: list of error object if any + """ + client = self.get_client() + class_names = [class_names] if class_names and isinstance(class_names, str) else class_names + if class_names: + error_list = [] + for class_name in class_names: + try: + self.delete_class(class_name=class_name) + except UnexpectedStatusCodeException as e: + error_list.append(e) + return error_list + else: + return client.schema.delete_all() + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def update_class(self, class_name: str, config: dict): + """Update schema's class.""" + client = self.get_client() + client.schema.update_config(class_name=class_name, config=config) + + def update_multiple_classes(self, schema_json: list[dict]) -> list[UnexpectedStatusCodeException] | None: + """Updated multiple classes. + + :param schema_json: list of class_config objects + .. seealso:: `example of class_config <https://weaviate-python-client.readthedocs.io/en/v3.25.2/weaviate.schema.html#weaviate.schema.Schema.update_config>`_. + :return: list of error object if any + """ + error_list = [] + for config in schema_json: + try: + self.update_class(class_name=config.pop("class"), config=config) + except UnexpectedStatusCodeException as e: + error_list.append(e) + return error_list + + def upsert_classes(self, schema_json: dict[str, Any] | str, existing: ExitingSchemaOptions = "ignore"): + """ + Create or update the classes in schema of Weaviate database. + + :param schema_json: Json containing the schema. Format {"class_name": "class_dict"} + .. seealso:: `example of class_dict <https://weaviate-python-client.readthedocs.io/en/v3.25.2/weaviate.schema.html#weaviate.schema.Schema.create>`_. + :param existing: Options to handle the case when the classes exist, possible options + 'replace', 'fail', 'ignore'. + """ + existing_schema_options = ["replace", "fail", "ignore"] + if existing not in existing_schema_options: + raise ValueError(f"Param 'existing' should be one of the {existing_schema_options} values.") + if isinstance(schema_json, str): + schema_json = cast(dict, json.load(open(schema_json))) + set__exiting_classes = {class_object["class"] for class_object in self.get_schema()["classes"]} + set__to_be_added_classes = {key for key, _ in schema_json.items()} + intersection_classes = set__exiting_classes.intersection(set__to_be_added_classes) + classes_to_create = set() + if existing == "fail" and intersection_classes: + raise ValueError( + f"Trying to create class {intersection_classes}" f" but this class already exists." + ) + elif existing == "ignore": + classes_to_create = set__to_be_added_classes - set__exiting_classes + elif existing == "replace": + error_list = self.delete_schema(class_names=list(intersection_classes)) + if error_list: + raise ValueError(error_list) + classes_to_create = intersection_classes.union(set__to_be_added_classes) + classes_to_create_list = [schema_json[item] for item in sorted(list(classes_to_create))] + self.create_schema({"classes": classes_to_create_list}) + + def _compare_schema_subset(self, class_object: Any, class_schema: Any) -> bool: + """ + Recursively check if requested schema/object is a subset of the current schema. + + :param class_object: The class object to check against current schema + :param class_schema: The current schema class object + """ + # Direct equality check + if class_object == class_schema: + return True + + # Type mismatch early return + if type(class_object) != type(class_schema): + return False + + # Dictionary comparison + if isinstance(class_object, dict): + for k, v in class_object.items(): + if (k not in class_schema) or (not self._compare_schema_subset(v, class_schema[k])): + return False + return True + + # List or Tuple comparison + if isinstance(class_object, (list, tuple)): + for obj, sch in zip(class_object, class_schema): + if len(class_object) > len(class_schema) or not self._compare_schema_subset(obj, sch): + return False + return True + + # Default case for non-matching types or unsupported types + return False + + @staticmethod + def _convert_properties_to_dict(classes_objects, key_property: str = "name"): + for cls in classes_objects: + cls["properties"] = {p[key_property]: p for p in cls["properties"]} + return classes_objects + + def check_subset_of_schema(self, classes_objects: list) -> bool: + """Check if the class_objects is a subset of existing schema.""" + # When the class properties are not in same order or not the same length. We convert them to dicts + # with property `name` as the key. This way we ensure, the subset is checked. + classes_objects = self._convert_properties_to_dict(classes_objects) + exiting_classes_list = self._convert_properties_to_dict(self.get_schema()["classes"]) + + exiting_classes = {cls["class"]: cls for cls in exiting_classes_list} + exiting_classes_set = set(exiting_classes.keys()) + input_classes_set = {cls["class"] for cls in classes_objects} + if not input_classes_set.issubset(exiting_classes_set): + return False + for cls in classes_objects: + if not self._compare_schema_subset(cls, exiting_classes[cls["class"]]): + return False + return True + + @staticmethod + def check_http_error_is_retryable(exc: BaseException): + return isinstance(exc, requests.HTTPError) and not exc.response.ok + def batch_data( - self, class_name: str, data: list[dict[str, Any]], batch_config_params: dict[str, Any] | None = None + self, + class_name: str, Review Comment: I prefer we work on this batch_data in a separate PR and state what we are doing ########## airflow/providers/weaviate/hooks/weaviate.py: ########## @@ -119,38 +125,204 @@ def test_connection(self) -> tuple[bool, str]: self.log.error("Error testing Weaviate connection: %s", e) return False, str(e) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) def create_class(self, class_json: dict[str, Any]) -> None: """Create a new class.""" client = self.conn client.schema.create_class(class_json) - def create_schema(self, schema_json: dict[str, Any]) -> None: + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def create_schema(self, schema_json: dict[str, Any] | str) -> None: """ Create a new Schema. Instead of adding classes one by one , you can upload a full schema in JSON format at once. - :param schema_json: The schema to create + :param schema_json: The schema to create or path to the json file holding the schema """ client = self.conn client.schema.create(schema_json) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def get_schema(self, class_name: str | None = None): + """Get the schema from Weaviate. + + get schema of a class or all classes. + """ + client = self.get_client() + return client.schema.get(class_name) + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def delete_class(self, class_name: str): + """Delete a schema class from Weaviate. This deletes all associated data.""" + client = self.get_client() + client.schema.delete_class(class_name) + + def delete_schema( + self, class_names: list[str] | str | None = None + ) -> list[UnexpectedStatusCodeException] | None: + """ + Deletes all or specific class if class_names are provided. + + If no class_name is given remove the entire schema from the Weaviate instance and all data associated + with it. If class_names are given, delete schema classes from Weaviate. This deletes all associated data. + :return: list of error object if any + """ + client = self.get_client() + class_names = [class_names] if class_names and isinstance(class_names, str) else class_names + if class_names: + error_list = [] + for class_name in class_names: + try: + self.delete_class(class_name=class_name) + except UnexpectedStatusCodeException as e: + error_list.append(e) + return error_list + else: + return client.schema.delete_all() + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def update_class(self, class_name: str, config: dict): + """Update schema's class.""" + client = self.get_client() + client.schema.update_config(class_name=class_name, config=config) + + def update_multiple_classes(self, schema_json: list[dict]) -> list[UnexpectedStatusCodeException] | None: + """Updated multiple classes. + + :param schema_json: list of class_config objects + .. seealso:: `example of class_config <https://weaviate-python-client.readthedocs.io/en/v3.25.2/weaviate.schema.html#weaviate.schema.Schema.update_config>`_. + :return: list of error object if any + """ + error_list = [] + for config in schema_json: + try: + self.update_class(class_name=config.pop("class"), config=config) + except UnexpectedStatusCodeException as e: + error_list.append(e) + return error_list + + def upsert_classes(self, schema_json: dict[str, Any] | str, existing: ExitingSchemaOptions = "ignore"): + """ + Create or update the classes in schema of Weaviate database. + + :param schema_json: Json containing the schema. Format {"class_name": "class_dict"} + .. seealso:: `example of class_dict <https://weaviate-python-client.readthedocs.io/en/v3.25.2/weaviate.schema.html#weaviate.schema.Schema.create>`_. + :param existing: Options to handle the case when the classes exist, possible options + 'replace', 'fail', 'ignore'. + """ + existing_schema_options = ["replace", "fail", "ignore"] + if existing not in existing_schema_options: + raise ValueError(f"Param 'existing' should be one of the {existing_schema_options} values.") + if isinstance(schema_json, str): + schema_json = cast(dict, json.load(open(schema_json))) + set__exiting_classes = {class_object["class"] for class_object in self.get_schema()["classes"]} + set__to_be_added_classes = {key for key, _ in schema_json.items()} + intersection_classes = set__exiting_classes.intersection(set__to_be_added_classes) + classes_to_create = set() + if existing == "fail" and intersection_classes: + raise ValueError( + f"Trying to create class {intersection_classes}" f" but this class already exists." + ) + elif existing == "ignore": + classes_to_create = set__to_be_added_classes - set__exiting_classes + elif existing == "replace": + error_list = self.delete_schema(class_names=list(intersection_classes)) + if error_list: + raise ValueError(error_list) Review Comment: It would be better for the delete_schema to raise these errors since we want it raised. No need packing them to raise them here ########## airflow/providers/weaviate/hooks/weaviate.py: ########## @@ -112,22 +117,170 @@ def test_connection(self) -> tuple[bool, str]: self.log.error("Error testing Weaviate connection: %s", e) return False, str(e) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) def create_class(self, class_json: dict[str, Any]) -> None: """Create a new class.""" client = self.get_client() client.schema.create_class(class_json) - def create_schema(self, schema_json: dict[str, Any]) -> None: + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def create_schema(self, schema_json: dict[str, Any] | str) -> None: """ Create a new Schema. Instead of adding classes one by one , you can upload a full schema in JSON format at once. - :param schema_json: The schema to create + :param schema_json: The schema to create or path to the json file holding the schema """ client = self.get_client() client.schema.create(schema_json) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def get_schema(self, class_name: str | None = None): + """Get the schema from Weaviate. + + get schema of a class or all classes. Review Comment: Since this calls the client.schema.get, it should have the same description. I think it's more understandable in the client doc ########## airflow/providers/weaviate/hooks/weaviate.py: ########## @@ -119,38 +125,204 @@ def test_connection(self) -> tuple[bool, str]: self.log.error("Error testing Weaviate connection: %s", e) return False, str(e) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) def create_class(self, class_json: dict[str, Any]) -> None: """Create a new class.""" client = self.conn client.schema.create_class(class_json) - def create_schema(self, schema_json: dict[str, Any]) -> None: + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def create_schema(self, schema_json: dict[str, Any] | str) -> None: """ Create a new Schema. Instead of adding classes one by one , you can upload a full schema in JSON format at once. - :param schema_json: The schema to create + :param schema_json: The schema to create or path to the json file holding the schema """ client = self.conn client.schema.create(schema_json) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def get_schema(self, class_name: str | None = None): + """Get the schema from Weaviate. + + get schema of a class or all classes. + """ + client = self.get_client() + return client.schema.get(class_name) + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def delete_class(self, class_name: str): + """Delete a schema class from Weaviate. This deletes all associated data.""" + client = self.get_client() + client.schema.delete_class(class_name) + + def delete_schema( + self, class_names: list[str] | str | None = None + ) -> list[UnexpectedStatusCodeException] | None: + """ + Deletes all or specific class if class_names are provided. + + If no class_name is given remove the entire schema from the Weaviate instance and all data associated + with it. If class_names are given, delete schema classes from Weaviate. This deletes all associated data. + :return: list of error object if any + """ + client = self.get_client() + class_names = [class_names] if class_names and isinstance(class_names, str) else class_names + if class_names: + error_list = [] + for class_name in class_names: + try: + self.delete_class(class_name=class_name) + except UnexpectedStatusCodeException as e: + error_list.append(e) + return error_list + else: + return client.schema.delete_all() + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def update_class(self, class_name: str, config: dict): + """Update schema's class.""" + client = self.get_client() + client.schema.update_config(class_name=class_name, config=config) + + def update_multiple_classes(self, schema_json: list[dict]) -> list[UnexpectedStatusCodeException] | None: + """Updated multiple classes. + + :param schema_json: list of class_config objects + .. seealso:: `example of class_config <https://weaviate-python-client.readthedocs.io/en/v3.25.2/weaviate.schema.html#weaviate.schema.Schema.update_config>`_. + :return: list of error object if any + """ + error_list = [] + for config in schema_json: + try: + self.update_class(class_name=config.pop("class"), config=config) + except UnexpectedStatusCodeException as e: + error_list.append(e) + return error_list + + def upsert_classes(self, schema_json: dict[str, Any] | str, existing: ExitingSchemaOptions = "ignore"): + """ + Create or update the classes in schema of Weaviate database. + + :param schema_json: Json containing the schema. Format {"class_name": "class_dict"} + .. seealso:: `example of class_dict <https://weaviate-python-client.readthedocs.io/en/v3.25.2/weaviate.schema.html#weaviate.schema.Schema.create>`_. + :param existing: Options to handle the case when the classes exist, possible options + 'replace', 'fail', 'ignore'. + """ + existing_schema_options = ["replace", "fail", "ignore"] + if existing not in existing_schema_options: + raise ValueError(f"Param 'existing' should be one of the {existing_schema_options} values.") + if isinstance(schema_json, str): + schema_json = cast(dict, json.load(open(schema_json))) Review Comment: `mypy` I guess? Didn't json.load solve it without the cast? ########## airflow/providers/weaviate/hooks/weaviate.py: ########## @@ -17,16 +17,22 @@ from __future__ import annotations +import json import warnings from functools import cached_property -from typing import Any +from typing import Any, Literal, cast -from weaviate import Client as WeaviateClient +import pandas as pd Review Comment: We should import this at the point of use ########## airflow/providers/weaviate/operators/weaviate.py: ########## @@ -51,21 +54,25 @@ def __init__( self, conn_id: str, class_name: str, - input_json: list[dict[str, Any]], + input_data: list[dict[str, Any]] | pd.DataFrame, + vector_col: str = "Vector", Review Comment: We shouldn't edit this operator in a separate PR ########## airflow/providers/weaviate/hooks/weaviate.py: ########## @@ -94,22 +99,117 @@ def test_connection(self) -> tuple[bool, str]: self.log.error("Error testing Weaviate connection: %s", e) return False, str(e) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) def create_class(self, class_json: dict[str, Any]) -> None: """Create a new class.""" client = self.get_client() client.schema.create_class(class_json) - def create_schema(self, schema_json: dict[str, Any]) -> None: + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def create_schema(self, schema_json: dict[str, Any] | str) -> None: """ Create a new Schema. Instead of adding classes one by one , you can upload a full schema in JSON format at once. - :param schema_json: The schema to create + :param schema_json: The schema to create or path to the json file holding the schema """ client = self.get_client() client.schema.create(schema_json) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def get_schema(self, class_name: str | None = None): + """Get the schema from Weaviate. + + get schema of a class or all classes. + """ + client = self.get_client() + return client.schema.get(class_name) + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def delete_class(self, class_name: str): + """Delete a schema class from Weaviate. This deletes all associated data.""" + client = self.get_client() + client.schema.delete_class(class_name) + + def delete_schema( + self, class_names: list[str] | str | None = None + ) -> list[weaviate.UnexpectedStatusCodeException] | None: + """ + Deletes all or specific class if class_names are provided. + + If no class_name is given remove the entire schema from the Weaviate instance and all data associated + with it. If class_names are given, delete schema classes from Weaviate. This deletes all associated data. + :return: list of error object if any + """ + client = self.get_client() + class_names = [class_names] if class_names and isinstance(class_names, str) else class_names + if class_names: + error_list = [] + for class_name in class_names: + try: + self.delete_class(class_name=class_name) + except weaviate.UnexpectedStatusCodeException as e: + error_list.append(e) + return error_list + else: + return client.schema.delete_all() + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def update_class(self, class_name: str, config: dict): Review Comment: Might be better to match what's in the client since there's a direct call to the client method without code changes ########## airflow/providers/weaviate/hooks/weaviate.py: ########## @@ -112,22 +117,170 @@ def test_connection(self) -> tuple[bool, str]: self.log.error("Error testing Weaviate connection: %s", e) return False, str(e) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) def create_class(self, class_json: dict[str, Any]) -> None: """Create a new class.""" client = self.get_client() client.schema.create_class(class_json) - def create_schema(self, schema_json: dict[str, Any]) -> None: + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def create_schema(self, schema_json: dict[str, Any] | str) -> None: """ Create a new Schema. Instead of adding classes one by one , you can upload a full schema in JSON format at once. - :param schema_json: The schema to create + :param schema_json: The schema to create or path to the json file holding the schema """ client = self.get_client() client.schema.create(schema_json) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def get_schema(self, class_name: str | None = None): + """Get the schema from Weaviate. + + get schema of a class or all classes. + """ + client = self.get_client() + return client.schema.get(class_name) + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def delete_class(self, class_name: str): + """Delete a schema class from Weaviate. This deletes all associated data.""" + client = self.get_client() + client.schema.delete_class(class_name) + + def delete_schema( Review Comment: `delete_classes` seems better and it's what is in the code ########## airflow/providers/weaviate/hooks/weaviate.py: ########## @@ -94,22 +99,117 @@ def test_connection(self) -> tuple[bool, str]: self.log.error("Error testing Weaviate connection: %s", e) return False, str(e) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) def create_class(self, class_json: dict[str, Any]) -> None: """Create a new class.""" client = self.get_client() client.schema.create_class(class_json) - def create_schema(self, schema_json: dict[str, Any]) -> None: + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def create_schema(self, schema_json: dict[str, Any] | str) -> None: """ Create a new Schema. Instead of adding classes one by one , you can upload a full schema in JSON format at once. - :param schema_json: The schema to create + :param schema_json: The schema to create or path to the json file holding the schema """ client = self.get_client() client.schema.create(schema_json) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def get_schema(self, class_name: str | None = None): + """Get the schema from Weaviate. + + get schema of a class or all classes. + """ + client = self.get_client() + return client.schema.get(class_name) + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def delete_class(self, class_name: str): + """Delete a schema class from Weaviate. This deletes all associated data.""" + client = self.get_client() + client.schema.delete_class(class_name) + + def delete_schema( + self, class_names: list[str] | str | None = None + ) -> list[weaviate.UnexpectedStatusCodeException] | None: + """ + Deletes all or specific class if class_names are provided. + + If no class_name is given remove the entire schema from the Weaviate instance and all data associated + with it. If class_names are given, delete schema classes from Weaviate. This deletes all associated data. + :return: list of error object if any + """ + client = self.get_client() + class_names = [class_names] if class_names and isinstance(class_names, str) else class_names + if class_names: + error_list = [] + for class_name in class_names: + try: + self.delete_class(class_name=class_name) + except weaviate.UnexpectedStatusCodeException as e: + error_list.append(e) + return error_list Review Comment: +1 to that @utkarsharma2 . Also, we can have `all=False` to indicate if we should delete all classes instead of deleting all without the users permission ########## airflow/providers/weaviate/hooks/weaviate.py: ########## @@ -119,38 +125,204 @@ def test_connection(self) -> tuple[bool, str]: self.log.error("Error testing Weaviate connection: %s", e) return False, str(e) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) def create_class(self, class_json: dict[str, Any]) -> None: """Create a new class.""" client = self.conn client.schema.create_class(class_json) - def create_schema(self, schema_json: dict[str, Any]) -> None: + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def create_schema(self, schema_json: dict[str, Any] | str) -> None: """ Create a new Schema. Instead of adding classes one by one , you can upload a full schema in JSON format at once. - :param schema_json: The schema to create + :param schema_json: The schema to create or path to the json file holding the schema """ client = self.conn client.schema.create(schema_json) + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def get_schema(self, class_name: str | None = None): + """Get the schema from Weaviate. + + get schema of a class or all classes. + """ + client = self.get_client() + return client.schema.get(class_name) + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def delete_class(self, class_name: str): + """Delete a schema class from Weaviate. This deletes all associated data.""" + client = self.get_client() + client.schema.delete_class(class_name) + + def delete_schema( + self, class_names: list[str] | str | None = None + ) -> list[UnexpectedStatusCodeException] | None: + """ + Deletes all or specific class if class_names are provided. + + If no class_name is given remove the entire schema from the Weaviate instance and all data associated + with it. If class_names are given, delete schema classes from Weaviate. This deletes all associated data. + :return: list of error object if any + """ + client = self.get_client() + class_names = [class_names] if class_names and isinstance(class_names, str) else class_names + if class_names: + error_list = [] + for class_name in class_names: + try: + self.delete_class(class_name=class_name) + except UnexpectedStatusCodeException as e: + error_list.append(e) + return error_list + else: + return client.schema.delete_all() + + @retry(reraise=True, stop=stop_after_attempt(3), retry=retry_if_exception_type(requests.ConnectionError)) + def update_class(self, class_name: str, config: dict): + """Update schema's class.""" + client = self.get_client() + client.schema.update_config(class_name=class_name, config=config) + + def update_multiple_classes(self, schema_json: list[dict]) -> list[UnexpectedStatusCodeException] | None: + """Updated multiple classes. + + :param schema_json: list of class_config objects + .. seealso:: `example of class_config <https://weaviate-python-client.readthedocs.io/en/v3.25.2/weaviate.schema.html#weaviate.schema.Schema.update_config>`_. + :return: list of error object if any + """ + error_list = [] + for config in schema_json: + try: + self.update_class(class_name=config.pop("class"), config=config) + except UnexpectedStatusCodeException as e: + error_list.append(e) + return error_list + + def upsert_classes(self, schema_json: dict[str, Any] | str, existing: ExitingSchemaOptions = "ignore"): + """ + Create or update the classes in schema of Weaviate database. + + :param schema_json: Json containing the schema. Format {"class_name": "class_dict"} + .. seealso:: `example of class_dict <https://weaviate-python-client.readthedocs.io/en/v3.25.2/weaviate.schema.html#weaviate.schema.Schema.create>`_. + :param existing: Options to handle the case when the classes exist, possible options + 'replace', 'fail', 'ignore'. + """ + existing_schema_options = ["replace", "fail", "ignore"] + if existing not in existing_schema_options: + raise ValueError(f"Param 'existing' should be one of the {existing_schema_options} values.") + if isinstance(schema_json, str): + schema_json = cast(dict, json.load(open(schema_json))) + set__exiting_classes = {class_object["class"] for class_object in self.get_schema()["classes"]} + set__to_be_added_classes = {key for key, _ in schema_json.items()} + intersection_classes = set__exiting_classes.intersection(set__to_be_added_classes) + classes_to_create = set() + if existing == "fail" and intersection_classes: + raise ValueError( + f"Trying to create class {intersection_classes}" f" but this class already exists." + ) + elif existing == "ignore": + classes_to_create = set__to_be_added_classes - set__exiting_classes + elif existing == "replace": + error_list = self.delete_schema(class_names=list(intersection_classes)) + if error_list: + raise ValueError(error_list) + classes_to_create = intersection_classes.union(set__to_be_added_classes) + classes_to_create_list = [schema_json[item] for item in sorted(list(classes_to_create))] + self.create_schema({"classes": classes_to_create_list}) + + def _compare_schema_subset(self, class_object: Any, class_schema: Any) -> bool: + """ + Recursively check if requested schema/object is a subset of the current schema. + + :param class_object: The class object to check against current schema + :param class_schema: The current schema class object + """ + # Direct equality check + if class_object == class_schema: + return True + + # Type mismatch early return + if type(class_object) != type(class_schema): + return False + + # Dictionary comparison + if isinstance(class_object, dict): + for k, v in class_object.items(): + if (k not in class_schema) or (not self._compare_schema_subset(v, class_schema[k])): + return False + return True + + # List or Tuple comparison + if isinstance(class_object, (list, tuple)): + for obj, sch in zip(class_object, class_schema): + if len(class_object) > len(class_schema) or not self._compare_schema_subset(obj, sch): + return False + return True + + # Default case for non-matching types or unsupported types + return False + + @staticmethod + def _convert_properties_to_dict(classes_objects, key_property: str = "name"): + for cls in classes_objects: + cls["properties"] = {p[key_property]: p for p in cls["properties"]} + return classes_objects + + def check_subset_of_schema(self, classes_objects: list) -> bool: + """Check if the class_objects is a subset of existing schema.""" + # When the class properties are not in same order or not the same length. We convert them to dicts + # with property `name` as the key. This way we ensure, the subset is checked. Review Comment: This matches the existing `contains` https://weaviate-python-client.readthedocs.io/en/stable/weaviate.schema.html#weaviate.schema.Schema.contains or am I missing something? The docstring says: If the given schema is a subset of the loaded schema it will still return true, by default None. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@airflow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org