pvillard31 commented on code in PR #8590: URL: https://github.com/apache/nifi/pull/8590#discussion_r1546012716
########## nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryQdrant.py: ########## @@ -0,0 +1,192 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from langchain.vectorstores.qdrant import Qdrant +from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult +from nifiapi.properties import ( + PropertyDescriptor, + StandardValidators, + ExpressionLanguageScope, +) +import QueryUtils +import json +from EmbeddingUtils import ( + create_embedding_service, +) + +from nifiapi.documentation import use_case + +from qdrant_client import QdrantClient + +import QdrantUtils + + +@use_case( + description="Semantically search for documents stored in Qdrant - https://qdrant.tech/", + keywords=["qdrant", "embedding", "vector", "text", "vectorstore", "search"], + configuration=""" + Configure 'Collection Name' to the name of the Qdrant collection to use. + Configure 'Qdrant URL' to the fully qualified URL of the Qdrant instance. + Configure 'Qdrant API Key' to the API Key to use in order to authenticate with Qdrant. + Configure 'Prefer gRPC' to True if you want to use gRPC for interfacing with Qdrant. + Configure 'Use HTTPS' to True if you want to use TLS(HTTPS) while interfacing with Qdrant. + Configure 'Embedding Model' to indicate whether OpenAI embeddings should be used or a HuggingFace embedding model should be used: 'Hugging Face Model' or 'OpenAI Model' + Configure 'HuggingFace API Key' or 'OpenAI API Key', depending on the chosen Embedding Model. + Configure 'HuggingFace Model' or 'OpenAI Model' to the name of the model to use. + Configure 'Query' to the text of the query to send to Qdrant. + Configure 'Number of Results' to the number of results to return from Qdrant. + Configure 'Metadata Filter' to apply an optional metadata filter with the query. For example: { "author": "john.doe" } + Configure 'Output Strategy' to indicate how the output should be formatted: 'Row-Oriented', 'Text', or 'Column-Oriented'. + Configure 'Results Field' to the name of the field to insert the results, if the input FlowFile is JSON Formatted,. + Configure 'Include Metadatas' to True if metadata should be included in the output. + Configure 'Include Distances' to True if distances should be included in the output. + """, +) +class QueryQdrant(FlowFileTransform): + class Java: + implements = ["org.apache.nifi.python.processor.FlowFileTransform"] + + class ProcessorDetails: + version = "2.0.0-SNAPSHOT" + description = "Queries Qdrant in order to gather a specified number of documents that are most closely related to the given query." + tags = [ + "qdrant", + "vector", + "vectordb", + "vectorstore", + "embeddings", + "ai", + "artificial intelligence", + "ml", + "machine learning", + "text", + "LLM", + ] + + QUERY = PropertyDescriptor( + name="Query", + description="The text of the query to send to Qdrant.", + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, + ) + NUMBER_OF_RESULTS = PropertyDescriptor( + name="Number of Results", + description="The number of results to return from Qdrant.", + required=True, + validators=[StandardValidators.POSITIVE_INTEGER_VALIDATOR], + default_value="10", + expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, + ) + FILTER = PropertyDescriptor( + name="Metadata Filter", + description='Optional metadata filter to apply with the query. For example: { "author": "john.doe" }', + required=False, + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES, + ) + + properties = ( + QdrantUtils.QDRANT_PROPERTIES + + QdrantUtils.EMBEDDING_MODEL_PROPERTIES + + [ + QUERY, + FILTER, + NUMBER_OF_RESULTS, + QueryUtils.OUTPUT_STRATEGY, + QueryUtils.RESULTS_FIELD, + QueryUtils.INCLUDE_METADATAS, + QueryUtils.INCLUDE_DISTANCES, + ] + ) + + embeddings = None + query_utils = None + client = None + + def __init__(self, **kwargs): + pass + + def getPropertyDescriptors(self): + return self.properties + + def onScheduled(self, context): + self.client = QdrantClient( + url=context.getProperty(QdrantUtils.QDRANT_URL).getValue(), + api_key=context.getProperty(QdrantUtils.QDRANT_API_KEY).getValue(), + prefer_grpc=context.getProperty(QdrantUtils.PREFER_GRPC).asBoolean(), + https=context.getProperty(QdrantUtils.HTTPS).asBoolean(), + ) + self.embeddings = create_embedding_service(context) + self.query_utils = QueryUtils.QueryUtils(context) + + def transform(self, context, flowfile): + collection_name = ( + context.getProperty(QdrantUtils.COLLECTION_NAME) + .evaluateAttributeExpressions(flowfile) + .getValue() + ) + query = ( + context.getProperty(self.QUERY) + .evaluateAttributeExpressions(flowfile) + .getValue() + ) + num_results = ( + context.getProperty(self.NUMBER_OF_RESULTS) + .evaluateAttributeExpressions(flowfile) + .asInteger() + ) + filter = ( + context.getProperty(self.FILTER) + .evaluateAttributeExpressions(flowfile) + .getValue() + ) + vector_store = Qdrant( + client=self.client, + collection_name=collection_name, + embeddings=self.embeddings, + ) + results = vector_store.similarity_search_with_score( + query=query, + k=num_results, + filter=None if filter is None else json.loads(filter), Review Comment: Yeah this is the point I was trying to make: it would be nice to have a processor that makes it super easy for a user in NiFi to do the hybrid search without the need to do in a multi-steps process. For example, have an optional property in the configuration of QueryQdrant for a user to provide keywords (the user may put those as FlowFile attributes upstream in the flow) and use the QueryQdrant processor where it's doing both searches and the fusion of the results. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@nifi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org