krisztina-zsihovszki commented on code in PR #7894: URL: https://github.com/apache/nifi/pull/7894#discussion_r1373753687
########## nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py: ########## @@ -0,0 +1,165 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from langchain.vectorstores import Pinecone +from langchain.embeddings.openai import OpenAIEmbeddings +from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult +from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope +import QueryUtils +import pinecone + + +class QueryPinecone(FlowFileTransform): + class Java: + implements = ['org.apache.nifi.python.processor.FlowFileTransform'] + + class ProcessorDetails: + version = '2.0.0-SNAPSHOT' + description = "Queries Pinecone in order to gather a specified number of documents that are most closely related to the given query." Review Comment: In my view it'd be useful to mention that QueryPinecone, QueryChroma and PromptChatGPT require an incoming flowfile, otherwise the queries are not triggered (no output flow file is created, even if there was a match based on the processor properties). ########## nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutPinecone.py: ########## @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from langchain.vectorstores import Pinecone +from langchain.embeddings.openai import OpenAIEmbeddings +from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult +from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope +import pinecone +import json + + +class PutPinecone(FlowFileTransform): + class Java: + implements = ['org.apache.nifi.python.processor.FlowFileTransform'] + + class ProcessorDetails: + version = '2.0.0-SNAPSHOT' + description = """Publishes JSON data to a Pinecone. The Incoming data must be in single JSON per Line format, each with two keys: 'text' and 'metadata'. + The text must be a string, while metadata must be a map with strings for values. Any additional fields will be ignored.""" + tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"] + + + PINECONE_API_KEY = PropertyDescriptor( + name="Pinecone API Key", + description="The API Key to use in order to authentication with Pinecone", + sensitive=True, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR] + ) + OPENAI_API_KEY = PropertyDescriptor( + name="OpenAI API Key", + description="The API Key for OpenAI in order to create embeddings", + sensitive=True, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR] + ) + PINECONE_ENV = PropertyDescriptor( + name="Pinecone Environment", + description="The name of the Pinecone Environment. This can be found in the Pinecone console next to the API Key.", + sensitive=False, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR] + ) + INDEX_NAME = PropertyDescriptor( + name="Index Name", + description="The name of the Pinecone index.", + sensitive=False, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES + ) + TEXT_KEY = PropertyDescriptor( + name="Text Key", + description="The key in the document that contains the text to create embeddings for.", + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + default_value="text", + expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES + ) + NAMESPACE = PropertyDescriptor( + name="Namespace", + description="The name of the Pinecone Namespace to put the documents to.", + required=False, + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES + ) + + properties = [PINECONE_API_KEY, + OPENAI_API_KEY, + PINECONE_ENV, + INDEX_NAME, + TEXT_KEY, + NAMESPACE] + + embeddings = None + + def __init__(self, **kwargs): + pass + + def getPropertyDescriptors(self): + return self.properties + + def onScheduled(self, context): + api_key = context.getProperty(self.PINECONE_API_KEY).getValue() + pinecone_env = context.getProperty(self.PINECONE_ENV).getValue() + + # initialize pinecone + pinecone.init( + api_key=api_key, + environment=pinecone_env, + ) + openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue() + self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) + + + def transform(self, context, flowfile): + # First, check if our index already exists. If it doesn't, we create it + index_name = context.getProperty(self.INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue() + namespace = context.getProperty(self.NAMESPACE).evaluateAttributeExpressions(flowfile).getValue() + + index = pinecone.Index(index_name) + + # Read the FlowFile content as "json-lines". + json_lines = flowfile.getContentsAsBytes().decode() + i = 0 + texts = [] + metadatas = [] + ids = [] + for line in json_lines.split("\n"): + doc = json.loads(line) + text = doc.get('text') Review Comment: The field name 'text' is used here while the text key can be overwritten by TEXT_KEY property. The processor description also mentions the 'text' as key, maybe it's worth mentioning that this is only the default key name. ########## nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/ChromaUtils.py: ########## @@ -0,0 +1,155 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nifiapi.properties import PropertyDescriptor, StandardValidators, PropertyDependency, ExpressionLanguageScope + +# Connection Strategies +LOCAL_DISK = "Local Disk" +REMOTE_SERVER = "Remote Chroma Server" + +# Authentication Strategies +TOKEN = "Token Authentication" +BASIC_AUTH = "Basic Authentication" +NONE = "None" + +# Transport Protocols +HTTP = "http" +HTTPS = "https" + +CONNECTION_STRATEGY = PropertyDescriptor( + name="Connection Strategy", + description="Specifies how to connect to the Chroma server", + allowable_values=[LOCAL_DISK, REMOTE_SERVER], + default_value=REMOTE_SERVER, + required=True +) +DIRECTORY = PropertyDescriptor( + name="Directory", + description="The Directory that Chroma should use to persist data", + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + required=True, + default_value="./chroma", + dependencies=[PropertyDependency(CONNECTION_STRATEGY, LOCAL_DISK)] +) +HOSTNAME = PropertyDescriptor( + name="Hostname", + description="The hostname to connect to in order to communicate with Chroma", + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + default_value="localhost", + required=True, + dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)] +) +PORT = PropertyDescriptor( + name="Port", + description="The port that the Chroma server is listening on", + validators=[StandardValidators.PORT_VALIDATOR], + default_value="8000", + required=True, + dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)] +) +TRANSPORT_PROTOCOL = PropertyDescriptor( + name="Transport Protocol", + description="Specifies whether connections should be made over http or https", + allowable_values=[HTTP, HTTPS], + default_value=HTTPS, + required=True, + dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)] +) +AUTH_STRATEGY = PropertyDescriptor( + name="Authentication Strategy", + description="Specifies how to authenticate to Chroma server", + allowable_values=[TOKEN, BASIC_AUTH, NONE], + default_value=TOKEN, + required=True, + dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)] +) +AUTH_TOKEN = PropertyDescriptor( + name="Authentication Token", + description="The token to use for authenticating to Chroma server", + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + required=True, + sensitive=True, + dependencies=[PropertyDependency(AUTH_STRATEGY, TOKEN)] +) +USERNAME = PropertyDescriptor( + name="Username", + description="The username to use for authenticating to Chroma server", + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + required=True, + dependencies=[PropertyDependency(AUTH_STRATEGY, BASIC_AUTH)] +) +PASSWORD = PropertyDescriptor( + name="Password", + description="The password to use for authenticating to Chroma server", + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + required=True, + sensitive=True, + dependencies=[PropertyDependency(AUTH_STRATEGY, BASIC_AUTH)] +) +COLLECTION_NAME = PropertyDescriptor( + name="Collection Name", + description="The name of the Chroma Collection to update", Review Comment: Minor: It'd be useful to mention here as well that the collection is created if it does not exist. (It is mentioned only in the description of "Distance Method".) ########## nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py: ########## @@ -0,0 +1,165 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from langchain.vectorstores import Pinecone +from langchain.embeddings.openai import OpenAIEmbeddings +from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult +from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope +import QueryUtils +import pinecone + + +class QueryPinecone(FlowFileTransform): + class Java: + implements = ['org.apache.nifi.python.processor.FlowFileTransform'] + + class ProcessorDetails: + version = '2.0.0-SNAPSHOT' + description = "Queries Pinecone in order to gather a specified number of documents that are most closely related to the given query." + tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"] + + + PINECONE_API_KEY = PropertyDescriptor( + name="Pinecone API Key", + description="The API Key to use in order to authentication with Pinecone", + sensitive=True, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR] + ) + OPENAI_API_KEY = PropertyDescriptor( + name="OpenAI API Key", + description="The API Key for OpenAI in order to create embeddings", + sensitive=True, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR] + ) + PINECONE_ENV = PropertyDescriptor( + name="Pinecone Environment", + description="The name of the Pinecone Environment. This can be found in the Pinecone console next to the API Key.", + sensitive=False, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR] + ) + INDEX_NAME = PropertyDescriptor( + name="Index Name", + description="The name of the Pinecone index.", + sensitive=False, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES + ) + QUERY = PropertyDescriptor( + name="Query", + description="The query to issue to Pinecone.", Review Comment: It is not obvious what kind of query is required here (it could have been a vectory query as well). It would be useful if something like this was added to the description: "Text to look up documents similar to." or a similar description was used as used for QueryChroma's "Query" property. ########## nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py: ########## @@ -0,0 +1,165 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from langchain.vectorstores import Pinecone +from langchain.embeddings.openai import OpenAIEmbeddings +from nifiapi.flowfiletransform import FlowFileTransform, FlowFileTransformResult +from nifiapi.properties import PropertyDescriptor, StandardValidators, ExpressionLanguageScope +import QueryUtils +import pinecone + + +class QueryPinecone(FlowFileTransform): + class Java: + implements = ['org.apache.nifi.python.processor.FlowFileTransform'] + + class ProcessorDetails: + version = '2.0.0-SNAPSHOT' + description = "Queries Pinecone in order to gather a specified number of documents that are most closely related to the given query." + tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", "ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"] + + + PINECONE_API_KEY = PropertyDescriptor( + name="Pinecone API Key", + description="The API Key to use in order to authentication with Pinecone", + sensitive=True, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR] + ) + OPENAI_API_KEY = PropertyDescriptor( + name="OpenAI API Key", + description="The API Key for OpenAI in order to create embeddings", + sensitive=True, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR] + ) + PINECONE_ENV = PropertyDescriptor( + name="Pinecone Environment", + description="The name of the Pinecone Environment. This can be found in the Pinecone console next to the API Key.", + sensitive=False, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR] + ) + INDEX_NAME = PropertyDescriptor( + name="Index Name", + description="The name of the Pinecone index.", + sensitive=False, + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES + ) + QUERY = PropertyDescriptor( + name="Query", + description="The query to issue to Pinecone.", + required=True, + validators=[StandardValidators.NON_EMPTY_VALIDATOR], + expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES + ) + NUMBER_OF_RESULTS = PropertyDescriptor( + name="Number of Results", + description="The number of results to return from Chroma", Review Comment: "...to return from Pinecone" -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@nifi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org