krisztina-zsihovszki commented on code in PR #7894:
URL: https://github.com/apache/nifi/pull/7894#discussion_r1373753687


##########
nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py:
##########
@@ -0,0 +1,165 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from langchain.vectorstores import Pinecone
+from langchain.embeddings.openai import OpenAIEmbeddings
+from nifiapi.flowfiletransform import FlowFileTransform, 
FlowFileTransformResult
+from nifiapi.properties import PropertyDescriptor, StandardValidators, 
ExpressionLanguageScope
+import QueryUtils
+import pinecone
+
+
+class QueryPinecone(FlowFileTransform):
+    class Java:
+        implements = ['org.apache.nifi.python.processor.FlowFileTransform']
+
+    class ProcessorDetails:
+        version = '2.0.0-SNAPSHOT'
+        description = "Queries Pinecone in order to gather a specified number 
of documents that are most closely related to the given query."

Review Comment:
   In my view it'd be useful to mention that QueryPinecone, QueryChroma and 
PromptChatGPT require an incoming flowfile, otherwise the queries are not 
triggered (no output flow file is created, even if there was a match based on 
the processor properties). 



##########
nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/PutPinecone.py:
##########
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from langchain.vectorstores import Pinecone
+from langchain.embeddings.openai import OpenAIEmbeddings
+from nifiapi.flowfiletransform import FlowFileTransform, 
FlowFileTransformResult
+from nifiapi.properties import PropertyDescriptor, StandardValidators, 
ExpressionLanguageScope
+import pinecone
+import json
+
+
+class PutPinecone(FlowFileTransform):
+    class Java:
+        implements = ['org.apache.nifi.python.processor.FlowFileTransform']
+
+    class ProcessorDetails:
+        version = '2.0.0-SNAPSHOT'
+        description = """Publishes JSON data to a Pinecone. The Incoming data 
must be in single JSON per Line format, each with two keys: 'text' and 
'metadata'.
+                       The text must be a string, while metadata must be a map 
with strings for values. Any additional fields will be ignored."""
+        tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", 
"ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
+
+
+    PINECONE_API_KEY = PropertyDescriptor(
+        name="Pinecone API Key",
+        description="The API Key to use in order to authentication with 
Pinecone",
+        sensitive=True,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+    )
+    OPENAI_API_KEY = PropertyDescriptor(
+        name="OpenAI API Key",
+        description="The API Key for OpenAI in order to create embeddings",
+        sensitive=True,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+    )
+    PINECONE_ENV = PropertyDescriptor(
+        name="Pinecone Environment",
+        description="The name of the Pinecone Environment. This can be found 
in the Pinecone console next to the API Key.",
+        sensitive=False,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+    )
+    INDEX_NAME = PropertyDescriptor(
+        name="Index Name",
+        description="The name of the Pinecone index.",
+        sensitive=False,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
+    )
+    TEXT_KEY = PropertyDescriptor(
+        name="Text Key",
+        description="The key in the document that contains the text to create 
embeddings for.",
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        default_value="text",
+        expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
+    )
+    NAMESPACE = PropertyDescriptor(
+        name="Namespace",
+        description="The name of the Pinecone Namespace to put the documents 
to.",
+        required=False,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
+    )
+
+    properties = [PINECONE_API_KEY,
+                  OPENAI_API_KEY,
+                  PINECONE_ENV,
+                  INDEX_NAME,
+                  TEXT_KEY,
+                  NAMESPACE]
+
+    embeddings = None
+
+    def __init__(self, **kwargs):
+        pass
+
+    def getPropertyDescriptors(self):
+        return self.properties
+
+    def onScheduled(self, context):
+        api_key = context.getProperty(self.PINECONE_API_KEY).getValue()
+        pinecone_env = context.getProperty(self.PINECONE_ENV).getValue()
+
+        # initialize pinecone
+        pinecone.init(
+            api_key=api_key,
+            environment=pinecone_env,
+        )
+        openai_api_key = context.getProperty(self.OPENAI_API_KEY).getValue()
+        self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
+
+
+    def transform(self, context, flowfile):
+        # First, check if our index already exists. If it doesn't, we create it
+        index_name = 
context.getProperty(self.INDEX_NAME).evaluateAttributeExpressions(flowfile).getValue()
+        namespace = 
context.getProperty(self.NAMESPACE).evaluateAttributeExpressions(flowfile).getValue()
+
+        index = pinecone.Index(index_name)
+
+        # Read the FlowFile content as "json-lines".
+        json_lines = flowfile.getContentsAsBytes().decode()
+        i = 0
+        texts = []
+        metadatas = []
+        ids = []
+        for line in json_lines.split("\n"):
+            doc = json.loads(line)
+            text = doc.get('text')

Review Comment:
   The field name 'text' is used here while the text key can be overwritten by 
TEXT_KEY property.
   The processor description also mentions the 'text' as key, maybe it's worth 
mentioning that this is only the default key name.



##########
nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/ChromaUtils.py:
##########
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nifiapi.properties import PropertyDescriptor, StandardValidators, 
PropertyDependency, ExpressionLanguageScope
+
+# Connection Strategies
+LOCAL_DISK = "Local Disk"
+REMOTE_SERVER = "Remote Chroma Server"
+
+# Authentication Strategies
+TOKEN = "Token Authentication"
+BASIC_AUTH = "Basic Authentication"
+NONE = "None"
+
+# Transport Protocols
+HTTP = "http"
+HTTPS = "https"
+
+CONNECTION_STRATEGY = PropertyDescriptor(
+    name="Connection Strategy",
+    description="Specifies how to connect to the Chroma server",
+    allowable_values=[LOCAL_DISK, REMOTE_SERVER],
+    default_value=REMOTE_SERVER,
+    required=True
+)
+DIRECTORY = PropertyDescriptor(
+    name="Directory",
+    description="The Directory that Chroma should use to persist data",
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    required=True,
+    default_value="./chroma",
+    dependencies=[PropertyDependency(CONNECTION_STRATEGY, LOCAL_DISK)]
+)
+HOSTNAME = PropertyDescriptor(
+    name="Hostname",
+    description="The hostname to connect to in order to communicate with 
Chroma",
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    default_value="localhost",
+    required=True,
+    dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)]
+)
+PORT = PropertyDescriptor(
+    name="Port",
+    description="The port that the Chroma server is listening on",
+    validators=[StandardValidators.PORT_VALIDATOR],
+    default_value="8000",
+    required=True,
+    dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)]
+)
+TRANSPORT_PROTOCOL = PropertyDescriptor(
+    name="Transport Protocol",
+    description="Specifies whether connections should be made over http or 
https",
+    allowable_values=[HTTP, HTTPS],
+    default_value=HTTPS,
+    required=True,
+    dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)]
+)
+AUTH_STRATEGY = PropertyDescriptor(
+    name="Authentication Strategy",
+    description="Specifies how to authenticate to Chroma server",
+    allowable_values=[TOKEN, BASIC_AUTH, NONE],
+    default_value=TOKEN,
+    required=True,
+    dependencies=[PropertyDependency(CONNECTION_STRATEGY, REMOTE_SERVER)]
+)
+AUTH_TOKEN = PropertyDescriptor(
+    name="Authentication Token",
+    description="The token to use for authenticating to Chroma server",
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    required=True,
+    sensitive=True,
+    dependencies=[PropertyDependency(AUTH_STRATEGY, TOKEN)]
+)
+USERNAME = PropertyDescriptor(
+    name="Username",
+    description="The username to use for authenticating to Chroma server",
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    required=True,
+    dependencies=[PropertyDependency(AUTH_STRATEGY, BASIC_AUTH)]
+)
+PASSWORD = PropertyDescriptor(
+    name="Password",
+    description="The password to use for authenticating to Chroma server",
+    validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+    required=True,
+    sensitive=True,
+    dependencies=[PropertyDependency(AUTH_STRATEGY, BASIC_AUTH)]
+)
+COLLECTION_NAME = PropertyDescriptor(
+    name="Collection Name",
+    description="The name of the Chroma Collection to update",

Review Comment:
   Minor: It'd be useful to mention here as well that the collection is created 
if it does not exist. 
   (It is mentioned only in the description of "Distance Method".)



##########
nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py:
##########
@@ -0,0 +1,165 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from langchain.vectorstores import Pinecone
+from langchain.embeddings.openai import OpenAIEmbeddings
+from nifiapi.flowfiletransform import FlowFileTransform, 
FlowFileTransformResult
+from nifiapi.properties import PropertyDescriptor, StandardValidators, 
ExpressionLanguageScope
+import QueryUtils
+import pinecone
+
+
+class QueryPinecone(FlowFileTransform):
+    class Java:
+        implements = ['org.apache.nifi.python.processor.FlowFileTransform']
+
+    class ProcessorDetails:
+        version = '2.0.0-SNAPSHOT'
+        description = "Queries Pinecone in order to gather a specified number 
of documents that are most closely related to the given query."
+        tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", 
"ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
+
+
+    PINECONE_API_KEY = PropertyDescriptor(
+        name="Pinecone API Key",
+        description="The API Key to use in order to authentication with 
Pinecone",
+        sensitive=True,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+    )
+    OPENAI_API_KEY = PropertyDescriptor(
+        name="OpenAI API Key",
+        description="The API Key for OpenAI in order to create embeddings",
+        sensitive=True,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+    )
+    PINECONE_ENV = PropertyDescriptor(
+        name="Pinecone Environment",
+        description="The name of the Pinecone Environment. This can be found 
in the Pinecone console next to the API Key.",
+        sensitive=False,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+    )
+    INDEX_NAME = PropertyDescriptor(
+        name="Index Name",
+        description="The name of the Pinecone index.",
+        sensitive=False,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
+    )
+    QUERY = PropertyDescriptor(
+        name="Query",
+        description="The query to issue to Pinecone.",

Review Comment:
   It is not obvious what kind of query is required here (it could have been a 
vectory query as well).
   It would be useful if something like this was added to the description: 
"Text to look up documents similar to." 
   or a similar description was used as used for QueryChroma's "Query" property.



##########
nifi-python-extensions/nifi-text-embeddings-module/src/main/python/vectorstores/QueryPinecone.py:
##########
@@ -0,0 +1,165 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from langchain.vectorstores import Pinecone
+from langchain.embeddings.openai import OpenAIEmbeddings
+from nifiapi.flowfiletransform import FlowFileTransform, 
FlowFileTransformResult
+from nifiapi.properties import PropertyDescriptor, StandardValidators, 
ExpressionLanguageScope
+import QueryUtils
+import pinecone
+
+
+class QueryPinecone(FlowFileTransform):
+    class Java:
+        implements = ['org.apache.nifi.python.processor.FlowFileTransform']
+
+    class ProcessorDetails:
+        version = '2.0.0-SNAPSHOT'
+        description = "Queries Pinecone in order to gather a specified number 
of documents that are most closely related to the given query."
+        tags = ["pinecone", "vector", "vectordb", "vectorstore", "embeddings", 
"ai", "artificial intelligence", "ml", "machine learning", "text", "LLM"]
+
+
+    PINECONE_API_KEY = PropertyDescriptor(
+        name="Pinecone API Key",
+        description="The API Key to use in order to authentication with 
Pinecone",
+        sensitive=True,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+    )
+    OPENAI_API_KEY = PropertyDescriptor(
+        name="OpenAI API Key",
+        description="The API Key for OpenAI in order to create embeddings",
+        sensitive=True,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+    )
+    PINECONE_ENV = PropertyDescriptor(
+        name="Pinecone Environment",
+        description="The name of the Pinecone Environment. This can be found 
in the Pinecone console next to the API Key.",
+        sensitive=False,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR]
+    )
+    INDEX_NAME = PropertyDescriptor(
+        name="Index Name",
+        description="The name of the Pinecone index.",
+        sensitive=False,
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
+    )
+    QUERY = PropertyDescriptor(
+        name="Query",
+        description="The query to issue to Pinecone.",
+        required=True,
+        validators=[StandardValidators.NON_EMPTY_VALIDATOR],
+        expression_language_scope=ExpressionLanguageScope.FLOWFILE_ATTRIBUTES
+    )
+    NUMBER_OF_RESULTS = PropertyDescriptor(
+        name="Number of Results",
+        description="The number of results to return from Chroma",

Review Comment:
   "...to return from Pinecone"



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@nifi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to