branch: externals/vecdb commit a296dd7075c6d3e4982ab8db49606f15616d7923 Merge: 169b7cb9e8 d7247e32cb Author: Andrew Hyatt <ahy...@gmail.com> Commit: GitHub <nore...@github.com>
Add postgres implementation of vecdb with pgvector --- Eldev | 3 +- README.org | 11 ++ vecdb-integration-test.el | 36 +++++-- vecdb-psql.el | 254 ++++++++++++++++++++++++++++++++++++++++++++++ vecdb.el | 2 +- 5 files changed, 297 insertions(+), 9 deletions(-) diff --git a/Eldev b/Eldev index ad2d86f3ce..43c29f47cf 100644 --- a/Eldev +++ b/Eldev @@ -1,4 +1,5 @@ -; -*- mode: emacs-lisp; lexical-binding: t -*- + ; -*- mode: emacs-lisp; lexical-binding: t -*- (eldev-use-package-archive 'gnu-elpa) +(eldev-use-package-archive 'nongnu-elpa) (eldev-use-plugin 'maintainer) diff --git a/README.org b/README.org index 8643592f65..af9249d08a 100644 --- a/README.org +++ b/README.org @@ -86,3 +86,14 @@ However, the full set of options, here demonstrating the equivalent settings to :tenant "default" :database "default")) #+end_src +** Postgres with pgvector +The popular database Postgres has an extension that allows it to have vector database functionality, [[https://github.com/pgvector/pgvector][pgvector]]. This needs the =pg-el= library. + +A provider defines a database, and the collection will define a table with the collection name in that database. + +For example, +#+begin_src emacs-lisp +(defvar my-postgres-provider (make-vecdb-psqlprovider :dbname "mydatabase" :username "myuser")) +#+end_src + +This also takes an optional password as well. For now, this just uses localhost as a default. diff --git a/vecdb-integration-test.el b/vecdb-integration-test.el index 35f670ba0d..f99ef5c295 100644 --- a/vecdb-integration-test.el +++ b/vecdb-integration-test.el @@ -33,6 +33,7 @@ ;; CHROMA_DATABASE (optional, defaults to "default") ;; For Qdrant: QDRANT_URL (e.g., "http://localhost:6333") ;; QDRANT_API_KEY (e.g., "your-api-key") +;; For Postgres: PSQL_DB (should exist already), PSQL_USERNAME, PSQL_PASSWORD (optional) ;; 3. Execute from the command line: ;; emacs -batch -l ert -l vecdb-integration-test.el -f ert-run-tests-batch-and-exit ;; @@ -46,6 +47,7 @@ (require 'vecdb) (require 'vecdb-chroma) (require 'vecdb-qdrant) +(require 'vecdb-psql) (require 'cl-lib) ;; For cl-remove-if-not, cl-every (declare-function chroma-ext--tmp-project-dir "ext-chroma") @@ -75,7 +77,18 @@ Skips tests if no providers are configured." (make-vecdb-qdrant-provider :url qdrant-url :api-key qdrant-api-key) - (warn "QDRANT_URL is set, but QDRANT_API_KEY is missing. Qdrant provider will not be configured."))))))) + (warn "QDRANT_URL is set, but QDRANT_API_KEY is missing. Qdrant provider will not be configured."))))) + + ;; Postgres Configuration + (let ((postgres-db (getenv "PSQL_DB")) + (postgres-username (getenv "PSQL_USERNAME")) + (postgres-password (getenv "PSQL_PASSWORD"))) + + (when postgres-username + (make-vecdb-psql-provider + :dbname postgres-db + :username postgres-username + :password postgres-password))))) (progn (ert-skip "No vector database provider environment variables set. (CHROMA_URL or QDRANT_URL must be set)") @@ -92,6 +105,7 @@ itself might globally skip if no providers at all are configured)." (declare (indent defun)) (let ((chroma-test-name (intern (format "%s-chroma" base-name))) (qdrant-test-name (intern (format "%s-qdrant" base-name))) + (psql-test-name (intern (format "%s-psql" base-name))) (base-doc (or docstring (format "Test %s for a vector database provider." base-name)))) `(progn (ert-deftest ,chroma-test-name () @@ -110,7 +124,15 @@ itself might globally skip if no providers at all are configured)." (vecdb-test--get-providers)))) (if current-provider (funcall ,body-function current-provider) - (ert-skip (format "Qdrant provider not configured for %s" ',qdrant-test-name)))))))) + (ert-skip (format "Qdrant provider not configured for %s" ',qdrant-test-name))))) + (ert-deftest ,psql-test-name () + ,(format "%s (Postgres)" base-doc) + (interactive) + (let ((current-provider (cl-find-if (lambda (p) (eq (type-of p) 'vecdb-psql-provider)) + (vecdb-test--get-providers)))) + (if current-provider + (funcall ,body-function current-provider) + (ert-skip (format "Postgres provider not configured for %s" ',psql-test-name)))))))) (defmacro with-test-collection (current-provider collection-var collection-name-base options &rest body) "Execute BODY with COLLECTION-VAR bound to a new collection. @@ -122,11 +144,11 @@ The full collection name is generated by appending the provider's name. The collection is created before BODY and deleted afterwards." (declare (indent 1) (debug t)) (let ((full-collection-name (gensym "full-collection-name-")) - (vector-size-val (gensym "vector-size-")) (default-vector-size 3)) `(let* ((,full-collection-name (format "%s-%s" ,collection-name-base (vecdb-provider-name ,current-provider))) - (,vector-size-val (or (plist-get ,options :vector-size) ,default-vector-size)) - (,collection-var (make-vecdb-collection :name ,full-collection-name :vector-size ,vector-size-val))) + (,collection-var (make-vecdb-collection :name ,full-collection-name + :vector-size (or (plist-get ,options :vector-size) ,default-vector-size) + :payload-fields (plist-get ,options :payload-fields)))) (unwind-protect (progn (vecdb-create ,current-provider ,collection-var) @@ -165,7 +187,7 @@ The collection is created before BODY and deleted afterwards." (make-vecdb-item :id 1 :vector [0 1 2] :payload '(:val 1)) (make-vecdb-item :id 2 :vector [0 1 2] :payload '(:val 2)) (make-vecdb-item :id 3 :vector [0 1 2] :payload '(:val 3))))) - (with-test-collection current-provider current-collection collection-name `(:vector-size ,vector-size) + (with-test-collection current-provider current-collection collection-name `(:vector-size ,vector-size :payload-fields ((val . integer))) (vecdb-upsert-items current-provider current-collection items t) (dolist (item items) (let ((retrieved-item (vecdb-get-item current-provider current-collection (vecdb-item-id item)))) @@ -190,7 +212,7 @@ The collection is created before BODY and deleted afterwards." (item2 (make-vecdb-item :id 2 :vector [0.4 0.5 0.6] :payload '(:val 2))) (item3 (make-vecdb-item :id 3 :vector [0.7 0.8 0.9] :payload '(:val 3))) (items (list item1 item2 item3))) - (with-test-collection current-provider current-collection collection-name `(:vector-size ,vector-size) + (with-test-collection current-provider current-collection collection-name `(:vector-size ,vector-size :payload-fields ((val . integer))) (vecdb-upsert-items current-provider current-collection items t) ;; Search for a vector similar to item2 (let ((results (vecdb-search-by-vector current-provider current-collection [0.41 0.51 0.61] 3))) diff --git a/vecdb-psql.el b/vecdb-psql.el new file mode 100644 index 0000000000..40dc662180 --- /dev/null +++ b/vecdb-psql.el @@ -0,0 +1,254 @@ +;;; vecdb-psql.el --- An interface to postgres with vector extension -*- lexical-binding: t; -*- + +;; Copyright (c) 2025 Free Software Foundation, Inc. + +;; Author: Andrew Hyatt <ahy...@gmail.com> +;; Homepage: https://github.com/ahyatt/vecdb +;; SPDX-License-Identifier: GPL-3.0-or-later +;; +;; This program is free software; you can redistribute it and/or +;; modify it under the terms of the GNU General Public License as +;; published by the Free Software Foundation; either version 3 of the +;; License, or (at your option) any later version. +;; +;; This program is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. + +;;; Commentary: +;; This package provides an implementation of vecdb for using with postgres. + +;;; Code: + +(require 'vecdb) +(require 'pg) +(require 'cl-lib) +(require 'map) +(require 'seq) + +(cl-defstruct (vecdb-psql-provider (:include vecdb-provider + (name "postgres"))) + "Provider for the vector database. +DBNAME is the database name, which must have been created by the user." + dbname + username + (password "")) + +(defconst vecdb-psql-connection-cache + (make-hash-table :test 'equal) + "Cache for database connections by db name.") + +(defun vecdb-psql-get-connection (provider) + "Get a connection to the database specified by PROVIDER." + (let* ((key (vecdb-psql-provider-dbname provider)) + (connection (gethash key vecdb-psql-connection-cache))) + (unless connection + (setq connection + (pg-connect + (vecdb-psql-provider-dbname provider) + (vecdb-psql-provider-username provider) + (vecdb-psql-provider-password provider))) + (puthash key connection vecdb-psql-connection-cache)) + connection)) + +(defun vecdb-psql-table-name (collection-name) + "Turn COLLECTION-NAME into a safe table name." + (replace-regexp-in-string "[^a-zA-Z0-9_]" "_" (downcase collection-name))) + +(defun vecdb-psql-type (collection-type) + "Convert COLLECTION-TYPE to a PostgreSQL type string." + (pcase collection-type + ('string "TEXT") + ('integer "INTEGER") + ('float "FLOAT") + (_ (error "Unsupported field type: %s" collection-type)))) + +(defun vecdb-psql-oid (collection-type) + "Convert COLLECTION-TYPE to a psql OID." + (pcase collection-type + ('string "text") + ('integer "int8") + ('float "float8") + (_ (error "Unsupported field type: %s" collection-type)))) + +(cl-defmethod vecdb-create ((provider vecdb-psql-provider) + (collection vecdb-collection)) + "Create COLLECTION in database PROVIDER." + (pg-exec (vecdb-psql-get-connection provider) + (format "CREATE TABLE IF NOT EXISTS %s ( + id INTEGER PRIMARY KEY, + vector VECTOR(%d) NOT NULL%s + %s + );" + (vecdb-psql-table-name (vecdb-collection-name collection)) + (vecdb-collection-vector-size collection) + (if (vecdb-collection-payload-fields collection) "," "") + (mapconcat + (lambda (field) + (format "%s %s NULL" + (car field) + (vecdb-psql-type (cdr field)))) + (vecdb-collection-payload-fields collection) + ", "))) + (pg-exec (vecdb-psql-get-connection provider) + (format "CREATE INDEX IF NOT EXISTS %s_embedding_hnsw_idx ON %s USING hnsw (vector vector_cosine_ops)" + (vecdb-psql-table-name (vecdb-collection-name collection)) + (vecdb-psql-table-name (vecdb-collection-name collection)))) + (mapc (lambda (field) + (pg-exec (vecdb-psql-get-connection provider) + (format "CREATE INDEX IF NOT EXISTS %s_%s_idx ON %s (%s)" + (vecdb-psql-table-name (vecdb-collection-name collection)) + (car field) + (vecdb-psql-table-name (vecdb-collection-name collection)) + (car field)))) + (vecdb-collection-payload-fields collection))) + +(cl-defmethod vecdb-delete ((provider vecdb-psql-provider) + (collection vecdb-collection)) + "Delete COLLECTION from database PROVIDER." + (pg-exec (vecdb-psql-get-connection provider) + (format "DROP TABLE IF EXISTS %s;" + (vecdb-psql-table-name (vecdb-collection-name collection))))) + +(cl-defmethod vecdb-exists ((provider vecdb-psql-provider) + (collection vecdb-collection)) + "Check if the COLLECTION exists in the database specified by PROVIDER." + (let ((result + (pg-exec (vecdb-psql-get-connection provider) + (format "SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = '%s' + );" + (vecdb-psql-table-name (vecdb-collection-name collection)))))) + (and result + (equal (caar (pg-result result :tuples)) t)))) + +(defun vecdb-psql--plist-keys (plist) + "Return a list of keys from PLIST, as strings with the colon removed." + (cl-loop for (k _v) on plist by #'cddr + collect (substring (symbol-name k) 1))) + +(cl-defmethod vecdb-upsert-items ((provider vecdb-psql-provider) + (collection vecdb-collection) + data-list &optional _) + "Upsert items into the COLLECTION in the database PROVIDER. +All items in DATA-LIST must have the same payloads." + (let ((arg-count 0)) + (funcall #'pg-exec-prepared + (vecdb-psql-get-connection provider) + (format "INSERT INTO %s (id, vector%s%s) VALUES %s + ON CONFLICT (id) DO UPDATE SET vector = EXCLUDED.vector%s%s;" + (vecdb-psql-table-name (vecdb-collection-name collection)) + (if (vecdb-collection-payload-fields collection) ", " "") + ;; We assume every vecdb-item has the same payload structure + (mapconcat #'identity (vecdb-psql--plist-keys + (vecdb-item-payload (car data-list))) + ", ") + (mapconcat (lambda (item) + (format "(%s)" + (string-join (cl-loop for i from 1 below (+ 2 (length (vecdb-item-payload item))) + do (cl-incf arg-count) + collect (format "$%d" arg-count)) + ", "))) + data-list + ", ") + (if (vecdb-collection-payload-fields collection) ", " "") + (mapconcat + (lambda (field) + (format "%s = EXCLUDED.%s" (car field) (car field))) + (vecdb-collection-payload-fields collection) + ", ")) + (mapcan (lambda (item) + (append + (list + (cons (vecdb-item-id item) "int8") + (cons (vecdb-item-vector item) "vector")) + (mapcar (lambda (payload-key) + (cons (plist-get (vecdb-item-payload item) payload-key) + (vecdb-psql-oid (assoc-default + (intern (substring (symbol-name payload-key) 1)) + (vecdb-collection-payload-fields collection))))) + (map-keys (vecdb-item-payload (car data-list)))))) + data-list)))) + +(defun vecdb-psql--full-row-to-item (row collection) + "Convert a full database row ROW into a vecdb-item for COLLECTION." + (make-vecdb-item + :id (nth 0 row) + :vector (nth 1 row) + :payload + (flatten-list (cl-loop for field in (vecdb-collection-payload-fields collection) + collect + (list (intern (format ":%s" (car field))) + (nth (+ 2 (cl-position field + (vecdb-collection-payload-fields collection) + :test #'equal)) + row)))))) + +(cl-defmethod vecdb-get-item ((provider vecdb-psql-provider) + (collection vecdb-collection) + id) + "Get an item from COLLECTION by ID. +PROVIDER specifies the database that the collection is in." + (let ((result + (pg-result + (pg-exec-prepared (vecdb-psql-get-connection provider) + (format "SELECT id, vector::vector%s %s FROM %s WHERE id = $1;" + (if (vecdb-collection-payload-fields collection) ", " "") + (mapconcat + (lambda (field) + (format "%s" (car field))) + (vecdb-collection-payload-fields collection) + ", ") + (vecdb-psql-table-name (vecdb-collection-name collection))) + (list (cons id "int8"))) + :tuples))) + (when result + (vecdb-psql--full-row-to-item (car result) collection)))) + +(cl-defmethod vecdb-delete-items ((provider vecdb-psql-provider) + (collection vecdb-collection) + ids &optional _) + "Delete items from COLLECTION by IDs. +PROVIDER is the database that the collection is in." + (when ids + ;; TODO: This should ideally be a prepared statement, but I dont know how to do + ;; this with psql. + (pg-exec (vecdb-psql-get-connection provider) + (format "DELETE FROM %s WHERE id IN (%s);" + (vecdb-psql-table-name (vecdb-collection-name collection)) + (mapconcat #'number-to-string ids ", "))))) + +(cl-defmethod vecdb-search-by-vector ((provider vecdb-psql-provider) + (collection vecdb-collection) + vector + &optional limit) + "Search for items in COLLECTION by VECTOR. +PROVIDER is the database that the collection is in." + (let ((limit-clause (if limit + (format "LIMIT %d" limit) + ""))) + (mapcar (lambda (row) + (vecdb-psql--full-row-to-item row collection)) + (pg-result + (pg-exec-prepared (vecdb-psql-get-connection provider) + (format "SELECT id, vector::vector%s %s FROM %s + ORDER BY vector <-> $1 %s;" + (if (vecdb-collection-payload-fields collection) ", " "") + (mapconcat + (lambda (field) + (format "%s" (car field))) + (vecdb-collection-payload-fields collection) + ", ") + (vecdb-psql-table-name (vecdb-collection-name collection)) + limit-clause) + (list (cons vector "vector"))) + :tuples)))) + +(provide 'vecdb-psql) + +;;; vecdb-psql.el ends here diff --git a/vecdb.el b/vecdb.el index 78179c804b..f56d7baf7b 100644 --- a/vecdb.el +++ b/vecdb.el @@ -4,7 +4,7 @@ ;; Author: Andrew Hyatt <ahy...@gmail.com> ;; Homepage: https://github.com/ahyatt/vecdb -;; Package-Requires: ((emacs "29.1") (plz "0.8")) +;; Package-Requires: ((emacs "29.1") (plz "0.8") (pg "0.56")) ;; Package-Version: 0.1 ;; SPDX-License-Identifier: GPL-3.0-or-later ;;