(systemds) branch main updated: [SYSTEMDS-3213] Add optional space decomposition to quantizeByCluster

mboehm7 Wed, 24 Jul 2024 10:27:31 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 69d04d4679 [SYSTEMDS-3213] Add optional space decomposition to 
quantizeByCluster
69d04d4679 is described below

commit 69d04d4679fedd4c43fe3d734ffa875980d545f8
Author: Can Abdulla <canabdu...@outlook.de>
AuthorDate: Wed Jul 24 19:25:24 2024 +0200

    [SYSTEMDS-3213] Add optional space decomposition to quantizeByCluster
    
    Closes #2053.
---
 scripts/builtin/quantizeByCluster.dml              | 69 ++++++++++++++++++----
 .../part2/BuiltinQuantizeByClusterTest.java        | 36 +++++++----
 .../functions/builtin/quantizeByCluster.dml        | 24 ++++++--
 3 files changed, 101 insertions(+), 28 deletions(-)

diff --git a/scripts/builtin/quantizeByCluster.dml 
b/scripts/builtin/quantizeByCluster.dml
index 82bd9200be..824ac35053 100644
--- a/scripts/builtin/quantizeByCluster.dml
+++ b/scripts/builtin/quantizeByCluster.dml
@@ -17,9 +17,14 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-#-------------------------------------------------------------
+#----------------------------------------------------------------------------------------
 
-# Builtin function that implements product quantization
+# The quantizeByCluster-function implements product quantization. Initially, it
+# divides the original vector space into M subspaces. The resulting lower 
dimensional
+# subvectors are then quantized. If the column count is not divisible by the 
number of
+# subspaces M, the data is padded with zeros. Optimal space decomposition can 
be
+# computed, when the data follows a Gaussian distribution. The function uses 
kmeans for
+# quantizing and svd to compute the space decomposition.
 #
 # INPUT:
 # 
---------------------------------------------------------------------------------------
@@ -33,6 +38,10 @@
 # separate                      Cluster subspaces separately. If value is set 
to true,
 #                               kmeans is run M times, once for each subspace. 
Otherwise
 #                               kmeans is run only once.
+# space_decomp                  Decompose the vector space by multiplying the 
input
+#                               matrix X with an orthogonal matrix R. Assumes 
the data
+#                               follows a parametric Gaussian distribution.
+#                               Time complexity in O(nrow(X)^2 * min(nrow(X), 
ncol(X))).
 # seed                          The seed used for initial sampling. If set to 
-1 random
 #                               seeds are selected.
 # 
---------------------------------------------------------------------------------------
@@ -45,13 +54,45 @@
 # codes     The mapping of vectors to centroids. Each vector of the input 
matrix X is mapped
 #           onto a vector of codes. The entries in the codes matrix are the 
indices of
 #           the vectors in the codebook. The codes matrix has the dimensions 
[nrow(X) x M].
+# R         The orthogonal matrix R which is applied to the input matrix X 
before performing
+#           the product quantization. Only relevant when space_decomp = TRUE.
 # 
------------------------------------------------------------------------------------------
 
 m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, 
Integer runs = 10,
-    Integer max_iter = 1000, Double eps = 1e-6, Integer 
avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Integer seed = -1)
-  return(Matrix[Double] codebook, Matrix[Double] codes)
+    Integer max_iter = 1000, Double eps = 1e-6, Integer 
avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean 
space_decomp=FALSE, Integer seed = -1)
+  return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R)
 {
+  #Pad the data with zeros if the number of columns of the input matrix X is 
not divisible by M
+  if(ncol(X) %% M != 0) {
+    zeros = matrix(0, rows=nrow(X), cols=((ncol(X) %/% M) +1) * M - ncol(X))
+    X = cbind(X, zeros)
+  }
   subvector_size = ncol(X) / M
+  #Transform the vector space by an orthogonal matrix R.
+  #R is computed by reordering the principal directions of the input matrix X 
such that the variance of each subspace is balanced.
+  if(space_decomp) {
+    #Perform PCA using SVD
+    X2 = X - colMeans(X)
+    [U, S, V] = svd(X2)
+    eigen_v = diag(S)^2 / (nrow(X)-1)
+    #Balance the variance of the subspaces following a greedy approach.
+    R = matrix(0, rows=nrow(V), cols=ncol(V))
+    subspace_variance = matrix(1, rows=M, cols=1)
+    subspace_idx = seq(0,M-1) * subvector_size + 1
+    for(i in 1:nrow(R)) {
+      #free_buckets[j] == 0 iff subspace j is full
+      free_buckets = subspace_idx - seq(1,M) * subvector_size < 1
+      b = as.scalar(rowIndexMin(t(subspace_variance) %*% (diag(1 / 
free_buckets))))
+      subspace_variance[b] = subspace_variance[b] * eigen_v[i]
+      R[,as.scalar(subspace_idx[b])] = V[,i]
+      subspace_idx[b] = subspace_idx[b] + 1
+    }
+    #Apply space decomposition
+    X = X %*% t(R)
+  }
+  else {
+    R = matrix(0, rows=1, cols=1)
+  }
   #Kmeans is run just once for all subspaces together. Subvectors are mapped 
to vectors of the codebook of size k*M.
   #The ith entry of a code vector has a value in [1, k*M].
   if(!separate) {
@@ -62,14 +103,20 @@ m_quantizeByCluster = function(Matrix[Double]X, Integer M 
= 4, Integer k = 10, I
   #Kmeans is run for every subspace separately. Subvectors are mapped to a 
subset of k vectors of the codebook.
   #The ith entry of a code vector has a value in ((i-1)*k, i*k].
   else {
-    l = k
-    codebook = matrix(1, rows=l*M, cols=subvector_size)
+    codebook = matrix(1, rows=k*M, cols=subvector_size)
     codes = matrix(1, rows=nrow(X), cols=M)
-    parfor(i in 1:M, check=0) {
-       [tmp_cbook, tmp_c] = 
kmeans(X[,(i-1)*subvector_size+1:i*subvector_size], l, runs, max_iter, eps, 
FALSE, avg_sample_size_per_centroid, seed)
-       codebook[(i-1)*l+1:i*l,] = tmp_cbook
-       offset = matrix((i-1)*l, rows=nrow(codes), cols=1)
-       codes[,i] = tmp_c + offset
+    for(i in 1:M, check=0) {
+      [tmp_cbook, tmp_c] = kmeans(X[,(i-1)*subvector_size+1:i*subvector_size], 
k, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed)
+      #If no output is produced, use a single centroid
+      if(as.scalar(tmp_c[1,1]) < 1) {
+        tmp_cbook = matrix(0, rows=k, cols=subvector_size)
+        tmp_cbook[1,] =  colSums(X[,(i-1)*subvector_size+1:i*subvector_size]) 
/ nrow(X)
+        tmp_c = matrix(1, rows=nrow(X), cols=1)
+      }
+      codebook[(i-1)*k+1:i*k,] = tmp_cbook
+      offset = matrix((i-1)*k, rows=nrow(codes), cols=1)
+      codes[,i] = tmp_c + offset
     }
   }
 }
+
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
index 64908ee073..e398a0ad06 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
@@ -53,6 +53,7 @@ public class BuiltinQuantizeByClusterTest extends 
AutomatedTestBase {
        public int vectors_per_cluster;
        @Parameter(7)
        public boolean quantize_separately;
+       public boolean space_decomp;
 
        private final static String TEST_NAME = "quantizeByCluster";
        private final static String TEST_DIR = "functions/builtin/";
@@ -60,7 +61,6 @@ public class BuiltinQuantizeByClusterTest extends 
AutomatedTestBase {
        private final static double eps = 1e-10;
        private final static int runs = 3;
        private final static int max_iter = 1000;
-//     private final static double cluster_offset = 0.1;
 
        @Parameterized.Parameters(name = "{0}: rows={1}, cols={2}, c={3}, 
subspaces={4}, k={5}, v_per_c={6}, sep={7}")
        public static Collection<Object[]> data() {
@@ -73,6 +73,8 @@ public class BuiltinQuantizeByClusterTest extends 
AutomatedTestBase {
                        {"uniform", 1024, 64, 12, 8, 12, 40, false}, 
{"uniform", 1024, 64, 12, 4, 12, 40, false}, {"uniform", 1024, 64, 12, 2, 12, 
40, false},
                        {"normal",  1024, 64, 12, 8, 12, 40, true},  {"normal", 
 1024, 64, 12, 4, 12, 40, true},  {"normal",  1024, 64, 12, 2, 12, 40, true},
                        {"normal",  1024, 64, 12, 8, 12, 40, false}, {"normal", 
 1024, 64, 12, 4, 12, 40, false}, {"normal",  1024, 64, 12, 2, 12, 40, false},
+                       {"normal",  1024, 53, 12, 8, 12, 40, true},  {"normal", 
 1024, 61, 12, 4, 12, 40, true},  {"normal",  1024, 83, 12, 2, 12, 40, true},
+                       {"normal",  1024, 53, 12, 8, 12, 40, false},  
{"normal",  1024, 61, 12, 4, 12, 40, false},  {"normal",  1024, 83, 12, 2, 12, 
40, false},
                });
        }
 
@@ -83,7 +85,10 @@ public class BuiltinQuantizeByClusterTest extends 
AutomatedTestBase {
 
        @Test
        public void basicTest() {
-               runQuantizeByClusterTest();
+               for (boolean b : new boolean[]{true, false}) {
+                       space_decomp = b;
+                       runQuantizeByClusterTest();
+               }
        }
 
        /*The tests use kmeans clustering as a baseline and check whether the 
distortion is within
@@ -93,30 +98,35 @@ public class BuiltinQuantizeByClusterTest extends 
AutomatedTestBase {
                String HOME = SCRIPT_DIR + TEST_DIR;
                fullDMLScriptName = HOME + TEST_NAME + ".dml";
                programArgs = new String[]{"-nvargs", "codes=" + 
output("codes"), "codebook=" + output("codebook"),
-                               "pq_distortion=" + output("pq_distortion"), 
"k_distortion=" + output("k_distortion"),
-                               "clusters=" + clusters, "test_case=" + 
test_case, "rows=" + rows,
-                               "cols=" + cols, "subspaces=" + subspaces, "k=" 
+ k, "runs=" + runs, "max_iter=" + max_iter,
-                               "eps=" + eps, "vectors_per_cluster=" + 
vectors_per_cluster, "sep=" + quantize_separately};
+                       "pq_distortion=" + output("pq_distortion"), 
"k_distortion=" + output("k_distortion"), "is_orthogonal=" + 
output("is_orthogonal"),
+                       "clusters=" + clusters, "test_case=" + test_case, 
"rows=" + rows,
+                       "cols=" + cols, "subspaces=" + subspaces, "k=" + k, 
"runs=" + runs, "max_iter=" + max_iter,
+                       "eps=" + eps, "vectors_per_cluster=" + 
vectors_per_cluster, "sep=" + quantize_separately, "space_decomp=" + 
space_decomp};
 
                runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
 
-               // check if output dimensions are correct
+               //check if R is an orthogonal matrix
+               if(space_decomp) {
+                       double is_orthogonal = 
readDMLScalarFromOutputDir("is_orthogonal").get(new MatrixValue.CellIndex(1, 
1));
+                       Assert.assertEquals(0, is_orthogonal, 0.001);
+               }
+
+               //check if output dimensions are correct
                MatrixCharacteristics meta_codes = readDMLMetaDataFile("codes");
                MatrixCharacteristics meta_codebook = 
readDMLMetaDataFile("codebook");
                Assert.assertTrue("Matrix dimensions should be equal to 
expected dimensions",
                        meta_codes.getRows() == (long) clusters * 
vectors_per_cluster 
                        && meta_codes.getCols() == subspaces);
                Assert.assertEquals("Centroid dimensions should be equal to 
expected dimensions",
-                       cols / subspaces, meta_codebook.getCols());
-
-               double pq_distortion = 
readDMLMatrixFromOutputDir("pq_distortion").get(new MatrixValue.CellIndex(1, 
1));
-               double k_distortion = 
readDMLMatrixFromOutputDir("k_distortion").get(new MatrixValue.CellIndex(1, 1));
+                               (int) Math.ceil((double) cols / subspaces), 
meta_codebook.getCols());
 
                //check if distortion is within a threshold
+               double pq_distortion = 
readDMLScalarFromOutputDir("pq_distortion").get(new MatrixValue.CellIndex(1,1));
+               double k_distortion = 
readDMLScalarFromOutputDir("k_distortion").get(new MatrixValue.CellIndex(1, 1));
                if (!test_case.equals("cluster")) {
-                       Assert.assertTrue(pq_distortion < 1.2 * k_distortion + 
0.1);
+                       Assert.assertTrue(pq_distortion < 1.2 * k_distortion + 
0.5);
                } else {
-                       Assert.assertTrue(pq_distortion < 2 * k_distortion + 
0.1);
+                       Assert.assertTrue(pq_distortion < 2 * k_distortion + 2);
                }
        }
 }
diff --git a/src/test/scripts/functions/builtin/quantizeByCluster.dml 
b/src/test/scripts/functions/builtin/quantizeByCluster.dml
index 62fa475de2..4e235acccf 100644
--- a/src/test/scripts/functions/builtin/quantizeByCluster.dml
+++ b/src/test/scripts/functions/builtin/quantizeByCluster.dml
@@ -47,6 +47,8 @@ offset = max / 10
 
 subvector_size = $cols / $subspaces
 rows = $clusters * $vectors_per_cluster
+space_decomp = as.logical($space_decomp)
+sep = as.logical($sep)
 
 # Generate points by concatenating sub_points around sub_clusters
 if($test_case == "sub_cluster") {
@@ -75,16 +77,29 @@ else {
   vectors = rand(rows = rows, cols = $cols, min=min, max=max, pdf=$test_case, 
seed=2)
 }
 
-[codebook, codes] = quantizeByCluster(vectors, $subspaces, $k, $runs, 
$max_iter, $eps, $vectors_per_cluster, $sep, 2)
+#Perform quantization
+[codebook, codes, R] = quantizeByCluster(vectors, $subspaces, $k, $runs, 
$max_iter, $eps, $vectors_per_cluster, sep, space_decomp, 2)
 [k_codebook, k_codes] = kmeans(vectors, $k * $subspaces, $runs, $max_iter, 
$eps, FALSE, $vectors_per_cluster, 2)
 
 #construct vectors from codes
-pq_result = construct_vectors(codes, codebook)
 k_result = construct_vectors(k_codes, k_codebook)
+pq_result = construct_vectors(codes, codebook)
+if(space_decomp) {
+  pq_result = pq_result %*% R
+  #check if R is an orthogonal matrix
+  is_orthogonal = sum((t(R) %*% R - diag(matrix(1, nrow(R), 1)))^2)
+}
+else {
+  is_orthogonal = 0
+}
+
+if($cols %% $subspaces != 0) {
+  pq_result = pq_result[,1:$cols]
+}
 
 #calculate distortion
-pq_distortion = colSums(rowSums((vectors - pq_result)^2)) / rows
-k_distortion = colSums(rowSums((vectors - k_result)^2)) / rows
+pq_distortion = as.scalar(colSums(rowSums((vectors - pq_result)^2)) / rows)
+k_distortion = as.scalar(colSums(rowSums((vectors - k_result)^2)) / rows)
 
 print("Product quantization distortion: " + toString(pq_distortion))
 print("Kmeans distortion: " + toString(k_distortion))
@@ -93,4 +108,5 @@ write(codes, $codes)
 write(codebook, $codebook)
 write(pq_distortion, $pq_distortion)
 write(k_distortion, $k_distortion)
+write(is_orthogonal, $is_orthogonal)

(systemds) branch main updated: [SYSTEMDS-3213] Add optional space decomposition to quantizeByCluster

Reply via email to