This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 9ff9e1635f [SYSTEMDS-3213] New builtin for cluster-based quantization
9ff9e1635f is described below

commit 9ff9e1635fcc76a1e0fc21cfa4f6aefba1e64326
Author: Can Abdulla <canabdu...@outlook.de>
AuthorDate: Tue Jun 4 10:10:43 2024 +0200

    [SYSTEMDS-3213] New builtin for cluster-based quantization
    
    LDE SoSe'24 project
    Closes #2030.
---
 scripts/builtin/quantizeByCluster.dml              |  75 +++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |   1 +
 .../part2/BuiltinQuantizeByClusterTest.java        | 120 +++++++++++++++++++++
 .../functions/builtin/quantizeByCluster.dml        |  96 +++++++++++++++++
 4 files changed, 292 insertions(+)

diff --git a/scripts/builtin/quantizeByCluster.dml 
b/scripts/builtin/quantizeByCluster.dml
new file mode 100644
index 0000000000..82bd9200be
--- /dev/null
+++ b/scripts/builtin/quantizeByCluster.dml
@@ -0,0 +1,75 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Builtin function that implements product quantization
+#
+# INPUT:
+# 
---------------------------------------------------------------------------------------
+# X                             The input matrix to perform product 
quantization on
+# M                             Number of subspaces
+# k                             Number of vectors in the subcodebooks
+# runs                          Number of runs (with different initial 
centroids)
+# max_iter                      Maximum number of iterations per run
+# eps                           Tolerance (epsilon) for WCSS change ratio
+# avg_sample_size_per_centroid  Average number of records per centroid in data 
samples
+# separate                      Cluster subspaces separately. If value is set 
to true,
+#                               kmeans is run M times, once for each subspace. 
Otherwise
+#                               kmeans is run only once.
+# seed                          The seed used for initial sampling. If set to 
-1 random
+#                               seeds are selected.
+# 
---------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# 
------------------------------------------------------------------------------------------
+# codebook  The matrix containing the centroids. If clustered separately, the 
ith
+#           subcodebook is the ith chunk of size k. The codebook matrix has 
the dimensions
+#           [k*M x ncol(X)/M].
+# codes     The mapping of vectors to centroids. Each vector of the input 
matrix X is mapped
+#           onto a vector of codes. The entries in the codes matrix are the 
indices of
+#           the vectors in the codebook. The codes matrix has the dimensions 
[nrow(X) x M].
+# 
------------------------------------------------------------------------------------------
+
+m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, 
Integer runs = 10,
+    Integer max_iter = 1000, Double eps = 1e-6, Integer 
avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Integer seed = -1)
+  return(Matrix[Double] codebook, Matrix[Double] codes)
+{
+  subvector_size = ncol(X) / M
+  #Kmeans is run just once for all subspaces together. Subvectors are mapped 
to vectors of the codebook of size k*M.
+  #The ith entry of a code vector has a value in [1, k*M].
+  if(!separate) {
+    A = matrix(X, rows= nrow(X) * M, cols=subvector_size)
+    [codebook, B] = kmeans(A, k * M, runs, max_iter, eps, FALSE, 
avg_sample_size_per_centroid, seed)
+    codes = matrix(B, rows = nrow(B) / M, cols = ncol(B) * M)
+  }
+  #Kmeans is run for every subspace separately. Subvectors are mapped to a 
subset of k vectors of the codebook.
+  #The ith entry of a code vector has a value in ((i-1)*k, i*k].
+  else {
+    l = k
+    codebook = matrix(1, rows=l*M, cols=subvector_size)
+    codes = matrix(1, rows=nrow(X), cols=M)
+    parfor(i in 1:M, check=0) {
+       [tmp_cbook, tmp_c] = 
kmeans(X[,(i-1)*subvector_size+1:i*subvector_size], l, runs, max_iter, eps, 
FALSE, avg_sample_size_per_centroid, seed)
+       codebook[(i-1)*l+1:i*l,] = tmp_cbook
+       offset = matrix((i-1)*l, rows=nrow(codes), cols=1)
+       codes[,i] = tmp_c + offset
+    }
+  }
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index 59d41fb227..b9cda07791 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -271,6 +271,7 @@ public enum Builtins {
        PSNR("psnr", true),
        QR("qr", false, ReturnType.MULTI_RETURN),
        QUANTILE("quantile", false),
+       QUANTIZEBYCLUSTER("quantizeByCluster", true),
        RANDOM_FOREST("randomForest", true),
        RANDOM_FOREST_PREDICT("randomForestPredict", true),
        RANGE("range", false),
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
new file mode 100644
index 0000000000..3f33c1bb24
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part2;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+
+@RunWith(Parameterized.class)
+public class BuiltinQuantizeByClusterTest extends AutomatedTestBase {
+
+       @Parameter
+       public String test_case;
+       @Parameter(1)
+       public int rows;
+       @Parameter(2)
+       public int cols;
+       @Parameter(3)
+       public int clusters;
+       @Parameter(4)
+       public int subspaces;
+       @Parameter(5)
+       public int k;
+       @Parameter(6)
+       public int vectors_per_cluster;
+       @Parameter(7)
+       public boolean quantize_separately;
+
+       private final static String TEST_NAME = "quantizeByCluster";
+       private final static String TEST_DIR = "functions/builtin/";
+       private final static String TEST_CLASS_DIR = TEST_DIR + 
BuiltinQuantizeByClusterTest.class.getSimpleName() + "/";
+       private final static double eps = 1e-10;
+       private final static int runs = 3;
+       private final static int max_iter = 1000;
+//     private final static double cluster_offset = 0.1;
+
+       @Parameterized.Parameters(name = "{0}: rows={1}, cols={2}, c={3}, 
subspaces={4}, k={5}, v_per_c={6}, sep={7}")
+       public static Collection<Object[]> data() {
+               return Arrays.asList(new Object[][]{
+                       {"sub_cluster", 1024, 64, 12, 8, 12, 40, true},  
{"sub_cluster", 1024, 64, 12, 4, 12, 40, true},  {"sub_cluster", 1024, 64, 12, 
2, 12, 40, true},
+                       {"sub_cluster", 1024, 64, 12, 8, 12, 40, false}, 
{"sub_cluster", 1024, 64, 12, 4, 12, 40, false}, {"sub_cluster", 1024, 64, 12, 
2, 12, 40, false},
+                       {"cluster", 1024, 64, 12, 8, 12, 40, true},  
{"cluster", 1024, 64, 12, 4, 12, 40, true},  {"cluster", 1024, 64, 12, 2, 12, 
40, true},
+                       {"cluster", 1024, 64, 20, 8, 12, 40, false}, 
{"cluster", 1024, 64, 12, 4, 12, 40, false}, {"cluster", 1024, 64, 12, 2, 12, 
40, false},
+                       {"uniform", 1024, 64, 12, 8, 12, 40, true},  
{"uniform", 1024, 64, 12, 4, 12, 40, true},  {"uniform", 1024, 64, 12, 2, 12, 
40, true},
+                       {"uniform", 1024, 64, 12, 8, 12, 40, false}, 
{"uniform", 1024, 64, 12, 4, 12, 40, false}, {"uniform", 1024, 64, 12, 2, 12, 
40, false},
+                       {"normal",  1024, 64, 12, 8, 12, 40, true},  {"normal", 
 1024, 64, 12, 4, 12, 40, true},  {"normal",  1024, 64, 12, 2, 12, 40, true},
+                       {"normal",  1024, 64, 12, 8, 12, 40, false}, {"normal", 
 1024, 64, 12, 4, 12, 40, false}, {"normal",  1024, 64, 12, 2, 12, 40, false},
+               });
+       }
+
+       @Override
+       public void setUp() {
+               addTestConfiguration(TEST_NAME, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME));
+       }
+
+       @Test
+       public void basicTest() {
+               runQuantizeByClusterTest();
+       }
+
+       /*The tests use kmeans clustering as a baseline and check whether the 
distortion is within
+       a certain threshold.*/
+       private void runQuantizeByClusterTest() {
+               loadTestConfiguration(getTestConfiguration(TEST_NAME));
+               String HOME = SCRIPT_DIR + TEST_DIR;
+               fullDMLScriptName = HOME + TEST_NAME + ".dml";
+               programArgs = new String[]{"-nvargs", "codes=" + 
output("codes"), "codebook=" + output("codebook"),
+                               "pq_distortion=" + output("pq_distortion"), 
"k_distortion=" + output("k_distortion"),
+                               "clusters=" + clusters, "test_case=" + 
test_case, "rows=" + rows,
+                               "cols=" + cols, "subspaces=" + subspaces, "k=" 
+ k, "runs=" + runs, "max_iter=" + max_iter,
+                               "eps=" + eps, "vectors_per_cluster=" + 
vectors_per_cluster, "sep=" + quantize_separately};
+
+               runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+
+               // check if output dimensions are correct
+               MatrixCharacteristics meta_codes = readDMLMetaDataFile("codes");
+               MatrixCharacteristics meta_codebook = 
readDMLMetaDataFile("codebook");
+               Assert.assertTrue("Matrix dimensions should be equal to 
expected dimensions",
+                       meta_codes.getRows() == (long) clusters * 
vectors_per_cluster 
+                       && meta_codes.getCols() == subspaces);
+               Assert.assertEquals("Centroid dimensions should be equal to 
expected dimensions",
+                       cols / subspaces, meta_codebook.getCols());
+
+               double pq_distortion = 
readDMLMatrixFromOutputDir("pq_distortion").get(new MatrixValue.CellIndex(1, 
1));
+               double k_distortion = 
readDMLMatrixFromOutputDir("k_distortion").get(new MatrixValue.CellIndex(1, 1));
+
+               //check if distortion is within a threshold
+               if (!test_case.equals("cluster")) {
+                       Assert.assertTrue(pq_distortion < 1.2 * k_distortion + 
0.1);
+               } else {
+                       Assert.assertTrue(pq_distortion < 2 * k_distortion + 
0.1);
+               }
+       }
+}
diff --git a/src/test/scripts/functions/builtin/quantizeByCluster.dml 
b/src/test/scripts/functions/builtin/quantizeByCluster.dml
new file mode 100644
index 0000000000..62fa475de2
--- /dev/null
+++ b/src/test/scripts/functions/builtin/quantizeByCluster.dml
@@ -0,0 +1,96 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#duplicate rows of matix n times
+duplicate_rows = function (Matrix [Double] V, Integer N)
+  return(Matrix [Double] V)
+{
+  tmp = V
+  for(i in seq(1, N-1, 1)) {
+    tmp = rbind(tmp, V)
+  }
+  V = tmp
+}
+#construct vectors from codes
+construct_vectors = function (Matrix [Double] codes, Matrix [Double] codebook)
+  return(Matrix [Double] vectors)
+{
+  vectors = matrix(0, rows=nrow(codes), cols=ncol(codes)*ncol(codebook))
+  parfor (i in 1:nrow(codes), check=0) {
+    parfor (j in 1:ncol(codes), check=0) {
+      vectors[i, 1 + (j-1)* ncol(codebook): j * ncol(codebook)] = 
codebook[as.scalar(codes[i, j])]
+    }
+  }
+}
+
+max = 1
+min = -max
+offset = max / 10
+
+subvector_size = $cols / $subspaces
+rows = $clusters * $vectors_per_cluster
+
+# Generate points by concatenating sub_points around sub_clusters
+if($test_case == "sub_cluster") {
+  offset_matrix = rand(rows=rows, cols=$cols, min=-offset, max=offset, 
pdf="uniform", seed=2)
+  cluster_centers = rand(rows = $clusters, cols = subvector_size, min=min, 
max=max, seed=2)
+  vectors = matrix(cluster_centers, nrow(cluster_centers), 
ncol(cluster_centers))
+  for(i in 1:$subspaces-1)  {
+    cluster_centers = rand(rows = $clusters, cols = subvector_size, min=min, 
max=max, seed=2)
+    vectors = cbind(vectors, cluster_centers)
+  }
+  #ensure correct number of vectors
+  vectors = duplicate_rows(vectors, $vectors_per_cluster)
+  vectors = vectors + offset_matrix
+}
+# Generate points around clusters
+else if ($test_case == "cluster") {
+  cluster_centers = rand(rows = $clusters, cols = $cols, min=min, max=max, 
pdf="uniform", seed=2)
+  vectors = matrix(cluster_centers, nrow(cluster_centers), 
ncol(cluster_centers))
+  #ensure correct number of vectors
+  vectors = duplicate_rows(vectors, $vectors_per_cluster)
+  offset_matrix = rand(rows=rows, cols=$cols, min=-offset, max=offset, 
pdf="uniform", seed=2)
+  vectors = vectors + offset_matrix
+}
+# Generate random points
+else {
+  vectors = rand(rows = rows, cols = $cols, min=min, max=max, pdf=$test_case, 
seed=2)
+}
+
+[codebook, codes] = quantizeByCluster(vectors, $subspaces, $k, $runs, 
$max_iter, $eps, $vectors_per_cluster, $sep, 2)
+[k_codebook, k_codes] = kmeans(vectors, $k * $subspaces, $runs, $max_iter, 
$eps, FALSE, $vectors_per_cluster, 2)
+
+#construct vectors from codes
+pq_result = construct_vectors(codes, codebook)
+k_result = construct_vectors(k_codes, k_codebook)
+
+#calculate distortion
+pq_distortion = colSums(rowSums((vectors - pq_result)^2)) / rows
+k_distortion = colSums(rowSums((vectors - k_result)^2)) / rows
+
+print("Product quantization distortion: " + toString(pq_distortion))
+print("Kmeans distortion: " + toString(k_distortion))
+
+write(codes, $codes)
+write(codebook, $codebook)
+write(pq_distortion, $pq_distortion)
+write(k_distortion, $k_distortion)
+

Reply via email to