This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 9ff9e1635f [SYSTEMDS-3213] New builtin for cluster-based quantization
9ff9e1635f is described below
commit 9ff9e1635fcc76a1e0fc21cfa4f6aefba1e64326
Author: Can Abdulla <[email protected]>
AuthorDate: Tue Jun 4 10:10:43 2024 +0200
[SYSTEMDS-3213] New builtin for cluster-based quantization
LDE SoSe'24 project
Closes #2030.
---
scripts/builtin/quantizeByCluster.dml | 75 +++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../part2/BuiltinQuantizeByClusterTest.java | 120 +++++++++++++++++++++
.../functions/builtin/quantizeByCluster.dml | 96 +++++++++++++++++
4 files changed, 292 insertions(+)
diff --git a/scripts/builtin/quantizeByCluster.dml
b/scripts/builtin/quantizeByCluster.dml
new file mode 100644
index 0000000000..82bd9200be
--- /dev/null
+++ b/scripts/builtin/quantizeByCluster.dml
@@ -0,0 +1,75 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Builtin function that implements product quantization
+#
+# INPUT:
+#
---------------------------------------------------------------------------------------
+# X The input matrix to perform product
quantization on
+# M Number of subspaces
+# k Number of vectors in the subcodebooks
+# runs Number of runs (with different initial
centroids)
+# max_iter Maximum number of iterations per run
+# eps Tolerance (epsilon) for WCSS change ratio
+# avg_sample_size_per_centroid Average number of records per centroid in data
samples
+# separate Cluster subspaces separately. If value is set
to true,
+# kmeans is run M times, once for each subspace.
Otherwise
+# kmeans is run only once.
+# seed The seed used for initial sampling. If set to
-1 random
+# seeds are selected.
+#
---------------------------------------------------------------------------------------
+#
+# OUTPUT:
+#
------------------------------------------------------------------------------------------
+# codebook The matrix containing the centroids. If clustered separately, the
ith
+# subcodebook is the ith chunk of size k. The codebook matrix has
the dimensions
+# [k*M x ncol(X)/M].
+# codes The mapping of vectors to centroids. Each vector of the input
matrix X is mapped
+# onto a vector of codes. The entries in the codes matrix are the
indices of
+# the vectors in the codebook. The codes matrix has the dimensions
[nrow(X) x M].
+#
------------------------------------------------------------------------------------------
+
+m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10,
Integer runs = 10,
+ Integer max_iter = 1000, Double eps = 1e-6, Integer
avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Integer seed = -1)
+ return(Matrix[Double] codebook, Matrix[Double] codes)
+{
+ subvector_size = ncol(X) / M
+ #Kmeans is run just once for all subspaces together. Subvectors are mapped
to vectors of the codebook of size k*M.
+ #The ith entry of a code vector has a value in [1, k*M].
+ if(!separate) {
+ A = matrix(X, rows= nrow(X) * M, cols=subvector_size)
+ [codebook, B] = kmeans(A, k * M, runs, max_iter, eps, FALSE,
avg_sample_size_per_centroid, seed)
+ codes = matrix(B, rows = nrow(B) / M, cols = ncol(B) * M)
+ }
+ #Kmeans is run for every subspace separately. Subvectors are mapped to a
subset of k vectors of the codebook.
+ #The ith entry of a code vector has a value in ((i-1)*k, i*k].
+ else {
+ l = k
+ codebook = matrix(1, rows=l*M, cols=subvector_size)
+ codes = matrix(1, rows=nrow(X), cols=M)
+ parfor(i in 1:M, check=0) {
+ [tmp_cbook, tmp_c] =
kmeans(X[,(i-1)*subvector_size+1:i*subvector_size], l, runs, max_iter, eps,
FALSE, avg_sample_size_per_centroid, seed)
+ codebook[(i-1)*l+1:i*l,] = tmp_cbook
+ offset = matrix((i-1)*l, rows=nrow(codes), cols=1)
+ codes[,i] = tmp_c + offset
+ }
+ }
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index 59d41fb227..b9cda07791 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -271,6 +271,7 @@ public enum Builtins {
PSNR("psnr", true),
QR("qr", false, ReturnType.MULTI_RETURN),
QUANTILE("quantile", false),
+ QUANTIZEBYCLUSTER("quantizeByCluster", true),
RANDOM_FOREST("randomForest", true),
RANDOM_FOREST_PREDICT("randomForestPredict", true),
RANGE("range", false),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
new file mode 100644
index 0000000000..3f33c1bb24
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part2;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+
+@RunWith(Parameterized.class)
+public class BuiltinQuantizeByClusterTest extends AutomatedTestBase {
+
+ @Parameter
+ public String test_case;
+ @Parameter(1)
+ public int rows;
+ @Parameter(2)
+ public int cols;
+ @Parameter(3)
+ public int clusters;
+ @Parameter(4)
+ public int subspaces;
+ @Parameter(5)
+ public int k;
+ @Parameter(6)
+ public int vectors_per_cluster;
+ @Parameter(7)
+ public boolean quantize_separately;
+
+ private final static String TEST_NAME = "quantizeByCluster";
+ private final static String TEST_DIR = "functions/builtin/";
+ private final static String TEST_CLASS_DIR = TEST_DIR +
BuiltinQuantizeByClusterTest.class.getSimpleName() + "/";
+ private final static double eps = 1e-10;
+ private final static int runs = 3;
+ private final static int max_iter = 1000;
+// private final static double cluster_offset = 0.1;
+
+ @Parameterized.Parameters(name = "{0}: rows={1}, cols={2}, c={3},
subspaces={4}, k={5}, v_per_c={6}, sep={7}")
+ public static Collection<Object[]> data() {
+ return Arrays.asList(new Object[][]{
+ {"sub_cluster", 1024, 64, 12, 8, 12, 40, true},
{"sub_cluster", 1024, 64, 12, 4, 12, 40, true}, {"sub_cluster", 1024, 64, 12,
2, 12, 40, true},
+ {"sub_cluster", 1024, 64, 12, 8, 12, 40, false},
{"sub_cluster", 1024, 64, 12, 4, 12, 40, false}, {"sub_cluster", 1024, 64, 12,
2, 12, 40, false},
+ {"cluster", 1024, 64, 12, 8, 12, 40, true},
{"cluster", 1024, 64, 12, 4, 12, 40, true}, {"cluster", 1024, 64, 12, 2, 12,
40, true},
+ {"cluster", 1024, 64, 20, 8, 12, 40, false},
{"cluster", 1024, 64, 12, 4, 12, 40, false}, {"cluster", 1024, 64, 12, 2, 12,
40, false},
+ {"uniform", 1024, 64, 12, 8, 12, 40, true},
{"uniform", 1024, 64, 12, 4, 12, 40, true}, {"uniform", 1024, 64, 12, 2, 12,
40, true},
+ {"uniform", 1024, 64, 12, 8, 12, 40, false},
{"uniform", 1024, 64, 12, 4, 12, 40, false}, {"uniform", 1024, 64, 12, 2, 12,
40, false},
+ {"normal", 1024, 64, 12, 8, 12, 40, true}, {"normal",
1024, 64, 12, 4, 12, 40, true}, {"normal", 1024, 64, 12, 2, 12, 40, true},
+ {"normal", 1024, 64, 12, 8, 12, 40, false}, {"normal",
1024, 64, 12, 4, 12, 40, false}, {"normal", 1024, 64, 12, 2, 12, 40, false},
+ });
+ }
+
+ @Override
+ public void setUp() {
+ addTestConfiguration(TEST_NAME, new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME));
+ }
+
+ @Test
+ public void basicTest() {
+ runQuantizeByClusterTest();
+ }
+
+ /*The tests use kmeans clustering as a baseline and check whether the
distortion is within
+ a certain threshold.*/
+ private void runQuantizeByClusterTest() {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[]{"-nvargs", "codes=" +
output("codes"), "codebook=" + output("codebook"),
+ "pq_distortion=" + output("pq_distortion"),
"k_distortion=" + output("k_distortion"),
+ "clusters=" + clusters, "test_case=" +
test_case, "rows=" + rows,
+ "cols=" + cols, "subspaces=" + subspaces, "k="
+ k, "runs=" + runs, "max_iter=" + max_iter,
+ "eps=" + eps, "vectors_per_cluster=" +
vectors_per_cluster, "sep=" + quantize_separately};
+
+ runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+
+ // check if output dimensions are correct
+ MatrixCharacteristics meta_codes = readDMLMetaDataFile("codes");
+ MatrixCharacteristics meta_codebook =
readDMLMetaDataFile("codebook");
+ Assert.assertTrue("Matrix dimensions should be equal to
expected dimensions",
+ meta_codes.getRows() == (long) clusters *
vectors_per_cluster
+ && meta_codes.getCols() == subspaces);
+ Assert.assertEquals("Centroid dimensions should be equal to
expected dimensions",
+ cols / subspaces, meta_codebook.getCols());
+
+ double pq_distortion =
readDMLMatrixFromOutputDir("pq_distortion").get(new MatrixValue.CellIndex(1,
1));
+ double k_distortion =
readDMLMatrixFromOutputDir("k_distortion").get(new MatrixValue.CellIndex(1, 1));
+
+ //check if distortion is within a threshold
+ if (!test_case.equals("cluster")) {
+ Assert.assertTrue(pq_distortion < 1.2 * k_distortion +
0.1);
+ } else {
+ Assert.assertTrue(pq_distortion < 2 * k_distortion +
0.1);
+ }
+ }
+}
diff --git a/src/test/scripts/functions/builtin/quantizeByCluster.dml
b/src/test/scripts/functions/builtin/quantizeByCluster.dml
new file mode 100644
index 0000000000..62fa475de2
--- /dev/null
+++ b/src/test/scripts/functions/builtin/quantizeByCluster.dml
@@ -0,0 +1,96 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#duplicate rows of matix n times
+duplicate_rows = function (Matrix [Double] V, Integer N)
+ return(Matrix [Double] V)
+{
+ tmp = V
+ for(i in seq(1, N-1, 1)) {
+ tmp = rbind(tmp, V)
+ }
+ V = tmp
+}
+#construct vectors from codes
+construct_vectors = function (Matrix [Double] codes, Matrix [Double] codebook)
+ return(Matrix [Double] vectors)
+{
+ vectors = matrix(0, rows=nrow(codes), cols=ncol(codes)*ncol(codebook))
+ parfor (i in 1:nrow(codes), check=0) {
+ parfor (j in 1:ncol(codes), check=0) {
+ vectors[i, 1 + (j-1)* ncol(codebook): j * ncol(codebook)] =
codebook[as.scalar(codes[i, j])]
+ }
+ }
+}
+
+max = 1
+min = -max
+offset = max / 10
+
+subvector_size = $cols / $subspaces
+rows = $clusters * $vectors_per_cluster
+
+# Generate points by concatenating sub_points around sub_clusters
+if($test_case == "sub_cluster") {
+ offset_matrix = rand(rows=rows, cols=$cols, min=-offset, max=offset,
pdf="uniform", seed=2)
+ cluster_centers = rand(rows = $clusters, cols = subvector_size, min=min,
max=max, seed=2)
+ vectors = matrix(cluster_centers, nrow(cluster_centers),
ncol(cluster_centers))
+ for(i in 1:$subspaces-1) {
+ cluster_centers = rand(rows = $clusters, cols = subvector_size, min=min,
max=max, seed=2)
+ vectors = cbind(vectors, cluster_centers)
+ }
+ #ensure correct number of vectors
+ vectors = duplicate_rows(vectors, $vectors_per_cluster)
+ vectors = vectors + offset_matrix
+}
+# Generate points around clusters
+else if ($test_case == "cluster") {
+ cluster_centers = rand(rows = $clusters, cols = $cols, min=min, max=max,
pdf="uniform", seed=2)
+ vectors = matrix(cluster_centers, nrow(cluster_centers),
ncol(cluster_centers))
+ #ensure correct number of vectors
+ vectors = duplicate_rows(vectors, $vectors_per_cluster)
+ offset_matrix = rand(rows=rows, cols=$cols, min=-offset, max=offset,
pdf="uniform", seed=2)
+ vectors = vectors + offset_matrix
+}
+# Generate random points
+else {
+ vectors = rand(rows = rows, cols = $cols, min=min, max=max, pdf=$test_case,
seed=2)
+}
+
+[codebook, codes] = quantizeByCluster(vectors, $subspaces, $k, $runs,
$max_iter, $eps, $vectors_per_cluster, $sep, 2)
+[k_codebook, k_codes] = kmeans(vectors, $k * $subspaces, $runs, $max_iter,
$eps, FALSE, $vectors_per_cluster, 2)
+
+#construct vectors from codes
+pq_result = construct_vectors(codes, codebook)
+k_result = construct_vectors(k_codes, k_codebook)
+
+#calculate distortion
+pq_distortion = colSums(rowSums((vectors - pq_result)^2)) / rows
+k_distortion = colSums(rowSums((vectors - k_result)^2)) / rows
+
+print("Product quantization distortion: " + toString(pq_distortion))
+print("Kmeans distortion: " + toString(k_distortion))
+
+write(codes, $codes)
+write(codebook, $codebook)
+write(pq_distortion, $pq_distortion)
+write(k_distortion, $k_distortion)
+