This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push: new 9ff9e1635f [SYSTEMDS-3213] New builtin for cluster-based quantization 9ff9e1635f is described below commit 9ff9e1635fcc76a1e0fc21cfa4f6aefba1e64326 Author: Can Abdulla <canabdu...@outlook.de> AuthorDate: Tue Jun 4 10:10:43 2024 +0200 [SYSTEMDS-3213] New builtin for cluster-based quantization LDE SoSe'24 project Closes #2030. --- scripts/builtin/quantizeByCluster.dml | 75 +++++++++++++ .../java/org/apache/sysds/common/Builtins.java | 1 + .../part2/BuiltinQuantizeByClusterTest.java | 120 +++++++++++++++++++++ .../functions/builtin/quantizeByCluster.dml | 96 +++++++++++++++++ 4 files changed, 292 insertions(+) diff --git a/scripts/builtin/quantizeByCluster.dml b/scripts/builtin/quantizeByCluster.dml new file mode 100644 index 0000000000..82bd9200be --- /dev/null +++ b/scripts/builtin/quantizeByCluster.dml @@ -0,0 +1,75 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Builtin function that implements product quantization +# +# INPUT: +# --------------------------------------------------------------------------------------- +# X The input matrix to perform product quantization on +# M Number of subspaces +# k Number of vectors in the subcodebooks +# runs Number of runs (with different initial centroids) +# max_iter Maximum number of iterations per run +# eps Tolerance (epsilon) for WCSS change ratio +# avg_sample_size_per_centroid Average number of records per centroid in data samples +# separate Cluster subspaces separately. If value is set to true, +# kmeans is run M times, once for each subspace. Otherwise +# kmeans is run only once. +# seed The seed used for initial sampling. If set to -1 random +# seeds are selected. +# --------------------------------------------------------------------------------------- +# +# OUTPUT: +# ------------------------------------------------------------------------------------------ +# codebook The matrix containing the centroids. If clustered separately, the ith +# subcodebook is the ith chunk of size k. The codebook matrix has the dimensions +# [k*M x ncol(X)/M]. +# codes The mapping of vectors to centroids. Each vector of the input matrix X is mapped +# onto a vector of codes. The entries in the codes matrix are the indices of +# the vectors in the codebook. The codes matrix has the dimensions [nrow(X) x M]. +# ------------------------------------------------------------------------------------------ + +m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, Integer runs = 10, + Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Integer seed = -1) + return(Matrix[Double] codebook, Matrix[Double] codes) +{ + subvector_size = ncol(X) / M + #Kmeans is run just once for all subspaces together. Subvectors are mapped to vectors of the codebook of size k*M. + #The ith entry of a code vector has a value in [1, k*M]. + if(!separate) { + A = matrix(X, rows= nrow(X) * M, cols=subvector_size) + [codebook, B] = kmeans(A, k * M, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed) + codes = matrix(B, rows = nrow(B) / M, cols = ncol(B) * M) + } + #Kmeans is run for every subspace separately. Subvectors are mapped to a subset of k vectors of the codebook. + #The ith entry of a code vector has a value in ((i-1)*k, i*k]. + else { + l = k + codebook = matrix(1, rows=l*M, cols=subvector_size) + codes = matrix(1, rows=nrow(X), cols=M) + parfor(i in 1:M, check=0) { + [tmp_cbook, tmp_c] = kmeans(X[,(i-1)*subvector_size+1:i*subvector_size], l, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed) + codebook[(i-1)*l+1:i*l,] = tmp_cbook + offset = matrix((i-1)*l, rows=nrow(codes), cols=1) + codes[,i] = tmp_c + offset + } + } +} diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index 59d41fb227..b9cda07791 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -271,6 +271,7 @@ public enum Builtins { PSNR("psnr", true), QR("qr", false, ReturnType.MULTI_RETURN), QUANTILE("quantile", false), + QUANTIZEBYCLUSTER("quantizeByCluster", true), RANDOM_FOREST("randomForest", true), RANDOM_FOREST_PREDICT("randomForestPredict", true), RANGE("range", false), diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java new file mode 100644 index 0000000000..3f33c1bb24 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part2; + +import java.util.Arrays; +import java.util.Collection; + +import org.apache.sysds.runtime.matrix.data.MatrixValue; +import org.apache.sysds.runtime.meta.MatrixCharacteristics; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameter; + +@RunWith(Parameterized.class) +public class BuiltinQuantizeByClusterTest extends AutomatedTestBase { + + @Parameter + public String test_case; + @Parameter(1) + public int rows; + @Parameter(2) + public int cols; + @Parameter(3) + public int clusters; + @Parameter(4) + public int subspaces; + @Parameter(5) + public int k; + @Parameter(6) + public int vectors_per_cluster; + @Parameter(7) + public boolean quantize_separately; + + private final static String TEST_NAME = "quantizeByCluster"; + private final static String TEST_DIR = "functions/builtin/"; + private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinQuantizeByClusterTest.class.getSimpleName() + "/"; + private final static double eps = 1e-10; + private final static int runs = 3; + private final static int max_iter = 1000; +// private final static double cluster_offset = 0.1; + + @Parameterized.Parameters(name = "{0}: rows={1}, cols={2}, c={3}, subspaces={4}, k={5}, v_per_c={6}, sep={7}") + public static Collection<Object[]> data() { + return Arrays.asList(new Object[][]{ + {"sub_cluster", 1024, 64, 12, 8, 12, 40, true}, {"sub_cluster", 1024, 64, 12, 4, 12, 40, true}, {"sub_cluster", 1024, 64, 12, 2, 12, 40, true}, + {"sub_cluster", 1024, 64, 12, 8, 12, 40, false}, {"sub_cluster", 1024, 64, 12, 4, 12, 40, false}, {"sub_cluster", 1024, 64, 12, 2, 12, 40, false}, + {"cluster", 1024, 64, 12, 8, 12, 40, true}, {"cluster", 1024, 64, 12, 4, 12, 40, true}, {"cluster", 1024, 64, 12, 2, 12, 40, true}, + {"cluster", 1024, 64, 20, 8, 12, 40, false}, {"cluster", 1024, 64, 12, 4, 12, 40, false}, {"cluster", 1024, 64, 12, 2, 12, 40, false}, + {"uniform", 1024, 64, 12, 8, 12, 40, true}, {"uniform", 1024, 64, 12, 4, 12, 40, true}, {"uniform", 1024, 64, 12, 2, 12, 40, true}, + {"uniform", 1024, 64, 12, 8, 12, 40, false}, {"uniform", 1024, 64, 12, 4, 12, 40, false}, {"uniform", 1024, 64, 12, 2, 12, 40, false}, + {"normal", 1024, 64, 12, 8, 12, 40, true}, {"normal", 1024, 64, 12, 4, 12, 40, true}, {"normal", 1024, 64, 12, 2, 12, 40, true}, + {"normal", 1024, 64, 12, 8, 12, 40, false}, {"normal", 1024, 64, 12, 4, 12, 40, false}, {"normal", 1024, 64, 12, 2, 12, 40, false}, + }); + } + + @Override + public void setUp() { + addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME)); + } + + @Test + public void basicTest() { + runQuantizeByClusterTest(); + } + + /*The tests use kmeans clustering as a baseline and check whether the distortion is within + a certain threshold.*/ + private void runQuantizeByClusterTest() { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + String HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + programArgs = new String[]{"-nvargs", "codes=" + output("codes"), "codebook=" + output("codebook"), + "pq_distortion=" + output("pq_distortion"), "k_distortion=" + output("k_distortion"), + "clusters=" + clusters, "test_case=" + test_case, "rows=" + rows, + "cols=" + cols, "subspaces=" + subspaces, "k=" + k, "runs=" + runs, "max_iter=" + max_iter, + "eps=" + eps, "vectors_per_cluster=" + vectors_per_cluster, "sep=" + quantize_separately}; + + runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + + // check if output dimensions are correct + MatrixCharacteristics meta_codes = readDMLMetaDataFile("codes"); + MatrixCharacteristics meta_codebook = readDMLMetaDataFile("codebook"); + Assert.assertTrue("Matrix dimensions should be equal to expected dimensions", + meta_codes.getRows() == (long) clusters * vectors_per_cluster + && meta_codes.getCols() == subspaces); + Assert.assertEquals("Centroid dimensions should be equal to expected dimensions", + cols / subspaces, meta_codebook.getCols()); + + double pq_distortion = readDMLMatrixFromOutputDir("pq_distortion").get(new MatrixValue.CellIndex(1, 1)); + double k_distortion = readDMLMatrixFromOutputDir("k_distortion").get(new MatrixValue.CellIndex(1, 1)); + + //check if distortion is within a threshold + if (!test_case.equals("cluster")) { + Assert.assertTrue(pq_distortion < 1.2 * k_distortion + 0.1); + } else { + Assert.assertTrue(pq_distortion < 2 * k_distortion + 0.1); + } + } +} diff --git a/src/test/scripts/functions/builtin/quantizeByCluster.dml b/src/test/scripts/functions/builtin/quantizeByCluster.dml new file mode 100644 index 0000000000..62fa475de2 --- /dev/null +++ b/src/test/scripts/functions/builtin/quantizeByCluster.dml @@ -0,0 +1,96 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +#duplicate rows of matix n times +duplicate_rows = function (Matrix [Double] V, Integer N) + return(Matrix [Double] V) +{ + tmp = V + for(i in seq(1, N-1, 1)) { + tmp = rbind(tmp, V) + } + V = tmp +} +#construct vectors from codes +construct_vectors = function (Matrix [Double] codes, Matrix [Double] codebook) + return(Matrix [Double] vectors) +{ + vectors = matrix(0, rows=nrow(codes), cols=ncol(codes)*ncol(codebook)) + parfor (i in 1:nrow(codes), check=0) { + parfor (j in 1:ncol(codes), check=0) { + vectors[i, 1 + (j-1)* ncol(codebook): j * ncol(codebook)] = codebook[as.scalar(codes[i, j])] + } + } +} + +max = 1 +min = -max +offset = max / 10 + +subvector_size = $cols / $subspaces +rows = $clusters * $vectors_per_cluster + +# Generate points by concatenating sub_points around sub_clusters +if($test_case == "sub_cluster") { + offset_matrix = rand(rows=rows, cols=$cols, min=-offset, max=offset, pdf="uniform", seed=2) + cluster_centers = rand(rows = $clusters, cols = subvector_size, min=min, max=max, seed=2) + vectors = matrix(cluster_centers, nrow(cluster_centers), ncol(cluster_centers)) + for(i in 1:$subspaces-1) { + cluster_centers = rand(rows = $clusters, cols = subvector_size, min=min, max=max, seed=2) + vectors = cbind(vectors, cluster_centers) + } + #ensure correct number of vectors + vectors = duplicate_rows(vectors, $vectors_per_cluster) + vectors = vectors + offset_matrix +} +# Generate points around clusters +else if ($test_case == "cluster") { + cluster_centers = rand(rows = $clusters, cols = $cols, min=min, max=max, pdf="uniform", seed=2) + vectors = matrix(cluster_centers, nrow(cluster_centers), ncol(cluster_centers)) + #ensure correct number of vectors + vectors = duplicate_rows(vectors, $vectors_per_cluster) + offset_matrix = rand(rows=rows, cols=$cols, min=-offset, max=offset, pdf="uniform", seed=2) + vectors = vectors + offset_matrix +} +# Generate random points +else { + vectors = rand(rows = rows, cols = $cols, min=min, max=max, pdf=$test_case, seed=2) +} + +[codebook, codes] = quantizeByCluster(vectors, $subspaces, $k, $runs, $max_iter, $eps, $vectors_per_cluster, $sep, 2) +[k_codebook, k_codes] = kmeans(vectors, $k * $subspaces, $runs, $max_iter, $eps, FALSE, $vectors_per_cluster, 2) + +#construct vectors from codes +pq_result = construct_vectors(codes, codebook) +k_result = construct_vectors(k_codes, k_codebook) + +#calculate distortion +pq_distortion = colSums(rowSums((vectors - pq_result)^2)) / rows +k_distortion = colSums(rowSums((vectors - k_result)^2)) / rows + +print("Product quantization distortion: " + toString(pq_distortion)) +print("Kmeans distortion: " + toString(k_distortion)) + +write(codes, $codes) +write(codebook, $codebook) +write(pq_distortion, $pq_distortion) +write(k_distortion, $k_distortion) +