This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push: new 69d04d4679 [SYSTEMDS-3213] Add optional space decomposition to quantizeByCluster 69d04d4679 is described below commit 69d04d4679fedd4c43fe3d734ffa875980d545f8 Author: Can Abdulla <canabdu...@outlook.de> AuthorDate: Wed Jul 24 19:25:24 2024 +0200 [SYSTEMDS-3213] Add optional space decomposition to quantizeByCluster Closes #2053. --- scripts/builtin/quantizeByCluster.dml | 69 ++++++++++++++++++---- .../part2/BuiltinQuantizeByClusterTest.java | 36 +++++++---- .../functions/builtin/quantizeByCluster.dml | 24 ++++++-- 3 files changed, 101 insertions(+), 28 deletions(-) diff --git a/scripts/builtin/quantizeByCluster.dml b/scripts/builtin/quantizeByCluster.dml index 82bd9200be..824ac35053 100644 --- a/scripts/builtin/quantizeByCluster.dml +++ b/scripts/builtin/quantizeByCluster.dml @@ -17,9 +17,14 @@ # specific language governing permissions and limitations # under the License. # -#------------------------------------------------------------- +#---------------------------------------------------------------------------------------- -# Builtin function that implements product quantization +# The quantizeByCluster-function implements product quantization. Initially, it +# divides the original vector space into M subspaces. The resulting lower dimensional +# subvectors are then quantized. If the column count is not divisible by the number of +# subspaces M, the data is padded with zeros. Optimal space decomposition can be +# computed, when the data follows a Gaussian distribution. The function uses kmeans for +# quantizing and svd to compute the space decomposition. # # INPUT: # --------------------------------------------------------------------------------------- @@ -33,6 +38,10 @@ # separate Cluster subspaces separately. If value is set to true, # kmeans is run M times, once for each subspace. Otherwise # kmeans is run only once. +# space_decomp Decompose the vector space by multiplying the input +# matrix X with an orthogonal matrix R. Assumes the data +# follows a parametric Gaussian distribution. +# Time complexity in O(nrow(X)^2 * min(nrow(X), ncol(X))). # seed The seed used for initial sampling. If set to -1 random # seeds are selected. # --------------------------------------------------------------------------------------- @@ -45,13 +54,45 @@ # codes The mapping of vectors to centroids. Each vector of the input matrix X is mapped # onto a vector of codes. The entries in the codes matrix are the indices of # the vectors in the codebook. The codes matrix has the dimensions [nrow(X) x M]. +# R The orthogonal matrix R which is applied to the input matrix X before performing +# the product quantization. Only relevant when space_decomp = TRUE. # ------------------------------------------------------------------------------------------ m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, Integer runs = 10, - Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Integer seed = -1) - return(Matrix[Double] codebook, Matrix[Double] codes) + Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean space_decomp=FALSE, Integer seed = -1) + return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R) { + #Pad the data with zeros if the number of columns of the input matrix X is not divisible by M + if(ncol(X) %% M != 0) { + zeros = matrix(0, rows=nrow(X), cols=((ncol(X) %/% M) +1) * M - ncol(X)) + X = cbind(X, zeros) + } subvector_size = ncol(X) / M + #Transform the vector space by an orthogonal matrix R. + #R is computed by reordering the principal directions of the input matrix X such that the variance of each subspace is balanced. + if(space_decomp) { + #Perform PCA using SVD + X2 = X - colMeans(X) + [U, S, V] = svd(X2) + eigen_v = diag(S)^2 / (nrow(X)-1) + #Balance the variance of the subspaces following a greedy approach. + R = matrix(0, rows=nrow(V), cols=ncol(V)) + subspace_variance = matrix(1, rows=M, cols=1) + subspace_idx = seq(0,M-1) * subvector_size + 1 + for(i in 1:nrow(R)) { + #free_buckets[j] == 0 iff subspace j is full + free_buckets = subspace_idx - seq(1,M) * subvector_size < 1 + b = as.scalar(rowIndexMin(t(subspace_variance) %*% (diag(1 / free_buckets)))) + subspace_variance[b] = subspace_variance[b] * eigen_v[i] + R[,as.scalar(subspace_idx[b])] = V[,i] + subspace_idx[b] = subspace_idx[b] + 1 + } + #Apply space decomposition + X = X %*% t(R) + } + else { + R = matrix(0, rows=1, cols=1) + } #Kmeans is run just once for all subspaces together. Subvectors are mapped to vectors of the codebook of size k*M. #The ith entry of a code vector has a value in [1, k*M]. if(!separate) { @@ -62,14 +103,20 @@ m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10, I #Kmeans is run for every subspace separately. Subvectors are mapped to a subset of k vectors of the codebook. #The ith entry of a code vector has a value in ((i-1)*k, i*k]. else { - l = k - codebook = matrix(1, rows=l*M, cols=subvector_size) + codebook = matrix(1, rows=k*M, cols=subvector_size) codes = matrix(1, rows=nrow(X), cols=M) - parfor(i in 1:M, check=0) { - [tmp_cbook, tmp_c] = kmeans(X[,(i-1)*subvector_size+1:i*subvector_size], l, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed) - codebook[(i-1)*l+1:i*l,] = tmp_cbook - offset = matrix((i-1)*l, rows=nrow(codes), cols=1) - codes[,i] = tmp_c + offset + for(i in 1:M, check=0) { + [tmp_cbook, tmp_c] = kmeans(X[,(i-1)*subvector_size+1:i*subvector_size], k, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed) + #If no output is produced, use a single centroid + if(as.scalar(tmp_c[1,1]) < 1) { + tmp_cbook = matrix(0, rows=k, cols=subvector_size) + tmp_cbook[1,] = colSums(X[,(i-1)*subvector_size+1:i*subvector_size]) / nrow(X) + tmp_c = matrix(1, rows=nrow(X), cols=1) + } + codebook[(i-1)*k+1:i*k,] = tmp_cbook + offset = matrix((i-1)*k, rows=nrow(codes), cols=1) + codes[,i] = tmp_c + offset } } } + diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java index 64908ee073..e398a0ad06 100644 --- a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java @@ -53,6 +53,7 @@ public class BuiltinQuantizeByClusterTest extends AutomatedTestBase { public int vectors_per_cluster; @Parameter(7) public boolean quantize_separately; + public boolean space_decomp; private final static String TEST_NAME = "quantizeByCluster"; private final static String TEST_DIR = "functions/builtin/"; @@ -60,7 +61,6 @@ public class BuiltinQuantizeByClusterTest extends AutomatedTestBase { private final static double eps = 1e-10; private final static int runs = 3; private final static int max_iter = 1000; -// private final static double cluster_offset = 0.1; @Parameterized.Parameters(name = "{0}: rows={1}, cols={2}, c={3}, subspaces={4}, k={5}, v_per_c={6}, sep={7}") public static Collection<Object[]> data() { @@ -73,6 +73,8 @@ public class BuiltinQuantizeByClusterTest extends AutomatedTestBase { {"uniform", 1024, 64, 12, 8, 12, 40, false}, {"uniform", 1024, 64, 12, 4, 12, 40, false}, {"uniform", 1024, 64, 12, 2, 12, 40, false}, {"normal", 1024, 64, 12, 8, 12, 40, true}, {"normal", 1024, 64, 12, 4, 12, 40, true}, {"normal", 1024, 64, 12, 2, 12, 40, true}, {"normal", 1024, 64, 12, 8, 12, 40, false}, {"normal", 1024, 64, 12, 4, 12, 40, false}, {"normal", 1024, 64, 12, 2, 12, 40, false}, + {"normal", 1024, 53, 12, 8, 12, 40, true}, {"normal", 1024, 61, 12, 4, 12, 40, true}, {"normal", 1024, 83, 12, 2, 12, 40, true}, + {"normal", 1024, 53, 12, 8, 12, 40, false}, {"normal", 1024, 61, 12, 4, 12, 40, false}, {"normal", 1024, 83, 12, 2, 12, 40, false}, }); } @@ -83,7 +85,10 @@ public class BuiltinQuantizeByClusterTest extends AutomatedTestBase { @Test public void basicTest() { - runQuantizeByClusterTest(); + for (boolean b : new boolean[]{true, false}) { + space_decomp = b; + runQuantizeByClusterTest(); + } } /*The tests use kmeans clustering as a baseline and check whether the distortion is within @@ -93,30 +98,35 @@ public class BuiltinQuantizeByClusterTest extends AutomatedTestBase { String HOME = SCRIPT_DIR + TEST_DIR; fullDMLScriptName = HOME + TEST_NAME + ".dml"; programArgs = new String[]{"-nvargs", "codes=" + output("codes"), "codebook=" + output("codebook"), - "pq_distortion=" + output("pq_distortion"), "k_distortion=" + output("k_distortion"), - "clusters=" + clusters, "test_case=" + test_case, "rows=" + rows, - "cols=" + cols, "subspaces=" + subspaces, "k=" + k, "runs=" + runs, "max_iter=" + max_iter, - "eps=" + eps, "vectors_per_cluster=" + vectors_per_cluster, "sep=" + quantize_separately}; + "pq_distortion=" + output("pq_distortion"), "k_distortion=" + output("k_distortion"), "is_orthogonal=" + output("is_orthogonal"), + "clusters=" + clusters, "test_case=" + test_case, "rows=" + rows, + "cols=" + cols, "subspaces=" + subspaces, "k=" + k, "runs=" + runs, "max_iter=" + max_iter, + "eps=" + eps, "vectors_per_cluster=" + vectors_per_cluster, "sep=" + quantize_separately, "space_decomp=" + space_decomp}; runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); - // check if output dimensions are correct + //check if R is an orthogonal matrix + if(space_decomp) { + double is_orthogonal = readDMLScalarFromOutputDir("is_orthogonal").get(new MatrixValue.CellIndex(1, 1)); + Assert.assertEquals(0, is_orthogonal, 0.001); + } + + //check if output dimensions are correct MatrixCharacteristics meta_codes = readDMLMetaDataFile("codes"); MatrixCharacteristics meta_codebook = readDMLMetaDataFile("codebook"); Assert.assertTrue("Matrix dimensions should be equal to expected dimensions", meta_codes.getRows() == (long) clusters * vectors_per_cluster && meta_codes.getCols() == subspaces); Assert.assertEquals("Centroid dimensions should be equal to expected dimensions", - cols / subspaces, meta_codebook.getCols()); - - double pq_distortion = readDMLMatrixFromOutputDir("pq_distortion").get(new MatrixValue.CellIndex(1, 1)); - double k_distortion = readDMLMatrixFromOutputDir("k_distortion").get(new MatrixValue.CellIndex(1, 1)); + (int) Math.ceil((double) cols / subspaces), meta_codebook.getCols()); //check if distortion is within a threshold + double pq_distortion = readDMLScalarFromOutputDir("pq_distortion").get(new MatrixValue.CellIndex(1,1)); + double k_distortion = readDMLScalarFromOutputDir("k_distortion").get(new MatrixValue.CellIndex(1, 1)); if (!test_case.equals("cluster")) { - Assert.assertTrue(pq_distortion < 1.2 * k_distortion + 0.1); + Assert.assertTrue(pq_distortion < 1.2 * k_distortion + 0.5); } else { - Assert.assertTrue(pq_distortion < 2 * k_distortion + 0.1); + Assert.assertTrue(pq_distortion < 2 * k_distortion + 2); } } } diff --git a/src/test/scripts/functions/builtin/quantizeByCluster.dml b/src/test/scripts/functions/builtin/quantizeByCluster.dml index 62fa475de2..4e235acccf 100644 --- a/src/test/scripts/functions/builtin/quantizeByCluster.dml +++ b/src/test/scripts/functions/builtin/quantizeByCluster.dml @@ -47,6 +47,8 @@ offset = max / 10 subvector_size = $cols / $subspaces rows = $clusters * $vectors_per_cluster +space_decomp = as.logical($space_decomp) +sep = as.logical($sep) # Generate points by concatenating sub_points around sub_clusters if($test_case == "sub_cluster") { @@ -75,16 +77,29 @@ else { vectors = rand(rows = rows, cols = $cols, min=min, max=max, pdf=$test_case, seed=2) } -[codebook, codes] = quantizeByCluster(vectors, $subspaces, $k, $runs, $max_iter, $eps, $vectors_per_cluster, $sep, 2) +#Perform quantization +[codebook, codes, R] = quantizeByCluster(vectors, $subspaces, $k, $runs, $max_iter, $eps, $vectors_per_cluster, sep, space_decomp, 2) [k_codebook, k_codes] = kmeans(vectors, $k * $subspaces, $runs, $max_iter, $eps, FALSE, $vectors_per_cluster, 2) #construct vectors from codes -pq_result = construct_vectors(codes, codebook) k_result = construct_vectors(k_codes, k_codebook) +pq_result = construct_vectors(codes, codebook) +if(space_decomp) { + pq_result = pq_result %*% R + #check if R is an orthogonal matrix + is_orthogonal = sum((t(R) %*% R - diag(matrix(1, nrow(R), 1)))^2) +} +else { + is_orthogonal = 0 +} + +if($cols %% $subspaces != 0) { + pq_result = pq_result[,1:$cols] +} #calculate distortion -pq_distortion = colSums(rowSums((vectors - pq_result)^2)) / rows -k_distortion = colSums(rowSums((vectors - k_result)^2)) / rows +pq_distortion = as.scalar(colSums(rowSums((vectors - pq_result)^2)) / rows) +k_distortion = as.scalar(colSums(rowSums((vectors - k_result)^2)) / rows) print("Product quantization distortion: " + toString(pq_distortion)) print("Kmeans distortion: " + toString(k_distortion)) @@ -93,4 +108,5 @@ write(codes, $codes) write(codebook, $codebook) write(pq_distortion, $pq_distortion) write(k_distortion, $k_distortion) +write(is_orthogonal, $is_orthogonal)