This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 69d04d4679 [SYSTEMDS-3213] Add optional space decomposition to
quantizeByCluster
69d04d4679 is described below
commit 69d04d4679fedd4c43fe3d734ffa875980d545f8
Author: Can Abdulla <[email protected]>
AuthorDate: Wed Jul 24 19:25:24 2024 +0200
[SYSTEMDS-3213] Add optional space decomposition to quantizeByCluster
Closes #2053.
---
scripts/builtin/quantizeByCluster.dml | 69 ++++++++++++++++++----
.../part2/BuiltinQuantizeByClusterTest.java | 36 +++++++----
.../functions/builtin/quantizeByCluster.dml | 24 ++++++--
3 files changed, 101 insertions(+), 28 deletions(-)
diff --git a/scripts/builtin/quantizeByCluster.dml
b/scripts/builtin/quantizeByCluster.dml
index 82bd9200be..824ac35053 100644
--- a/scripts/builtin/quantizeByCluster.dml
+++ b/scripts/builtin/quantizeByCluster.dml
@@ -17,9 +17,14 @@
# specific language governing permissions and limitations
# under the License.
#
-#-------------------------------------------------------------
+#----------------------------------------------------------------------------------------
-# Builtin function that implements product quantization
+# The quantizeByCluster-function implements product quantization. Initially, it
+# divides the original vector space into M subspaces. The resulting lower
dimensional
+# subvectors are then quantized. If the column count is not divisible by the
number of
+# subspaces M, the data is padded with zeros. Optimal space decomposition can
be
+# computed, when the data follows a Gaussian distribution. The function uses
kmeans for
+# quantizing and svd to compute the space decomposition.
#
# INPUT:
#
---------------------------------------------------------------------------------------
@@ -33,6 +38,10 @@
# separate Cluster subspaces separately. If value is set
to true,
# kmeans is run M times, once for each subspace.
Otherwise
# kmeans is run only once.
+# space_decomp Decompose the vector space by multiplying the
input
+# matrix X with an orthogonal matrix R. Assumes
the data
+# follows a parametric Gaussian distribution.
+# Time complexity in O(nrow(X)^2 * min(nrow(X),
ncol(X))).
# seed The seed used for initial sampling. If set to
-1 random
# seeds are selected.
#
---------------------------------------------------------------------------------------
@@ -45,13 +54,45 @@
# codes The mapping of vectors to centroids. Each vector of the input
matrix X is mapped
# onto a vector of codes. The entries in the codes matrix are the
indices of
# the vectors in the codebook. The codes matrix has the dimensions
[nrow(X) x M].
+# R The orthogonal matrix R which is applied to the input matrix X
before performing
+# the product quantization. Only relevant when space_decomp = TRUE.
#
------------------------------------------------------------------------------------------
m_quantizeByCluster = function(Matrix[Double]X, Integer M = 4, Integer k = 10,
Integer runs = 10,
- Integer max_iter = 1000, Double eps = 1e-6, Integer
avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Integer seed = -1)
- return(Matrix[Double] codebook, Matrix[Double] codes)
+ Integer max_iter = 1000, Double eps = 1e-6, Integer
avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean
space_decomp=FALSE, Integer seed = -1)
+ return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R)
{
+ #Pad the data with zeros if the number of columns of the input matrix X is
not divisible by M
+ if(ncol(X) %% M != 0) {
+ zeros = matrix(0, rows=nrow(X), cols=((ncol(X) %/% M) +1) * M - ncol(X))
+ X = cbind(X, zeros)
+ }
subvector_size = ncol(X) / M
+ #Transform the vector space by an orthogonal matrix R.
+ #R is computed by reordering the principal directions of the input matrix X
such that the variance of each subspace is balanced.
+ if(space_decomp) {
+ #Perform PCA using SVD
+ X2 = X - colMeans(X)
+ [U, S, V] = svd(X2)
+ eigen_v = diag(S)^2 / (nrow(X)-1)
+ #Balance the variance of the subspaces following a greedy approach.
+ R = matrix(0, rows=nrow(V), cols=ncol(V))
+ subspace_variance = matrix(1, rows=M, cols=1)
+ subspace_idx = seq(0,M-1) * subvector_size + 1
+ for(i in 1:nrow(R)) {
+ #free_buckets[j] == 0 iff subspace j is full
+ free_buckets = subspace_idx - seq(1,M) * subvector_size < 1
+ b = as.scalar(rowIndexMin(t(subspace_variance) %*% (diag(1 /
free_buckets))))
+ subspace_variance[b] = subspace_variance[b] * eigen_v[i]
+ R[,as.scalar(subspace_idx[b])] = V[,i]
+ subspace_idx[b] = subspace_idx[b] + 1
+ }
+ #Apply space decomposition
+ X = X %*% t(R)
+ }
+ else {
+ R = matrix(0, rows=1, cols=1)
+ }
#Kmeans is run just once for all subspaces together. Subvectors are mapped
to vectors of the codebook of size k*M.
#The ith entry of a code vector has a value in [1, k*M].
if(!separate) {
@@ -62,14 +103,20 @@ m_quantizeByCluster = function(Matrix[Double]X, Integer M
= 4, Integer k = 10, I
#Kmeans is run for every subspace separately. Subvectors are mapped to a
subset of k vectors of the codebook.
#The ith entry of a code vector has a value in ((i-1)*k, i*k].
else {
- l = k
- codebook = matrix(1, rows=l*M, cols=subvector_size)
+ codebook = matrix(1, rows=k*M, cols=subvector_size)
codes = matrix(1, rows=nrow(X), cols=M)
- parfor(i in 1:M, check=0) {
- [tmp_cbook, tmp_c] =
kmeans(X[,(i-1)*subvector_size+1:i*subvector_size], l, runs, max_iter, eps,
FALSE, avg_sample_size_per_centroid, seed)
- codebook[(i-1)*l+1:i*l,] = tmp_cbook
- offset = matrix((i-1)*l, rows=nrow(codes), cols=1)
- codes[,i] = tmp_c + offset
+ for(i in 1:M, check=0) {
+ [tmp_cbook, tmp_c] = kmeans(X[,(i-1)*subvector_size+1:i*subvector_size],
k, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed)
+ #If no output is produced, use a single centroid
+ if(as.scalar(tmp_c[1,1]) < 1) {
+ tmp_cbook = matrix(0, rows=k, cols=subvector_size)
+ tmp_cbook[1,] = colSums(X[,(i-1)*subvector_size+1:i*subvector_size])
/ nrow(X)
+ tmp_c = matrix(1, rows=nrow(X), cols=1)
+ }
+ codebook[(i-1)*k+1:i*k,] = tmp_cbook
+ offset = matrix((i-1)*k, rows=nrow(codes), cols=1)
+ codes[,i] = tmp_c + offset
}
}
}
+
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
index 64908ee073..e398a0ad06 100644
---
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinQuantizeByClusterTest.java
@@ -53,6 +53,7 @@ public class BuiltinQuantizeByClusterTest extends
AutomatedTestBase {
public int vectors_per_cluster;
@Parameter(7)
public boolean quantize_separately;
+ public boolean space_decomp;
private final static String TEST_NAME = "quantizeByCluster";
private final static String TEST_DIR = "functions/builtin/";
@@ -60,7 +61,6 @@ public class BuiltinQuantizeByClusterTest extends
AutomatedTestBase {
private final static double eps = 1e-10;
private final static int runs = 3;
private final static int max_iter = 1000;
-// private final static double cluster_offset = 0.1;
@Parameterized.Parameters(name = "{0}: rows={1}, cols={2}, c={3},
subspaces={4}, k={5}, v_per_c={6}, sep={7}")
public static Collection<Object[]> data() {
@@ -73,6 +73,8 @@ public class BuiltinQuantizeByClusterTest extends
AutomatedTestBase {
{"uniform", 1024, 64, 12, 8, 12, 40, false},
{"uniform", 1024, 64, 12, 4, 12, 40, false}, {"uniform", 1024, 64, 12, 2, 12,
40, false},
{"normal", 1024, 64, 12, 8, 12, 40, true}, {"normal",
1024, 64, 12, 4, 12, 40, true}, {"normal", 1024, 64, 12, 2, 12, 40, true},
{"normal", 1024, 64, 12, 8, 12, 40, false}, {"normal",
1024, 64, 12, 4, 12, 40, false}, {"normal", 1024, 64, 12, 2, 12, 40, false},
+ {"normal", 1024, 53, 12, 8, 12, 40, true}, {"normal",
1024, 61, 12, 4, 12, 40, true}, {"normal", 1024, 83, 12, 2, 12, 40, true},
+ {"normal", 1024, 53, 12, 8, 12, 40, false},
{"normal", 1024, 61, 12, 4, 12, 40, false}, {"normal", 1024, 83, 12, 2, 12,
40, false},
});
}
@@ -83,7 +85,10 @@ public class BuiltinQuantizeByClusterTest extends
AutomatedTestBase {
@Test
public void basicTest() {
- runQuantizeByClusterTest();
+ for (boolean b : new boolean[]{true, false}) {
+ space_decomp = b;
+ runQuantizeByClusterTest();
+ }
}
/*The tests use kmeans clustering as a baseline and check whether the
distortion is within
@@ -93,30 +98,35 @@ public class BuiltinQuantizeByClusterTest extends
AutomatedTestBase {
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";
programArgs = new String[]{"-nvargs", "codes=" +
output("codes"), "codebook=" + output("codebook"),
- "pq_distortion=" + output("pq_distortion"),
"k_distortion=" + output("k_distortion"),
- "clusters=" + clusters, "test_case=" +
test_case, "rows=" + rows,
- "cols=" + cols, "subspaces=" + subspaces, "k="
+ k, "runs=" + runs, "max_iter=" + max_iter,
- "eps=" + eps, "vectors_per_cluster=" +
vectors_per_cluster, "sep=" + quantize_separately};
+ "pq_distortion=" + output("pq_distortion"),
"k_distortion=" + output("k_distortion"), "is_orthogonal=" +
output("is_orthogonal"),
+ "clusters=" + clusters, "test_case=" + test_case,
"rows=" + rows,
+ "cols=" + cols, "subspaces=" + subspaces, "k=" + k,
"runs=" + runs, "max_iter=" + max_iter,
+ "eps=" + eps, "vectors_per_cluster=" +
vectors_per_cluster, "sep=" + quantize_separately, "space_decomp=" +
space_decomp};
runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
- // check if output dimensions are correct
+ //check if R is an orthogonal matrix
+ if(space_decomp) {
+ double is_orthogonal =
readDMLScalarFromOutputDir("is_orthogonal").get(new MatrixValue.CellIndex(1,
1));
+ Assert.assertEquals(0, is_orthogonal, 0.001);
+ }
+
+ //check if output dimensions are correct
MatrixCharacteristics meta_codes = readDMLMetaDataFile("codes");
MatrixCharacteristics meta_codebook =
readDMLMetaDataFile("codebook");
Assert.assertTrue("Matrix dimensions should be equal to
expected dimensions",
meta_codes.getRows() == (long) clusters *
vectors_per_cluster
&& meta_codes.getCols() == subspaces);
Assert.assertEquals("Centroid dimensions should be equal to
expected dimensions",
- cols / subspaces, meta_codebook.getCols());
-
- double pq_distortion =
readDMLMatrixFromOutputDir("pq_distortion").get(new MatrixValue.CellIndex(1,
1));
- double k_distortion =
readDMLMatrixFromOutputDir("k_distortion").get(new MatrixValue.CellIndex(1, 1));
+ (int) Math.ceil((double) cols / subspaces),
meta_codebook.getCols());
//check if distortion is within a threshold
+ double pq_distortion =
readDMLScalarFromOutputDir("pq_distortion").get(new MatrixValue.CellIndex(1,1));
+ double k_distortion =
readDMLScalarFromOutputDir("k_distortion").get(new MatrixValue.CellIndex(1, 1));
if (!test_case.equals("cluster")) {
- Assert.assertTrue(pq_distortion < 1.2 * k_distortion +
0.1);
+ Assert.assertTrue(pq_distortion < 1.2 * k_distortion +
0.5);
} else {
- Assert.assertTrue(pq_distortion < 2 * k_distortion +
0.1);
+ Assert.assertTrue(pq_distortion < 2 * k_distortion + 2);
}
}
}
diff --git a/src/test/scripts/functions/builtin/quantizeByCluster.dml
b/src/test/scripts/functions/builtin/quantizeByCluster.dml
index 62fa475de2..4e235acccf 100644
--- a/src/test/scripts/functions/builtin/quantizeByCluster.dml
+++ b/src/test/scripts/functions/builtin/quantizeByCluster.dml
@@ -47,6 +47,8 @@ offset = max / 10
subvector_size = $cols / $subspaces
rows = $clusters * $vectors_per_cluster
+space_decomp = as.logical($space_decomp)
+sep = as.logical($sep)
# Generate points by concatenating sub_points around sub_clusters
if($test_case == "sub_cluster") {
@@ -75,16 +77,29 @@ else {
vectors = rand(rows = rows, cols = $cols, min=min, max=max, pdf=$test_case,
seed=2)
}
-[codebook, codes] = quantizeByCluster(vectors, $subspaces, $k, $runs,
$max_iter, $eps, $vectors_per_cluster, $sep, 2)
+#Perform quantization
+[codebook, codes, R] = quantizeByCluster(vectors, $subspaces, $k, $runs,
$max_iter, $eps, $vectors_per_cluster, sep, space_decomp, 2)
[k_codebook, k_codes] = kmeans(vectors, $k * $subspaces, $runs, $max_iter,
$eps, FALSE, $vectors_per_cluster, 2)
#construct vectors from codes
-pq_result = construct_vectors(codes, codebook)
k_result = construct_vectors(k_codes, k_codebook)
+pq_result = construct_vectors(codes, codebook)
+if(space_decomp) {
+ pq_result = pq_result %*% R
+ #check if R is an orthogonal matrix
+ is_orthogonal = sum((t(R) %*% R - diag(matrix(1, nrow(R), 1)))^2)
+}
+else {
+ is_orthogonal = 0
+}
+
+if($cols %% $subspaces != 0) {
+ pq_result = pq_result[,1:$cols]
+}
#calculate distortion
-pq_distortion = colSums(rowSums((vectors - pq_result)^2)) / rows
-k_distortion = colSums(rowSums((vectors - k_result)^2)) / rows
+pq_distortion = as.scalar(colSums(rowSums((vectors - pq_result)^2)) / rows)
+k_distortion = as.scalar(colSums(rowSums((vectors - k_result)^2)) / rows)
print("Product quantization distortion: " + toString(pq_distortion))
print("Kmeans distortion: " + toString(k_distortion))
@@ -93,4 +108,5 @@ write(codes, $codes)
write(codebook, $codebook)
write(pq_distortion, $pq_distortion)
write(k_distortion, $k_distortion)
+write(is_orthogonal, $is_orthogonal)