This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 00e02f32bf [SYSTEMDS-3858] New HDBSCAN builtin function
00e02f32bf is described below
commit 00e02f32bf7858abfbac4eb2ceb42b4bb5984f4e
Author: anuunchin <[email protected]>
AuthorDate: Sun Mar 29 11:06:29 2026 +0200
[SYSTEMDS-3858] New HDBSCAN builtin function
Closes #2381.
---
scripts/staging/hdbscan/hdbscan.dml | 456 +++++++++++++++++++++
scripts/staging/hdbscan/test_build_hierarchy.dml | 142 +++++++
scripts/staging/hdbscan/test_build_mst.dml | 68 +++
scripts/staging/hdbscan/test_cluster_model.dml | 151 +++++++
.../hdbscan/test_extract_stabe_clusters.dml | 158 +++++++
scripts/staging/hdbscan/test_hdbscan.dml | 73 ++++
scripts/staging/hdbscan/test_kth_smallest.dml | 36 ++
.../staging/hdbscan/test_mutual_reachability.dml | 48 +++
scripts/staging/hdbscan/test_union_find.dml | 161 ++++++++
9 files changed, 1293 insertions(+)
diff --git a/scripts/staging/hdbscan/hdbscan.dml
b/scripts/staging/hdbscan/hdbscan.dml
new file mode 100644
index 0000000000..5d989574a9
--- /dev/null
+++ b/scripts/staging/hdbscan/hdbscan.dml
@@ -0,0 +1,456 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# The hdbscan function is used to perform the hdbscan clustering
+# algorithm using knn-based mutual reachability distance and minimum spanning
tree.
+#
+# INPUT:
+# ------------------------------------------------------------
+# X The input Matrix to do hdbscan on.
+# minPts Minimum number of points for core distance computation.
(Defaults to 5)
+# minClSize Minimum cluster size (Defaults to minPts)
+# ------------------------------------------------------------
+#
+# OUTPUT:
+# ------------------------------------------------------------
+# clusterMems Cluster labels for each point
+# clusterModel The cluster centroids for prediction
+# ------------------------------------------------------------
+
+# TODO: m,s , f?
+m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize =
-1)
+ return (Matrix[Double] clusterMems, Matrix[Double] clusterModel)
+{
+ if(minPts < 2) {
+ stop("HDBSCAN: minPts should be at least 2")
+ }
+
+ if(minClSize < 0) {
+ minClSize = minPts
+ }
+
+ n = nrow(X)
+ d = ncol(X)
+
+ if(n < minPts) {
+ stop("HDBSCAN: Number of data points should be at least minPts")
+ }
+
+ distances = dist(X)
+
+ coreDistances = matrix(0, rows=n, cols=1)
+ for(i in 1:n) {
+ kthDist = computeKthSmallest(t(distances[i,]), minPts) # Add t() here!
+ coreDistances[i] = kthDist
+ }
+
+ mutualReachDist = computeMutualReachability(distances, coreDistances)
+
+ [mstEdges, mstWeights] = buildMST(mutualReachDist, n)
+
+ [hierarchy, clusterSizes] = buildHierarchy(mstEdges, mstWeights, n)
+
+ [clusterMems, stabilities, clusterToNode] =
extractStableClusters(hierarchy, mstWeights, n, minClSize)
+
+ [centroids, clusterInfo] = buildClusterModel(X, clusterMems, stabilities,
clusterToNode)
+
+ clusterModel = centroids
+}
+
+
+computeKthSmallest = function(Matrix[Double] array, Integer k)
+ return (Double res)
+{
+ sorted = order(target=array, by=1, decreasing=FALSE)
+ res = as.scalar(sorted[k+1, 1])
+}
+
+
+computeMutualReachability = function(Matrix[Double] distances, Matrix[Double]
coreDistances)
+ return (Matrix[Double] mutualReach)
+{
+ # mutualReach(i,j) = max(dist(i,j), coreDist(i), coreDist(j))
+ # Diagonal is set to zero.
+
+ n = nrow(distances)
+
+ coreDistRow = t(coreDistances)
+ coreDistCol = coreDistances
+
+ maxCoreRow = (distances > coreDistRow) * distances + (distances <=
coreDistRow) * coreDistRow
+ mutualReach = (maxCoreRow > coreDistCol) * maxCoreRow + (maxCoreRow <=
coreDistCol) * coreDistCol
+
+ mutualReach = mutualReach * (1 - diag(matrix(1, rows=n, cols=1)))
+}
+
+
+buildMST = function(Matrix[Double] distances, Integer n)
+ return (Matrix[Double] edges, Matrix[Double] weights)
+{
+ edges = matrix(0, rows=n-1, cols=2)
+ weights = matrix(0, rows=n-1, cols=1)
+
+ inMST = matrix(0, rows=n, cols=1)
+ inMST[1] = 1
+
+ minDist = distances[1,]
+ minDist = t(minDist)
+
+ for(i in 1:(n-1)) {
+ candidates = minDist + inMST * 1e15
+ minIdx = as.scalar(rowIndexMin(t(candidates)))
+ minWeight = as.scalar(minDist[minIdx])
+
+ # Find which node in MST connects to minIdx
+ connectIdx = as.scalar(rowIndexMin(distances[minIdx,] + t(1-inMST) * 1e15))
+ edges[i,1] = minIdx
+ edges[i,2] = connectIdx
+ weights[i] = minWeight
+
+ inMST[minIdx] = 1
+ newDists = distances[minIdx,]
+ minDist = (minDist < t(newDists)) * minDist + (minDist >= t(newDists)) *
t(newDists)
+ }
+}
+
+
+# Union-find utils
+find = function(Matrix[Double] parent, Integer x)
+ return (Integer root)
+{
+ root = x
+ while(as.scalar(parent[root]) != root) {
+ root = as.integer(as.scalar(parent[root]))
+ }
+}
+
+
+union = function(Matrix[Double] parent, Matrix[Double] rank,
+ Integer x, Integer y, Matrix[Double] size,
+ Matrix[Double] compToNode, Integer newId)
+ return (Matrix[Double] newParent, Matrix[Double] newRank,
+ Matrix[Double] newSize, Matrix[Double] newCompToNode, Integer
newRoot)
+{
+ newParent = parent
+ newRank = rank
+ newSize = size
+ newCompToNode = compToNode
+
+ rankX = as.scalar(rank[x,1])
+ rankY = as.scalar(rank[y,1])
+
+ combinedSize = as.scalar(size[x,1]) + as.scalar(size[y,1])
+
+ if(rankX < rankY) {
+ newParent[x] = y
+ newSize[y,1] = combinedSize
+ newCompToNode[y,1] = newId
+ newRoot = y
+ }
+ else if(rankX > rankY) {
+ newParent[y] = x
+ newSize[x,1] = combinedSize
+ newCompToNode[x,1] = newId
+ newRoot = x
+ }
+ else {
+ newParent[y] = x
+ newRank[x,1] = rankX + 1
+ newSize[x,1] = combinedSize
+ newCompToNode[x,1] = newId
+ newRoot = x
+ }
+}
+
+
+buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights,
Integer n)
+ return (Matrix[Double] hierarchy, Matrix[Double] sizes)
+{
+ # create indexed weights to preserve original positions after sorting
+ indexedWeights = cbind(seq(1, nrow(weights)), weights)
+ sorted = order(target=indexedWeights, by=2, decreasing=FALSE)
+
+ # parent[i] = i, meaning each point is its own parent in the beginning
+ parent = seq(1, n)
+ # tree depth when unioning (icnreases only when merged trees are of same
height)
+ rank = matrix(0, rows=n, cols=1)
+
+ # initially, each component is a single point, so root i maps to leaf node
i.
+ # Later, when two components merge, the new component root will map to a
new
+ # internal linkage node id (n+1, n+2, ..., 2n-1)
+ compToNode = seq(1, n)
+
+ # size (number of original points) for each linkage-tree node id,
+ # leaves 1..n have size 1.
+ nodeSize = matrix(0, rows=2*n-1, cols=1)
+ for(j in 1:n) {
+ nodeSize[j,1] = 1
+ }
+
+ # hierarcy[i, :] = [cluster1_id, cluster2_id, merge_distance]
+ hierarchy = matrix(0, rows=n-1, cols=3)
+
+ # sizes[i] = size of the cluster created by merge i
+ sizes = matrix(0, rows=n-1, cols=1)
+
+ row = 1
+ for(i in 1:(n-1)) {
+ idx = as.integer(as.scalar(sorted[i,1]))
+ u = as.integer(as.scalar(edges[idx,1]))
+ v = as.integer(as.scalar(edges[idx,2]))
+ w = as.scalar(weights[idx,1])
+
+ root_u = find(parent, u)
+ root_v = find(parent, v)
+
+ if(root_u != root_v) {
+ left = as.integer(as.scalar(compToNode[root_u]))
+ right = as.integer(as.scalar(compToNode[root_v]))
+
+ newId = n + row
+
+ hierarchy[row,1] = left
+ hierarchy[row,2] = right
+ hierarchy[row,3] = w
+
+ nodeSize[newId,1] = as.scalar(nodeSize[left,1]) +
as.scalar(nodeSize[right,1])
+ sizes[row,1] = as.scalar(nodeSize[newId,1])
+
+ [parent, rank, ufSize, compToNode, newRoot] =
+ union(parent, rank, root_u, root_v, nodeSize, compToNode,
newId)
+
+ row = row + 1
+ }
+ }
+}
+
+
+# Leaf descendants util
+getLeafDescendants = function(Matrix[Double] hierarchy, Integer n, Integer
nodeId)
+ return (Matrix[Double] leaves)
+{
+ if(nodeId <= n) {
+ leaves = matrix(nodeId, rows=1, cols=1)
+ } else {
+ mergeIdx = nodeId - n
+ left = as.integer(as.scalar(hierarchy[mergeIdx,1]))
+ right = as.integer(as.scalar(hierarchy[mergeIdx,2]))
+
+ leftLeaves = getLeafDescendants(hierarchy, n, left)
+ rightLeaves = getLeafDescendants(hierarchy, n, right)
+
+ leaves = rbind(leftLeaves, rightLeaves)
+ }
+}
+
+
+extractStableClusters = function(Matrix[Double] hierarchy, Matrix[Double]
weights,
+ Integer n, Integer minClSize)
+ return (Matrix[Double] labels, Matrix[Double] stabilities, Matrix[Double]
clusterToNode)
+{
+ numMerges = n - 1 # hierarchical tree over n points has exactly n-1 merge
events
+ numNodes = 2*n - 1 # total nodes in the dendogram
+
+ # convert distances to lambda (density)
+ lambda = matrix(0, rows=numMerges, cols=1)
+ for(i in 1:numMerges) {
+ dist = as.scalar(hierarchy[i,3])
+ if(dist > 0) {
+ lambda[i,1] = 1.0 / dist
+ } else {
+ lambda[i,1] = 1e15
+ }
+ }
+
+ lambda_birth = matrix(1e15, rows=numNodes, cols=1)
+ lambda_death = matrix(0, rows=numNodes, cols=1)
+ cluster_size = matrix(0, rows=numNodes, cols=1)
+
+ # initialize the leaf nodes to have cluster size 1
+ for(i in 1:n) {
+ cluster_size[i,1] = 1
+ }
+
+ for(i in 1:numMerges) {
+ left = as.integer(as.scalar(hierarchy[i,1]))
+ right = as.integer(as.scalar(hierarchy[i,2]))
+ newId = n + i
+ merge_lambda = as.scalar(lambda[i,1])
+
+ # cluster newId starts existing as its own cluster at this density
level
+ # and that's why the children get their det set at the same density
+ lambda_birth[newId,1] = merge_lambda
+ lambda_death[left,1] = merge_lambda
+ lambda_death[right,1] = merge_lambda
+ cluster_size[newId,1] = as.scalar(cluster_size[left,1]) +
as.scalar(cluster_size[right,1])
+ }
+
+ # root cluster exists all the way
+ rootId = 2*n - 1
+ lambda_death[rootId,1] = 0
+
+ # compute own stability for each internal node
+ # NOTE: If the cluster is big enough, we assign stability.
+ # The more long-lived it is (birth - death) and
+ # the larger it is, the more stable it is.
+ stability = matrix(0, rows=numNodes, cols=1)
+ for(nodeId in (n+1):numNodes) {
+ size = as.scalar(cluster_size[nodeId,1])
+ birth = as.scalar(lambda_birth[nodeId,1])
+ death = as.scalar(lambda_death[nodeId,1])
+ if(size >= minClSize) {
+ stability[nodeId,1] = size * (birth - death)
+ }
+ }
+
+ # compute subtree stability (best achievable from each subtree)
+ subtree_stability = matrix(0, rows=numNodes, cols=1)
+
+ # leaf nodes have 0 subtree stability
+ for(i in 1:n) {
+ subtree_stability[i,1] = 0
+ }
+
+ # process merges in order (bottom-up)
+ for(i in 1:numMerges) {
+ nodeId = n + i
+ left = as.integer(as.scalar(hierarchy[i,1]))
+ right = as.integer(as.scalar(hierarchy[i,2]))
+
+ children_subtree = as.scalar(subtree_stability[left,1]) +
as.scalar(subtree_stability[right,1])
+ own_stab = as.scalar(stability[nodeId,1])
+
+ # Subtree stability is the best we can achieve from this subtree
+ if(children_subtree > own_stab) {
+ subtree_stability[nodeId,1] = children_subtree
+ } else {
+ subtree_stability[nodeId,1] = own_stab
+ }
+ }
+
+ # select clusters
+ selected = matrix(0, rows=numNodes, cols=1)
+ selected[rootId,1] = 1
+
+ i = numMerges
+ while(i >= 1) {
+ nodeId = n + i
+
+ if(as.scalar(selected[nodeId,1]) == 1) {
+ left = as.integer(as.scalar(hierarchy[i,1]))
+ right = as.integer(as.scalar(hierarchy[i,2]))
+
+ children_subtree = as.scalar(subtree_stability[left,1]) +
as.scalar(subtree_stability[right,1])
+ own_stab = as.scalar(stability[nodeId,1])
+ parent_size = as.scalar(cluster_size[nodeId,1])
+
+ # select children if they have higher subtree stability
+ if(parent_size < minClSize | children_subtree > own_stab) {
+ selected[nodeId,1] = 0
+ selected[left,1] = 1
+ selected[right,1] = 1
+ }
+ }
+
+ i = i - 1
+ }
+
+ # assign labels
+ labels = matrix(-1, rows=n, cols=1)
+ cluster_id = 1
+
+ # while tracking which node each cluster comes from
+ maxClusters = sum(selected) # upper bound on number of clusters
+ clusterToNode = matrix(0, rows=maxClusters, cols=1)
+
+ for(nodeId in 1:numNodes) {
+ if(as.scalar(selected[nodeId,1]) == 1) {
+ size = as.scalar(cluster_size[nodeId,1])
+
+ if(size >= minClSize) {
+ clusterToNode[cluster_id,1] = nodeId # ADD THIS LINE
+
+ leaves = getLeafDescendants(hierarchy, n, nodeId)
+
+ for(j in 1:nrow(leaves)) {
+ leafId = as.integer(as.scalar(leaves[j,1]))
+ if(leafId >= 1 & leafId <= n) {
+ labels[leafId,1] = cluster_id
+ }
+ }
+
+ cluster_id = cluster_id + 1
+ }
+ }
+ }
+
+ # trim clusterToNode to actual number of clusters
+ numActualClusters = cluster_id - 1
+ if(numActualClusters > 0) {
+ clusterToNode = clusterToNode[1:numActualClusters,]
+ } else {
+ clusterToNode = matrix(0, rows=0, cols=1)
+ }
+
+ stabilities = stability
+
+}
+
+
+buildClusterModel = function(Matrix[Double] X, Matrix[Double] labels,
+ Matrix[Double] stabilities, Matrix[Double]
clusterToNode)
+ return (Matrix[Double] centroids, Matrix[Double] clusterInfo)
+{
+ n = nrow(X)
+ d = ncol(X)
+
+ numClusters = as.integer(max(labels))
+
+ if(numClusters <= 0) {
+ # No clusters found, return empty model
+ centroids = matrix(0, rows=0, cols=d)
+ clusterInfo = matrix(0, rows=0, cols=2)
+ } else {
+ centroids = matrix(0, rows=numClusters, cols=d)
+ clusterInfo = matrix(0, rows=numClusters, cols=2)
+
+ for(c in 1:numClusters) {
+ # find all points in cluster c
+ mask = (labels == c)
+ clusterSize = sum(mask)
+
+ if(clusterSize > 0) {
+ # get centroid as mean of all points in cluster
+ clusterSum = t(mask) %*% X
+ centroids[c,] = clusterSum / clusterSize
+
+ # store cluster metadata: [size, stability]
+ clusterInfo[c,1] = clusterSize
+
+ # get stability for this cluster from the mapping
+ if(nrow(clusterToNode) >= c & as.scalar(clusterToNode[c,1]) >
0) {
+ nodeId = as.integer(as.scalar(clusterToNode[c,1]))
+ clusterInfo[c,2] = as.scalar(stabilities[nodeId,1])
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/scripts/staging/hdbscan/test_build_hierarchy.dml
b/scripts/staging/hdbscan/test_build_hierarchy.dml
new file mode 100644
index 0000000000..fc323c23d2
--- /dev/null
+++ b/scripts/staging/hdbscan/test_build_hierarchy.dml
@@ -0,0 +1,142 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+source("scripts/builtin/hdbscan.dml") as hdb
+
+# 4
+# / | \
+# / | \
+# / (2) (5)
+# | | \
+# | | \
+# (2) 1-(3)--3
+# | | /
+# \ | (4)
+# \ (1) /
+# \ | /
+# \|/
+# 2
+
+distances = matrix(0, rows=4, cols=4)
+distances[1,2] = 1
+distances[2,1] = 1
+
+distances[1,3] = 3
+distances[3,1] = 3
+
+distances[1,4] = 2
+distances[4,1] = 2
+
+distances[2,3] = 4
+distances[3,2] = 4
+
+distances[2,4] = 2
+distances[4,2] = 2
+
+distances[3,4] = 5
+distances[4,3] = 5
+
+[edges, weights] = hdb::buildMST(distances, 4)
+
+[hierarchy, sizes] = hdb::buildHierarchy(edges, weights, 4)
+
+print("Hierarchy (format: [cluster1, cluster2, merge_distance]):")
+print(toString(hierarchy))
+
+# Should have n-1 merge operations for n nodes
+num_merges = nrow(hierarchy)
+print("Number of merges: " + num_merges + " (should be 3)")
+test1 = (num_merges == 3)
+
+# Merge distances should be in ascending order (or equal)
+# Because we process edges from low weight to high weight
+dist1 = as.scalar(hierarchy[1,3])
+dist2 = as.scalar(hierarchy[2,3])
+dist3 = as.scalar(hierarchy[3,3])
+print("\nMerge distances: [" + dist1 + ", " + dist2 + ", " + dist3 + "]" + "
(Should be in ascending order)")
+test2 = (dist1 <= dist2) & (dist2 <= dist3)
+
+# Cluster sizes should increase
+size1 = as.scalar(sizes[1])
+size2 = as.scalar(sizes[2])
+size3 = as.scalar(sizes[3])
+print("\nCluster sizes: [" + size1 + ", " + size2 + ", " + size3 + "]" + "
(Should be increasing)")
+test3 = (size1 <= size2) & (size2 <= size3)
+
+# Final size should equal total number of nodes
+print("Final cluster size: " + size3 + " (should be 4)")
+test4 = (size3 == 4)
+
+# First merge should be size 2
+print("First merge size: " + size1 + " (should be 2)")
+test5 = (size1 == 2)
+
+# New classic-linkage checks
+n = 4
+hasInternal = (sum(hierarchy[,1] > n) + sum(hierarchy[,2] > n)) > 0
+print("Has internal node ids (>n): " + hasInternal + " (should be true)")
+test6 = hasInternal
+
+# Check that child ids are within valid range
+maxChild1 = max(hierarchy[,1])
+maxChild2 = max(hierarchy[,2])
+maxChild = maxChild1
+if(maxChild2 > maxChild) { maxChild = maxChild2 }
+
+print("Max child id: " + maxChild + " (should be <= " + (2*n-2) + ")")
+test7 = (maxChild <= (2*n-2))
+
+# Check that internal ids are “created in order”
+test8 = TRUE
+for(r in 1:(n-1)) {
+ child1 = as.scalar(hierarchy[r,1])
+ child2 = as.scalar(hierarchy[r,2])
+ newId = n + r
+ ok = (child1 < newId) & (child2 < newId)
+ test8 = test8 & ok
+}
+print("Children reference only existing nodes: " + test8 + " (should be true)")
+
+# Recompute node sizes from hierarchy and verify
+nodeSize = matrix(0, rows=2*n-1, cols=1)
+for(j in 1:n) { nodeSize[j,1] = 1 }
+
+test9 = TRUE
+for(r in 1:(n-1)) {
+ left = as.integer(as.scalar(hierarchy[r,1]))
+ right = as.integer(as.scalar(hierarchy[r,2]))
+ newId = n + r
+
+ expected = as.scalar(nodeSize[left,1]) + as.scalar(nodeSize[right,1])
+ nodeSize[newId,1] = expected
+
+ ok = (as.scalar(sizes[r,1]) == expected)
+ test9 = test9 & ok
+}
+print("sizes[r] equals sum of child sizes: " + test9 + " (should be true)")
+
+test_pass = test1 & test2 & test3 & test4 & test5 & test6 & test7 & test8 &
test9
+
+if(test_pass) {
+ print("Passed")
+} else {
+ print("Failed")
+}
\ No newline at end of file
diff --git a/scripts/staging/hdbscan/test_build_mst.dml
b/scripts/staging/hdbscan/test_build_mst.dml
new file mode 100644
index 0000000000..4ec05e18fb
--- /dev/null
+++ b/scripts/staging/hdbscan/test_build_mst.dml
@@ -0,0 +1,68 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+source("scripts/builtin/hdbscan.dml") as hdb
+
+# 4
+# / | \
+# / | \
+# / (2) (5)
+# | | \
+# | | \
+# (2) 1-(3)--3
+# | | /
+# \ | (4)
+# \ (1) /
+# \ | /
+# \|/
+# 2
+
+distances = matrix(0, rows=4, cols=4)
+distances[1,2] = 1
+distances[2,1] = 1
+
+distances[1,3] = 3
+distances[3,1] = 3
+
+distances[1,4] = 2
+distances[4,1] = 2
+
+distances[2,3] = 4
+distances[3,2] = 4
+
+distances[2,4] = 2
+distances[4,2] = 2
+
+distances[3,4] = 5
+distances[4,3] = 5
+
+[edges, weights] = hdb::buildMST(distances, 4)
+
+totalWeight = sum(weights)
+
+test_pass = (nrow(edges) == 3) & (totalWeight == 6)
+
+if(test_pass) {
+ print("Passed")
+} else {
+ print("Failed")
+}
\ No newline at end of file
diff --git a/scripts/staging/hdbscan/test_cluster_model.dml
b/scripts/staging/hdbscan/test_cluster_model.dml
new file mode 100644
index 0000000000..363b356ab5
--- /dev/null
+++ b/scripts/staging/hdbscan/test_cluster_model.dml
@@ -0,0 +1,151 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+source("scripts/builtin/hdbscan.dml") as hdb
+
+# 3 clear clusters
+# A: points around (0, 0)
+# B: points around (10, 10)
+# C: points around (20, 0)
+
+n = 12
+d = 2
+X = matrix(0, rows=n, cols=d)
+
+# A
+X[1,] = matrix("0.0 0.0", rows=1, cols=2)
+X[2,] = matrix("0.5 0.5", rows=1, cols=2)
+X[3,] = matrix("-0.5 0.5", rows=1, cols=2)
+X[4,] = matrix("0.0 -0.5", rows=1, cols=2)
+
+# B
+X[5,] = matrix("10.0 10.0", rows=1, cols=2)
+X[6,] = matrix("10.5 10.5", rows=1, cols=2)
+X[7,] = matrix("9.5 10.5", rows=1, cols=2)
+X[8,] = matrix("10.0 9.5", rows=1, cols=2)
+
+# C
+X[9,] = matrix("20.0 0.0", rows=1, cols=2)
+X[10,] = matrix("20.5 0.5", rows=1, cols=2)
+X[11,] = matrix("19.5 0.5", rows=1, cols=2)
+X[12,] = matrix("20.0 -0.5", rows=1, cols=2)
+
+print(toString(X))
+
+
+# get distances
+distances = hdb::dist(X)
+
+
+# get core distances
+minPts = 3
+coreDistances = matrix(0, rows=n, cols=1)
+for(i in 1:n) {
+ kthDist = hdb::computeKthSmallest(t(distances[i,]), minPts)
+ coreDistances[i] = kthDist
+}
+
+
+# get mutual reachability
+mutualReach = hdb::computeMutualReachability(distances, coreDistances)
+
+
+# get MST
+[edges, weights] = hdb::buildMST(mutualReach, n)
+
+
+# get hierarchy
+[hierarchy, sizes] = hdb::buildHierarchy(edges, weights, n)
+
+
+# get stable clusters
+minClSize = 3
+[labels, stabilities, clusterToNode] = hdb::extractStableClusters(hierarchy,
weights, n, minClSize)
+expected_labels = matrix("1 1 1 1 2 2 2 2 3 3 3 3", rows=12, cols=1)
+labels_match = (min(labels == expected_labels) == 1)
+if (labels_match) {
+ print("Pass: labels match.")
+} else {
+ print("Fail: labels don't match.")
+}
+print("Cluster labels:")
+print(toString(labels))
+
+
+# get cluster model
+[centroids, clusterInfo] = hdb::buildClusterModel(X, labels, stabilities,
clusterToNode)
+
+print("\nCentroids:")
+print(toString(centroids))
+
+print("\nInfo [size, stability]:")
+print(toString(clusterInfo))
+
+
+# check results
+numClusters = nrow(centroids)
+print("\nNumber of clusters found: " + numClusters)
+
+
+# should find 3 clusters
+test1 = (numClusters == 3)
+print("Found 3 clusters: " + test1)
+
+
+# each cluster should have 4 points
+allSizesFour = TRUE
+for(c in 1:numClusters) {
+ size = as.scalar(clusterInfo[c,1])
+ allSizesFour = allSizesFour & (size == 4)
+}
+print("All clusters have size 4: " + allSizesFour)
+
+
+test_A = min(sqrt(rowSums((centroids - matrix("0 0.125", 1, 2))^2))) < 0.001
+test_B = min(sqrt(rowSums((centroids - matrix("10 10.125", 1, 2))^2))) < 0.001
+test_C = min(sqrt(rowSums((centroids - matrix("20 0.125", 1, 2))^2))) < 0.001
+all_found = test_A & test_B & test_C
+print("Expected centroids near expected positions: " + all_found)
+
+
+# no noise (all assigned to clusters)
+numNoise = sum(labels == -1)
+test4 = (numNoise == 0)
+print("No noise points: " + test4)
+
+
+# stabilities are populated (not 0)
+test_stability = TRUE
+for(c in 1:numClusters) {
+ stab = as.scalar(clusterInfo[c,2])
+ test_stability = test_stability & (stab > 0)
+}
+print("Stabilities populated: " + test_stability)
+
+
+test_pass = test1 & allSizesFour & all_found & test4 & test_stability
+
+if(test_pass) {
+ print("\nAll tests passed")
+} else {
+ print("\nTests failed")
+}
\ No newline at end of file
diff --git a/scripts/staging/hdbscan/test_extract_stabe_clusters.dml
b/scripts/staging/hdbscan/test_extract_stabe_clusters.dml
new file mode 100644
index 0000000000..8d19acf7cd
--- /dev/null
+++ b/scripts/staging/hdbscan/test_extract_stabe_clusters.dml
@@ -0,0 +1,158 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+source("scripts/builtin/hdbscan.dml") as hdb
+
+# 6 point example with clear cluster structure
+# 1,2,3 form tight cluster A, 4,5,6 form tight cluster B, A and B are far
apart (10)
+
+n = 6
+distances = matrix(10, rows=n, cols=n)
+
+# points 1,2,3 (cluster A)
+distances[1,2] = 1
+distances[2,1] = 1
+
+distances[1,3] = 2
+distances[3,1] = 2
+
+distances[2,3] = 1
+distances[3,2] = 1
+
+# points 4,5,6 (cluster B)
+distances[4,5] = 1
+distances[5,4] = 1
+
+distances[4,6] = 2
+distances[6,4] = 2
+
+distances[5,6] = 1
+distances[6,5] = 1
+
+# zero diagonal (to self)
+for(i in 1:n) {
+ distances[i,i] = 0
+}
+
+
+print("\nBuilding MST")
+expected_edges = matrix("2 1 3 2 6 3 5 6 4 5", rows=5, cols=2)
+expected_weights = matrix("1 1 10 1 1", rows=5, cols=1)
+[edges, weights] = hdb::buildMST(distances, n)
+edges_match = (min(edges == expected_edges) == 1)
+weights_match = (min(weights == expected_weights) == 1)
+if (edges_match) {
+ print("Pass: edges match.")
+} else {
+ print("Fail: edges don't match.")
+}
+if (weights_match) {
+ print("Pass: weights match.")
+} else {
+ print("Fail: weights don't match.")
+}
+print("MST edges:\n" + toString(edges))
+print("MST weights:\n " + toString(weights))
+
+
+print("\nBuilding hierarchy")
+[hierarchy, sizes] = hdb::buildHierarchy(edges, weights, n)
+expected_hierarchy = matrix("2 1 1 3 7 1 5 6 1 4 9 1 10 8 10", rows=5,
cols=3)
+expected_sizes = matrix("2 3 2 3 6", rows=5, cols=1)
+hierachy_match = (min(hierarchy == expected_hierarchy) == 1)
+sizes_match = (min(sizes == expected_sizes) == 1)
+if (hierachy_match) {
+ print("Pass: hierachy mathes.")
+} else {
+ print("Fail: hierarchy doesn't match.")
+}
+if (sizes_match) {
+ print("Pass: sizes match.")
+} else {
+ print("Fail: sizes don't match.")
+}
+print("Hierarchy:\n" + toString(hierarchy))
+print("Sizes:\n" + toString(sizes))
+
+
+print("\nExtracting stable clusters with minClSize=2")
+[labels, stabilities] = hdb::extractStableClusters(hierarchy, weights, n, 2)
+expected_labels = matrix("1 1 1 2 2 2", rows=6, cols=1)
+expected_stabilites = matrix("0 0 0 0 0 0 0 2.7 0 2.7 0.6", rows=n*2-1, cols=1)
+labels_match = (min(labels == expected_labels) == 1)
+tolerance = 1e-10
+stabilities_match = max(abs(stabilities - expected_stabilites)) < tolerance
+if (labels_match) {
+ print("Pass: labels match.")
+} else {
+ print("Fail: labels don't match.")
+}
+if (stabilities_match) {
+ print("Pass: stabilities match.")
+} else {
+ print("Fail: stabilities don't match.")
+}
+print("Cluster labels:\n" + toString(labels))
+print("Top stabilities:\n" + toString(stabilities))
+
+
+
+# check results (we have some duplicate logic here, but anyway)
+num_clusters = max(labels)
+num_noise = sum(labels == -1)
+
+print("\nNumber of clusters found: " + num_clusters)
+print("Number of noise points: " + num_noise)
+
+# should find 2 clusters
+test1 = (num_clusters == 2)
+print("Found 2 clusters: " + test1)
+
+# no points should be noise
+test2 = (num_noise == 0)
+print("No noise points: " + test2)
+
+# points 1,2,3 should be in same cluster
+label1 = as.scalar(labels[1])
+label2 = as.scalar(labels[2])
+label3 = as.scalar(labels[3])
+test3 = (label1 == label2) & (label2 == label3) & (label1 > 0)
+print("points 1,2,3 in same cluster: " + test3)
+
+# points 4,5,6 should be in same
+label4 = as.scalar(labels[4])
+label5 = as.scalar(labels[5])
+label6 = as.scalar(labels[6])
+test4 = (label4 == label5) & (label5 == label6) & (label4 > 0)
+print("Points 4,5,6 in same cluster: " + test4)
+
+# clusters A and B should be different
+test5 = (label1 != label4)
+print("Two clusters are different: " + test5)
+
+test_pass = edges_match & weights_match & hierachy_match & sizes_match &
labels_match & stabilities_match & test1 & test2 & test3 & test4 & test5
+
+if(test_pass) {
+ print("\nAll tests passed\n")
+} else {
+ print("\nTests failed\n")
+}
\ No newline at end of file
diff --git a/scripts/staging/hdbscan/test_hdbscan.dml
b/scripts/staging/hdbscan/test_hdbscan.dml
new file mode 100644
index 0000000000..411ef0d047
--- /dev/null
+++ b/scripts/staging/hdbscan/test_hdbscan.dml
@@ -0,0 +1,73 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+source("scripts/builtin/hdbscan.dml") as hdb
+
+# A: 5 points tightly grouped around (0, 0)
+# B: 5 points tightly grouped around (10, 0)
+# Outliers: 2 points far from any cluster
+
+n = 12
+d = 2
+X = matrix(0, rows=n, cols=d)
+
+# A
+X[1,] = matrix("0.0 0.0", rows=1, cols=2)
+X[2,] = matrix("0.3 0.2", rows=1, cols=2)
+X[3,] = matrix("-0.2 0.3", rows=1, cols=2)
+X[4,] = matrix("0.1 -0.2", rows=1, cols=2)
+X[5,] = matrix("-0.1 -0.1", rows=1, cols=2)
+
+# B
+X[6,] = matrix("10.0 0.0", rows=1, cols=2)
+X[7,] = matrix("10.3 0.2", rows=1, cols=2)
+X[8,] = matrix("9.8 0.3", rows=1, cols=2)
+X[9,] = matrix("10.1 -0.2", rows=1, cols=2)
+X[10,] = matrix("9.9 -0.1", rows=1, cols=2)
+
+# Outliers
+X[11,] = matrix("5.0 5.0", rows=1, cols=2)
+X[12,] = matrix("-5.0 -5.0", rows=1, cols=2)
+
+print(toString(X))
+
+minPts = 3
+minClSize = 4
+
+
+# run hdbscan
+[labels, centroids] = hdb::m_hdbscan(X, minPts, minClSize)
+expected_labels = matrix("1 1 1 1 1 2 2 2 2 2 -1 -1", rows=12, cols=1)
+labels_match = (min(labels == expected_labels) == 1)
+if (labels_match) {
+ print("Pass: labels match.")
+} else {
+ print("Fail: labels don't match.")
+}
+test_A = min(sqrt(rowSums((centroids - matrix("0.020 0.040", 1, 2))^2))) <
0.001
+test_B = min(sqrt(rowSums((centroids - matrix("10.020 0.040", 1, 2))^2))) <
0.001
+all_found = test_A & test_B
+print("Expected centroids near expected positions: " + all_found)
+print("Cluster labels:")
+print(toString(labels))
+print("\nCluster centroids:")
+print(toString(centroids))
diff --git a/scripts/staging/hdbscan/test_kth_smallest.dml
b/scripts/staging/hdbscan/test_kth_smallest.dml
new file mode 100644
index 0000000000..0ddf5d6a03
--- /dev/null
+++ b/scripts/staging/hdbscan/test_kth_smallest.dml
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+source("scripts/builtin/hdbscan.dml") as hdb
+
+array = matrix("0 1 4 2", rows=4, cols=1)
+
+res1 = hdb::computeKthSmallest(array, 1)
+res2 = hdb::computeKthSmallest(array, 2)
+
+test_pass = (res1 == 1) & (res2 == 2)
+
+if(test_pass) {
+ print("Passed")
+} else {
+ print("Failed")
+}
\ No newline at end of file
diff --git a/scripts/staging/hdbscan/test_mutual_reachability.dml
b/scripts/staging/hdbscan/test_mutual_reachability.dml
new file mode 100644
index 0000000000..757b5f98ee
--- /dev/null
+++ b/scripts/staging/hdbscan/test_mutual_reachability.dml
@@ -0,0 +1,48 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+source("scripts/builtin/hdbscan.dml") as hdb
+
+distances = matrix("0 1 5 1 0 4 5 4 0", rows=3, cols=3)
+coreDistances = matrix("1 1 4", rows=3, cols=1)
+
+mutualReach = hdb::computeMutualReachability(distances, coreDistances)
+
+diagSum = sum(diag(mutualReach))
+
+val_AB = as.scalar(mutualReach[1,2])
+val_AC = as.scalar(mutualReach[1,3])
+val_BC = as.scalar(mutualReach[2,3])
+
+sym_AB = as.scalar(mutualReach[2,1])
+sym_AC = as.scalar(mutualReach[3,1])
+sym_BC = as.scalar(mutualReach[3,2])
+
+test1_pass = (val_AB == 1) & (val_AC == 5) & (val_BC == 4) &
+ (diagSum == 0) &
+ (val_AB == sym_AB) & (val_AC == sym_AC) & (val_BC == sym_BC)
+
+if(test1_pass) {
+ print("Passed")
+} else {
+ print("Failed")
+}
diff --git a/scripts/staging/hdbscan/test_union_find.dml
b/scripts/staging/hdbscan/test_union_find.dml
new file mode 100644
index 0000000000..05b0d70604
--- /dev/null
+++ b/scripts/staging/hdbscan/test_union_find.dml
@@ -0,0 +1,161 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+source("scripts/builtin/hdbscan.dml") as hdb
+
+# 1 -> 1
+# 2 -> 1
+# 3 -> 2 -> 1
+# 4 -> 4
+
+n = 4
+parent = matrix(0, rows=n, cols=1)
+parent[1] = 1
+parent[2] = 1
+parent[3] = 2
+parent[4] = 4
+
+root1 = hdb::find(parent, 1)
+root2 = hdb::find(parent, 2)
+root3 = hdb::find(parent, 3)
+root4 = hdb::find(parent, 4)
+
+test_find = (root1 == 1) & (root2 == 1) & (root3 == 1) & (root4 == 4)
+
+if(test_find) {
+ print("Passed")
+} else {
+ print("Failed")
+}
+
+# ensure union() attaches lower-rank root under higher-rank root
+# rank[1] < rank[2] => parent[1] becomes 2
+
+newId = 5 # arbitrary "new linkage node id" for this unit test
+
+parentA = matrix(0, rows=n, cols=1)
+parentA[1] = 1
+parentA[2] = 2
+parentA[3] = 3
+parentA[4] = 4
+
+rankA = matrix(0, rows=n, cols=1)
+rankA[1] = 0
+rankA[2] = 1
+rankA[3] = 0
+rankA[4] = 0
+
+sizeA = matrix(1, rows=n, cols=1)
+
+compToNodeA = matrix(0, rows=n, cols=1)
+compToNodeA[1] = 1
+compToNodeA[2] = 2
+compToNodeA[3] = 3
+compToNodeA[4] = 4
+
+[newParentA, newRankA, newSizeA, newCompToNodeA, newRootA] =
hdb::union(parentA, rankA, 1, 2, sizeA, compToNodeA, newId)
+test_unionA = (as.scalar(newParentA[1]) == 2) & (as.scalar(newParentA[2]) == 2)
+
+if(test_unionA) {
+ print("Passed")
+} else {
+ print("Failed")
+}
+
+# ensure union() attaches lower-rank root under higher-rank root
+# rank[1] > rank[2] => parent[2] becomes 1
+
+parentB = matrix(0, rows=n, cols=1)
+parentB[1] = 1
+parentB[2] = 2
+parentB[3] = 3
+parentB[4] = 4
+
+rankB = matrix(0, rows=n, cols=1)
+rankB[1] = 2
+rankB[2] = 1
+rankB[3] = 0
+rankB[4] = 0
+
+sizeB = matrix(1, rows=n, cols=1)
+
+compToNodeB = matrix(0, rows=n, cols=1)
+compToNodeB[1] = 1
+compToNodeB[2] = 2
+compToNodeB[3] = 3
+compToNodeB[4] = 4
+
+[newParentB, newRankB, newSizeB, newCompToNodeB, newRootB] =
hdb::union(parentB, rankB, 1, 2, sizeB, compToNodeB, newId)
+test_unionB = (as.scalar(newParentB[2]) == 1) & (as.scalar(newParentB[1]) == 1)
+
+if(test_unionB) {
+ print("Passed")
+} else {
+ print("Failed")
+}
+
+# check linkage
+test_unionB = test_unionB & (newRootB == 1) & (as.scalar(newCompToNodeB[1]) ==
newId)
+
+if(test_unionB) {
+ print("Passed")
+} else {
+ print("Failed")
+}
+
+
+# ensure union() increments if height is same
+
+parentC = matrix(0, rows=n, cols=1)
+parentC[1] = 1
+parentC[2] = 2
+parentC[3] = 3
+parentC[4] = 4
+
+rankC = matrix(0, rows=n, cols=1)
+rankC[1] = 0
+rankC[2] = 0
+rankC[3] = 0
+rankC[4] = 0
+
+sizeC = matrix(1, rows=n, cols=1)
+
+compToNodeC = matrix(0, rows=n, cols=1)
+compToNodeC[1] = 1
+compToNodeC[2] = 2
+compToNodeC[3] = 3
+compToNodeC[4] = 4
+
+[newParentC, newRankC, newSizeC, newCompToNodeC, newRootC] =
hdb::union(parentC, rankC, 1, 2, sizeC, compToNodeC, newId)
+
+test_unionC_parent = (as.scalar(newParentC[2]) == 1)
+test_unionC_rank = (as.scalar(newRankC[1]) == 1)
+
+test_unionC = test_unionC_parent & test_unionC_rank
+
+test_unionC = test_unionC & (newRootC == 1) & (as.scalar(newCompToNodeC[1]) ==
newId)
+
+if(test_unionC) {
+ print("Passed")
+} else {
+ print("Failed")
+}