IGNITE-5246: Fuzzy c-means (FCM) this closes #2969
Project: http://git-wip-us.apache.org/repos/asf/ignite/repo Commit: http://git-wip-us.apache.org/repos/asf/ignite/commit/b13f088b Tree: http://git-wip-us.apache.org/repos/asf/ignite/tree/b13f088b Diff: http://git-wip-us.apache.org/repos/asf/ignite/diff/b13f088b Branch: refs/heads/ignite-zk Commit: b13f088b210b66b4b38c260b48d6853e03f46a6c Parents: cd0d2eb Author: Ilya Nozhkin <[email protected]> Authored: Sun Dec 3 23:08:49 2017 +0300 Committer: Yury Babak <[email protected]> Committed: Sun Dec 3 23:08:49 2017 +0300 ---------------------------------------------------------------------- .../ml/clustering/FuzzyCMeansExample.java | 125 +++++ .../examples/ml/clustering/package-info.java | 22 + .../ignite/ml/FuzzyCMeansModelFormat.java | 76 +++ .../ml/clustering/BaseFuzzyCMeansClusterer.java | 90 ++++ .../FuzzyCMeansDistributedClusterer.java | 512 +++++++++++++++++++ .../clustering/FuzzyCMeansLocalClusterer.java | 254 +++++++++ .../ignite/ml/clustering/FuzzyCMeansModel.java | 88 ++++ .../ignite/ml/clustering/WeightedClusterer.java | 2 +- .../ml/clustering/ClusteringTestSuite.java | 4 +- .../FuzzyCMeansDistributedClustererTest.java | 177 +++++++ .../FuzzyCMeansLocalClustererTest.java | 203 ++++++++ 11 files changed, 1551 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ignite/blob/b13f088b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/FuzzyCMeansExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/FuzzyCMeansExample.java b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/FuzzyCMeansExample.java new file mode 100644 index 0000000..9c47186 --- /dev/null +++ b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/FuzzyCMeansExample.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.examples.ml.clustering; + +import org.apache.ignite.Ignite; +import org.apache.ignite.Ignition; +import org.apache.ignite.ml.clustering.BaseFuzzyCMeansClusterer; +import org.apache.ignite.ml.clustering.FuzzyCMeansDistributedClusterer; +import org.apache.ignite.ml.clustering.FuzzyCMeansModel; +import org.apache.ignite.ml.math.DistanceMeasure; +import org.apache.ignite.ml.math.EuclideanDistance; +import org.apache.ignite.ml.math.StorageConstants; +import org.apache.ignite.ml.math.Vector; +import org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix; +import org.apache.ignite.thread.IgniteThread; + +/** + * This example shows how to use Fuzzy C-Means clusterer + * ({@link org.apache.ignite.ml.clustering.FuzzyCMeansDistributedClusterer}). + */ +public final class FuzzyCMeansExample { + /** + * Executes example. + * + * @param args Command line arguments, none required. + */ + public static void main(String[] args) throws InterruptedException { + System.out.println(); + System.out.println(">>> Fuzzy C-Means usage example started."); + // Start ignite grid. + try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) { + System.out.println(">>> Ignite grid started."); + + // Start new Ignite thread. + IgniteThread igniteThread = new IgniteThread(ignite.configuration().getIgniteInstanceName(), + FuzzyCMeansExample.class.getSimpleName(), + () -> { + + // Distance measure that computes distance between two points. + DistanceMeasure distanceMeasure = new EuclideanDistance(); + + // "Fuzziness" - specific constant that is used in membership calculation (1.0+-eps ~ K-Means). + double exponentialWeight = 2.0; + + // Condition that indicated when algorithm must stop. + // In this example algorithm stops if memberships have changed insignificantly. + BaseFuzzyCMeansClusterer.StopCondition stopCond = + BaseFuzzyCMeansClusterer.StopCondition.STABLE_MEMBERSHIPS; + + // Maximum difference between new and old membership values with which algorithm will continue to work. + double maxDelta = 0.01; + + // The maximum number of FCM iterations. + int maxIterations = 50; + + // Value that is used to initialize random numbers generator. You can choose it randomly. + Long seed = null; + + // Number of steps of primary centers selection (more steps more candidates). + int initializationSteps = 2; + + // Number of K-Means iteration that is used to choose required number of primary centers from candidates. + int kMeansMaxIterations = 50; + + // Create new distributed clusterer with parameters described above. + System.out.println(">>> Create new Distributed Fuzzy C-Means clusterer."); + FuzzyCMeansDistributedClusterer clusterer = new FuzzyCMeansDistributedClusterer( + distanceMeasure, exponentialWeight, stopCond, maxDelta, maxIterations, + seed, initializationSteps, kMeansMaxIterations); + + // Create sample data. + double[][] points = new double[][]{{-10, -10}, {-9, -11}, {-10, -9}, {-11, -9}, + {10, 10}, {9, 11}, {10, 9}, {11, 9}, + {-10, 10}, {-9, 11}, {-10, 9}, {-11, 9}, + {10, -10}, {9, -11}, {10, -9}, {11, -9}}; + + // Initialize matrix of data points. Each row contains one point. + int rows = points.length; + int cols = points[0].length; + + System.out.println(">>> Create the matrix that contains sample points."); + SparseDistributedMatrix pntMatrix = new SparseDistributedMatrix(rows, cols, + StorageConstants.ROW_STORAGE_MODE, StorageConstants.RANDOM_ACCESS_MODE); + + // Store points into matrix. + pntMatrix.assign(points); + + // Call clusterization method with some number of centers. + // It returns model that can predict results for new points. + System.out.println(">>> Perform clusterization."); + int numCenters = 4; + FuzzyCMeansModel mdl = clusterer.cluster(pntMatrix, numCenters); + + // You can also get centers of clusters that is computed by Fuzzy C-Means algorithm. + Vector[] centers = mdl.centers(); + + StringBuilder results = new StringBuilder(">>> Results:\n"); + results.append(">>> 1st center: " + centers[0].get(0) + " " + centers[0].get(1) + "\n"); + results.append(">>> 2nd center: " + centers[1].get(0) + " " + centers[1].get(1) + "\n"); + results.append(">>> 3rd center: " + centers[2].get(0) + " " + centers[2].get(1) + "\n"); + results.append(">>> 4th center: " + centers[3].get(0) + " " + centers[3].get(1) + "\n"); + + System.out.println(results.toString()); + }); + + igniteThread.start(); + igniteThread.join(); + } + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/b13f088b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/package-info.java ---------------------------------------------------------------------- diff --git a/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/package-info.java b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/package-info.java new file mode 100644 index 0000000..7051912 --- /dev/null +++ b/examples/src/main/ml/org/apache/ignite/examples/ml/clustering/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * <!-- Package description. --> + * Clustering examples. + */ +package org.apache.ignite.examples.ml.clustering; http://git-wip-us.apache.org/repos/asf/ignite/blob/b13f088b/modules/ml/src/main/java/org/apache/ignite/ml/FuzzyCMeansModelFormat.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/FuzzyCMeansModelFormat.java b/modules/ml/src/main/java/org/apache/ignite/ml/FuzzyCMeansModelFormat.java new file mode 100644 index 0000000..2b27e86 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/FuzzyCMeansModelFormat.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml; + +import java.io.Serializable; +import java.util.Arrays; +import org.apache.ignite.ml.math.DistanceMeasure; +import org.apache.ignite.ml.math.Vector; + +/** Fuzzy C-Means model representation. */ +public class FuzzyCMeansModelFormat implements Serializable { + /** Centers of clusters. */ + private final Vector[] centers; + + /** Distance measure. */ + private final DistanceMeasure measure; + + /** + * Constructor that retains result of clusterization and distance measure. + * + * @param centers Centers found while clusterization. + * @param measure Distance measure. + */ + public FuzzyCMeansModelFormat(Vector[] centers, DistanceMeasure measure) { + this.centers = centers; + this.measure = measure; + } + + /** Distance measure used while clusterization. */ + public DistanceMeasure getMeasure() { + return measure; + } + + /** Get cluster centers. */ + public Vector[] getCenters() { + return centers; + } + + /** {@inheritDoc} */ + @Override public int hashCode() { + int res = 1; + + res = res * 37 + measure.hashCode(); + res = res * 37 + Arrays.hashCode(centers); + + return res; + } + + /** {@inheritDoc} */ + @Override public boolean equals(Object obj) { + if (this == obj) + return true; + + if (obj == null || getClass() != obj.getClass()) + return false; + + FuzzyCMeansModelFormat that = (FuzzyCMeansModelFormat) obj; + + return measure.equals(that.measure) && Arrays.deepEquals(centers, that.centers); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/b13f088b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseFuzzyCMeansClusterer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseFuzzyCMeansClusterer.java b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseFuzzyCMeansClusterer.java new file mode 100644 index 0000000..65aaeee --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/BaseFuzzyCMeansClusterer.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.clustering; + +import org.apache.ignite.ml.math.DistanceMeasure; +import org.apache.ignite.ml.math.Matrix; +import org.apache.ignite.ml.math.Vector; +import org.apache.ignite.ml.math.exceptions.ConvergenceException; +import org.apache.ignite.ml.math.exceptions.MathIllegalArgumentException; + +/** The abstract class that defines the basic interface of Fuzzy C-Means clusterers */ +public abstract class BaseFuzzyCMeansClusterer<T extends Matrix> implements Clusterer<T, FuzzyCMeansModel> { + /** Distance measure. */ + protected DistanceMeasure measure; + + /** Specific constant which is used in calculating of membership matrix. */ + protected double exponentialWeight; + + /** The maximum distance between old and new centers or the maximum difference between new and old membership matrix + * elements for which algorithm must stop. */ + protected double maxDelta; + + /** The flag that tells when algorithm should stop. */ + protected StopCondition stopCond; + + /** + * Constructor that stores some required parameters. + * + * @param measure Distance measure. + * @param exponentialWeight Specific constant which is used in calculating of membership matrix. + * @param stopCond Flag that tells when algorithm should stop. + * @param maxDelta The maximum distance between old and new centers or maximum difference between new and old + * membership matrix elements for which algorithm must stop. + */ + protected BaseFuzzyCMeansClusterer(DistanceMeasure measure, double exponentialWeight, StopCondition stopCond, + double maxDelta) { + this.measure = measure; + this.exponentialWeight = exponentialWeight; + this.stopCond = stopCond; + this.maxDelta = maxDelta; + } + + /** + * Perform a cluster analysis on the given set of points. + * + * @param points The set of points. + * @return A list of clusters. + * @throws MathIllegalArgumentException If points are null or the number of data points is not compatible with this + * clusterer. + * @throws ConvergenceException If the algorithm has not yet converged after the maximum number of iterations has + * been exceeded. + */ + public abstract FuzzyCMeansModel cluster(T points, int k); + + /** + * Calculates the distance between two vectors. * with the configured {@link DistanceMeasure}. + * + * @return The distance between two points. + */ + protected double distance(final Vector v1, final Vector v2) { + return measure.compute(v1, v2); + } + + /** Enumeration that contains different conditions under which algorithm must stop. */ + public enum StopCondition { + /** Algorithm stops if the maximum distance between new and old centers is less than {@link #maxDelta}. */ + STABLE_CENTERS, + + /** + * Algorithm stops if the maximum difference between elements of new and old membership matrix is less than + * {@link #maxDelta}. + */ + STABLE_MEMBERSHIPS + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/b13f088b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClusterer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClusterer.java b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClusterer.java new file mode 100644 index 0000000..a5cd871 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClusterer.java @@ -0,0 +1,512 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.clustering; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.stream.Collectors; +import javax.cache.Cache; +import org.apache.ignite.internal.util.GridArgumentCheck; +import org.apache.ignite.ml.math.DistanceMeasure; +import org.apache.ignite.ml.math.Vector; +import org.apache.ignite.ml.math.VectorUtils; +import org.apache.ignite.ml.math.distributed.CacheUtils; +import org.apache.ignite.ml.math.distributed.keys.impl.SparseMatrixKey; +import org.apache.ignite.ml.math.exceptions.ConvergenceException; +import org.apache.ignite.ml.math.exceptions.MathIllegalArgumentException; +import org.apache.ignite.ml.math.functions.Functions; +import org.apache.ignite.ml.math.functions.IgniteBiFunction; +import org.apache.ignite.ml.math.functions.IgniteSupplier; +import org.apache.ignite.ml.math.impls.matrix.DenseLocalOnHeapMatrix; +import org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix; +import org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage; +import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; +import org.apache.ignite.ml.math.util.MatrixUtil; + +/** This class implements distributed version of Fuzzy C-Means clusterization of equal-weighted points. */ +public class FuzzyCMeansDistributedClusterer extends BaseFuzzyCMeansClusterer<SparseDistributedMatrix> { + /** Random numbers generator which is used in centers selection. */ + private Random rnd; + + /** The value that is used to initialize random numbers generator. */ + private long seed; + + /** The number of initialization steps each of which adds some number of candidates for being a center. */ + private int initSteps; + + /** The maximum number of iterations of K-Means algorithm which selects the required number of centers. */ + private int kMeansMaxIterations; + + /** The maximum number of FCM iterations. */ + private int cMeansMaxIterations; + + /** + * Constructor that retains all required parameters. + * + * @param measure Distance measure. + * @param exponentialWeight Specific constant which is used in calculating of membership matrix. + * @param stopCond Flag that tells when algorithm should stop. + * @param maxDelta The maximum distance between old and new centers or maximum difference between new and old + * membership matrix elements for which algorithm must stop. + * @param cMeansMaxIterations The maximum number of FCM iterations. + * @param seed Seed for random numbers generator. + * @param initSteps Number of steps of primary centers selection (the more steps, the more candidates). + * @param kMeansMaxIterations The maximum number of K-Means iteration in primary centers selection. + */ + public FuzzyCMeansDistributedClusterer(DistanceMeasure measure, double exponentialWeight, + StopCondition stopCond, double maxDelta, int cMeansMaxIterations, + Long seed, int initSteps, int kMeansMaxIterations) { + super(measure, exponentialWeight, stopCond, maxDelta); + + this.seed = seed != null ? seed : new Random().nextLong(); + this.initSteps = initSteps; + this.cMeansMaxIterations = cMeansMaxIterations; + this.kMeansMaxIterations = kMeansMaxIterations; + rnd = new Random(this.seed); + } + + /** {@inheritDoc} */ + @Override public FuzzyCMeansModel cluster(SparseDistributedMatrix points, int k) + throws MathIllegalArgumentException, ConvergenceException { + GridArgumentCheck.notNull(points, "points"); + + if (k < 2) + throw new MathIllegalArgumentException("The number of clusters is less than 2"); + + Vector[] centers = initializeCenters(points, k); + + MembershipsAndSums membershipsAndSums = null; + + int iteration = 0; + boolean finished = false; + while (!finished && iteration < cMeansMaxIterations) { + MembershipsAndSums newMembershipsAndSums = calculateMembership(points, centers); + Vector[] newCenters = calculateNewCenters(points, newMembershipsAndSums, k); + + if (stopCond == StopCondition.STABLE_CENTERS) + finished = isFinished(centers, newCenters); + else + finished = isFinished(membershipsAndSums, newMembershipsAndSums); + + centers = newCenters; + membershipsAndSums = newMembershipsAndSums; + + iteration++; + } + + if (iteration == cMeansMaxIterations) + throw new ConvergenceException("Fuzzy C-Means algorithm has not converged after " + + Integer.toString(iteration) + " iterations"); + + return new FuzzyCMeansModel(centers, measure); + } + + /** + * Choose k primary centers from source points. + * + * @param points Matrix with source points. + * @param k Number of centers. + * @return Array of primary centers. + */ + private Vector[] initializeCenters(SparseDistributedMatrix points, int k) { + int pointsNum = points.rowSize(); + + Vector firstCenter = points.viewRow(rnd.nextInt(pointsNum)); + + List<Vector> centers = new ArrayList<>(); + List<Vector> newCenters = new ArrayList<>(); + + centers.add(firstCenter); + newCenters.add(firstCenter); + + ConcurrentHashMap<Integer, Double> costs = new ConcurrentHashMap<>(); + + int step = 0; + UUID uuid = points.getUUID(); + String cacheName = ((SparseDistributedMatrixStorage) points.getStorage()).cacheName(); + + while(step < initSteps) { + ConcurrentHashMap<Integer, Double> newCosts = getNewCosts(cacheName, uuid, newCenters); + + for (Integer key : newCosts.keySet()) + costs.merge(key, newCosts.get(key), Math::min); + + double costsSum = costs.values().stream().mapToDouble(Double::valueOf).sum(); + + newCenters = getNewCenters(cacheName, uuid, costs, costsSum, k); + centers.addAll(newCenters); + + step++; + } + + return chooseKCenters(cacheName, uuid, centers, k); + } + + /** + * Calculate new distances from each point to the nearest center. + * + * @param cacheName Cache name of point matrix. + * @param uuid Uuid of point matrix. + * @param newCenters The list of centers that was added on previous step. + * @return Hash map with distances. + */ + private ConcurrentHashMap<Integer, Double> getNewCosts(String cacheName, UUID uuid, + List<Vector> newCenters) { + return CacheUtils.distributedFold(cacheName, + (IgniteBiFunction<Cache.Entry<SparseMatrixKey, ConcurrentHashMap<Integer, Double>>, + ConcurrentHashMap<Integer, Double>, + ConcurrentHashMap<Integer, Double>>)(vectorWithIndex, map) -> { + Vector vector = VectorUtils.fromMap(vectorWithIndex.getValue(), false); + + for (Vector center : newCenters) + map.merge(vectorWithIndex.getKey().index(), distance(vector, center), Functions.MIN); + + return map; + }, + key -> key.dataStructureId().equals(uuid), + (map1, map2) -> { + map1.putAll(map2); + return map1; + }, + ConcurrentHashMap::new); + } + + /** + * Choose some number of center candidates from source points according to their costs. + * + * @param cacheName Cache name of point matrix. + * @param uuid Uuid of point matrix. + * @param costs Hash map with costs (distances to nearest center). + * @param costsSum The sum of costs. + * @param k The estimated number of centers. + * @return The list of new candidates. + */ + private List<Vector> getNewCenters(String cacheName, UUID uuid, + ConcurrentHashMap<Integer, Double> costs, double costsSum, int k) { + return CacheUtils.distributedFold(cacheName, + (IgniteBiFunction<Cache.Entry<SparseMatrixKey, Map<Integer, Double>>, + List<Vector>, + List<Vector>>)(vectorWithIndex, centers) -> { + Integer idx = vectorWithIndex.getKey().index(); + Vector vector = VectorUtils.fromMap(vectorWithIndex.getValue(), false); + + double probability = (costs.get(idx) * 2.0 * k) / costsSum; + + if (rnd.nextDouble() < probability) + centers.add(vector); + + return centers; + }, + key -> key.dataStructureId().equals(uuid), + (list1, list2) -> { + list1.addAll(list2); + return list1; + }, + ArrayList::new); + } + + /** + * Weight candidates and use K-Means to choose required number of them. + * + * @param cacheName Cache name of the point matrix. + * @param uuid Uuid of the point matrix. + * @param centers The list of candidates. + * @param k The estimated number of centers. + * @return {@code k} centers. + */ + private Vector[] chooseKCenters(String cacheName, UUID uuid, List<Vector> centers, int k) { + centers = centers.stream().distinct().collect(Collectors.toList()); + + ConcurrentHashMap<Integer, Integer> weightsMap = weightCenters(cacheName, uuid, centers); + + List<Double> weights = new ArrayList<>(centers.size()); + + for (int i = 0; i < centers.size(); i++) + weights.add(i, Double.valueOf(weightsMap.getOrDefault(i, 0))); + + DenseLocalOnHeapMatrix centersMatrix = MatrixUtil.fromList(centers, true); + + KMeansLocalClusterer clusterer = new KMeansLocalClusterer(measure, kMeansMaxIterations, seed); + return clusterer.cluster(centersMatrix, k, weights).centers(); + } + + /** + * Weight each center with number of points for which this center is the nearest. + * + * @param cacheName Cache name of the point matrix. + * @param uuid Uuid of the point matrix. + * @param centers The list of centers. + * @return Hash map with weights. + */ + public ConcurrentHashMap<Integer, Integer> weightCenters(String cacheName, UUID uuid, List<Vector> centers) { + if (centers.size() == 0) + return new ConcurrentHashMap<>(); + + return CacheUtils.distributedFold(cacheName, + (IgniteBiFunction<Cache.Entry<SparseMatrixKey, ConcurrentHashMap<Integer, Double>>, + ConcurrentHashMap<Integer, Integer>, + ConcurrentHashMap<Integer, Integer>>)(vectorWithIndex, counts) -> { + Vector vector = VectorUtils.fromMap(vectorWithIndex.getValue(), false); + + int nearest = 0; + double minDistance = distance(centers.get(nearest), vector); + + for (int i = 0; i < centers.size(); i++) { + double currDistance = distance(centers.get(i), vector); + if (currDistance < minDistance) { + minDistance = currDistance; + nearest = i; + } + } + + counts.compute(nearest, (index, value) -> value == null ? 1 : value + 1); + + return counts; + }, + key -> key.dataStructureId().equals(uuid), + (map1, map2) -> { + map1.putAll(map2); + return map1; + }, + ConcurrentHashMap::new); + } + + /** + * Calculate matrix of membership coefficients for each point and each center. + * + * @param points Matrix with source points. + * @param centers Array of current centers. + * @return Membership matrix and sums of membership coefficients for each center. + */ + private MembershipsAndSums calculateMembership(SparseDistributedMatrix points, Vector[] centers) { + String cacheName = ((SparseDistributedMatrixStorage) points.getStorage()).cacheName(); + UUID uuid = points.getUUID(); + double fuzzyMembershipCoefficient = 2 / (exponentialWeight - 1); + + MembershipsAndSumsSupplier supplier = new MembershipsAndSumsSupplier(centers.length); + + return CacheUtils.distributedFold(cacheName, + (IgniteBiFunction<Cache.Entry<SparseMatrixKey, ConcurrentHashMap<Integer, Double>>, + MembershipsAndSums, + MembershipsAndSums>)(vectorWithIndex, membershipsAndSums) -> { + Integer idx = vectorWithIndex.getKey().index(); + Vector pnt = VectorUtils.fromMap(vectorWithIndex.getValue(), false); + Vector distances = new DenseLocalOnHeapVector(centers.length); + Vector pntMemberships = new DenseLocalOnHeapVector(centers.length); + + for (int i = 0; i < centers.length; i++) + distances.setX(i, distance(centers[i], pnt)); + + for (int i = 0; i < centers.length; i++) { + double invertedFuzzyWeight = 0.0; + + for (int j = 0; j < centers.length; j++) { + double val = Math.pow(distances.getX(i) / distances.getX(j), fuzzyMembershipCoefficient); + if (Double.isNaN(val)) + val = 1.0; + + invertedFuzzyWeight += val; + } + + double membership = Math.pow(1.0 / invertedFuzzyWeight, exponentialWeight); + pntMemberships.setX(i, membership); + } + + membershipsAndSums.memberships.put(idx, pntMemberships); + membershipsAndSums.membershipSums = membershipsAndSums.membershipSums.plus(pntMemberships); + + return membershipsAndSums; + }, + key -> key.dataStructureId().equals(uuid), + (mem1, mem2) -> { + mem1.merge(mem2); + return mem1; + }, + supplier); + } + + /** + * Calculate new centers according to membership matrix. + * + * @param points Matrix with source points. + * @param membershipsAndSums Membership matrix and sums of membership coefficient for each center. + * @param k The number of centers. + * @return Array of new centers. + */ + private Vector[] calculateNewCenters(SparseDistributedMatrix points, MembershipsAndSums membershipsAndSums, int k) { + String cacheName = ((SparseDistributedMatrixStorage) points.getStorage()).cacheName(); + UUID uuid = points.getUUID(); + + CentersArraySupplier supplier = new CentersArraySupplier(k, points.columnSize()); + + Vector[] centers = CacheUtils.distributedFold(cacheName, + (IgniteBiFunction<Cache.Entry<SparseMatrixKey, ConcurrentHashMap<Integer, Double>>, + Vector[], + Vector[]>)(vectorWithIndex, centerSums) -> { + Integer idx = vectorWithIndex.getKey().index(); + Vector pnt = MatrixUtil.localCopyOf(VectorUtils.fromMap(vectorWithIndex.getValue(), false)); + Vector pntMemberships = membershipsAndSums.memberships.get(idx); + + for (int i = 0; i < k; i++) { + Vector weightedPnt = pnt.times(pntMemberships.getX(i)); + centerSums[i] = centerSums[i].plus(weightedPnt); + } + + return centerSums; + }, + key -> key.dataStructureId().equals(uuid), + (sums1, sums2) -> { + for (int i = 0; i < k; i++) + sums1[i] = sums1[i].plus(sums2[i]); + + return sums1; + }, + supplier); + + for (int i = 0; i < k; i++) + centers[i] = centers[i].divide(membershipsAndSums.membershipSums.getX(i)); + + return centers; + } + + /** + * Check if centers have moved insignificantly. + * + * @param centers Old centers. + * @param newCenters New centers. + * @return The result of comparison. + */ + private boolean isFinished(Vector[] centers, Vector[] newCenters) { + int numCenters = centers.length; + + for (int i = 0; i < numCenters; i++) + if (distance(centers[i], newCenters[i]) > maxDelta) + return false; + + return true; + } + + /** + * Check memberships difference. + * + * @param membershipsAndSums Old memberships. + * @param newMembershipsAndSums New memberships. + * @return The result of comparison. + */ + private boolean isFinished(MembershipsAndSums membershipsAndSums, MembershipsAndSums newMembershipsAndSums) { + if (membershipsAndSums == null) + return false; + + double currMaxDelta = 0.0; + for (Integer key : membershipsAndSums.memberships.keySet()) { + double distance = measure.compute(membershipsAndSums.memberships.get(key), + newMembershipsAndSums.memberships.get(key)); + if (distance > currMaxDelta) + currMaxDelta = distance; + } + + return currMaxDelta <= maxDelta; + } + + /** Service class used to optimize counting of membership sums. */ + private class MembershipsAndSums { + /** Membership matrix. */ + public ConcurrentHashMap<Integer, Vector> memberships = new ConcurrentHashMap<>(); + + /** Membership sums. */ + public Vector membershipSums; + + /** + * Default constructor. + * + * @param k The number of centers. + */ + public MembershipsAndSums(int k) { + membershipSums = new DenseLocalOnHeapVector(k); + } + + /** + * Merge results of calculation for different parts of points. + * @param another Another part of memberships and sums. + */ + public void merge(MembershipsAndSums another) { + memberships.putAll(another.memberships); + membershipSums = membershipSums.plus(another.membershipSums); + } + } + + /** Service class that is used to create new {@link MembershipsAndSums} instances. */ + private class MembershipsAndSumsSupplier implements IgniteSupplier<MembershipsAndSums> { + /** The number of centers */ + int k; + + /** + * Constructor that retains the number of centers. + * + * @param k The number of centers. + */ + public MembershipsAndSumsSupplier(int k) { + this.k = k; + } + + /** + * Create new instance of {@link MembershipsAndSums}. + * + * @return {@link MembershipsAndSums} object. + */ + @Override public MembershipsAndSums get() { + return new MembershipsAndSums(k); + } + } + + /** Service class that is used to create new arrays of vectors. */ + private class CentersArraySupplier implements IgniteSupplier<Vector[]> { + /** The number of centers. */ + int k; + + /** The number of coordinates. */ + int dim; + + /** + * Constructor that retains all required parameters. + * + * @param k The number of centers. + * @param dim The number of coordinates. + */ + public CentersArraySupplier(int k, int dim) { + this.k = k; + this.dim = dim; + } + + /** + * Create new array of vectors. + * + * @return Array of vectors. + */ + @Override public Vector[] get() { + DenseLocalOnHeapVector[] centerSumsArr = new DenseLocalOnHeapVector[k]; + for (int i = 0; i < k; i++) + centerSumsArr[i] = new DenseLocalOnHeapVector(dim); + return centerSumsArr; + } + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/b13f088b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClusterer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClusterer.java b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClusterer.java new file mode 100644 index 0000000..1724da3 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClusterer.java @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.clustering; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Random; +import org.apache.ignite.internal.util.GridArgumentCheck; +import org.apache.ignite.ml.math.DistanceMeasure; +import org.apache.ignite.ml.math.Matrix; +import org.apache.ignite.ml.math.Vector; +import org.apache.ignite.ml.math.exceptions.ConvergenceException; +import org.apache.ignite.ml.math.exceptions.MathIllegalArgumentException; +import org.apache.ignite.ml.math.impls.matrix.DenseLocalOnHeapMatrix; +import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; + +/** Implements the local version of Fuzzy C-Means algorithm for weighted points. */ +public class FuzzyCMeansLocalClusterer extends BaseFuzzyCMeansClusterer<DenseLocalOnHeapMatrix> implements + WeightedClusterer<DenseLocalOnHeapMatrix, FuzzyCMeansModel> { + /** The maximum number of iterations. */ + private int maxIterations; + + /** The random numbers generator that is used to choose primary centers. */ + private Random rnd; + + /** + * Constructor that retains all required parameters. + * + * @param measure Distance measure. + * @param exponentialWeight Specific constant which is used in calculating of membership matrix. + * @param stopCond Flag that tells when algorithm should stop. + * @param maxDelta The maximum distance between old and new centers or maximum difference between new and old + * membership matrix elements for which algorithm must stop. + * @param maxIterations The maximum number of FCM iterations. + */ + public FuzzyCMeansLocalClusterer(DistanceMeasure measure, double exponentialWeight, StopCondition stopCond, + double maxDelta, int maxIterations, Long seed) { + super(measure, exponentialWeight, stopCond, maxDelta); + this.maxIterations = maxIterations; + rnd = seed != null ? new Random(seed) : new Random(); + } + + /** {@inheritDoc} */ + @Override public FuzzyCMeansModel cluster(DenseLocalOnHeapMatrix points, int k) { + List<Double> ones = new ArrayList<>(Collections.nCopies(points.rowSize(), 1.0)); + return cluster(points, k, ones); + } + + /** {@inheritDoc} */ + @Override public FuzzyCMeansModel cluster(DenseLocalOnHeapMatrix points, int k, List<Double> weights) + throws MathIllegalArgumentException, ConvergenceException { + GridArgumentCheck.notNull(points, "points"); + GridArgumentCheck.notNull(weights, "weights"); + + if (points.rowSize() != weights.size()) + throw new MathIllegalArgumentException("The number of points and the number of weights are not equal"); + + if (k < 2) + throw new MathIllegalArgumentException("The number of clusters is less than 2"); + + Matrix centers = new DenseLocalOnHeapMatrix(k, points.columnSize()); + Matrix distances = new DenseLocalOnHeapMatrix(k, points.rowSize()); + Matrix membership = new DenseLocalOnHeapMatrix(k, points.rowSize()); + Vector weightsVector = new DenseLocalOnHeapVector(weights.size()); + for (int i = 0; i < weights.size(); i++) + weightsVector.setX(i, weights.get(i)); + + initializeCenters(centers, points, k, weightsVector); + + int iteration = 0; + boolean finished = false; + while (iteration < maxIterations && !finished) { + calculateDistances(distances, points, centers); + Matrix newMembership = calculateMembership(distances, weightsVector); + Matrix newCenters = calculateNewCenters(points, newMembership); + + if (this.stopCond == StopCondition.STABLE_CENTERS) + finished = areCentersStable(centers, newCenters); + else + finished = areMembershipStable(membership, newMembership); + + centers = newCenters; + membership = newMembership; + iteration++; + } + + if (iteration == maxIterations) + throw new ConvergenceException("Fuzzy C-Means algorithm has not converged after " + + Integer.toString(iteration) + " iterations"); + + Vector[] centersArr = new Vector[k]; + for (int i = 0; i < k; i++) + centersArr[i] = centers.getRow(i); + + return new FuzzyCMeansModel(centersArr, measure); + } + + /** + * Choose {@code k} centers according to their weights. + * + * @param centers Output matrix containing primary centers. + * @param points Matrix of source points. + * @param k The number of centers. + * @param weights Vector of weights. + */ + private void initializeCenters(Matrix centers, Matrix points, int k, Vector weights) { + //int dimensions = points.columnSize(); + int numPoints = points.rowSize(); + + Vector firstCenter = points.viewRow(rnd.nextInt(numPoints)); + centers.setRow(0, firstCenter.getStorage().data()); + + Vector costs = points.foldRows(vector -> distance(vector, firstCenter)); + costs = costs.times(weights); + + double sum = costs.sum(); + + for (int i = 1; i < k; i++) { + double probe = rnd.nextDouble() * sum; + double cntr = 0; + int id = 0; + + for (int j = 0; j < numPoints; j++) { + cntr += costs.getX(j); + if (cntr >= probe) { + id = j; + break; + } + } + + centers.setRow(i, points.viewRow(id).getStorage().data()); + sum -= costs.get(id); + costs.set(id, 0.0); + } + } + + /** + * Calculate matrix of distances form each point to each center. + * + * @param distances Output matrix. + * @param points Matrix that contains source points. + * @param centers Matrix that contains centers. + */ + private void calculateDistances(Matrix distances, Matrix points, Matrix centers) { + int numPoints = points.rowSize(); + int numCenters = centers.rowSize(); + + for (int i = 0; i < numCenters; i++) + for (int j = 0; j < numPoints; j++) + distances.set(i, j, distance(centers.viewRow(i), points.viewRow(j))); + } + + /** + * Calculate membership matrix. + * + * @param distances Matrix of distances. + * @param weights Vector of weights. + * @ + */ + private Matrix calculateMembership(Matrix distances, Vector weights) { + Matrix newMembership = new DenseLocalOnHeapMatrix(distances.rowSize(), distances.columnSize()); + int numPoints = distances.columnSize(); + int numCenters = distances.rowSize(); + double fuzzyMembershipCoefficient = 2 / (exponentialWeight - 1); + + for (int i = 0; i < numCenters; i++) { + for (int j = 0; j < numPoints; j++) { + double invertedFuzzyWeight = 0.0; + + for (int k = 0; k < numCenters; k++) { + double val = Math.pow(distances.get(i, j) / distances.get(k, j), + fuzzyMembershipCoefficient); + if (Double.isNaN(val)) + val = 1.0; + + invertedFuzzyWeight += val; + } + + double weight = 1.0 / invertedFuzzyWeight * weights.getX(j); + newMembership.setX(i, j, Math.pow(weight, exponentialWeight)); + } + } + return newMembership; + } + + /** + * Calculate new centers using membership matrix. + * + * @param points Matrix of source points. + * @param membership Matrix that contains membership coefficients. + * @return Matrix that contains new centers. + */ + private Matrix calculateNewCenters(Matrix points, Matrix membership) { + Vector membershipSums = membership.foldRows(Vector::sum); + Matrix newCenters = membership.times(points); + + int numCenters = newCenters.rowSize(); + for (int i = 0; i < numCenters; i++) + newCenters.viewRow(i).divide(membershipSums.getX(i)); + + return newCenters; + } + + /** + * Check if centers have moved insignificantly. + * + * @param centers Old centers. + * @param newCenters New centers. + * @return The result of comparison. + */ + private boolean areCentersStable(Matrix centers, Matrix newCenters) { + int numCenters = centers.rowSize(); + for (int i = 0; i < numCenters; i++) + if (distance(centers.viewRow(i), newCenters.viewRow(i)) > maxDelta) + return false; + + return true; + } + + /** + * Check if membership matrix has changed insignificantly. + * + * @param membership Old membership matrix. + * @param newMembership New membership matrix. + * @return The result of comparison. + */ + private boolean areMembershipStable(Matrix membership, Matrix newMembership) { + int numCenters = membership.rowSize(); + int numPoints = membership.columnSize(); + + for (int i = 0; i < numCenters; i++) + for (int j = 0; j < numPoints; j++) + if (Math.abs(newMembership.getX(i, j) - membership.getX(i, j)) > maxDelta) + return false; + + return true; + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/b13f088b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansModel.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansModel.java b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansModel.java new file mode 100644 index 0000000..41267b9 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/FuzzyCMeansModel.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.clustering; + +import java.util.Arrays; +import org.apache.ignite.ml.Exportable; +import org.apache.ignite.ml.Exporter; +import org.apache.ignite.ml.FuzzyCMeansModelFormat; +import org.apache.ignite.ml.math.DistanceMeasure; +import org.apache.ignite.ml.math.Vector; + +/** This class incapsulates result of clusterization. */ +public class FuzzyCMeansModel implements ClusterizationModel<Vector, Integer>, Exportable<FuzzyCMeansModelFormat> { + /** Centers of clusters. */ + private Vector[] centers; + + /** Distance measure. */ + private DistanceMeasure measure; + + /** + * Constructor that creates FCM model by centers and measure. + * + * @param centers Array of centers. + * @param measure Distance measure. + */ + public FuzzyCMeansModel(Vector[] centers, DistanceMeasure measure) { + this.centers = Arrays.copyOf(centers, centers.length); + this.measure = measure; + } + + /** Distance measure used while clusterization. */ + public DistanceMeasure distanceMeasure() { + return measure; + } + + /** @inheritDoc */ + @Override public int clustersCount() { + return centers.length; + } + + /** @inheritDoc */ + @Override public Vector[] centers() { + return Arrays.copyOf(centers, centers.length); + } + + /** + * Predict closest center index for a given vector. + * + * @param val Vector. + * @return Index of the closest center or -1 if it can't be found. + */ + @Override public Integer predict(Vector val) { + int idx = -1; + double minDistance = Double.POSITIVE_INFINITY; + + for (int i = 0; i < centers.length; i++) { + double currDistance = measure.compute(val, centers[i]); + if (currDistance < minDistance) { + minDistance = currDistance; + idx = i; + } + } + + return idx; + } + + /** {@inheritDoc} */ + @Override public <P> void saveModel(Exporter<FuzzyCMeansModelFormat, P> exporter, P path) { + FuzzyCMeansModelFormat mdlData = new FuzzyCMeansModelFormat(centers, measure); + + exporter.save(mdlData, path); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/b13f088b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/WeightedClusterer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/WeightedClusterer.java b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/WeightedClusterer.java index 55fb359..1688087 100644 --- a/modules/ml/src/main/java/org/apache/ignite/ml/clustering/WeightedClusterer.java +++ b/modules/ml/src/main/java/org/apache/ignite/ml/clustering/WeightedClusterer.java @@ -33,6 +33,6 @@ public interface WeightedClusterer<P, M extends Model> extends Clusterer<P, M> { * @param k count of centers. * @param weights Weights. */ - public KMeansModel cluster(P points, int k, List<Double> weights) throws + public M cluster(P points, int k, List<Double> weights) throws MathIllegalArgumentException, ConvergenceException; } http://git-wip-us.apache.org/repos/asf/ignite/blob/b13f088b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/ClusteringTestSuite.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/clustering/ClusteringTestSuite.java b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/ClusteringTestSuite.java index b4cce5e..85f61fa 100644 --- a/modules/ml/src/test/java/org/apache/ignite/ml/clustering/ClusteringTestSuite.java +++ b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/ClusteringTestSuite.java @@ -27,7 +27,9 @@ import org.junit.runners.Suite; @Suite.SuiteClasses({ KMeansDistributedClustererTestSingleNode.class, KMeansDistributedClustererTestMultiNode.class, - KMeansLocalClustererTest.class + KMeansLocalClustererTest.class, + FuzzyCMeansDistributedClustererTest.class, + FuzzyCMeansLocalClustererTest.class }) public class ClusteringTestSuite { } http://git-wip-us.apache.org/repos/asf/ignite/blob/b13f088b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClustererTest.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClustererTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClustererTest.java new file mode 100644 index 0000000..0cfa7b8 --- /dev/null +++ b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/FuzzyCMeansDistributedClustererTest.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.clustering; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.Random; +import org.apache.ignite.Ignite; +import org.apache.ignite.internal.util.IgniteUtils; +import org.apache.ignite.ml.math.DistanceMeasure; +import org.apache.ignite.ml.math.EuclideanDistance; +import org.apache.ignite.ml.math.StorageConstants; +import org.apache.ignite.ml.math.Vector; +import org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix; +import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; +import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest; + +/** Tests that checks distributed Fuzzy C-Means clusterer. */ +public class FuzzyCMeansDistributedClustererTest extends GridCommonAbstractTest { + /** Number of nodes in grid. */ + private static final int NODE_COUNT = 3; + + /** Grid instance. */ + private Ignite ignite; + + /** Default constructor. */ + public FuzzyCMeansDistributedClustererTest() { + super(false); + } + + /** {@inheritDoc} */ + @Override protected void beforeTest() throws Exception { + ignite = grid(NODE_COUNT); + } + + /** {@inheritDoc} */ + @Override protected void beforeTestsStarted() throws Exception { + for (int i = 1; i <= NODE_COUNT; i++) + startGrid(i); + } + + /** {@inheritDoc} */ + @Override protected void afterTestsStopped() throws Exception { + stopAllGrids(); + } + + /** Test that algorithm gives correct results on a small sample - 4 centers on the plane. */ + public void testTwoDimensionsLittleData() { + IgniteUtils.setCurrentIgniteName(ignite.configuration().getIgniteInstanceName()); + + FuzzyCMeansDistributedClusterer clusterer = new FuzzyCMeansDistributedClusterer(new EuclideanDistance(), + 2, BaseFuzzyCMeansClusterer.StopCondition.STABLE_MEMBERSHIPS, + 0.01, 500, null, 2, 50); + + double[][] points = new double[][]{{-10, -10}, {-9, -11}, {-10, -9}, {-11, -9}, + {10, 10}, {9, 11}, {10, 9}, {11, 9}, + {-10, 10}, {-9, 11}, {-10, 9}, {-11, 9}, + {10, -10}, {9, -11}, {10, -9}, {11, -9}}; + + SparseDistributedMatrix pntMatrix = new SparseDistributedMatrix(16, 2, + StorageConstants.ROW_STORAGE_MODE, StorageConstants.RANDOM_ACCESS_MODE); + for (int i = 0; i < 16; i++) + pntMatrix.setRow(i, points[i]); + + FuzzyCMeansModel mdl = clusterer.cluster(pntMatrix, 4); + + Vector[] centers = mdl.centers(); + Arrays.sort(centers, Comparator.comparing(vector -> Math.atan2(vector.get(1), vector.get(0)))); + + DistanceMeasure measure = mdl.distanceMeasure(); + + assertEquals(0, measure.compute(centers[0], new DenseLocalOnHeapVector(new double[]{-10, -10})), 1); + assertEquals(0, measure.compute(centers[1], new DenseLocalOnHeapVector(new double[]{10, -10})), 1); + assertEquals(0, measure.compute(centers[2], new DenseLocalOnHeapVector(new double[]{10, 10})), 1); + assertEquals(0, measure.compute(centers[3], new DenseLocalOnHeapVector(new double[]{-10, 10})), 1); + } + + /** Perform N tests each of which contains M random points placed around K centers on the plane. */ + public void testTwoDimensionsRandomlyPlacedPointsAndCenters() { + IgniteUtils.setCurrentIgniteName(ignite.configuration().getIgniteInstanceName()); + + final int numOfTests = 5; + + final double exponentialWeight = 2.0; + final double maxCentersDelta = 0.01; + final int maxIterations = 500; + final Long seed = 1L; + + DistanceMeasure measure = new EuclideanDistance(); + FuzzyCMeansDistributedClusterer distributedClusterer = new FuzzyCMeansDistributedClusterer(measure, + exponentialWeight, BaseFuzzyCMeansClusterer.StopCondition.STABLE_CENTERS, + maxCentersDelta, maxIterations, seed, 2, 50); + + for (int i = 0; i < numOfTests; i++) + performRandomTest(distributedClusterer, i); + } + + /** + * Test given clusterer on points placed randomly around vertexes of a regular polygon. + * + * @param distributedClusterer Tested clusterer. + * @param seed Seed for the random numbers generator. + */ + public void performRandomTest(FuzzyCMeansDistributedClusterer distributedClusterer, long seed) { + final int minNumCenters = 2; + final int maxNumCenters = 5; + final double maxRadius = 1000; + final int maxPoints = 1000; + final int minPoints = 300; + + Random random = new Random(seed); + + int numCenters = random.nextInt(maxNumCenters - minNumCenters) + minNumCenters; + + double[][] centers = new double[numCenters][2]; + + for (int i = 0; i < numCenters; i++) { + double radius = maxRadius; + double angle = Math.PI * 2.0 * i / numCenters; + + centers[i][0] = Math.cos(angle) * radius; + centers[i][1] = Math.sin(angle) * radius; + } + + int numPoints = minPoints + random.nextInt(maxPoints - minPoints); + + double[][] points = new double[numPoints][2]; + + for (int i = 0; i < numPoints; i++) { + int center = random.nextInt(numCenters); + double randomDouble = random.nextDouble(); + double radius = randomDouble * randomDouble * maxRadius / 10; + double angle = random.nextDouble() * Math.PI * 2.0; + + points[i][0] = centers[center][0] + Math.cos(angle) * radius; + points[i][1] = centers[center][1] + Math.sin(angle) * radius; + } + + SparseDistributedMatrix pntMatrix = new SparseDistributedMatrix(numPoints, 2, + StorageConstants.ROW_STORAGE_MODE, StorageConstants.RANDOM_ACCESS_MODE); + + for (int i = 0; i < numPoints; i++) + pntMatrix.setRow(i, points[i]); + + FuzzyCMeansModel mdl = distributedClusterer.cluster(pntMatrix, numCenters); + Vector[] computedCenters = mdl.centers(); + DistanceMeasure measure = mdl.distanceMeasure(); + + int cntr = numCenters; + + for (int i = 0; i < numCenters; i++) { + for (int j = 0; j < numCenters; j++) { + if (measure.compute(computedCenters[i], new DenseLocalOnHeapVector(centers[j])) < 100) { + cntr--; + break; + } + } + } + + assertEquals(0, cntr); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/b13f088b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClustererTest.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClustererTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClustererTest.java new file mode 100644 index 0000000..c58ffc7 --- /dev/null +++ b/modules/ml/src/test/java/org/apache/ignite/ml/clustering/FuzzyCMeansLocalClustererTest.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.clustering; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import org.apache.ignite.ml.math.DistanceMeasure; +import org.apache.ignite.ml.math.EuclideanDistance; +import org.apache.ignite.ml.math.Matrix; +import org.apache.ignite.ml.math.Vector; +import org.apache.ignite.ml.math.exceptions.MathIllegalArgumentException; +import org.apache.ignite.ml.math.impls.matrix.DenseLocalOnHeapMatrix; +import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** Tests that checks local Fuzzy C-Means clusterer. */ +public class FuzzyCMeansLocalClustererTest { + /** Test FCM on points that forms three clusters on the line. */ + @Test + public void equalWeightsOneDimension() { + BaseFuzzyCMeansClusterer clusterer = new FuzzyCMeansLocalClusterer(new EuclideanDistance(), + 2, BaseFuzzyCMeansClusterer.StopCondition.STABLE_CENTERS, + 0.01, 10, null); + + double[][] points = new double[][]{{-10}, {-9}, {-8}, {-7}, + {7}, {8}, {9}, {10}, + {-1}, {0}, {1}}; + + Matrix pntMatrix = new DenseLocalOnHeapMatrix(points); + + FuzzyCMeansModel mdl = clusterer.cluster(pntMatrix, 3); + + Vector[] centers = mdl.centers(); + Arrays.sort(centers, Comparator.comparing(vector -> vector.getX(0))); + assertEquals(-8.5, centers[0].getX(0), 2); + assertEquals(0, centers[1].getX(0), 2); + assertEquals(8.5, centers[2].getX(0), 2); + } + + /** Test FCM on points that forms four clusters on the plane. */ + @Test + public void equalWeightsTwoDimensions() { + BaseFuzzyCMeansClusterer clusterer = new FuzzyCMeansLocalClusterer(new EuclideanDistance(), + 2, BaseFuzzyCMeansClusterer.StopCondition.STABLE_CENTERS, + 0.01, 20, null); + + double[][] points = new double[][]{{-10, -10}, {-9, -11}, {-10, -9}, {-11, -9}, + {10, 10}, {9, 11}, {10, 9}, {11, 9}, + {-10, 10}, {-9, 11}, {-10, 9}, {-11, 9}, + {10, -10}, {9, -11}, {10, -9}, {11, -9}}; + + Matrix pntMatrix = new DenseLocalOnHeapMatrix(points); + + FuzzyCMeansModel mdl = clusterer.cluster(pntMatrix, 4); + Vector[] centers = mdl.centers(); + Arrays.sort(centers, Comparator.comparing(vector -> Math.atan2(vector.get(1), vector.get(0)))); + + DistanceMeasure measure = mdl.distanceMeasure(); + + assertEquals(0, measure.compute(centers[0], new DenseLocalOnHeapVector(new double[]{-10, -10})), 1); + assertEquals(0, measure.compute(centers[1], new DenseLocalOnHeapVector(new double[]{10, -10})), 1); + assertEquals(0, measure.compute(centers[2], new DenseLocalOnHeapVector(new double[]{10, 10})), 1); + assertEquals(0, measure.compute(centers[3], new DenseLocalOnHeapVector(new double[]{-10, 10})), 1); + } + + /** Test FCM on points which have the equal coordinates. */ + @Test + public void checkCentersOfTheSamePointsTwoDimensions() { + BaseFuzzyCMeansClusterer clusterer = new FuzzyCMeansLocalClusterer(new EuclideanDistance(), + 2, BaseFuzzyCMeansClusterer.StopCondition.STABLE_MEMBERSHIPS, 0.01, 10, null); + + double[][] points = new double[][] {{3.3, 10}, {3.3, 10}, {3.3, 10}, {3.3, 10}, {3.3, 10}}; + + Matrix pntMatrix = new DenseLocalOnHeapMatrix(points); + + int k = 2; + FuzzyCMeansModel mdl = clusterer.cluster(pntMatrix, k); + Vector exp = new DenseLocalOnHeapVector(new double[] {3.3, 10}); + for (int i = 0; i < k; i++) { + Vector center = mdl.centers()[i]; + + for (int j = 0; j < 2; j++) + assertEquals(exp.getX(j), center.getX(j), 1); + } + } + + /** Test FCM on points located on the circle. */ + @Test + public void checkCentersLocationOnSphere() { + BaseFuzzyCMeansClusterer clusterer = new FuzzyCMeansLocalClusterer(new EuclideanDistance(), + 2, BaseFuzzyCMeansClusterer.StopCondition.STABLE_CENTERS, 0.01, 100, null); + + int numOfPoints = 650; + double radius = 100.0; + double[][] points = new double [numOfPoints][2]; + + for (int i = 0; i < numOfPoints; i++) { + points[i][0] = Math.cos(Math.PI * 2 * i / numOfPoints) * radius; + points[i][1] = Math.sin(Math.PI * 2 * i / numOfPoints) * radius; + } + + Matrix pntMatrix = new DenseLocalOnHeapMatrix(points); + + int k = 10; + FuzzyCMeansModel mdl = clusterer.cluster(pntMatrix, k); + + Vector sum = mdl.centers()[0]; + for (int i = 1; i < k; i++) + sum = sum.plus(mdl.centers()[i]); + + assertEquals(0, sum.kNorm(1), 1); + } + + /** Test FCM on points that forms the line located on the plane. */ + @Test + public void test2DLineClustering() { + BaseFuzzyCMeansClusterer clusterer = new FuzzyCMeansLocalClusterer(new EuclideanDistance(), + 2, BaseFuzzyCMeansClusterer.StopCondition.STABLE_CENTERS, 0.01, 50, null); + + double[][] points = new double[][]{{1, 2}, {3, 6}, {5, 10}}; + + Matrix pntMatrix = new DenseLocalOnHeapMatrix(points); + + int k = 2; + FuzzyCMeansModel mdl = clusterer.cluster(pntMatrix, k); + Vector[] centers = mdl.centers(); + Arrays.sort(centers, Comparator.comparing(vector -> vector.getX(0))); + + Vector[] exp = {new DenseLocalOnHeapVector(new double[]{1.5, 3}), + new DenseLocalOnHeapVector(new double[]{4.5, 9})}; + + for (int i = 0; i < k; i++) { + Vector center = centers[i]; + + for (int j = 0; j < 2; j++) + assertEquals(exp[i].getX(j), center.getX(j), 0.5); + } + } + + /** Test FCM on points that have different weights. */ + @Test + public void differentWeightsOneDimension() { + FuzzyCMeansLocalClusterer clusterer = new FuzzyCMeansLocalClusterer(new EuclideanDistance(), + 2, BaseFuzzyCMeansClusterer.StopCondition.STABLE_CENTERS, + 0.01, 10, null); + + double[][] points = new double[][]{{1}, {2}, {3}, {4}, {5}, {6}}; + + DenseLocalOnHeapMatrix pntMatrix = new DenseLocalOnHeapMatrix(points); + ArrayList<Double> weights = new ArrayList<>(); + Collections.addAll(weights, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0); + + Vector[] centers1 = clusterer.cluster(pntMatrix, 2).centers(); + Vector[] centers2 = clusterer.cluster(pntMatrix, 2, weights).centers(); + Arrays.sort(centers1, Comparator.comparing(vector -> vector.getX(0))); + Arrays.sort(centers2, Comparator.comparing(vector -> vector.getX(0))); + + assertTrue(centers1[0].get(0) - centers2[0].get(0) > 0.5); + } + + /** Test FCM on illegal number of clusters. */ + @Test(expected = MathIllegalArgumentException.class) + public void testIllegalNumberOfClusters() { + FuzzyCMeansLocalClusterer clusterer = new FuzzyCMeansLocalClusterer(new EuclideanDistance(), + 2, BaseFuzzyCMeansClusterer.StopCondition.STABLE_CENTERS, 0.01, 10, null); + double[][] points = new double[][]{{1}, {2}, {3}, {4}}; + + FuzzyCMeansModel cluster = clusterer.cluster(new DenseLocalOnHeapMatrix(points), 1); + } + + /** Test FCM on different numbers of points and weights. */ + @Test(expected = MathIllegalArgumentException.class) + public void testDifferentAmountsOfPointsAndWeights(){ + FuzzyCMeansLocalClusterer clusterer = new FuzzyCMeansLocalClusterer(new EuclideanDistance(), + 2, BaseFuzzyCMeansClusterer.StopCondition.STABLE_CENTERS, 0.01, 10, null); + double[][] points = new double[][]{{1}, {2}, {3}, {4}}; + + ArrayList<Double> weights = new ArrayList<>(); + Collections.addAll(weights, 1.0, 34.0, 2.5, 5.0, 0.5); + + FuzzyCMeansModel cluster = clusterer.cluster(new DenseLocalOnHeapMatrix(points), 2, weights); + } +}
