Github user huaxingao commented on a diff in the pull request: https://github.com/apache/spark/pull/21513#discussion_r194237396 --- Diff: python/pyspark/ml/clustering.py --- @@ -1156,6 +1157,204 @@ def getKeepLastCheckpoint(self): return self.getOrDefault(self.keepLastCheckpoint) +@inherit_doc +class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReadable, + JavaMLWritable): + """ + .. note:: Experimental + + Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by + <a href=http://www.icml2010.org/papers/387.pdf>Lin and Cohen</a>. From the abstract: + PIC finds a very low-dimensional embedding of a dataset using truncated power + iteration on a normalized pair-wise similarity matrix of the data. + + This class is not yet an Estimator/Transformer, use `assignClusters` method to run the + PowerIterationClustering algorithm. + + .. seealso:: `Wikipedia on Spectral clustering \ + <http://en.wikipedia.org/wiki/Spectral_clustering>`_ + + >>> from pyspark.sql.types import DoubleType, LongType, StructField, StructType + >>> import math + >>> def genCircle(r, n): + ... points = [] + ... for i in range(0, n): + ... theta = 2.0 * math.pi * i / n + ... points.append((r * math.cos(theta), r * math.sin(theta))) + ... return points + >>> def sim(x, y): + ... dist = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1]) + ... return math.exp(-dist / 2.0) + >>> r1 = 1.0 + >>> n1 = 10 + >>> r2 = 4.0 + >>> n2 = 40 + >>> n = n1 + n2 + >>> points = genCircle(r1, n1) + genCircle(r2, n2) + >>> data = [(i, j, sim(points[i], points[j])) for i in range(1, n) for j in range(0, i)] + >>> rdd = sc.parallelize(data, 2) + >>> schema = StructType([StructField("src", LongType(), False), \ + StructField("dst", LongType(), True), \ + StructField("weight", DoubleType(), True)]) + >>> df = spark.createDataFrame(rdd, schema) + >>> pic = PowerIterationClustering() + >>> assignments = pic.setK(2).setMaxIter(40).setWeightCol("weight").assignClusters(df) + >>> result = sorted(assignments.collect(), key=lambda x: x.id) + >>> result[0].cluster == result[1].cluster == result[2].cluster == result[3].cluster + True + >>> result[4].cluster == result[5].cluster == result[6].cluster == result[7].cluster + True + >>> pic_path = temp_path + "/pic" + >>> pic.save(pic_path) + >>> pic2 = PowerIterationClustering.load(pic_path) + >>> pic2.getK() + 2 + >>> pic2.getMaxIter() + 40 + >>> assignments2 = pic2.assignClusters(df) + >>> result2 = sorted(assignments2.collect(), key=lambda x: x.id) + >>> result2[0].cluster == result2[1].cluster == result2[2].cluster == result2[3].cluster + True + >>> result2[4].cluster == result2[5].cluster == result2[6].cluster == result2[7].cluster + True --- End diff -- Thanks for the comments. Will make changes.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org