[GitHub] spark pull request #21119: [SPARK-19826][ML][PYTHON]add spark.ml Python API ...

viirya Fri, 27 Apr 2018 21:06:33 -0700

Github user viirya commented on a diff in the pull request:

    https://github.com/apache/spark/pull/21119#discussion_r184838981
  
    --- Diff: python/pyspark/ml/clustering.py ---
    @@ -1156,6 +1156,204 @@ def getKeepLastCheckpoint(self):
             return self.getOrDefault(self.keepLastCheckpoint)
     
     
    +@inherit_doc
    +class PowerIterationClustering(HasMaxIter, HasPredictionCol, 
JavaTransformer, JavaParams,
    +                               JavaMLReadable, JavaMLWritable):
    +    """
    +    .. note:: Experimental
    +    Power Iteration Clustering (PIC), a scalable graph clustering 
algorithm developed by
    +    <a href=http://www.icml2010.org/papers/387.pdf>Lin and Cohen</a>. From 
the abstract:
    +    PIC finds a very low-dimensional embedding of a dataset using 
truncated power
    +    iteration on a normalized pair-wise similarity matrix of the data.
    +
    +    PIC takes an affinity matrix between items (or vertices) as input.  An 
affinity matrix
    +    is a symmetric matrix whose entries are non-negative similarities 
between items.
    +    PIC takes this matrix (or graph) as an adjacency matrix.  
Specifically, each input row
    +    includes:
    +
    +     - :py:class:`idCol`: vertex ID
    +     - :py:class:`neighborsCol`: neighbors of vertex in :py:class:`idCol`
    +     - :py:class:`similaritiesCol`: non-negative weights (similarities) of 
edges between the
    +        vertex in :py:class:`idCol` and each neighbor in 
:py:class:`neighborsCol`
    +
    +    PIC returns a cluster assignment for each input vertex.  It appends a 
new column
    +    :py:class:`predictionCol` containing the cluster assignment in 
:py:class:`[0,k)` for
    +    each row (vertex).
    +
    +    Notes:
    +
    +     - [[PowerIterationClustering]] is a transformer with an expensive 
[[transform]] operation.
    +        Transform runs the iterative PIC algorithm to cluster the whole 
input dataset.
    +     - Input validation: This validates that similarities are non-negative 
but does NOT validate
    +        that the input matrix is symmetric.
    +
    +    @see <a href=http://en.wikipedia.org/wiki/Spectral_clustering>
    +    Spectral clustering (Wikipedia)</a>
    +
    +    >>> from pyspark.sql.types import ArrayType, DoubleType, LongType, 
StructField, StructType
    +    >>> similarities = [((long)(1), [0], [0.5]), ((long)(2), [0, 1], 
[0.7,0.5]), \
    +                        ((long)(3), [0, 1, 2], [0.9, 0.7, 0.5]), \
    +                        ((long)(4), [0, 1, 2, 3], [1.1, 0.9, 0.7,0.5]), \
    +                        ((long)(5), [0, 1, 2, 3, 4], [1.3, 1.1, 0.9, 
0.7,0.5])]
    +    >>> rdd = sc.parallelize(similarities, 2)
    +    >>> schema = StructType([StructField("id", LongType(), False), \
    +                 StructField("neighbors", ArrayType(LongType(), False), 
True), \
    +                 StructField("similarities", ArrayType(DoubleType(), 
False), True)])
    +    >>> df = spark.createDataFrame(rdd, schema)
    +    >>> pic = PowerIterationClustering()
    +    >>> result = pic.setK(2).setMaxIter(10).transform(df)
    +    >>> predictions = sorted(set([(i[0], i[1]) for i in 
result.select(result.id, result.prediction)
    +    ...     .collect()]), key=lambda x: x[0])
    +    >>> predictions[0]
    +    (1, 1)
    +    >>> predictions[1]
    +    (2, 1)
    +    >>> predictions[2]
    +    (3, 0)
    +    >>> predictions[3]
    +    (4, 0)
    +    >>> predictions[4]
    +    (5, 0)
    +    >>> pic_path = temp_path + "/pic"
    +    >>> pic.save(pic_path)
    +    >>> pic2 = PowerIterationClustering.load(pic_path)
    +    >>> pic2.getK()
    +    2
    +    >>> pic2.getMaxIter()
    +    10
    +    >>> pic3 = PowerIterationClustering(k=4, initMode="degree")
    +    >>> pic3.getIdCol()
    +    'id'
    +    >>> pic3.getK()
    +    4
    +    >>> pic3.getMaxIter()
    +    20
    +    >>> pic3.getInitMode()
    +    'degree'
    +
    +    .. versionadded:: 2.4.0
    +    """
    +
    +    k = Param(Params._dummy(), "k",
    +              "The number of clusters to create. Must be > 1.",
    +              typeConverter=TypeConverters.toInt)
    +    initMode = Param(Params._dummy(), "initMode",
    +                     "The initialization algorithm. This can be either " +
    +                     "'random' to use a random vector as vertex 
properties, or 'degree' to use " +
    +                     "a normalized sum of similarities with other 
vertices.  Supported options: " +
    +                     "'random' and 'degree'.",
    +                     typeConverter=TypeConverters.toString)
    +    idCol = Param(Params._dummy(), "idCol",
    +                  "Name of the input column for vertex IDs.",
    +                  typeConverter=TypeConverters.toString)
    +    neighborsCol = Param(Params._dummy(), "neighborsCol",
    +                         "Name of the input column for neighbors in the 
adjacency list " +
    +                         "representation.",
    +                         typeConverter=TypeConverters.toString)
    +    similaritiesCol = Param(Params._dummy(), "similaritiesCol",
    +                            "Name of the input column for non-negative 
weights (similarities) " +
    +                            "of edges between the vertex in `idCol` and 
each neighbor in " +
    +                            "`neighborsCol`",
    +                            typeConverter=TypeConverters.toString)
    +
    +    @keyword_only
    +    def __init__(self, predictionCol="prediction", k=2, maxIter=20, 
initMode="random",
    +                 idCol="id", neighborsCol="neighbors", 
similaritiesCol="similarities"):
    +        """
    +        __init__(self, predictionCol="prediction", k=2, maxIter=20, 
initMode="random",\
    +                 idCol="id", neighborsCol="neighbors", 
similaritiesCol="similarities")
    +        """
    +        super(PowerIterationClustering, self).__init__()
    +        self._java_obj = self._new_java_obj(
    +            "org.apache.spark.ml.clustering.PowerIterationClustering", 
self.uid)
    +        self._setDefault(k=2, maxIter=20, initMode="random", idCol="id", 
neighborsCol="neighbors",
    +                         similaritiesCol="similarities")
    +        kwargs = self._input_kwargs
    +        self.setParams(**kwargs)
    +
    +    @keyword_only
    +    @since("2.4.0")
    +    def setParams(self, predictionCol="prediction", k=2, maxIter=20, 
initMode="random",
    +                  idCol="id", neighborsCol="neighbors", 
similaritiesCol="similarities"):
    +        """
    +        setParams(self, predictionCol="prediction", k=2, maxIter=20, 
initMode="random",\
    +                  idCol="id", neighborsCol="neighbors", 
similaritiesCol="similarities")
    +        Sets params for PowerIterationClustering.
    +        """
    +        kwargs = self._input_kwargs
    +        return self._set(**kwargs)
    +
    +    @since("2.4.0")
    +    def setK(self, value):
    +        """
    +        Sets the value of :py:attr:`k`.
    +        """
    +        return self._set(k=value)
    +
    +    @since("2.4.0")
    +    def getK(self):
    +        """
    +        Gets the value of :py:attr:`k`.
    +        """
    +        return self.getOrDefault(self.k)
    +
    +    @since("2.4.0")
    +    def setInitMode(self, value):
    +        """
    +        Sets the value of :py:attr:`initMode`.
    +        """
    +        return self._set(initMode=value)
    +
    +    @since("2.4.0")
    +    def getInitMode(self):
    +        """
    +        Gets the value of `initMode`
    +        """
    +        return self.getOrDefault(self.initMode)
    +
    +    @since("2.4.0")
    +    def setIdCol(self, value):
    +        """
    +        Sets the value of :py:attr:`idCol`.
    +        """
    +        return self._set(idCol=value)
    +
    +    @since("2.4.0")
    +    def getIdCol(self):
    +        """
    +        Gets the value of :py:attr:`idCol`.
    +        """
    +        return self.getOrDefault(self.idCol)
    +
    +    @since("2.4.0")
    +    def setNeighborsCol(self, value):
    +        """
    +        Sets the value of :py:attr:`neighborsCol.
    +        """
    +        return self._set(neighborsCol=value)
    +
    +    @since("2.4.0")
    +    def getNeighborsCol(self):
    +        """
    +        Gets the value of :py:attr:`neighborsCol`.
    +        """
    +        return self.getOrDefault(self.neighborsCol)
    +
    +    @since("2.4.0")
    +    def setSimilaritiesCol(self, value):
    +        """
    +        Sets the value of :py:attr:`similaritiesCol`.
    +        """
    +        return self._set(similaritiesCol=value)
    +
    +    @since("2.4.0")
    +    def getSimilaritiesCol(self):
    +        """
    +        Gets the value of :py:attr:`similaritiesCol`.
    +        """
    +        return self.getOrDefault(self.binary)
    --- End diff --
    
    `self.binary` -> `self.similaritiesCol`?



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #21119: [SPARK-19826][ML][PYTHON]add spark.ml Python API ...

Reply via email to