[GitHub] spark pull request #19185: [Spark-21854] Added LogisticRegressionTrainingSum...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19185#discussion_r138045280 --- Diff: python/pyspark/ml/classification.py --- @@ -529,8 +529,11 @@ def summary(self): """ if self.hasSummary: java_blrt_summary = self._call_java("summary") -# Note: Once multiclass is added, update this to return correct summary -return BinaryLogisticRegressionTrainingSummary(java_blrt_summary) +if (self.numClasses == 2): +java_blrt_binarysummary = self._call_java("binarySummary") --- End diff -- Actually this is not necessary, we can just wrap ```java_lrt_summary``` with ```BinaryLogisticRegressionTrainingSummary```. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19185: [Spark-21854] Added LogisticRegressionTrainingSum...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19185#discussion_r138045342 --- Diff: python/pyspark/ml/classification.py --- @@ -529,8 +529,11 @@ def summary(self): """ if self.hasSummary: java_blrt_summary = self._call_java("summary") -# Note: Once multiclass is added, update this to return correct summary -return BinaryLogisticRegressionTrainingSummary(java_blrt_summary) +if (self.numClasses == 2): --- End diff -- ```if (self.numClasses <= 2)``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19185: [Spark-21854] Added LogisticRegressionTrainingSum...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19185#discussion_r138045070 --- Diff: python/pyspark/ml/classification.py --- @@ -529,8 +529,11 @@ def summary(self): """ if self.hasSummary: java_blrt_summary = self._call_java("summary") --- End diff -- Rename this to ```java_lrt_summary```, as it's not always _binary_ logistic regression. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r138025640 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,437 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + @Since("2.3.0") + def this() = this(Identifiable.randomUID("cluEval")) + + @Since("2.3.0") + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + @Since("2.3.0") + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default)) + * @group param + */ + @Since("2.3.0") + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) +new Param( + this, + "metricName", + "metric name in evaluation (squaredSilhouette)", + allowedParams +) + } + + /** @group getParam */ + @Since("2.3.0") + def getMetricName: String = $(metricName) + + /** @group setParam */ + @Since("2.3.0") + def setMetricName(value: String): this.type = set(metricName, value) + + setDefault(metricName -> "squaredSilhouette") + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +// Silhouette is reasonable only when the number of clusters is grater then 1 +assert(dataset.select($(predictionCol)).distinct().count() > 1, --- End diff -- Move this check to L418, in case another unnecessary computation for most of the cases(cluster size > 1). See my comment at L418. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r138024385 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,437 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + @Since("2.3.0") + def this() = this(Identifiable.randomUID("cluEval")) + + @Since("2.3.0") + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + @Since("2.3.0") + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default)) + * @group param + */ + @Since("2.3.0") + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) +new Param( + this, + "metricName", + "metric name in evaluation (squaredSilhouette)", + allowedParams +) + } + + /** @group getParam */ + @Since("2.3.0") + def getMetricName: String = $(metricName) + + /** @group setParam */ + @Since("2.3.0") + def setMetricName(value: String): this.type = set(metricName, value) + + setDefault(metricName -> "squaredSilhouette") + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +// Silhouette is reasonable only when the number of clusters is grater then 1 +assert(dataset.select($(predictionCol)).distinct().count() > 1, + "Number of clusters must be greater than one.") + +$(metricName) match { + case "squaredSilhouette" => SquaredEuclideanSilhouette.computeSilhouetteScore( +dataset, +$(predictionCol), +$(featuresCol) + ) +} + } +} + + +@Since("2.3.0") +object ClusteringEvaluator + extends Defaul
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r138025184 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,437 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + @Since("2.3.0") + def this() = this(Identifiable.randomUID("cluEval")) + + @Since("2.3.0") + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + @Since("2.3.0") + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default)) + * @group param + */ + @Since("2.3.0") + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) +new Param( + this, + "metricName", + "metric name in evaluation (squaredSilhouette)", + allowedParams +) + } + + /** @group getParam */ + @Since("2.3.0") + def getMetricName: String = $(metricName) + + /** @group setParam */ + @Since("2.3.0") + def setMetricName(value: String): this.type = set(metricName, value) + + setDefault(metricName -> "squaredSilhouette") + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +// Silhouette is reasonable only when the number of clusters is grater then 1 +assert(dataset.select($(predictionCol)).distinct().count() > 1, + "Number of clusters must be greater than one.") + +$(metricName) match { + case "squaredSilhouette" => SquaredEuclideanSilhouette.computeSilhouetteScore( +dataset, +$(predictionCol), +$(featuresCol) + ) +} + } +} + + +@Since("2.3.0") +object ClusteringEvaluator + extends Defaul
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r138027427 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,437 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + @Since("2.3.0") + def this() = this(Identifiable.randomUID("cluEval")) + + @Since("2.3.0") + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + @Since("2.3.0") + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default)) + * @group param + */ + @Since("2.3.0") + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) --- End diff -- I'd suggest the metric name is ```silhouette```, since we may add silhouette for other distance, then we can add another param like ```distance``` to control that. The param ```metricName``` should not bind to any distance computation way. There are lots of other metrics for clustering algorithms, like [these](http://scikit-learn.org/stable/modules/classes.html#clustering-metrics) in sklearn. We would not add all of them for MLlib, but we may add part of them in the future. cc @jkbradley @MLnick @WeichenXu123 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r138021102 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,437 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + @Since("2.3.0") + def this() = this(Identifiable.randomUID("cluEval")) + + @Since("2.3.0") + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + @Since("2.3.0") + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default)) + * @group param + */ + @Since("2.3.0") + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) +new Param( + this, + "metricName", + "metric name in evaluation (squaredSilhouette)", + allowedParams +) + } + + /** @group getParam */ + @Since("2.3.0") + def getMetricName: String = $(metricName) + + /** @group setParam */ + @Since("2.3.0") + def setMetricName(value: String): this.type = set(metricName, value) + + setDefault(metricName -> "squaredSilhouette") + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +// Silhouette is reasonable only when the number of clusters is grater then 1 +assert(dataset.select($(predictionCol)).distinct().count() > 1, + "Number of clusters must be greater than one.") + +$(metricName) match { + case "squaredSilhouette" => SquaredEuclideanSilhouette.computeSilhouetteScore( +dataset, +$(predictionCol), +$(featuresCol) + ) +} + } +} + + +@Since("2.3.0") +object ClusteringEvaluator + extends Defaul
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r138024573 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,437 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + @Since("2.3.0") + def this() = this(Identifiable.randomUID("cluEval")) + + @Since("2.3.0") + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + @Since("2.3.0") + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default)) + * @group param + */ + @Since("2.3.0") + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) +new Param( + this, + "metricName", + "metric name in evaluation (squaredSilhouette)", + allowedParams +) + } + + /** @group getParam */ + @Since("2.3.0") + def getMetricName: String = $(metricName) + + /** @group setParam */ + @Since("2.3.0") + def setMetricName(value: String): this.type = set(metricName, value) + + setDefault(metricName -> "squaredSilhouette") + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +// Silhouette is reasonable only when the number of clusters is grater then 1 +assert(dataset.select($(predictionCol)).distinct().count() > 1, + "Number of clusters must be greater than one.") + +$(metricName) match { + case "squaredSilhouette" => SquaredEuclideanSilhouette.computeSilhouetteScore( +dataset, +$(predictionCol), +$(featuresCol) + ) +} + } +} + + +@Since("2.3.0") +object ClusteringEvaluator + extends Defaul
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r138023290 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,437 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + @Since("2.3.0") + def this() = this(Identifiable.randomUID("cluEval")) + + @Since("2.3.0") + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + @Since("2.3.0") + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default)) + * @group param + */ + @Since("2.3.0") + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) +new Param( + this, + "metricName", + "metric name in evaluation (squaredSilhouette)", + allowedParams +) + } + + /** @group getParam */ + @Since("2.3.0") + def getMetricName: String = $(metricName) + + /** @group setParam */ + @Since("2.3.0") + def setMetricName(value: String): this.type = set(metricName, value) + + setDefault(metricName -> "squaredSilhouette") + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +// Silhouette is reasonable only when the number of clusters is grater then 1 +assert(dataset.select($(predictionCol)).distinct().count() > 1, + "Number of clusters must be greater than one.") + +$(metricName) match { + case "squaredSilhouette" => SquaredEuclideanSilhouette.computeSilhouetteScore( +dataset, +$(predictionCol), +$(featuresCol) + ) +} + } +} + + +@Since("2.3.0") +object ClusteringEvaluator + extends Defaul
[GitHub] spark issue #19172: [SPARK-21856] Add probability and rawPrediction to MLPC ...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/19172 LGTM2, merged into master. Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19172: [SPARK-21856] Add probability and rawPrediction t...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19172#discussion_r137975691 --- Diff: python/pyspark/ml/tests.py --- @@ -1655,6 +1655,26 @@ def test_multinomial_logistic_regression_with_bound(self): np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4)) +class MultilayerPerceptronClassifierTest(SparkSessionTestCase): + +def test_multilayer_perceptron_classifier(self): --- End diff -- Rename to ```test_raw_and_probability_prediction```? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19185: [Spark-21854] Added LogisticRegressionTrainingSummary fo...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/19185 @gatorsmile Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19176: [SPARK-21965] [SparkR] Add createOrReplaceGlobalTempView...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/19176 Oh sorry, I didn't find that PR before. Let's discuss this issue in that JIRA, I'll close this PR. Thanks for all your kindly remind. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19176: [SPARK-21965] [SparkR] Add createOrReplaceGlobalT...
Github user yanboliang closed the pull request at: https://github.com/apache/spark/pull/19176 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19176: [SPARK-21965] [SparkR] Add createOrReplaceGlobalTempView...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/19176 cc @felixcheung --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19172: [SPARK-21856] Add probability and rawPrediction t...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19172#discussion_r137929197 --- Diff: python/pyspark/ml/tests.py --- @@ -1655,6 +1655,25 @@ def test_multinomial_logistic_regression_with_bound(self): np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4)) +class MultilayerPerceptronClassifierTest(SparkSessionTestCase): + +def test_multilayer_perceptron_classifier(self): + +data_path = "data/mllib/sample_multiclass_classification_data.txt" --- End diff -- For other algorithms, I agree with you. However, too simple dataset is not ideal to test MLPC. This dataset is a tiny dataset with 150 rows and 4 columns, so I think we can use it. Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19172: [SPARK-21856] Add probability and rawPrediction t...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19172#discussion_r137929052 --- Diff: python/pyspark/ml/classification.py --- @@ -1425,11 +1425,13 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, - solver="l-bfgs", initialWeights=None): + solver="l-bfgs", initialWeights=None, probabilityCol="probability", + rawPredicitionCol="rawPrediction"): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \ - solver="l-bfgs", initialWeights=None) + solver="l-bfgs", initialWeights=None, probabilityCol="probability", \ + rawPredicitionCol="rawPrediction") --- End diff -- @WeichenXu123 Actually these default values has been set in base classes ```HasProbabilityCol``` and ```HasRawPredictionCol```, so we don't to set it again here. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19172: [SPARK-21856] Add probability and rawPrediction t...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19172#discussion_r137929080 --- Diff: python/pyspark/ml/classification.py --- @@ -1442,11 +1444,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred @since("1.6.0") def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, - solver="l-bfgs", initialWeights=None): + solver="l-bfgs", initialWeights=None, probabilityCol="probability", + rawPredicitionCol="rawPrediction"): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \ - solver="l-bfgs", initialWeights=None) + solver="l-bfgs", initialWeights=None, probabilityCol="probability", \ + rawPredicitionCol="rawPrediction"): --- End diff -- Align. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19176: [SPARK-21965] [SparkR] Add createOrReplaceGlobalT...
GitHub user yanboliang opened a pull request: https://github.com/apache/spark/pull/19176 [SPARK-21965] [SparkR] Add createOrReplaceGlobalTempView and dropGlobalTempView for SparkR ## What changes were proposed in this pull request? Add ```createOrReplaceGlobalTempView``` and ```dropGlobalTempView``` for SparkR. ## How was this patch tested? Unit tests. You can merge this pull request into a Git repository by running: $ git pull https://github.com/yanboliang/spark SPARK-21965 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/19176.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #19176 commit 859ab5e1a8071be8e7c3bf9617386aac2edcccea Author: Yanbo Liang Date: 2017-09-09T15:25:05Z Add createOrReplaceGlobalTempView and dropGlobalTempView for SparkR --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19173: [Minor] [SQL] Correct PySpark DataFrame doc.
GitHub user yanboliang opened a pull request: https://github.com/apache/spark/pull/19173 [Minor] [SQL] Correct PySpark DataFrame doc. ## What changes were proposed in this pull request? Correct PySpark DataFrame doc. ## How was this patch tested? Only doc change, no tests. You can merge this pull request into a Git repository by running: $ git pull https://github.com/yanboliang/spark df-doc Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/19173.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #19173 commit 7686a2ea047a8c2985c1dd902e9f9e4126448838 Author: Yanbo Liang Date: 2017-09-09T09:27:19Z Correct PySpark DataFrame doc. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137278367 --- Diff: mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala --- @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.DefaultReadWriteTest +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.{DataFrame, SparkSession} + + +private[ml] case class ClusteringEvaluationTestData(features: Vector, label: Int) + +class ClusteringEvaluatorSuite + extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + import testImplicits._ + + test("params") { +ParamsSuite.checkParams(new ClusteringEvaluator) + } + + test("read/write") { +val evaluator = new ClusteringEvaluator() + .setPredictionCol("myPrediction") + .setFeaturesCol("myLabel") +testDefaultReadWrite(evaluator) + } + + /* +Use the following python code to load the data and evaluate it using scikit-learn package. + +from sklearn import datasets +from sklearn.metrics import silhouette_score +iris = datasets.load_iris() +round(silhouette_score(iris.data, iris.target, metric='sqeuclidean'), 10) + +0.6564679231 + */ + test("squared euclidean Silhouette") { +val iris = ClusteringEvaluatorSuite.irisDataset(spark) +val evaluator = new ClusteringEvaluator() +.setFeaturesCol("features") +.setPredictionCol("label") + +assert(evaluator.evaluate(iris) ~== 0.6564679231 relTol 1e-10) + } + --- End diff -- Yeah, I support to keep consistent result. Otherwise, any real value is a confused result. What do you think of it? Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137253446 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( --- End diff -- Sorry if the comment was left by me. Anyway, I think we should add it, since this class is ```ClusteringEvaluator``` rather than silhouette metric, users should know which metric they are using. And we will support more metrics in the future. Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with the im...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18538 @mgaido91 I left some minor comments, otherwise, this looks good. Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137239650 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) --- End diff -- ```@Since("2.3.0") ``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137240370 --- Diff: mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala --- @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.DefaultReadWriteTest +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.{DataFrame, SparkSession} + + +private[ml] case class ClusteringEvaluationTestData(features: Vector, label: Int) + +class ClusteringEvaluatorSuite + extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + import testImplicits._ + + test("params") { +ParamsSuite.checkParams(new ClusteringEvaluator) + } + + test("read/write") { +val evaluator = new ClusteringEvaluator() + .setPredictionCol("myPrediction") + .setFeaturesCol("myLabel") +testDefaultReadWrite(evaluator) + } + + /* +Use the following python code to load the data and evaluate it using scikit-learn package. + +from sklearn import datasets +from sklearn.metrics import silhouette_score +iris = datasets.load_iris() +round(silhouette_score(iris.data, iris.target, metric='sqeuclidean'), 10) + +0.6564679231 + */ + test("squared euclidean Silhouette") { +val iris = ClusteringEvaluatorSuite.irisDataset(spark) +val evaluator = new ClusteringEvaluator() +.setFeaturesCol("features") +.setPredictionCol("label") + +assert(evaluator.evaluate(iris) ~== 0.6564679231 relTol 1e-10) --- End diff -- Check with tolerance 1e-5 is good enough. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137224923 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137239744 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) --- End diff -- ```@Since("2.3.0")``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137242981 --- Diff: mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala --- @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.DefaultReadWriteTest +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.{DataFrame, SparkSession} + + +private[ml] case class ClusteringEvaluationTestData(features: Vector, label: Int) + +class ClusteringEvaluatorSuite + extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + import testImplicits._ + + test("params") { +ParamsSuite.checkParams(new ClusteringEvaluator) + } + + test("read/write") { +val evaluator = new ClusteringEvaluator() + .setPredictionCol("myPrediction") + .setFeaturesCol("myLabel") +testDefaultReadWrite(evaluator) + } + + /* +Use the following python code to load the data and evaluate it using scikit-learn package. + +from sklearn import datasets +from sklearn.metrics import silhouette_score +iris = datasets.load_iris() +round(silhouette_score(iris.data, iris.target, metric='sqeuclidean'), 10) + +0.6564679231 + */ + test("squared euclidean Silhouette") { +val iris = ClusteringEvaluatorSuite.irisDataset(spark) +val evaluator = new ClusteringEvaluator() +.setFeaturesCol("features") +.setPredictionCol("label") + +assert(evaluator.evaluate(iris) ~== 0.6564679231 relTol 1e-10) + } + --- End diff -- It's better to add another corner case: single cluster. We should guarantee it output consistent result with sklearn. You can just select one cluster from the iris dataset and test it. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137180832 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137239772 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true --- End diff -- ```@Since("2.3.0")``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137178736 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137238642 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137226104 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137180329 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137226738 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137239566 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( --- End diff -- Could we add a param ```metricName``` like other evaluator? It can only support ```silhouette``` currently, but we may add other metric in the future. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137239933 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) --- End diff -- ```@Since("2.3.0")``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137180194 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137178071 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137226969 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137239478 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) --- End diff -- ```class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: String)``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137226318 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137242127 --- Diff: mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala --- @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.DefaultReadWriteTest +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.{DataFrame, SparkSession} + + +private[ml] case class ClusteringEvaluationTestData(features: Vector, label: Int) + +class ClusteringEvaluatorSuite + extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + import testImplicits._ + + test("params") { +ParamsSuite.checkParams(new ClusteringEvaluator) + } + + test("read/write") { +val evaluator = new ClusteringEvaluator() + .setPredictionCol("myPrediction") + .setFeaturesCol("myLabel") +testDefaultReadWrite(evaluator) + } + + /* +Use the following python code to load the data and evaluate it using scikit-learn package. + +from sklearn import datasets +from sklearn.metrics import silhouette_score +iris = datasets.load_iris() +round(silhouette_score(iris.data, iris.target, metric='sqeuclidean'), 10) + +0.6564679231 + */ + test("squared euclidean Silhouette") { +val iris = ClusteringEvaluatorSuite.irisDataset(spark) +val evaluator = new ClusteringEvaluator() +.setFeaturesCol("features") +.setPredictionCol("label") + +assert(evaluator.evaluate(iris) ~== 0.6564679231 relTol 1e-10) + } + +} + +object ClusteringEvaluatorSuite { + def irisDataset(spark: SparkSession): DataFrame = { +import spark.implicits._ + +val irisCsvPath = Thread.currentThread() + .getContextClassLoader + .getResource("test-data/iris.csv") + .toString + +spark.sparkContext + .textFile(irisCsvPath) + .map { +line => + val splits = line.split(",") + ClusteringEvaluationTestData( +Vectors.dense(splits.take(splits.length-1).map(_.toDouble)), +splits(splits.length-1).toInt + ) + } + .toDF() --- End diff -- Can we store the test data as libsvm format rather than csv? Then we can use ```spark.read.format("libsvm").load(irisPath)``` to load it to a DataFrame with two columns: features and label. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137239906 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator --- End diff -- ```@Since("2.3.0")``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137175816 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. --- End diff -- ```of to``` -> ```of `i` to``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r137178833 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * :: Experimental :: + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * $$ + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ + * + * + * which can be rewritten as + * + * + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ + * + * + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm
[GitHub] spark pull request #8883: [SPARK-10884] [ML] Support prediction on single in...
Github user yanboliang closed the pull request at: https://github.com/apache/spark/pull/8883 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19020: [SPARK-3181] [ML] Implement huber loss for LinearRegress...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/19020 @MLnick @WeichenXu123 Thanks for your comments, also cc @jkbradley @hhbyyh @sethah, would you mind to have a look? Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19020: [SPARK-3181] [ML] Implement huber loss for Linear...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19020#discussion_r136515821 --- Diff: mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HuberAggregator.scala --- @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.optim.aggregator + +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.linalg.Vector + +/** + * HuberAggregator computes the gradient and loss for a huber loss function, + * as used in robust regression for samples in sparse or dense vector in an online fashion. + * + * The huber loss function based on: + * Art B. Owen (2006), A robust hybrid of lasso and ridge regression. + * (http://statweb.stanford.edu/~owen/reports/hhu.pdf) + * + * Two HuberAggregator can be merged together to have a summary of loss and gradient of + * the corresponding joint dataset. + * + * The huber loss function is given by + * + * + * $$ + * \begin{align} + * \min_{w, \sigma}\frac{1}{2n}{\sum_{i=1}^n\left(\sigma + + * H_m\left(\frac{X_{i}w - y_{i}}{\sigma}\right)\sigma\right) + \frac{1}{2}\alpha {||w||_2}^2} + * \end{align} + * $$ + * + * + * where + * + * + * $$ + * \begin{align} + * H_m(z) = \begin{cases} + *z^2, & \text {if } |z| < \epsilon, \\ + *2\epsilon|z| - \epsilon^2, & \text{otherwise} + *\end{cases} + * \end{align} + * $$ + * + * + * It is advised to set the parameter $\epsilon$ to 1.35 to achieve 95% statistical efficiency. + * + * @param fitIntercept Whether to fit an intercept term. + * @param m The shape parameter to control the amount of robustness. + * @param bcFeaturesStd The broadcast standard deviation values of the features. + * @param bcParameters including three parts: the regression coefficients corresponding + * to the features, the intercept (if fitIntercept is ture) + * and the scale parameter (sigma). + */ +private[ml] class HuberAggregator( +fitIntercept: Boolean, +m: Double, +bcFeaturesStd: Broadcast[Array[Double]])(bcParameters: Broadcast[Vector]) + extends DifferentiableLossAggregator[Instance, HuberAggregator] { + + protected override val dim: Int = bcParameters.value.size + private val numFeatures: Int = if (fitIntercept) dim - 2 else dim - 1 + + @transient private lazy val coefficients: Array[Double] = +bcParameters.value.toArray.slice(0, numFeatures) + private val sigma: Double = bcParameters.value(dim - 1) + + @transient private lazy val featuresStd = bcFeaturesStd.value + + /** + * Add a new training instance to this HuberAggregator, and update the loss and gradient + * of the objective function. + * + * @param instance The instance of data point to be added. + * @return This HuberAggregator object. + */ + def add(instance: Instance): HuberAggregator = { +instance match { case Instance(label, weight, features) => + require(numFeatures == features.size, s"Dimensions mismatch when adding new sample." + +s" Expecting $numFeatures but got ${features.size}.") + require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0") + + if (weight == 0.0) return this + + val margin = { +var sum = 0.0 +features.foreachActive { (index, value) => + if (featuresStd(index) != 0.0 && value != 0.0) { +sum += coefficients(index) * (value / featuresStd(index)) + } +} +if (fitIntercept) sum += bcParameters.value(dim - 2) --- End diff -- ```coefficients``` doesn't contain ```intercept``` element, so we can't get
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r136333104 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + * + * The implementation follows the proposal explained + * https://drive.google.com/file/d/0B0Hyo%5f%5fbG%5f3fdkNvSVNYX2E3ZU0/view";> + * in this document. --- End diff -- BTW, we have necessary docs at ```object SquaredEuclideanSilhouette``` to explain our proposed algorithm, so we can remove this. Usually we only refer to public publication. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r136332399 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + * + * The implementation follows the proposal explained + * https://drive.google.com/file/d/0B0Hyo%5f%5fbG%5f3fdkNvSVNYX2E3ZU0/view";> + * in this document. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * + * + * which can be rewritten as + * + * + * s_{i}=\left\{ \begin{tabular}{cc} + * $1-\frac{a_{i}}{b_{i}}$ & if $a_{i} \leq b_{i}$ \\ + * $\frac{b_{i}}{a_{i}}-1$ & if $a_{i} \gt b_{i}$ --- End diff -- 1, Remove ```private[evaluation]``` from ```object SquaredEuclideanSilhouette```. We only generate docs for public APIs, the doc of private APIs are used for developers to understand code. 2, ```cd docs``` 3, Run ```jekyll build``` 4, Then you can get API docs under ```docs/_site/api/scala/index.html```, try to search ```SquaredEuclideanSilhouette```. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well.
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r136306135 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + * + * The implementation follows the proposal explained + * https://drive.google.com/file/d/0B0Hyo%5f%5fbG%5f3fdkNvSVNYX2E3ZU0/view";> + * in this document. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * + * + * which can be rewritten as + * + * + * s_{i}=\left\{ \begin{tabular}{cc} + * $1-\frac{a_{i}}{b_{i}}$ & if $a_{i} \leq b_{i}$ \\ + * $\frac{b_{i}}{a_{i}}-1$ & if $a_{i} \gt b_{i}$ + * + * + * where `a(i)` is the average dissimilarity of `i` with all other data + * within the same cluster, `b(i)` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `a(i)` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `b(i)` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest clus
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r136304803 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + * + * The implementation follows the proposal explained + * https://drive.google.com/file/d/0B0Hyo%5f%5fbG%5f3fdkNvSVNYX2E3ZU0/view";> + * in this document. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * --- End diff -- The latex formula should be surrounded by $$, change here and other places as: ``` $$ s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} $$ ``` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r136305238 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + * + * The implementation follows the proposal explained + * https://drive.google.com/file/d/0B0Hyo%5f%5fbG%5f3fdkNvSVNYX2E3ZU0/view";> + * in this document. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * + * + * which can be rewritten as + * + * + * s_{i}=\left\{ \begin{tabular}{cc} + * $1-\frac{a_{i}}{b_{i}}$ & if $a_{i} \leq b_{i}$ \\ + * $\frac{b_{i}}{a_{i}}-1$ & if $a_{i} \gt b_{i}$ + * + * + * where `a(i)` is the average dissimilarity of `i` with all other data + * within the same cluster, `b(i)` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `a(i)` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `b(i)` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest clus
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r136305819 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * Evaluator for clustering results. + * The metric computes the Silhouette measure + * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + * + * The implementation follows the proposal explained + * https://drive.google.com/file/d/0B0Hyo%5f%5fbG%5f3fdkNvSVNYX2E3ZU0/view";> + * in this document. + */ +@Experimental +@Since("2.3.0") +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("cluEval")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + @Since("2.3.0") + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + @Since("2.3.0") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + @Since("2.3.0") + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) +) + } +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + * + * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * + * + * which can be rewritten as + * + * + * s_{i}=\left\{ \begin{tabular}{cc} + * $1-\frac{a_{i}}{b_{i}}$ & if $a_{i} \leq b_{i}$ \\ + * $\frac{b_{i}}{a_{i}}-1$ & if $a_{i} \gt b_{i}$ --- End diff -- There is syntax error in this latex formula, I checked the generated doc and found it can't show correctly. Or you can paste this formula into http://www.hostmath.com/ to check. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working,
[GitHub] spark issue #17862: [SPARK-20602] [ML]Adding LBFGS optimizer and Squared_hin...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/17862 +1 @jkbradley for test on large-scale datasets. @hhbyyh Do you have time to test it? If not, I can help. Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18610: [SPARK-21386] ML LinearRegression supports warm s...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18610#discussion_r136020810 --- Diff: mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala --- @@ -72,6 +72,22 @@ private[regression] trait LinearRegressionParams extends PredictorParams } /** + * Params for linear regression. + */ +private[regression] trait LinearRegressionParams extends LinearRegressionModelParams --- End diff -- The refactor here is not only for this PR, since model usually has less params than estimator, we make _EstimatorParams_ extends from _ModelParams_ in other places, for example, _ALSParams_ extends _ALSModelParams_. I think the class hierarchy should be more clear. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18610: [SPARK-21386] ML LinearRegression supports warm s...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18610#discussion_r136018943 --- Diff: mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala --- @@ -226,6 +246,12 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String if (($(solver) == Auto && numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == Normal) { + + if (isSet(initialModel)) { +logWarning("Initial model will be ignored if fitting by normal solver. " + --- End diff -- Fair enough, I will update it after collecting all comments. Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18610: [SPARK-21386] ML LinearRegression supports warm start fr...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18610 @hhbyyh We got agreement that the initialModel should be of type ```[T <: Model[T]]``` at #9. I understand the scenario you mentioned, however, I think they are different scenarios: 1, If we set ```initialModel``` as a string path, we should guarantee there is correct model stored in the path, but we can't guarantee it only to refer the string path. 2, I agree that the scenario you mentioned above is really useful, but it think the requirement is beyond initial model. The following code snippet can meet that requirement under my framework: ``` val initialModel = LinearRegressionModel.load("path_lr_20iteration_reg0.001.model") val lr = new LinearRegression().setInitialModel(initialModel) val model = lr.fit(dataset) ``` 3, If users have a model in memory, they want to use this model as the initial model to train another one, they need to save the model firstly and then set the corresponding path as the initial model path. 4, For your comments about serializing the initial model, I agree with you. I'm ok not to save ```initialModel``` when saving estimator. What do you think of it? Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18998: [SPARK-21748][ML] Migrate the implementation of HashingT...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18998 @srowen You are right, mllib won't be removed before 3.0, but we don't expect to migrate them at last minute. Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18998: [SPARK-21748][ML] Migrate the implementation of HashingT...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18998 @srowen Except for @facaiy mentioned above, we will remove spark.mllib package in the future, so all implementations in spark.mllib should be copied to spark.ml. Actually lots of MLlib algorithms have movd implementations to spark.ml and keep wrappers at spark.mllib. Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17014: [SPARK-18608][ML] Fix double-caching in ML algori...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/17014#discussion_r135740464 --- Diff: mllib/src/main/scala/org/apache/spark/ml/Predictor.scala --- @@ -85,6 +86,10 @@ abstract class Predictor[ M <: PredictionModel[FeaturesType, M]] extends Estimator[M] with PredictorParams { + protected[spark] var storageLevel = StorageLevel.NONE + + protected def handlePersistence = storageLevel == StorageLevel.NONE --- End diff -- +1 @WeichenXu123 We can't store persistence level as the member variable of _Predictor_, since we may use one estimator to fit on the same dataset multiple times, and it may lead duplicated persistence if we don't handle it carefully. Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18902: [SPARK-21690][ML] one-pass imputer
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18902 @zhengruifeng DataFrame-based operation is 2~3x slower than RDD-based operation is a known issue, because of the deserialization cost. If we switch to RDD-based method, we need to implement our own aggregator to calculate _mean_ and _median_, this need much more code than calling DataFrame API. BTW, DF using more compact structure that can reduce memory footprint. From my perspective, I'd suggest to keep the current DF-based solution. As it will 5~10 faster than the original implementation. @hhbyyh @MLnick What do you think about it? Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19029: [SPARK-21818][ML][MLLIB] Fix bug of MultivariateO...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19029#discussion_r135216154 --- Diff: mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala --- @@ -440,7 +440,7 @@ private[ml] object WeightedLeastSquares { /** * Weighted population standard deviation of labels. */ -def bStd: Double = math.sqrt(bbSum / wSum - bBar * bBar) +def bStd: Double = math.sqrt(math.max(bbSum / wSum - bBar * bBar, 0.0)) --- End diff -- Please add comment here and bellow to clarify that we are preventing from negative value caused by numerical error. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18315: [SPARK-21108] [ML] convert LinearSVC to aggregator frame...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18315 Merged into master. Thanks for all. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18315: [SPARK-21108] [ML] convert LinearSVC to aggregato...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18315#discussion_r135172322 --- Diff: mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala --- @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.optim.aggregator + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { + + import DifferentiableLossAggregatorSuite.getClassificationSummarizers + + @transient var instances: Array[Instance] = _ + @transient var instancesConstantFeature: Array[Instance] = _ + + override def beforeAll(): Unit = { +super.beforeAll() +instances = Array( + Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), + Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)), + Instance(0.0, 0.3, Vectors.dense(4.0, 0.5)) +) +instancesConstantFeature = Array( + Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), + Instance(1.0, 0.5, Vectors.dense(1.0, 1.0)), + Instance(1.0, 0.3, Vectors.dense(1.0, 0.5))) + } + + /** Get summary statistics for some data and create a new HingeAggregator. */ + private def getNewAggregator( + instances: Array[Instance], + coefficients: Vector, + fitIntercept: Boolean): HingeAggregator = { +val (featuresSummarizer, ySummarizer) = + DifferentiableLossAggregatorSuite.getClassificationSummarizers(instances) +val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt) +val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd) +val bcCoefficients = spark.sparkContext.broadcast(coefficients) +new HingeAggregator(bcFeaturesStd, fitIntercept)(bcCoefficients) + } + + test("aggregator add method input size") { +val coefArray = Array(1.0, 2.0) +val interceptArray = Array(2.0) --- End diff -- Okay. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19029: [SPARK-21818][ML][MLLIB] Fix bug of MultivariateO...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19029#discussion_r134816423 --- Diff: mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala --- @@ -438,6 +438,10 @@ private[ml] object SummaryBuilderImpl extends Logging { while (i < len) { realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * weightSum(i) * (totalWeightSum - weightSum(i)) / totalWeightSum) / denominator + // Because of numerical error, it is possible to get negative real variance + if (realVariance(i) < 0.0) { --- End diff -- The computation of _variance_ may be touch this numerical error, it seems ```WeightedLeastSquares``` also use the same method to compute _variance_ , does it will have similar issue? @WeichenXu123 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18315: [SPARK-21108] [ML] convert LinearSVC to aggregato...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18315#discussion_r134718512 --- Diff: mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala --- @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.optim.aggregator + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { + + import DifferentiableLossAggregatorSuite.getClassificationSummarizers + + @transient var instances: Array[Instance] = _ + @transient var instancesConstantFeature: Array[Instance] = _ + + override def beforeAll(): Unit = { +super.beforeAll() +instances = Array( + Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), + Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)), + Instance(0.0, 0.3, Vectors.dense(4.0, 0.5)) +) +instancesConstantFeature = Array( + Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), + Instance(1.0, 0.5, Vectors.dense(1.0, 1.0)), + Instance(1.0, 0.3, Vectors.dense(1.0, 0.5))) + } + + /** Get summary statistics for some data and create a new HingeAggregator. */ + private def getNewAggregator( + instances: Array[Instance], + coefficients: Vector, + fitIntercept: Boolean): HingeAggregator = { +val (featuresSummarizer, ySummarizer) = + DifferentiableLossAggregatorSuite.getClassificationSummarizers(instances) +val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt) +val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd) +val bcCoefficients = spark.sparkContext.broadcast(coefficients) +new HingeAggregator(bcFeaturesStd, fitIntercept)(bcCoefficients) + } + + test("aggregator add method input size") { +val coefArray = Array(1.0, 2.0) +val interceptArray = Array(2.0) +val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray), + fitIntercept = true) +withClue("HingeAggregator features dimension must match coefficients dimension") { + intercept[IllegalArgumentException] { +agg.add(Instance(1.0, 1.0, Vectors.dense(2.0))) + } +} + } + + test("negative weight") { +val coefArray = Array(1.0, 2.0) +val interceptArray = Array(2.0) --- End diff -- ditto --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18315: [SPARK-21108] [ML] convert LinearSVC to aggregato...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18315#discussion_r134718447 --- Diff: mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala --- @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.optim.aggregator + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { + + import DifferentiableLossAggregatorSuite.getClassificationSummarizers + + @transient var instances: Array[Instance] = _ + @transient var instancesConstantFeature: Array[Instance] = _ + + override def beforeAll(): Unit = { +super.beforeAll() +instances = Array( + Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), + Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)), + Instance(0.0, 0.3, Vectors.dense(4.0, 0.5)) +) +instancesConstantFeature = Array( + Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), + Instance(1.0, 0.5, Vectors.dense(1.0, 1.0)), + Instance(1.0, 0.3, Vectors.dense(1.0, 0.5))) + } + + /** Get summary statistics for some data and create a new HingeAggregator. */ + private def getNewAggregator( + instances: Array[Instance], + coefficients: Vector, + fitIntercept: Boolean): HingeAggregator = { +val (featuresSummarizer, ySummarizer) = + DifferentiableLossAggregatorSuite.getClassificationSummarizers(instances) +val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt) +val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd) +val bcCoefficients = spark.sparkContext.broadcast(coefficients) +new HingeAggregator(bcFeaturesStd, fitIntercept)(bcCoefficients) + } + + test("aggregator add method input size") { +val coefArray = Array(1.0, 2.0) +val interceptArray = Array(2.0) --- End diff -- This is different from ```LogisticRegression``` which supports multi-classification, so ```interceptArray``` should be renamed to ```intercept```? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18315: [SPARK-21108] [ML] convert LinearSVC to aggregato...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18315#discussion_r134717404 --- Diff: mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala --- @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.optim.aggregator + +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.linalg._ + +/** + * HingeAggregator computes the gradient and loss for loss function ("hinge" or + * "squared_hinge", as used in binary classification for instances in sparse or dense --- End diff -- Yeah, I agree you for separate ```SquaredHingeAggregator```. Then we should remove ```squared_hinge``` from here? BTW, you missed right parenthesis here. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19011: [ML][MINOR] Make sharedParams update.
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/19011 Merged into master. Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #19020: [SPARK-3181] [ML] Implement huber loss for LinearRegress...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/19020 @MLnick Yeah, I think we have get an agreement in JIRA discussion. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19020: [SPARK-3181] [ML] Implement huber loss for Linear...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19020#discussion_r134476397 --- Diff: mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala --- @@ -344,33 +408,58 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String } else { None } -val costFun = new RDDLossFunction(instances, getAggregatorFunc, regularization, - $(aggregationDepth)) -val optimizer = if ($(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) { - new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) -} else { - val standardizationParam = $(standardization) - def effectiveL1RegFun = (index: Int) => { -if (standardizationParam) { - effectiveL1RegParam +val costFun = $(loss) match { + case LeastSquares => +val getAggregatorFunc = new LeastSquaresAggregator(yStd, yMean, $(fitIntercept), + bcFeaturesStd, bcFeaturesMean)(_) +new RDDLossFunction(instances, getAggregatorFunc, regularization, $(aggregationDepth)) + case Huber => +val getAggregatorFunc = new HuberAggregator($(fitIntercept), $(m), bcFeaturesStd)(_) +new RDDLossFunction(instances, getAggregatorFunc, regularization, $(aggregationDepth)) --- End diff -- I think no. I was trying to factored it out, but because ```LeastSquaresAggregator``` and ```HuberAggregator``` pass in the type of itself, so the compile will complain. Maybe @sethah can give some suggestion? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14326: [SPARK-3181] [ML] Implement RobustRegression with huber ...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/14326 Please go to #19020 for reviewing and comments. Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19020: [SPARK-3181] [ML] Implement huber loss for Linear...
GitHub user yanboliang opened a pull request: https://github.com/apache/spark/pull/19020 [SPARK-3181] [ML] Implement huber loss for LinearRegression. ## What changes were proposed in this pull request? The current implementation is a straight forward porting for Python scikit-learn HuberRegressor, so it produces the same result with that. Objective function: ![image](https://user-images.githubusercontent.com/1962026/29554124-9544d198-8750-11e7-8afa-33579ec419d5.png) ## How was this patch tested? Unit tests. You can merge this pull request into a Git repository by running: $ git pull https://github.com/yanboliang/spark spark-3181 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/19020.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #19020 commit 2ee705d350fed8927c7fa65f9f6e8b44223071ac Author: Yanbo Liang Date: 2017-08-20T05:45:36Z Implement HuberAggregator and add tests. commit 630f8b65ffa1b30c1dcb20122e0f2609b90284a5 Author: Yanbo Liang Date: 2017-08-21T13:43:28Z Implement huber loss for LinearRegression. commit 50eaee26712b05f321cae0777b2f3a12c4f1f4c0 Author: Yanbo Liang Date: 2017-08-22T03:21:42Z Update HuberAggregator and tests. commit 2891f9938fd8d32b55a6ac9a5848f9c594597c65 Author: Yanbo Liang Date: 2017-08-22T04:34:02Z Update params doc and check for illegal params. commit 91424712ecffca00abb6172de555d76c94a26400 Author: Yanbo Liang Date: 2017-08-22T07:25:27Z Update LinearRegression test suites. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14326: [SPARK-3181] [ML] Implement RobustRegression with huber ...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/14326 I'll close this PR and open a new one. Feel free to review and comment. Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14326: [SPARK-3181] [ML] Implement RobustRegression with...
Github user yanboliang closed the pull request at: https://github.com/apache/spark/pull/14326 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18992: [SPARK-19762][ML][FOLLOWUP]Add necessary comments to L2R...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18992 Merged into master. Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19011: [ML][MINOR] Make sharedParams update.
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/19011#discussion_r134232769 --- Diff: mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala --- @@ -154,7 +154,7 @@ private[ml] trait HasVarianceCol extends Params { } /** - * Trait for shared param threshold (default: 0.5). --- End diff -- The definition in ```SharedParamsCodeGen``` has no default value yet, since both ```LogisticRegression``` and ```LinearSVC``` extends from ```HasThreshold```, but they have different default value. ``` ParamDesc[Double]("threshold", "threshold in binary classification prediction, in range [0, 1]", isValid = "ParamValidators.inRange(0, 1)", finalMethods = false, finalFields = false), ``` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #19011: [ML][MINOR] Make sharedParams update.
GitHub user yanboliang opened a pull request: https://github.com/apache/spark/pull/19011 [ML][MINOR] Make sharedParams update. ## What changes were proposed in this pull request? ```sharedParams.scala``` was generated by ```SharedParamsCodeGen```, but it's not updated currently. Maybe someone manual update ```sharedParams.scala```, this PR fix this issue. ## How was this patch tested? Offline check. You can merge this pull request into a Git repository by running: $ git pull https://github.com/yanboliang/spark sharedParams Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/19011.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #19011 commit 85307410cf3438f407999506614cfdd7349bcbf4 Author: Yanbo Liang Date: 2017-08-21T13:28:59Z Make sharedParams update. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r133961918 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.Experimental +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * Evaluator for clustering results. + * At the moment, the supported metrics are: + * squaredSilhouette: silhouette measure using the squared Euclidean distance; + * cosineSilhouette: silhouette measure using the cosine distance. + * The implementation follows the proposal explained + * https://drive.google.com/file/d/0B0Hyo%5f%5fbG%5f3fdkNvSVNYX2E3ZU0/view";> + * in this document. + */ +@Experimental +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("SquaredEuclideanSilhouette")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default)) + * @group param + */ + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) +new Param( + this, + "metricName", + "metric name in evaluation (squaredSilhouette)", + allowedParams +) + } + + /** @group getParam */ + def getMetricName: String = $(metricName) + + /** @group setParam */ + def setMetricName(value: String): this.type = set(metricName, value) + + setDefault(metricName -> "squaredSilhouette") + + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +val metric: Double = $(metricName) match { + case "squaredSilhouette" => +SquaredEuclideanSilhouette.computeSquaredSilhouette( + dataset, + $(predictionCol), + $(featuresCol) +) +} +metric + } + +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + +private[evaluation] object SquaredEuclideanSilhouette { + + private[this] var kryoRegistrationPerformed: Boolean = false + + /** + * This method registers the class + * [[org.apache.spark.ml.evaluation.SquaredEuclideanSilhouette.ClusterStats]] + * for kryo serialization. + * + * @param sc `SparkContext` to be used + */ + def registerKryoClasses(sc: SparkContext): Unit = { +if (! kryoRegistrationPerformed) { + sc.getConf
[GitHub] spark issue #18992: [SPARK-19762][ML][FOLLOWUP]Add necessary comments to L2R...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18992 @sethah @srowen Thanks for your great contributions for #17094. I wish you would not mind, I found these annotation section was missing. I think this is very important to let users/developers to understand how we handling _standardization_, so I add it back. Would you mind to have a look? Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18992: [SPARK-19762][ML][FOLLOWUP]Add necessary comments...
GitHub user yanboliang opened a pull request: https://github.com/apache/spark/pull/18992 [SPARK-19762][ML][FOLLOWUP]Add necessary comments to L2Regularization. ## What changes were proposed in this pull request? MLlib LiR/LoR/SR always standardize the data during training to improve the rate of convergence regardless of _standardization_ is true or false. If _standardization_ is false, we perform reverse standardization by penalizing each component differently to get effectively the same objective function when the training dataset is not standardized. We should keep these comments in the code to let developers understand how we handle it correctly. ## How was this patch tested? Existing tests, only adding some comments in code. You can merge this pull request into a Git repository by running: $ git pull https://github.com/yanboliang/spark SPARK-19762 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/18992.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #18992 commit 88b20dc8db1a95d202faaf0eaad2c60f42ff603d Author: Yanbo Liang Date: 2017-08-18T10:43:20Z Add necessary comments to L2Regularization. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18980: Correct validateAndTransformSchema in GaussianMixture
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18980 +1 @srowen, this is a bug. @sharp-pixel Would you mind to fix both ```GaussianMixture``` and ```AFTSurvivalRegression```? It's better to file a JIRA firstly and add some unit tests. Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r133876325 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.Experimental +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * Evaluator for clustering results. + * At the moment, the supported metrics are: + * squaredSilhouette: silhouette measure using the squared Euclidean distance; + * cosineSilhouette: silhouette measure using the cosine distance. + * The implementation follows the proposal explained + * https://drive.google.com/file/d/0B0Hyo%5f%5fbG%5f3fdkNvSVNYX2E3ZU0/view";> + * in this document. + */ +@Experimental +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("SquaredEuclideanSilhouette")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default)) + * @group param + */ + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) +new Param( + this, + "metricName", + "metric name in evaluation (squaredSilhouette)", + allowedParams +) + } + + /** @group getParam */ + def getMetricName: String = $(metricName) + + /** @group setParam */ + def setMetricName(value: String): this.type = set(metricName, value) + + setDefault(metricName -> "squaredSilhouette") + + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +val metric: Double = $(metricName) match { + case "squaredSilhouette" => +SquaredEuclideanSilhouette.computeSquaredSilhouette( + dataset, + $(predictionCol), + $(featuresCol) +) +} +metric + } + +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + +private[evaluation] object SquaredEuclideanSilhouette { + + private[this] var kryoRegistrationPerformed: Boolean = false + + /** + * This method registers the class + * [[org.apache.spark.ml.evaluation.SquaredEuclideanSilhouette.ClusterStats]] + * for kryo serialization. + * + * @param sc `SparkContext` to be used + */ + def registerKryoClasses(sc: SparkContext): Unit = { +if (! kryoRegistrationPerformed) { + sc.getConf
[GitHub] spark pull request #18538: [SPARK-14516][ML] Adding ClusteringEvaluator with...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18538#discussion_r133875990 --- Diff: mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala --- @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.annotation.Experimental +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, udf} +import org.apache.spark.sql.types.IntegerType + +/** + * Evaluator for clustering results. + * At the moment, the supported metrics are: + * squaredSilhouette: silhouette measure using the squared Euclidean distance; + * cosineSilhouette: silhouette measure using the cosine distance. + * The implementation follows the proposal explained + * https://drive.google.com/file/d/0B0Hyo%5f%5fbG%5f3fdkNvSVNYX2E3ZU0/view";> + * in this document. + */ +@Experimental +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("SquaredEuclideanSilhouette")) + + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default)) + * @group param + */ + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) +new Param( + this, + "metricName", + "metric name in evaluation (squaredSilhouette)", + allowedParams +) + } + + /** @group getParam */ + def getMetricName: String = $(metricName) + + /** @group setParam */ + def setMetricName(value: String): this.type = set(metricName, value) + + setDefault(metricName -> "squaredSilhouette") + + override def evaluate(dataset: Dataset[_]): Double = { +SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) +SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + +val metric: Double = $(metricName) match { + case "squaredSilhouette" => +SquaredEuclideanSilhouette.computeSquaredSilhouette( + dataset, + $(predictionCol), + $(featuresCol) +) +} +metric + } + +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} + +private[evaluation] object SquaredEuclideanSilhouette { --- End diff -- Usually we should paste the formula here to explain how we compute ```Silhouette Coefficient``` by the high efficient distributed implementation. Because your design document is not a publication, so I think we need to move it from there, but you can simplify it. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does n
[GitHub] spark issue #18902: [SPARK-21690][ML] one-pass imputer
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18902 @zhengruifeng What _the RDD-based one_ means? It's the code on master or the code in your former commit? Thanks --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #17862: [SPARK-20602] [ML]Adding LBFGS optimizer and Squared_hin...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/17862 +1 @WeichenXu123 IIRC softmax regression also include a non-derivable point, we can use LBFGS to solve it as well. We can support _squared hinge loss_ which is smooth function in the future, so users can switch to _squared hinge loss_ if they hit the condition with little probability. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18315: [SPARK-21108] [ML] [WIP] convert LinearSVC to aggregator...
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18315 @hhbyyh I think it's ready to move ```WIP``` in the PR title. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18315: [SPARK-21108] [ML] [WIP] convert LinearSVC to agg...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18315#discussion_r133671265 --- Diff: mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala --- @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.optim.aggregator + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { + + import DifferentiableLossAggregatorSuite.getClassificationSummarizers + + @transient var instances: Array[Instance] = _ + + override def beforeAll(): Unit = { +super.beforeAll() +instances = Array( + Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), + Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)), + Instance(0.0, 0.3, Vectors.dense(4.0, 0.5)) +) + } + + /** Get summary statistics for some data and create a new HingeAggregator. */ + private def getNewAggregator( + instances: Array[Instance], + coefficients: Vector, + fitIntercept: Boolean): HingeAggregator = { +val (featuresSummarizer, ySummarizer) = + DifferentiableLossAggregatorSuite.getClassificationSummarizers(instances) +val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt) +val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd) +val bcCoefficients = spark.sparkContext.broadcast(coefficients) +new HingeAggregator(bcFeaturesStd, fitIntercept)(bcCoefficients) + } + + test("aggregator add method input size") { +val coefArray = Array(1.0, 2.0) +val interceptArray = Array(2.0) +val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray), + fitIntercept = true) +withClue("HingeAggregator features dimension must match coefficients dimension") { + intercept[IllegalArgumentException] { +agg.add(Instance(1.0, 1.0, Vectors.dense(2.0))) + } +} + } + + test("negative weight") { +val coefArray = Array(1.0, 2.0) +val interceptArray = Array(2.0) +val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray), + fitIntercept = true) +withClue("HingeAggregator does not support negative instance weights") { + intercept[IllegalArgumentException] { +agg.add(Instance(1.0, -1.0, Vectors.dense(2.0, 1.0))) + } +} + } + + test("check sizes binomial") { --- End diff -- Remove ```binomial```? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18315: [SPARK-21108] [ML] [WIP] convert LinearSVC to agg...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18315#discussion_r133662950 --- Diff: mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala --- @@ -173,7 +174,7 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau test("sparse coefficients in SVCAggregator") { --- End diff -- ```SVCAggregator``` -> ```HingeAggregator```? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18315: [SPARK-21108] [ML] [WIP] convert LinearSVC to agg...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18315#discussion_r133671815 --- Diff: mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala --- @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.optim.aggregator + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { + + import DifferentiableLossAggregatorSuite.getClassificationSummarizers + + @transient var instances: Array[Instance] = _ + + override def beforeAll(): Unit = { +super.beforeAll() +instances = Array( + Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), + Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)), + Instance(0.0, 0.3, Vectors.dense(4.0, 0.5)) +) + } + + /** Get summary statistics for some data and create a new HingeAggregator. */ + private def getNewAggregator( + instances: Array[Instance], + coefficients: Vector, + fitIntercept: Boolean): HingeAggregator = { +val (featuresSummarizer, ySummarizer) = + DifferentiableLossAggregatorSuite.getClassificationSummarizers(instances) +val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt) +val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd) +val bcCoefficients = spark.sparkContext.broadcast(coefficients) +new HingeAggregator(bcFeaturesStd, fitIntercept)(bcCoefficients) + } + + test("aggregator add method input size") { +val coefArray = Array(1.0, 2.0) +val interceptArray = Array(2.0) +val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray), + fitIntercept = true) +withClue("HingeAggregator features dimension must match coefficients dimension") { + intercept[IllegalArgumentException] { +agg.add(Instance(1.0, 1.0, Vectors.dense(2.0))) + } +} + } + + test("negative weight") { +val coefArray = Array(1.0, 2.0) +val interceptArray = Array(2.0) +val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray), + fitIntercept = true) +withClue("HingeAggregator does not support negative instance weights") { + intercept[IllegalArgumentException] { +agg.add(Instance(1.0, -1.0, Vectors.dense(2.0, 1.0))) + } +} + } + + test("check sizes binomial") { +val rng = new scala.util.Random +val numFeatures = instances.head.features.size +val coefWithIntercept = Vectors.dense(Array.fill(numFeatures + 1)(rng.nextDouble)) +val coefWithoutIntercept = Vectors.dense(Array.fill(numFeatures)(rng.nextDouble)) +val aggIntercept = getNewAggregator(instances, coefWithIntercept, fitIntercept = true) +val aggNoIntercept = getNewAggregator(instances, coefWithoutIntercept, + fitIntercept = false) +instances.foreach(aggIntercept.add) +instances.foreach(aggNoIntercept.add) + +assert(aggIntercept.gradient.size === numFeatures + 1) +assert(aggNoIntercept.gradient.size === numFeatures) + } + + + test("check correctness binomial") { +val coefArray = Array(1.0, 2.0) +val intercept = 1.0 +val numFeatures = instances.head.features.size +val (featuresSummarizer, _) = getClassificationSummarizers(instances) +val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt) +val weightSum = instances.map(_.weight).sum + +val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ Array
[GitHub] spark pull request #18315: [SPARK-21108] [ML] [WIP] convert LinearSVC to agg...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18315#discussion_r133665834 --- Diff: mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala --- @@ -219,8 +219,17 @@ class LinearSVC @Since("2.2.0") ( val featuresStd = summarizer.variance.toArray.map(math.sqrt) val regParamL2 = $(regParam) val bcFeaturesStd = instances.context.broadcast(featuresStd) - val costFun = new LinearSVCCostFun(instances, $(fitIntercept), -$(standardization), bcFeaturesStd, regParamL2, $(aggregationDepth)) + val regularization = if (regParamL2 != 0.0) { +val shouldApply = (idx: Int) => idx >= 0 && idx < numFeatures +Some(new L2Regularization(regParamL2, shouldApply, + if ($(standardization)) None else Some(featuresStd))) --- End diff -- Minor: The third argument ```applyFeaturesStd``` is a function rather than an array in semantics: ``` private[ml] class L2Regularization( override val regParam: Double, shouldApply: Int => Boolean, applyFeaturesStd: Option[Int => Double]) extends DifferentiableRegularization[Vector] ``` In LiR and LoR, we use a function: ``` val getFeaturesStd = (j: Int) => if (j >= 0 && j < numFeatures) featuresStd(j) else 0.0 ``` I think either is ok, but it's better to keep consistent with other algorithms. We can change here to use function or change the third argument of ```L2Regularization``` to ```Option[Array[Double]]```. I'm prefer the former way, what's your opinion? Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18315: [SPARK-21108] [ML] [WIP] convert LinearSVC to agg...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18315#discussion_r133671409 --- Diff: mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala --- @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.optim.aggregator + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { + + import DifferentiableLossAggregatorSuite.getClassificationSummarizers + + @transient var instances: Array[Instance] = _ + + override def beforeAll(): Unit = { +super.beforeAll() +instances = Array( + Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), + Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)), + Instance(0.0, 0.3, Vectors.dense(4.0, 0.5)) +) + } + + /** Get summary statistics for some data and create a new HingeAggregator. */ + private def getNewAggregator( + instances: Array[Instance], + coefficients: Vector, + fitIntercept: Boolean): HingeAggregator = { +val (featuresSummarizer, ySummarizer) = + DifferentiableLossAggregatorSuite.getClassificationSummarizers(instances) +val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt) +val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd) +val bcCoefficients = spark.sparkContext.broadcast(coefficients) +new HingeAggregator(bcFeaturesStd, fitIntercept)(bcCoefficients) + } + + test("aggregator add method input size") { +val coefArray = Array(1.0, 2.0) +val interceptArray = Array(2.0) +val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray), + fitIntercept = true) +withClue("HingeAggregator features dimension must match coefficients dimension") { + intercept[IllegalArgumentException] { +agg.add(Instance(1.0, 1.0, Vectors.dense(2.0))) + } +} + } + + test("negative weight") { +val coefArray = Array(1.0, 2.0) +val interceptArray = Array(2.0) +val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray), + fitIntercept = true) +withClue("HingeAggregator does not support negative instance weights") { + intercept[IllegalArgumentException] { +agg.add(Instance(1.0, -1.0, Vectors.dense(2.0, 1.0))) + } +} + } + + test("check sizes binomial") { +val rng = new scala.util.Random +val numFeatures = instances.head.features.size +val coefWithIntercept = Vectors.dense(Array.fill(numFeatures + 1)(rng.nextDouble)) +val coefWithoutIntercept = Vectors.dense(Array.fill(numFeatures)(rng.nextDouble)) +val aggIntercept = getNewAggregator(instances, coefWithIntercept, fitIntercept = true) +val aggNoIntercept = getNewAggregator(instances, coefWithoutIntercept, + fitIntercept = false) +instances.foreach(aggIntercept.add) +instances.foreach(aggNoIntercept.add) + +assert(aggIntercept.gradient.size === numFeatures + 1) +assert(aggNoIntercept.gradient.size === numFeatures) + } + + + test("check correctness binomial") { --- End diff -- Ditto. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- -
[GitHub] spark pull request #18315: [SPARK-21108] [ML] [WIP] convert LinearSVC to agg...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18315#discussion_r133664633 --- Diff: mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala --- @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.optim.aggregator + +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.linalg._ + +/** + * HingeAggregator computes the gradient and loss for loss function ("hinge" or + * "squared_hinge", as used in binary classification for instances in sparse or dense --- End diff -- It seems this only support _hinge_ loss currently. BTW, if we support _squared hinge_ in the future, what is the best way? Add a param _loss function_ for ```HingeAggregator``` or just add a new ```SquaredHingeAggregator```? The later way should be more clear, but with more code. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18902: [SPARK-21690][ML] one-pass imputer
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18902#discussion_r133649640 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala --- @@ -133,23 +133,45 @@ class Imputer @Since("2.2.0") (@Since("2.2.0") override val uid: String) override def fit(dataset: Dataset[_]): ImputerModel = { transformSchema(dataset.schema, logging = true) val spark = dataset.sparkSession -import spark.implicits._ -val surrogates = $(inputCols).map { inputCol => - val ic = col(inputCol) - val filtered = dataset.select(ic.cast(DoubleType)) -.filter(ic.isNotNull && ic =!= $(missingValue) && !ic.isNaN) - if(filtered.take(1).length == 0) { -throw new SparkException(s"surrogate cannot be computed. " + - s"All the values in $inputCol are Null, Nan or missingValue(${$(missingValue)})") - } - val surrogate = $(strategy) match { -case Imputer.mean => filtered.select(avg(inputCol)).as[Double].first() -case Imputer.median => filtered.stat.approxQuantile(inputCol, Array(0.5), 0.001).head - } - surrogate + +val cols = $(inputCols).map { inputCol => + when(col(inputCol).equalTo($(missingValue)), null) +.when(col(inputCol).isNaN, null) +.otherwise(col(inputCol)) +.cast("double") +.as(inputCol) +} + +val results = $(strategy) match { + case Imputer.mean => +val row = dataset.select(cols.map(avg): _*).head() --- End diff -- Add annotation here to clarify _avg_ function will ignore _null_ automatically. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18902: [SPARK-21690][ML] one-pass imputer
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18902#discussion_r133649896 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala --- @@ -133,23 +133,45 @@ class Imputer @Since("2.2.0") (@Since("2.2.0") override val uid: String) override def fit(dataset: Dataset[_]): ImputerModel = { transformSchema(dataset.schema, logging = true) val spark = dataset.sparkSession -import spark.implicits._ -val surrogates = $(inputCols).map { inputCol => - val ic = col(inputCol) - val filtered = dataset.select(ic.cast(DoubleType)) -.filter(ic.isNotNull && ic =!= $(missingValue) && !ic.isNaN) - if(filtered.take(1).length == 0) { -throw new SparkException(s"surrogate cannot be computed. " + - s"All the values in $inputCol are Null, Nan or missingValue(${$(missingValue)})") - } - val surrogate = $(strategy) match { -case Imputer.mean => filtered.select(avg(inputCol)).as[Double].first() -case Imputer.median => filtered.stat.approxQuantile(inputCol, Array(0.5), 0.001).head - } - surrogate + +val cols = $(inputCols).map { inputCol => + when(col(inputCol).equalTo($(missingValue)), null) +.when(col(inputCol).isNaN, null) +.otherwise(col(inputCol)) +.cast("double") +.as(inputCol) +} + +val results = $(strategy) match { + case Imputer.mean => +val row = dataset.select(cols.map(avg): _*).head() +Array.range(0, $(inputCols).length).map { i => + if (row.isNullAt(i)) { +Double.NaN + } else { +row.getDouble(i) + } +} + + case Imputer.median => +dataset.select(cols: _*).stat.approxQuantile($(inputCols), Array(0.5), 0.001) --- End diff -- BTW, add annotation here to clarify _approxQuantile_ function will ignore _null_ automatically. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #18902: [SPARK-21690][ML] one-pass imputer
Github user yanboliang commented on the issue: https://github.com/apache/spark/pull/18902 @hhbyyh @zhengruifeng I'm ok with the _convert to null_ method, I think there is no extra pass for data if we handle it with this way, and the DataFrame/RDD functions to compute _mean/median_ will ignore _null_ . Thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18902: [SPARK-21690][ML] one-pass imputer
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/18902#discussion_r133647271 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala --- @@ -133,23 +133,45 @@ class Imputer @Since("2.2.0") (@Since("2.2.0") override val uid: String) override def fit(dataset: Dataset[_]): ImputerModel = { transformSchema(dataset.schema, logging = true) val spark = dataset.sparkSession -import spark.implicits._ -val surrogates = $(inputCols).map { inputCol => - val ic = col(inputCol) - val filtered = dataset.select(ic.cast(DoubleType)) -.filter(ic.isNotNull && ic =!= $(missingValue) && !ic.isNaN) - if(filtered.take(1).length == 0) { -throw new SparkException(s"surrogate cannot be computed. " + - s"All the values in $inputCol are Null, Nan or missingValue(${$(missingValue)})") - } - val surrogate = $(strategy) match { -case Imputer.mean => filtered.select(avg(inputCol)).as[Double].first() -case Imputer.median => filtered.stat.approxQuantile(inputCol, Array(0.5), 0.001).head - } - surrogate + +val cols = $(inputCols).map { inputCol => + when(col(inputCol).equalTo($(missingValue)), null) +.when(col(inputCol).isNaN, null) +.otherwise(col(inputCol)) +.cast("double") +.as(inputCol) +} + +val results = $(strategy) match { + case Imputer.mean => +val row = dataset.select(cols.map(avg): _*).head() +Array.range(0, $(inputCols).length).map { i => + if (row.isNullAt(i)) { +Double.NaN + } else { +row.getDouble(i) + } +} + + case Imputer.median => +dataset.select(cols: _*).stat.approxQuantile($(inputCols), Array(0.5), 0.001) --- End diff -- Add API annotation to clarify that the relative error of _median_ is 0.001 if _strategy == median_. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org