Github user wangmiao1981 commented on a diff in the pull request: https://github.com/apache/spark/pull/12402#discussion_r60694336 --- Diff: mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala --- @@ -104,6 +105,17 @@ class GaussianMixtureModel private[ml] ( @Since("2.0.0") def gaussians: Array[MultivariateGaussian] = parentModel.gaussians + /** + * Helper method used in Python + */ + def gaussiansDF: DataFrame = { --- End diff -- @jkbradley After changing to the above lines, I got some errors. It seems that I can't use SparkContext to get sqlContext. Errors: execute, tree: Exchange hashpartitioning(gm_prediction#3, 200), None +- WholeStageCodegen : +- TungstenAggregate(key=[gm_prediction#3], functions=[(count(1),mode=Partial,isDistinct=false)], output=[gm_prediction#3,count#32L]) : +- Project [UDF(features#0) AS gm_prediction#3] : +- INPUT +- Scan ExistingRDD[features#0] org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree: Exchange hashpartitioning(gm_prediction#3, 200), None +- WholeStageCodegen : +- TungstenAggregate(key=[gm_prediction#3], functions=[(count(1),mode=Partial,isDistinct=false)], output=[gm_prediction#3,count#32L]) : +- Project [UDF(features#0) AS gm_prediction#3] : +- INPUT +- Scan ExistingRDD[features#0] at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:50) at org.apache.spark.sql.execution.exchange.ShuffleExchange.doExecute(ShuffleExchange.scala:109) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:137) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:134) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:117) at org.apache.spark.sql.execution.InputAdapter.upstreams(WholeStageCodegen.scala:237) at org.apache.spark.sql.execution.aggregate.TungstenAggregate.upstreams(TungstenAggregate.scala:131) at org.apache.spark.sql.execution.WholeStageCodegen.doExecute(WholeStageCodegen.scala:352) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:137) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:134) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:117) at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:230) at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:277) at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2099) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:53) at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2386) at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2098) at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2103) at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2103) at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2399) at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2103) at org.apache.spark.sql.Dataset.collect(Dataset.scala:2079) at org.apache.spark.ml.clustering.GaussianMixtureSummary.clusterSizes$lzycompute(GaussianMixture.scala:327) at org.apache.spark.ml.clustering.GaussianMixtureSummary.clusterSizes(GaussianMixture.scala:325) at org.apache.spark.ml.clustering.GaussianMixtureSuite$$anonfun$4.apply$mcV$sp(GaussianMixtureSuite.scala:102) at org.apache.spark.ml.clustering.GaussianMixtureSuite$$anonfun$4.apply(GaussianMixtureSuite.scala:73) at org.apache.spark.ml.clustering.GaussianMixtureSuite$$anonfun$4.apply(GaussianMixtureSuite.scala:73) at org.scalatest.Transformer$$anonfun$apply$1.apply$mcV$sp(Transformer.scala:22) at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85) at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104) at org.scalatest.Transformer.apply(Transformer.scala:22) at org.scalatest.Transformer.apply(Transformer.scala:20) at org.scalatest.FunSuiteLike$$anon$1.apply(FunSuiteLike.scala:166) at org.apache.spark.SparkFunSuite.withFixture(SparkFunSuite.scala:56) at org.scalatest.FunSuiteLike$class.invokeWithFixture$1(FunSuiteLike.scala:163) at org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:175) at org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:175) at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306) at org.scalatest.FunSuiteLike$class.runTest(FunSuiteLike.scala:175) at org.scalatest.FunSuite.runTest(FunSuite.scala:1555) at org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:208) at org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:208) at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:413) at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:401) at scala.collection.immutable.List.foreach(List.scala:381) at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401) at org.scalatest.SuperEngine.org$scalatest$SuperEngine$$runTestsInBranch(Engine.scala:396) at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:483) at org.scalatest.FunSuiteLike$class.runTests(FunSuiteLike.scala:208) at org.scalatest.FunSuite.runTests(FunSuite.scala:1555) at org.scalatest.Suite$class.run(Suite.scala:1424) at org.scalatest.FunSuite.org$scalatest$FunSuiteLike$$super$run(FunSuite.scala:1555) at org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:212) at org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:212) at org.scalatest.SuperEngine.runImpl(Engine.scala:545) at org.scalatest.FunSuiteLike$class.run(FunSuiteLike.scala:212) at org.apache.spark.SparkFunSuite.org$scalatest$BeforeAndAfterAll$$super$run(SparkFunSuite.scala:28) at org.scalatest.BeforeAndAfterAll$class.liftedTree1$1(BeforeAndAfterAll.scala:257) at org.scalatest.BeforeAndAfterAll$class.run(BeforeAndAfterAll.scala:256) at org.apache.spark.SparkFunSuite.run(SparkFunSuite.scala:28) at org.scalatest.tools.SuiteRunner.run(SuiteRunner.scala:55) at org.scalatest.tools.Runner$$anonfun$doRunRunRunDaDoRunRun$3.apply(Runner.scala:2563) at org.scalatest.tools.Runner$$anonfun$doRunRunRunDaDoRunRun$3.apply(Runner.scala:2557) at scala.collection.immutable.List.foreach(List.scala:381) at org.scalatest.tools.Runner$.doRunRunRunDaDoRunRun(Runner.scala:2557) at org.scalatest.tools.Runner$$anonfun$runOptionallyWithPassFailReporter$2.apply(Runner.scala:1044) at org.scalatest.tools.Runner$$anonfun$runOptionallyWithPassFailReporter$2.apply(Runner.scala:1043) at org.scalatest.tools.Runner$.withClassLoaderAndDispatchReporter(Runner.scala:2722) at org.scalatest.tools.Runner$.runOptionallyWithPassFailReporter(Runner.scala:1043) at org.scalatest.tools.Runner$.run(Runner.scala:883) at org.scalatest.tools.Runner.run(Runner.scala) at org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.runScalaTest2(ScalaTestRunner.java:138) at org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.main(ScalaTestRunner.java:28) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144) Caused by: org.apache.spark.SparkException: Task not serializable at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:305) at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:295) at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:123) at org.apache.spark.SparkContext.clean(SparkContext.scala:1944) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:782) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:781) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:357) at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:781) at org.apache.spark.sql.execution.WholeStageCodegen.doExecute(WholeStageCodegen.scala:355) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:137) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:134) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:117) at org.apache.spark.sql.execution.exchange.ShuffleExchange.prepareShuffleDependency(ShuffleExchange.scala:82) at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:118) at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:109) at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49) ... 83 more Caused by: java.io.NotSerializableException: org.apache.spark.SparkContext Serialization stack: - object not serializable (class: org.apache.spark.SparkContext, value: org.apache.spark.SparkContext@6eaadd1d) - field (class: org.apache.spark.ml.clustering.GaussianMixtureModel, name: sc, type: class org.apache.spark.SparkContext) - object (class org.apache.spark.ml.clustering.GaussianMixtureModel, GaussianMixture_0e35e80dd8f8) - field (class: org.apache.spark.ml.clustering.GaussianMixtureModel$$anonfun$3, name: $outer, type: class org.apache.spark.ml.clustering.GaussianMixtureModel) - object (class org.apache.spark.ml.clustering.GaussianMixtureModel$$anonfun$3, <function1>) - field (class: org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2, name: func$2, type: interface scala.Function1) - object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2, <function1>) - field (class: org.apache.spark.sql.catalyst.expressions.ScalaUDF, name: f, type: interface scala.Function1) - object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF, UDF(features#0)) - field (class: org.apache.spark.sql.catalyst.expressions.Alias, name: child, type: class org.apache.spark.sql.catalyst.expressions.Expression) - object (class org.apache.spark.sql.catalyst.expressions.Alias, UDF(features#0) AS gm_prediction#3) - element of array (index: 0) - array (class [Ljava.lang.Object;, size 1) - field (class: scala.collection.mutable.ArrayBuffer, name: array, type: class [Ljava.lang.Object;) - object (class scala.collection.mutable.ArrayBuffer, ArrayBuffer(UDF(features#0) AS gm_prediction#3)) - field (class: org.apache.spark.sql.execution.Project, name: projectList, type: interface scala.collection.Seq) - object (class org.apache.spark.sql.execution.Project, Project [UDF(features#0) AS gm_prediction#3] +- INPUT ) - field (class: org.apache.spark.sql.execution.aggregate.TungstenAggregate, name: child, type: class org.apache.spark.sql.execution.SparkPlan) - object (class org.apache.spark.sql.execution.aggregate.TungstenAggregate, TungstenAggregate(key=[gm_prediction#3], functions=[(count(1),mode=Partial,isDistinct=false)], output=[gm_prediction#3,count#32L]) +- Project [UDF(features#0) AS gm_prediction#3] +- INPUT ) - element of array (index: 0) - array (class [Ljava.lang.Object;, size 3) - field (class: org.apache.spark.sql.execution.WholeStageCodegen$$anonfun$6, name: references$1, type: class [Ljava.lang.Object;) - object (class org.apache.spark.sql.execution.WholeStageCodegen$$anonfun$6, <function2>) at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40) at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46) at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100) at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:302) ... 103 more
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org