[jira] [Commented] (SPARK-22427) StackOverFlowError when using FPGrowth
[ https://issues.apache.org/jira/browse/SPARK-22427?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16259587#comment-16259587 ] yuhao yang commented on SPARK-22427: I tried with larger scale data but did not repro the issue. [~lyt] Can you please provide the reference for your dataset, or some size info? Thanks. > StackOverFlowError when using FPGrowth > -- > > Key: SPARK-22427 > URL: https://issues.apache.org/jira/browse/SPARK-22427 > Project: Spark > Issue Type: Bug > Components: ML, MLlib >Affects Versions: 2.2.0 > Environment: Centos Linux 3.10.0-327.el7.x86_64 > java 1.8.0.111 > spark 2.2.0 >Reporter: lyt > > code part: > val path = jobConfig.getString("hdfspath") > val vectordata = sc.sparkContext.textFile(path) > val finaldata = sc.createDataset(vectordata.map(obj => { > obj.split(" ") > }).filter(arr => arr.length > 0)).toDF("items") > val fpg = new FPGrowth() > > fpg.setMinSupport(minSupport).setItemsCol("items").setMinConfidence(minConfidence) > val train = fpg.fit(finaldata) > print(train.freqItemsets.count()) > print(train.associationRules.count()) > train.save("/tmp/FPGModel") > And encountered following exception: > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) > at org.apache.spark.rdd.RDD.collect(RDD.scala:935) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2430) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2429) > at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836) > at org.apache.spark.sql.Dataset.count(Dataset.scala:2429) > at DataMining.FPGrowth$.runJob(FPGrowth.scala:116) > at DataMining.testFPG$.main(FPGrowth.scala:36) > at DataMining.testFPG.main(FPGrowth.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:755) > at > org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) > at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) > at org.apache.spark.d
[jira] [Commented] (SPARK-22427) StackOverFlowError when using FPGrowth
[ https://issues.apache.org/jira/browse/SPARK-22427?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16249097#comment-16249097 ] lyt commented on SPARK-22427: - -Xss512m didn’t work 发自我的iPhone -- Original -- From: yuhao yang (JIRA) Date: 周一,11月 13,2017 8:03 上午 To: 452104285 <452104...@qq.com> Subject: Re: [jira] [Commented] (SPARK-22427) StackOverFlowError when usingFPGrowth [ https://issues.apache.org/jira/browse/SPARK-22427?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16249017#comment-16249017 ] yuhao yang commented on SPARK-22427: Hi [~lyt] does increasing stack size resolve your issue? If not I will look into it. -- This message was sent by Atlassian JIRA (v6.4.14#64029) > StackOverFlowError when using FPGrowth > -- > > Key: SPARK-22427 > URL: https://issues.apache.org/jira/browse/SPARK-22427 > Project: Spark > Issue Type: Bug > Components: ML, MLlib >Affects Versions: 2.2.0 > Environment: Centos Linux 3.10.0-327.el7.x86_64 > java 1.8.0.111 > spark 2.2.0 >Reporter: lyt > > code part: > val path = jobConfig.getString("hdfspath") > val vectordata = sc.sparkContext.textFile(path) > val finaldata = sc.createDataset(vectordata.map(obj => { > obj.split(" ") > }).filter(arr => arr.length > 0)).toDF("items") > val fpg = new FPGrowth() > > fpg.setMinSupport(minSupport).setItemsCol("items").setMinConfidence(minConfidence) > val train = fpg.fit(finaldata) > print(train.freqItemsets.count()) > print(train.associationRules.count()) > train.save("/tmp/FPGModel") > And encountered following exception: > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) > at org.apache.spark.rdd.RDD.collect(RDD.scala:935) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2430) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2429) > at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836) > at org.apache.spark.sql.Dataset.count(Dataset.scala:2429) > at DataMining.FPGrowth$.runJob(FPGrowth.scala:116) > at DataMining.testFPG$.main(FPGrowth.scala:36) > at DataMining.testFPG.main(FPGrowth.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethod
[jira] [Commented] (SPARK-22427) StackOverFlowError when using FPGrowth
[ https://issues.apache.org/jira/browse/SPARK-22427?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16249017#comment-16249017 ] yuhao yang commented on SPARK-22427: Hi [~lyt] does increasing stack size resolve your issue? If not I will look into it. > StackOverFlowError when using FPGrowth > -- > > Key: SPARK-22427 > URL: https://issues.apache.org/jira/browse/SPARK-22427 > Project: Spark > Issue Type: Bug > Components: ML, MLlib >Affects Versions: 2.2.0 > Environment: Centos Linux 3.10.0-327.el7.x86_64 > java 1.8.0.111 > spark 2.2.0 >Reporter: lyt > > code part: > val path = jobConfig.getString("hdfspath") > val vectordata = sc.sparkContext.textFile(path) > val finaldata = sc.createDataset(vectordata.map(obj => { > obj.split(" ") > }).filter(arr => arr.length > 0)).toDF("items") > val fpg = new FPGrowth() > > fpg.setMinSupport(minSupport).setItemsCol("items").setMinConfidence(minConfidence) > val train = fpg.fit(finaldata) > print(train.freqItemsets.count()) > print(train.associationRules.count()) > train.save("/tmp/FPGModel") > And encountered following exception: > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) > at org.apache.spark.rdd.RDD.collect(RDD.scala:935) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2430) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2429) > at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836) > at org.apache.spark.sql.Dataset.count(Dataset.scala:2429) > at DataMining.FPGrowth$.runJob(FPGrowth.scala:116) > at DataMining.testFPG$.main(FPGrowth.scala:36) > at DataMining.testFPG.main(FPGrowth.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:755) > at > org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) > at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) > at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119) > at org.ap
[jira] [Commented] (SPARK-22427) StackOverFlowError when using FPGrowth
[ https://issues.apache.org/jira/browse/SPARK-22427?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16240662#comment-16240662 ] lyt commented on SPARK-22427: - Maybe it's problem of DataFrame. This problem won't occur when using pure rdds. > StackOverFlowError when using FPGrowth > -- > > Key: SPARK-22427 > URL: https://issues.apache.org/jira/browse/SPARK-22427 > Project: Spark > Issue Type: Bug > Components: ML, MLlib >Affects Versions: 2.2.0 > Environment: Centos Linux 3.10.0-327.el7.x86_64 > java 1.8.0.111 > spark 2.2.0 >Reporter: lyt > > code part: > val path = jobConfig.getString("hdfspath") > val vectordata = sc.sparkContext.textFile(path) > val finaldata = sc.createDataset(vectordata.map(obj => { > obj.split(" ") > }).filter(arr => arr.length > 0)).toDF("items") > val fpg = new FPGrowth() > > fpg.setMinSupport(minSupport).setItemsCol("items").setMinConfidence(minConfidence) > val train = fpg.fit(finaldata) > print(train.freqItemsets.count()) > print(train.associationRules.count()) > train.save("/tmp/FPGModel") > And encountered following exception: > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) > at org.apache.spark.rdd.RDD.collect(RDD.scala:935) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2430) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2429) > at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836) > at org.apache.spark.sql.Dataset.count(Dataset.scala:2429) > at DataMining.FPGrowth$.runJob(FPGrowth.scala:116) > at DataMining.testFPG$.main(FPGrowth.scala:36) > at DataMining.testFPG.main(FPGrowth.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:755) > at > org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) > at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) > at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119) > at org.apache.spark.deploy.Sp
[jira] [Commented] (SPARK-22427) StackOverFlowError when using FPGrowth
[ https://issues.apache.org/jira/browse/SPARK-22427?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16239824#comment-16239824 ] Liang-Chi Hsieh commented on SPARK-22427: - >From a rough glance, looks like the error didn't be thrown inside ml.FPGrowth? {code} ... at org.apache.spark.sql.Dataset.count(Dataset.scala:2429) at DataMining.FPGrowth$.runJob(FPGrowth.scala:116) at DataMining.testFPG$.main(FPGrowth.scala:36) at DataMining.testFPG.main(FPGrowth.scala) ... {code} > StackOverFlowError when using FPGrowth > -- > > Key: SPARK-22427 > URL: https://issues.apache.org/jira/browse/SPARK-22427 > Project: Spark > Issue Type: Bug > Components: ML, MLlib >Affects Versions: 2.2.0 > Environment: Centos Linux 3.10.0-327.el7.x86_64 > java 1.8.0.111 > spark 2.2.0 >Reporter: lyt > > code part: > val path = jobConfig.getString("hdfspath") > val vectordata = sc.sparkContext.textFile(path) > val finaldata = sc.createDataset(vectordata.map(obj => { > obj.split(" ") > }).filter(arr => arr.length > 0)).toDF("items") > val fpg = new FPGrowth() > > fpg.setMinSupport(minSupport).setItemsCol("items").setMinConfidence(minConfidence) > val train = fpg.fit(finaldata) > print(train.freqItemsets.count()) > print(train.associationRules.count()) > train.save("/tmp/FPGModel") > And encountered following exception: > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) > at org.apache.spark.rdd.RDD.collect(RDD.scala:935) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2430) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2429) > at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836) > at org.apache.spark.sql.Dataset.count(Dataset.scala:2429) > at DataMining.FPGrowth$.runJob(FPGrowth.scala:116) > at DataMining.testFPG$.main(FPGrowth.scala:36) > at DataMining.testFPG.main(FPGrowth.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:755) > at > org.apache.spark.
[jira] [Commented] (SPARK-22427) StackOverFlowError when using FPGrowth
[ https://issues.apache.org/jira/browse/SPARK-22427?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16237174#comment-16237174 ] yuhao yang commented on SPARK-22427: Could you please try to increase the stack size, E.g. with -Xss10m ? > StackOverFlowError when using FPGrowth > -- > > Key: SPARK-22427 > URL: https://issues.apache.org/jira/browse/SPARK-22427 > Project: Spark > Issue Type: Bug > Components: ML, MLlib >Affects Versions: 2.2.0 > Environment: Centos Linux 3.10.0-327.el7.x86_64 > java 1.8.0.111 > spark 2.2.0 >Reporter: lyt >Priority: Normal > > code part: > val path = jobConfig.getString("hdfspath") > val vectordata = sc.sparkContext.textFile(path) > val finaldata = sc.createDataset(vectordata.map(obj => { > obj.split(" ") > }).filter(arr => arr.length > 0)).toDF("items") > val fpg = new FPGrowth() > > fpg.setMinSupport(minSupport).setItemsCol("items").setMinConfidence(minConfidence) > val train = fpg.fit(finaldata) > print(train.freqItemsets.count()) > print(train.associationRules.count()) > train.save("/tmp/FPGModel") > And encountered following exception: > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) > at org.apache.spark.rdd.RDD.collect(RDD.scala:935) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2430) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2429) > at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836) > at org.apache.spark.sql.Dataset.count(Dataset.scala:2429) > at DataMining.FPGrowth$.runJob(FPGrowth.scala:116) > at DataMining.testFPG$.main(FPGrowth.scala:36) > at DataMining.testFPG.main(FPGrowth.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:755) > at > org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) > at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) > at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119) >
[jira] [Commented] (SPARK-22427) StackOverFlowError when using FPGrowth
[ https://issues.apache.org/jira/browse/SPARK-22427?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16237112#comment-16237112 ] Kazuaki Ishizaki commented on SPARK-22427: -- Thank you for reporting an issue. Could you please attach the data file? Or, can you write data size with a part of example data? > StackOverFlowError when using FPGrowth > -- > > Key: SPARK-22427 > URL: https://issues.apache.org/jira/browse/SPARK-22427 > Project: Spark > Issue Type: Bug > Components: ML, MLlib >Affects Versions: 2.2.0 > Environment: Centos Linux 3.10.0-327.el7.x86_64 > java 1.8.0.111 > spark 2.2.0 >Reporter: lyt >Priority: Normal > > code part: > val path = jobConfig.getString("hdfspath") > val vectordata = sc.sparkContext.textFile(path) > val finaldata = sc.createDataset(vectordata.map(obj => { > obj.split(" ") > }).filter(arr => arr.length > 0)).toDF("items") > val fpg = new FPGrowth() > > fpg.setMinSupport(minSupport).setItemsCol("items").setMinConfidence(minConfidence) > val train = fpg.fit(finaldata) > print(train.freqItemsets.count()) > print(train.associationRules.count()) > train.save("/tmp/FPGModel") > And encountered following exception: > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) > at org.apache.spark.rdd.RDD.collect(RDD.scala:935) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2430) > at > org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2429) > at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836) > at org.apache.spark.sql.Dataset.count(Dataset.scala:2429) > at DataMining.FPGrowth$.runJob(FPGrowth.scala:116) > at DataMining.testFPG$.main(FPGrowth.scala:36) > at DataMining.testFPG.main(FPGrowth.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at > org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:755) > at > org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) > at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) >