[ https://issues.apache.org/jira/browse/SPARK-23215?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Nikhil updated SPARK-23215: --------------------------- Description: Peforming groupByKey operation followed by reduceGroups on dataset results in java.lang.ArrayIndexOutOfBoundsException. *Input data(*spark_issue.csv*):* 1,nikhil 2,amit 3,rajeev 1,nikhil 2,amit2 3,rajeev2 *Code:* {code:java} // code placeholder {code} public class SparkIndexOutOfBoundsIssue \{ public static final String CSV_FORMAT = "org.apache.spark.sql.execution.datasources.csv.CSVFileFormat"; public static void main(String[] args) throws IOException { String path = "spark_issue.csv"; SparkSession session = SparkSession.builder().master("local").appName("Test").getOrCreate(); StructType schema1 = DataTypes .createStructType(new StructField[] { DataTypes.createStructField("distinct_id", DataTypes.StringType, true), DataTypes.createStructField("show_name", DataTypes.StringType, true) } ); StructType schema2 = DataTypes .createStructType(new StructField[] \{DataTypes.createStructField("colum", DataTypes.StringType, true) }); Dataset<Row> dataset = session.read().format(CSV_FORMAT).option("header", false).schema(schema1).load("/home/gabbar/Documents/v18/src/v18-gender-age-spark/src/spark_issue1.csv"); System.out.println("COUNT1: "+ dataset.count()); dataset .groupByKey( (MapFunction<Row, Row>) row -> \{ String[] arr = new String[2]; arr[0] = row.getAs(row.fieldIndex("distinct_id")); arr[1] = row.getAs(row.fieldIndex("show_name")); return new GenericRowWithSchema(arr, schema1); } ,RowEncoder.apply(schema1)) .reduceGroups((a, b) -> \{ Object[] obj = new Object[1]; obj[0] = "testdata"; GenericRowWithSchema row = new GenericRowWithSchema(obj, schema2); return row ; } ) .collect(); } } *Error-stacktrace:* 018-01-25 15:24:43,371 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Getting 1 non-empty blocks out of 1 blocks 2018-01-25 15:24:43,371 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Started 0 remote fetches in 0 ms 2018-01-25 15:24:43,379 [Executor task launch worker-0] ERROR org.apache.spark.executor.Executor - Exception in task 90.0 in stage 3.0 (TID 199) java.lang.ArrayIndexOutOfBoundsException: 1 at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2018-01-25 15:24:43,389 [dispatcher-event-loop-3] INFO org.apache.spark.scheduler.TaskSetManager - Starting task 123.0 in stage 3.0 (TID 200, localhost, partition 123, ANY, 5275 bytes) 2018-01-25 15:24:43,390 [Executor task launch worker-0] INFO org.apache.spark.executor.Executor - Running task 123.0 in stage 3.0 (TID 200) 2018-01-25 15:24:43,391 [task-result-getter-3] WARN org.apache.spark.scheduler.TaskSetManager - Lost task 90.0 in stage 3.0 (TID 199, localhost): java.lang.ArrayIndexOutOfBoundsException: 1 at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2018-01-25 15:24:43,391 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Getting 1 non-empty blocks out of 1 blocks 2018-01-25 15:24:43,392 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Started 0 remote fetches in 1 ms 2018-01-25 15:24:43,392 [task-result-getter-3] ERROR org.apache.spark.scheduler.TaskSetManager - Task 90 in stage 3.0 failed 1 times; aborting job 2018-01-25 15:24:43,395 [Executor task launch worker-0] INFO org.apache.spark.executor.Executor - Finished task 123.0 in stage 3.0 (TID 200). 2359 bytes result sent to driver 2018-01-25 15:24:43,395 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Cancelling stage 3 2018-01-25 15:24:43,397 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Stage 3 was cancelled 2018-01-25 15:24:43,397 [task-result-getter-0] INFO org.apache.spark.scheduler.TaskSetManager - Finished task 123.0 in stage 3.0 (TID 200) in 8 ms on localhost (197/200) 2018-01-25 15:24:43,397 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - ResultStage 3 (collect at SparkIndexOutOfBoundsIssue.java:49) failed in 1.257 s 2018-01-25 15:24:43,397 [task-result-getter-0] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Removed TaskSet 3.0, whose tasks have all completed, from pool 2018-01-25 15:24:43,398 [main] INFO org.apache.spark.scheduler.DAGScheduler - Job 1 failed: collect at SparkIndexOutOfBoundsIssue.java:49, took 1.365428 s Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 90 in stage 3.0 failed 1 times, most recent failure: Lost task 90.0 in stage 3.0 (TID 199, localhost): java.lang.ArrayIndexOutOfBoundsException: 1 at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1438) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1437) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1437) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1659) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1884) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1897) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1911) at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:893) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:358) at org.apache.spark.rdd.RDD.collect(RDD.scala:892) at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:290) at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2183) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57) at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2532) at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2182) at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2187) at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2187) at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2545) at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2187) at org.apache.spark.sql.Dataset.collect(Dataset.scala:2163) at v18.age.videowatched.SparkIndexOutOfBoundsIssue.main(SparkIndexOutOfBoundsIssue.java:49) Caused by: java.lang.ArrayIndexOutOfBoundsException: 1 at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2018-01-25 15:24:43,401 [Thread-1] INFO org.apache.spark.SparkContext - Invoking stop() from shutdown hook 2018-01-25 15:24:43,403 [Thread-1] INFO org.spark_project.jetty.server.ServerConnector - Stopped ServerConnector@1b2c4efb\{HTTP/1.1} {0.0.0.0:4041} 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2002348\{/stages/stage/kill,null,UNAVAILABLE} 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@3700ec9c\{/api,null,UNAVAILABLE} 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@51972dc7\{/,null,UNAVAILABLE} 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@294e5088\{/static,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2c532cd8\{/executors/threadDump/json,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@5f6722d3\{/executors/threadDump,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@39a8312f\{/executors/json,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2c5529ab\{/executors,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@6492fab5\{/environment/json,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@72c28d64\{/environment,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2807bdeb\{/storage/rdd/json,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@18920cc\{/storage/rdd,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@6ebf0f36\{/storage/json,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2e8e8225\{/storage,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@1b765a2c\{/stages/pool/json,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@7103cb56\{/stages/pool,null,UNAVAILABLE} 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@178213b\{/stages/stage/json,null,UNAVAILABLE} 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@613a8ee1\{/stages/stage,null,UNAVAILABLE} 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@41dd05a\{/stages/json,null,UNAVAILABLE} 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@67c2e933\{/stages,null,UNAVAILABLE} 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@fade1fc\{/jobs/job/json,null,UNAVAILABLE} 2018-01-25 15:24:43,408 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@7ca33c24\{/jobs/job,null,UNAVAILABLE} 2018-01-25 15:24:43,408 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@61f05988\{/jobs/json,null,UNAVAILABLE} 2018-01-25 15:24:43,408 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@59f63e24\{/jobs,null,UNAVAILABLE} 2018-01-25 15:24:43,409 [Thread-1] INFO org.apache.spark.ui.SparkUI - Stopped Spark web UI at [http://192.168.1.8:4041|http://192.168.1.8:4041/] 2018-01-25 15:24:43,416 [dispatcher-event-loop-9] INFO org.apache.spark.MapOutputTrackerMasterEndpoint - MapOutputTrackerMasterEndpoint stopped! 2018-01-25 15:24:43,424 [Thread-1] INFO org.apache.spark.storage.memory.MemoryStore - MemoryStore cleared 2018-01-25 15:24:43,424 [Thread-1] INFO org.apache.spark.storage.BlockManager - BlockManager stopped 2018-01-25 15:24:43,425 [Thread-1] INFO org.apache.spark.storage.BlockManagerMaster - BlockManagerMaster stopped 2018-01-25 15:24:43,427 [dispatcher-event-loop-1] INFO org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint - OutputCommitCoordinator stopped! 2018-01-25 15:24:43,428 [Thread-1] INFO org.apache.spark.SparkContext - Successfully stopped SparkContext 2018-01-25 15:24:43,428 [Thread-1] INFO org.apache.spark.util.ShutdownHookManager - Shutdown hook called 2018-01-25 15:24:43,429 [Thread-1] INFO org.apache.spark.util.ShutdownHookManager - Deleting directory /tmp/spark-978612f4-1e96-4366-9d14-ebe54627e6d4 *Note:* If we change the data so that all rows are distinct over two columns of input csv,above code does not throw error. was: Peforming groupByKey operation followed by reduceGroups on dataset results in java.lang.ArrayIndexOutOfBoundsException. *Input data(*spark_issue.csv*):* 1,nikhil 2,amit 3,rajeev 1,nikhil 2,amit2 3,rajeev2 *Code:*public class SparkIndexOutOfBoundsIssue { public static final String CSV_FORMAT = "org.apache.spark.sql.execution.datasources.csv.CSVFileFormat"; public static void main(String[] args) throws IOException { String path = "spark_issue.csv"; SparkSession session = SparkSession.builder().master("local").appName("Test").getOrCreate(); StructType schema1 = DataTypes .createStructType(new StructField[] { DataTypes.createStructField("distinct_id", DataTypes.StringType, true), DataTypes.createStructField("show_name", DataTypes.StringType, true) }); StructType schema2 = DataTypes .createStructType(new StructField[] { DataTypes.createStructField("colum", DataTypes.StringType, true) }); Dataset<Row> dataset = session.read().format(CSV_FORMAT).option("header", false).schema(schema1).load("/home/gabbar/Documents/v18/src/v18-gender-age-spark/src/spark_issue1.csv"); System.out.println("COUNT1: "+ dataset.count()); dataset .groupByKey( (MapFunction<Row, Row>) row -> { String[] arr = new String[2]; arr[0] = row.getAs(row.fieldIndex("distinct_id")); arr[1] = row.getAs(row.fieldIndex("show_name")); return new GenericRowWithSchema(arr, schema1); },RowEncoder.apply(schema1)) .reduceGroups((a, b) -> { Object[] obj = new Object[1]; obj[0] = "testdata"; GenericRowWithSchema row = new GenericRowWithSchema(obj, schema2); return row ; }) .collect(); } } *Error-stacktrace:* 018-01-25 15:24:43,371 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Getting 1 non-empty blocks out of 1 blocks 2018-01-25 15:24:43,371 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Started 0 remote fetches in 0 ms 2018-01-25 15:24:43,379 [Executor task launch worker-0] ERROR org.apache.spark.executor.Executor - Exception in task 90.0 in stage 3.0 (TID 199) java.lang.ArrayIndexOutOfBoundsException: 1 at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2018-01-25 15:24:43,389 [dispatcher-event-loop-3] INFO org.apache.spark.scheduler.TaskSetManager - Starting task 123.0 in stage 3.0 (TID 200, localhost, partition 123, ANY, 5275 bytes) 2018-01-25 15:24:43,390 [Executor task launch worker-0] INFO org.apache.spark.executor.Executor - Running task 123.0 in stage 3.0 (TID 200) 2018-01-25 15:24:43,391 [task-result-getter-3] WARN org.apache.spark.scheduler.TaskSetManager - Lost task 90.0 in stage 3.0 (TID 199, localhost): java.lang.ArrayIndexOutOfBoundsException: 1 at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2018-01-25 15:24:43,391 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Getting 1 non-empty blocks out of 1 blocks 2018-01-25 15:24:43,392 [Executor task launch worker-0] INFO org.apache.spark.storage.ShuffleBlockFetcherIterator - Started 0 remote fetches in 1 ms 2018-01-25 15:24:43,392 [task-result-getter-3] ERROR org.apache.spark.scheduler.TaskSetManager - Task 90 in stage 3.0 failed 1 times; aborting job 2018-01-25 15:24:43,395 [Executor task launch worker-0] INFO org.apache.spark.executor.Executor - Finished task 123.0 in stage 3.0 (TID 200). 2359 bytes result sent to driver 2018-01-25 15:24:43,395 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Cancelling stage 3 2018-01-25 15:24:43,397 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Stage 3 was cancelled 2018-01-25 15:24:43,397 [task-result-getter-0] INFO org.apache.spark.scheduler.TaskSetManager - Finished task 123.0 in stage 3.0 (TID 200) in 8 ms on localhost (197/200) 2018-01-25 15:24:43,397 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - ResultStage 3 (collect at SparkIndexOutOfBoundsIssue.java:49) failed in 1.257 s 2018-01-25 15:24:43,397 [task-result-getter-0] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Removed TaskSet 3.0, whose tasks have all completed, from pool 2018-01-25 15:24:43,398 [main] INFO org.apache.spark.scheduler.DAGScheduler - Job 1 failed: collect at SparkIndexOutOfBoundsIssue.java:49, took 1.365428 s Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 90 in stage 3.0 failed 1 times, most recent failure: Lost task 90.0 in stage 3.0 (TID 199, localhost): java.lang.ArrayIndexOutOfBoundsException: 1 at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1438) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1437) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1437) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1659) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1884) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1897) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1911) at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:893) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:358) at org.apache.spark.rdd.RDD.collect(RDD.scala:892) at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:290) at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2183) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57) at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2532) at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2182) at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2187) at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2187) at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2545) at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2187) at org.apache.spark.sql.Dataset.collect(Dataset.scala:2163) at v18.age.videowatched.SparkIndexOutOfBoundsIssue.main(SparkIndexOutOfBoundsIssue.java:49) Caused by: java.lang.ArrayIndexOutOfBoundsException: 1 at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 2018-01-25 15:24:43,401 [Thread-1] INFO org.apache.spark.SparkContext - Invoking stop() from shutdown hook 2018-01-25 15:24:43,403 [Thread-1] INFO org.spark_project.jetty.server.ServerConnector - Stopped ServerConnector@1b2c4efb\{HTTP/1.1}{0.0.0.0:4041} 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2002348\{/stages/stage/kill,null,UNAVAILABLE} 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@3700ec9c\{/api,null,UNAVAILABLE} 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@51972dc7\{/,null,UNAVAILABLE} 2018-01-25 15:24:43,404 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@294e5088\{/static,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2c532cd8\{/executors/threadDump/json,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@5f6722d3\{/executors/threadDump,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@39a8312f\{/executors/json,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2c5529ab\{/executors,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@6492fab5\{/environment/json,null,UNAVAILABLE} 2018-01-25 15:24:43,405 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@72c28d64\{/environment,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2807bdeb\{/storage/rdd/json,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@18920cc\{/storage/rdd,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@6ebf0f36\{/storage/json,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@2e8e8225\{/storage,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@1b765a2c\{/stages/pool/json,null,UNAVAILABLE} 2018-01-25 15:24:43,406 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@7103cb56\{/stages/pool,null,UNAVAILABLE} 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@178213b\{/stages/stage/json,null,UNAVAILABLE} 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@613a8ee1\{/stages/stage,null,UNAVAILABLE} 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@41dd05a\{/stages/json,null,UNAVAILABLE} 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@67c2e933\{/stages,null,UNAVAILABLE} 2018-01-25 15:24:43,407 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@fade1fc\{/jobs/job/json,null,UNAVAILABLE} 2018-01-25 15:24:43,408 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@7ca33c24\{/jobs/job,null,UNAVAILABLE} 2018-01-25 15:24:43,408 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@61f05988\{/jobs/json,null,UNAVAILABLE} 2018-01-25 15:24:43,408 [Thread-1] INFO org.spark_project.jetty.server.handler.ContextHandler - Stopped o.s.j.s.ServletContextHandler@59f63e24\{/jobs,null,UNAVAILABLE} 2018-01-25 15:24:43,409 [Thread-1] INFO org.apache.spark.ui.SparkUI - Stopped Spark web UI at http://192.168.1.8:4041 2018-01-25 15:24:43,416 [dispatcher-event-loop-9] INFO org.apache.spark.MapOutputTrackerMasterEndpoint - MapOutputTrackerMasterEndpoint stopped! 2018-01-25 15:24:43,424 [Thread-1] INFO org.apache.spark.storage.memory.MemoryStore - MemoryStore cleared 2018-01-25 15:24:43,424 [Thread-1] INFO org.apache.spark.storage.BlockManager - BlockManager stopped 2018-01-25 15:24:43,425 [Thread-1] INFO org.apache.spark.storage.BlockManagerMaster - BlockManagerMaster stopped 2018-01-25 15:24:43,427 [dispatcher-event-loop-1] INFO org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint - OutputCommitCoordinator stopped! 2018-01-25 15:24:43,428 [Thread-1] INFO org.apache.spark.SparkContext - Successfully stopped SparkContext 2018-01-25 15:24:43,428 [Thread-1] INFO org.apache.spark.util.ShutdownHookManager - Shutdown hook called 2018-01-25 15:24:43,429 [Thread-1] INFO org.apache.spark.util.ShutdownHookManager - Deleting directory /tmp/spark-978612f4-1e96-4366-9d14-ebe54627e6d4 *Note:* If we change the data so that all rows are distinct over two columns of input csv,above code does not throw error. > Dataset Grouping: Index out of bounds error > ------------------------------------------- > > Key: SPARK-23215 > URL: https://issues.apache.org/jira/browse/SPARK-23215 > Project: Spark > Issue Type: Bug > Components: Spark Core > Affects Versions: 2.1.0, 2.2.0 > Reporter: Nikhil > Priority: Major > > Peforming groupByKey operation followed by reduceGroups on dataset results in > java.lang.ArrayIndexOutOfBoundsException. > *Input data(*spark_issue.csv*):* > 1,nikhil > 2,amit > 3,rajeev > 1,nikhil > 2,amit2 > 3,rajeev2 > *Code:* > {code:java} > // code placeholder > {code} > public class SparkIndexOutOfBoundsIssue \{ public static final > String CSV_FORMAT = > "org.apache.spark.sql.execution.datasources.csv.CSVFileFormat"; > public static void main(String[] args) throws IOException { String > path = "spark_issue.csv"; SparkSession session = > SparkSession.builder().master("local").appName("Test").getOrCreate(); > StructType schema1 = DataTypes .createStructType(new > StructField[] { > DataTypes.createStructField("distinct_id", DataTypes.StringType, true), > DataTypes.createStructField("show_name", DataTypes.StringType, true) > } ); StructType schema2 = > DataTypes .createStructType(new StructField[] > \{DataTypes.createStructField("colum", DataTypes.StringType, true) }); > Dataset<Row> dataset = session.read().format(CSV_FORMAT).option("header", > false).schema(schema1).load("/home/gabbar/Documents/v18/src/v18-gender-age-spark/src/spark_issue1.csv"); > System.out.println("COUNT1: "+ dataset.count()); dataset > .groupByKey( (MapFunction<Row, Row>) row -> > \{ String[] arr = new String[2]; > arr[0] = row.getAs(row.fieldIndex("distinct_id")); > arr[1] = row.getAs(row.fieldIndex("show_name")); > return new GenericRowWithSchema(arr, schema1); } > ,RowEncoder.apply(schema1)) .reduceGroups((a, b) -> \{ > Object[] obj = new Object[1]; obj[0] = "testdata"; > GenericRowWithSchema row = new GenericRowWithSchema(obj, schema2); > return row ; } ) .collect(); } > } > *Error-stacktrace:* > 018-01-25 15:24:43,371 [Executor task launch worker-0] INFO > org.apache.spark.storage.ShuffleBlockFetcherIterator - Getting 1 non-empty > blocks out of 1 blocks > 2018-01-25 15:24:43,371 [Executor task launch worker-0] INFO > org.apache.spark.storage.ShuffleBlockFetcherIterator - Started 0 remote > fetches in 0 ms > 2018-01-25 15:24:43,379 [Executor task launch worker-0] ERROR > org.apache.spark.executor.Executor - Exception in task 90.0 in stage 3.0 (TID > 199) > java.lang.ArrayIndexOutOfBoundsException: 1 > at > org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) > at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) > at > org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > 2018-01-25 15:24:43,389 [dispatcher-event-loop-3] INFO > org.apache.spark.scheduler.TaskSetManager - Starting task 123.0 in stage 3.0 > (TID 200, localhost, partition 123, ANY, 5275 bytes) > 2018-01-25 15:24:43,390 [Executor task launch worker-0] INFO > org.apache.spark.executor.Executor - Running task 123.0 in stage 3.0 (TID 200) > 2018-01-25 15:24:43,391 [task-result-getter-3] WARN > org.apache.spark.scheduler.TaskSetManager - Lost task 90.0 in stage 3.0 (TID > 199, localhost): java.lang.ArrayIndexOutOfBoundsException: 1 > at > org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) > at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) > at > org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > 2018-01-25 15:24:43,391 [Executor task launch worker-0] INFO > org.apache.spark.storage.ShuffleBlockFetcherIterator - Getting 1 non-empty > blocks out of 1 blocks > 2018-01-25 15:24:43,392 [Executor task launch worker-0] INFO > org.apache.spark.storage.ShuffleBlockFetcherIterator - Started 0 remote > fetches in 1 ms > 2018-01-25 15:24:43,392 [task-result-getter-3] ERROR > org.apache.spark.scheduler.TaskSetManager - Task 90 in stage 3.0 failed 1 > times; aborting job > 2018-01-25 15:24:43,395 [Executor task launch worker-0] INFO > org.apache.spark.executor.Executor - Finished task 123.0 in stage 3.0 (TID > 200). 2359 bytes result sent to driver > 2018-01-25 15:24:43,395 [dag-scheduler-event-loop] INFO > org.apache.spark.scheduler.TaskSchedulerImpl - Cancelling stage 3 > 2018-01-25 15:24:43,397 [dag-scheduler-event-loop] INFO > org.apache.spark.scheduler.TaskSchedulerImpl - Stage 3 was cancelled > 2018-01-25 15:24:43,397 [task-result-getter-0] INFO > org.apache.spark.scheduler.TaskSetManager - Finished task 123.0 in stage 3.0 > (TID 200) in 8 ms on localhost (197/200) > 2018-01-25 15:24:43,397 [dag-scheduler-event-loop] INFO > org.apache.spark.scheduler.DAGScheduler - ResultStage 3 (collect at > SparkIndexOutOfBoundsIssue.java:49) failed in 1.257 s > 2018-01-25 15:24:43,397 [task-result-getter-0] INFO > org.apache.spark.scheduler.TaskSchedulerImpl - Removed TaskSet 3.0, whose > tasks have all completed, from pool > 2018-01-25 15:24:43,398 [main] INFO org.apache.spark.scheduler.DAGScheduler > - Job 1 failed: collect at SparkIndexOutOfBoundsIssue.java:49, took 1.365428 s > Exception in thread "main" org.apache.spark.SparkException: Job aborted due > to stage failure: Task 90 in stage 3.0 failed 1 times, most recent failure: > Lost task 90.0 in stage 3.0 (TID 199, localhost): > java.lang.ArrayIndexOutOfBoundsException: 1 > at > org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) > at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) > at > org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1438) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1437) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1437) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1659) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1884) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1897) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1911) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:893) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:358) > at org.apache.spark.rdd.RDD.collect(RDD.scala:892) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:290) > at > org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2183) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57) > at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2532) > at > org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2182) > at > org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2187) > at > org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collect$1.apply(Dataset.scala:2187) > at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2545) > at > org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2187) > at org.apache.spark.sql.Dataset.collect(Dataset.scala:2163) > at > v18.age.videowatched.SparkIndexOutOfBoundsIssue.main(SparkIndexOutOfBoundsIssue.java:49) > Caused by: java.lang.ArrayIndexOutOfBoundsException: 1 > at > org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:200) > at org.apache.spark.sql.Row$class.isNullAt(Row.scala:185) > at > org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:192) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > 2018-01-25 15:24:43,401 [Thread-1] INFO org.apache.spark.SparkContext - > Invoking stop() from shutdown hook > 2018-01-25 15:24:43,403 [Thread-1] INFO > org.spark_project.jetty.server.ServerConnector - Stopped > ServerConnector@1b2c4efb\{HTTP/1.1} > {0.0.0.0:4041} > 2018-01-25 15:24:43,404 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@2002348\{/stages/stage/kill,null,UNAVAILABLE} > 2018-01-25 15:24:43,404 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@3700ec9c\{/api,null,UNAVAILABLE} > 2018-01-25 15:24:43,404 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@51972dc7\{/,null,UNAVAILABLE} > 2018-01-25 15:24:43,404 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@294e5088\{/static,null,UNAVAILABLE} > 2018-01-25 15:24:43,405 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@2c532cd8\{/executors/threadDump/json,null,UNAVAILABLE} > 2018-01-25 15:24:43,405 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@5f6722d3\{/executors/threadDump,null,UNAVAILABLE} > 2018-01-25 15:24:43,405 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@39a8312f\{/executors/json,null,UNAVAILABLE} > 2018-01-25 15:24:43,405 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@2c5529ab\{/executors,null,UNAVAILABLE} > 2018-01-25 15:24:43,405 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@6492fab5\{/environment/json,null,UNAVAILABLE} > 2018-01-25 15:24:43,405 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@72c28d64\{/environment,null,UNAVAILABLE} > 2018-01-25 15:24:43,406 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@2807bdeb\{/storage/rdd/json,null,UNAVAILABLE} > 2018-01-25 15:24:43,406 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@18920cc\{/storage/rdd,null,UNAVAILABLE} > 2018-01-25 15:24:43,406 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@6ebf0f36\{/storage/json,null,UNAVAILABLE} > 2018-01-25 15:24:43,406 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@2e8e8225\{/storage,null,UNAVAILABLE} > 2018-01-25 15:24:43,406 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@1b765a2c\{/stages/pool/json,null,UNAVAILABLE} > 2018-01-25 15:24:43,406 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@7103cb56\{/stages/pool,null,UNAVAILABLE} > 2018-01-25 15:24:43,407 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@178213b\{/stages/stage/json,null,UNAVAILABLE} > 2018-01-25 15:24:43,407 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@613a8ee1\{/stages/stage,null,UNAVAILABLE} > 2018-01-25 15:24:43,407 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@41dd05a\{/stages/json,null,UNAVAILABLE} > 2018-01-25 15:24:43,407 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@67c2e933\{/stages,null,UNAVAILABLE} > 2018-01-25 15:24:43,407 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@fade1fc\{/jobs/job/json,null,UNAVAILABLE} > 2018-01-25 15:24:43,408 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@7ca33c24\{/jobs/job,null,UNAVAILABLE} > 2018-01-25 15:24:43,408 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@61f05988\{/jobs/json,null,UNAVAILABLE} > 2018-01-25 15:24:43,408 [Thread-1] INFO > org.spark_project.jetty.server.handler.ContextHandler - Stopped > o.s.j.s.ServletContextHandler@59f63e24\{/jobs,null,UNAVAILABLE} > 2018-01-25 15:24:43,409 [Thread-1] INFO org.apache.spark.ui.SparkUI - > Stopped Spark web UI at [http://192.168.1.8:4041|http://192.168.1.8:4041/] > 2018-01-25 15:24:43,416 [dispatcher-event-loop-9] INFO > org.apache.spark.MapOutputTrackerMasterEndpoint - > MapOutputTrackerMasterEndpoint stopped! > 2018-01-25 15:24:43,424 [Thread-1] INFO > org.apache.spark.storage.memory.MemoryStore - MemoryStore cleared > 2018-01-25 15:24:43,424 [Thread-1] INFO > org.apache.spark.storage.BlockManager - BlockManager stopped > 2018-01-25 15:24:43,425 [Thread-1] INFO > org.apache.spark.storage.BlockManagerMaster - BlockManagerMaster stopped > 2018-01-25 15:24:43,427 [dispatcher-event-loop-1] INFO > org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint > - OutputCommitCoordinator stopped! > 2018-01-25 15:24:43,428 [Thread-1] INFO org.apache.spark.SparkContext - > Successfully stopped SparkContext > 2018-01-25 15:24:43,428 [Thread-1] INFO > org.apache.spark.util.ShutdownHookManager - Shutdown hook called > 2018-01-25 15:24:43,429 [Thread-1] INFO > org.apache.spark.util.ShutdownHookManager - Deleting directory > /tmp/spark-978612f4-1e96-4366-9d14-ebe54627e6d4 > *Note:* If we change the data so that all rows are distinct over two columns > of input csv,above code does not throw error. > -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org