Howdy, I'm a relative novice at Spark/Scala and I'm puzzled by some behavior that I'm seeing in 2 of my local Spark/Scala environments (Scala for Jupyter and Scala IDE) but not the 3rd (Spark Shell). The following code throws the following stack trace error in the former 2 environments but executes successfully in the 3rd. I'm not sure how to go about troubleshooting my former 2 environments so any assistance is greatly appreciated.
Code: //get file val logFile = "s3n://file" val logData = sc.textFile(logFile) // header val header = logData.first // filter out header val sample = logData.filter(!_.contains(header)).map { line => line.replaceAll("['\"]","").substring(0,line.length()-1) }.takeSample(false,100,12L) Stack Trace: org.apache.spark.SparkException: Task not serializable org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:315) org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:305) org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:132) org.apache.spark.SparkContext.clean(SparkContext.scala:1893) org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:311) org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:310) org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) org.apache.spark.rdd.RDD.withScope(RDD.scala:286) org.apache.spark.rdd.RDD.filter(RDD.scala:310) cmd6$$user$$anonfun$3.apply(Main.scala:134) cmd6$$user$$anonfun$3.apply(Main.scala:133) java.io.NotSerializableException: org.apache.spark.SparkConf Serialization stack: - object not serializable (class: org.apache.spark.SparkConf, value: org.apache.spark.SparkConf@309ed441) - field (class: cmd2$$user, name: conf, type: class org.apache.spark.SparkConf) - object (class cmd2$$user, cmd2$$user@75a88665) - field (class: cmd6, name: $ref$cmd2, type: class cmd2$$user) - object (class cmd6, cmd6@5e9e8f0b) - field (class: cmd6$$user, name: $outer, type: class cmd6) - object (class cmd6$$user, cmd6$$user@692f81c) - field (class: cmd6$$user$$anonfun$3, name: $outer, type: class cmd6$$user) - object (class cmd6$$user$$anonfun$3, <function0>) - field (class: cmd6$$user$$anonfun$3$$anonfun$apply$1, name: $outer, type: class cmd6$$user$$anonfun$3) - object (class cmd6$$user$$anonfun$3$$anonfun$apply$1, <function1>) org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40) org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47) org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:81) org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:312) org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:305) org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:132) org.apache.spark.SparkContext.clean(SparkContext.scala:1893) org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:311) org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:310) org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) org.apache.spark.rdd.RDD.withScope(RDD.scala:286) org.apache.spark.rdd.RDD.filter(RDD.scala:310) cmd6$$user$$anonfun$3.apply(Main.scala:134) cmd6$$user$$anonfun$3.apply(Main.scala:133) Thanks, Balaji