Howdy,

I'm a relative novice at Spark/Scala and I'm puzzled by some behavior that
I'm seeing in 2 of my local Spark/Scala environments (Scala for Jupyter and
Scala IDE) but not the 3rd (Spark Shell). The following code throws the
following stack trace error in the former 2 environments but executes
successfully in the 3rd. I'm not sure how to go about troubleshooting my
former 2 environments so any assistance is greatly appreciated.

Code:

//get file
val logFile = "s3n://file"
val logData  = sc.textFile(logFile)
// header
val header =  logData.first
// filter out header
val sample = logData.filter(!_.contains(header)).map {
 line => line.replaceAll("['\"]","").substring(0,line.length()-1)
}.takeSample(false,100,12L)

Stack Trace:

org.apache.spark.SparkException: Task not serializable
        
org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:315)
        
org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:305)
        org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:132)
        org.apache.spark.SparkContext.clean(SparkContext.scala:1893)
        org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:311)
        org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:310)
        
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
        
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
        org.apache.spark.rdd.RDD.withScope(RDD.scala:286)
        org.apache.spark.rdd.RDD.filter(RDD.scala:310)
        cmd6$$user$$anonfun$3.apply(Main.scala:134)
        cmd6$$user$$anonfun$3.apply(Main.scala:133)
java.io.NotSerializableException: org.apache.spark.SparkConf
Serialization stack:
        - object not serializable (class: org.apache.spark.SparkConf, value:
org.apache.spark.SparkConf@309ed441)
        - field (class: cmd2$$user, name: conf, type: class 
org.apache.spark.SparkConf)
        - object (class cmd2$$user, cmd2$$user@75a88665)
        - field (class: cmd6, name: $ref$cmd2, type: class cmd2$$user)
        - object (class cmd6, cmd6@5e9e8f0b)
        - field (class: cmd6$$user, name: $outer, type: class cmd6)
        - object (class cmd6$$user, cmd6$$user@692f81c)
        - field (class: cmd6$$user$$anonfun$3, name: $outer, type: class 
cmd6$$user)
        - object (class cmd6$$user$$anonfun$3, <function0>)
        - field (class: cmd6$$user$$anonfun$3$$anonfun$apply$1, name: $outer,
type: class cmd6$$user$$anonfun$3)
        - object (class cmd6$$user$$anonfun$3$$anonfun$apply$1, <function1>)
        
org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
        
org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
        
org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:81)
        
org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:312)
        
org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:305)
        org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:132)
        org.apache.spark.SparkContext.clean(SparkContext.scala:1893)
        org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:311)
        org.apache.spark.rdd.RDD$$anonfun$filter$1.apply(RDD.scala:310)
        
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
        
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
        org.apache.spark.rdd.RDD.withScope(RDD.scala:286)
        org.apache.spark.rdd.RDD.filter(RDD.scala:310)
        cmd6$$user$$anonfun$3.apply(Main.scala:134)
        cmd6$$user$$anonfun$3.apply(Main.scala:133)

Thanks,
Balaji

Reply via email to