I am trying to Parsing the data from XML file through Spark using databrics
library

Here is my code:

import org.apache.spark.SparkConfimport
org.apache.spark.SparkContextimport
org.apache.spark.sql.SQLContextimport
org.apache.spark.sql.functionsimport java.text.Formatimport
org.apache.spark.sql.functions.concat_wsimport
org.apache.spark.sqlimport org.apache.spark.sql.types._import
org.apache.spark.sql.catalyst.plans.logical.Withimport
org.apache.spark.sql.functions.litimport
org.apache.spark.sql.functions.udfimport scala.sys.process._import
org.apache.spark.sql.functions.litimport
org.apache.spark.sql.functions.udfimport
org.apache.spark.sql.functions._ object printschema {
   def main(args: Array[String]): Unit =
  {
      val conf = new SparkConf().setAppName("printschema").setMaster("local")
      conf.set("spark.debug.maxToStringFields", "10000000")
      val context = new SparkContext(conf)
      val sqlCotext = new SQLContext(context)
      import sqlCotext.implicits._
      val df = sqlCotext.read.format("com.databricks.spark.xml")
     .option("rowTag", "us-bibliographic-data-application")
     .option("treatEmptyValuesAsNulls", true)
     .load("/Users/praveen/Desktop/ipa0105.xml")
    val q1= 
df.withColumn("document",$"application-reference.document-id.doc-number".cast(sql.types.StringType))
           
.withColumn("document_number",$"application-reference.document-id.doc-number".cast(sql.types.StringType)).select("document","document_number")
           for(l<-q1)
           {
             val m1=l.get(0)
             val m2=l.get(1)
             println(m1,m2)
           }
  }}


When I run the code on ScalaIDE/IntelliJ IDEA its works fine and Here is my
Output.

(14789882,14789882)(14755945,14755945)(14755919,14755919)(14755034,14755034)

But, when i make a jar and run by using spark-submit it returns simply null
values

OUTPUT :

NULL,NULL
NULL,NULL
NULL,NULL


Please help me out.

Thanks in advance.

Reply via email to