returns empty result set when using TimestampType and NullType as StructType +DataFrame +Scala + Spark 1.4.1
SQL context available as sqlContext. > > scala> import org.apache.spark.sql.hive.HiveContext > import org.apache.spark.sql.hive.HiveContext > > scala> import org.apache.spark.sql.hive.orc._ > import org.apache.spark.sql.hive.orc._ > > scala> val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc) > 15/12/28 03:34:57 WARN SparkConf: The configuration key > 'spark.yarn.applicationMaster.waitTries' has been deprecated as of Spark > 1.3 and and may be removed in the future. Please use the new key > 'spark.yarn.am.waitTime' instead. > 15/12/28 03:34:57 INFO HiveContext: Initializing execution hive, version > 0.13.1 > hiveContext: org.apache.spark.sql.hive.HiveContext = > org.apache.spark.sql.hive.HiveContext@3413fbe > > scala> import org.apache.spark.sql.types.{StructType, StructField, > StringType, IntegerType,FloatType ,LongType ,TimestampType,NullType }; > import org.apache.spark.sql.types.{StructType, StructField, StringType, > IntegerType, FloatType, LongType, TimestampType, NullType} > > scala> val loandepoSchema = StructType(Seq( > | StructField("COLUMN1", StringType, true), > | StructField("COLUMN2", StringType , true), > | StructField("COLUMN3", TimestampType , true), > | StructField("COLUMN4", TimestampType , true), > | StructField("COLUMN5", StringType , true), > | StructField("COLUMN6", StringType, true), > | StructField("COLUMN7", IntegerType, true), > | StructField("COLUMN8", IntegerType, true), > | StructField("COLUMN9", StringType, true), > | StructField("COLUMN10", IntegerType, true), > | StructField("COLUMN11", IntegerType, true), > | StructField("COLUMN12", IntegerType, true), > | StructField("COLUMN13", StringType, true), > | StructField("COLUMN14", StringType, true), > | StructField("COLUMN15", StringType, true), > | StructField("COLUMN16", StringType, true), > | StructField("COLUMN17", StringType, true), > | StructField("COLUMN18", StringType, true), > | StructField("COLUMN19", StringType, true), > | StructField("COLUMN20", StringType, true), > | StructField("COLUMN21", StringType, true), > | StructField("COLUMN22", StringType, true))) > loandepoSchema: org.apache.spark.sql.types.StructType = > StructType(StructField(COLUMN1,StringType,true), > StructField(COLUMN2,StringType,true), > StructField(COLUMN3,TimestampType,true), > StructField(COLUMN4,TimestampType,true), > StructField(COLUMN5,StringType,true), StructField(COLUMN6,StringType,true), > StructField(COLUMN7,IntegerType,true), > StructField(COLUMN8,IntegerType,true), > StructField(COLUMN9,StringType,true), > StructField(COLUMN10,IntegerType,true), > StructField(COLUMN11,IntegerType,true), > StructField(COLUMN12,IntegerType,true), > StructField(COLUMN13,StringType,true), > StructField(COLUMN14,StringType,true), > StructField(COLUMN15,StringType,true), > StructField(COLUMN16,StringType,true), > StructField(COLUMN17,StringType,true), > StructField(COLUMN18,StringType,true), StructField(COLUMN19,Strin... > scala> val lonadepodf = > hiveContext.read.format("com.databricks.spark.csv").option("header", > "true").schema(loandepoSchema).load("/tmp/TestDivya/loandepo_10K.csv") > 15/12/28 03:37:52 INFO HiveContext: Initializing HiveMetastoreConnection > version 0.13.1 using Spark classes. > lonadepodf: org.apache.spark.sql.DataFrame = [COLUMN1: string, COLUMN2: > string, COLUMN3: timestamp, COLUMN4: timestamp, COLUMN5: string, COLUMN6: > string, COLUMN7: int, COLUMN8: int, COLUMN9: string, COLUMN10: int, > COLUMN11: int, COLUMN12: int, COLUMN13: string, COLUMN14: string, COLUMN15: > string, COLUMN16: string, COLUMN17: string, COLUMN18: string, COLUMN19: > string, COLUMN20: string, COLUMN21: string, COLUMN22: string] > > scala> lonadepodf.select("COLUMN1").show(10) > 15/12/28 03:38:01 INFO MemoryStore: ensureFreeSpace(216384) called with > curMem=0, maxMem=278302556 > 15/12/28 03:38:01 INFO MemoryStore: Block broadcast_0 stored as values in > memory (estimated size 211.3 KB, free 265.2 MB) > > ... > 15/12/28 03:38:07 INFO DAGScheduler: ResultStage 2 (show at :33) > finished in 0.653 s > 15/12/28 03:38:07 INFO YarnScheduler: Removed TaskSet 2.0, whose tasks > have all completed, from pool > 15/12/28 03:38:07 INFO DAGScheduler: Job 2 finished: show at :33, > took 0.669388 s > +---+ > |COLUMN1| > +---+ > +---+ > > > scala> val loandepoSchema = StructType(Seq( > | StructField("COLUMN1", StringType, true), > | StructField("COLUMN2", StringType , true), > | StructField("COLUMN3", StringType , true), > | StructField("COLUMN4", StringType , true), > | StructField("COLUMN5", StringType , true), > | StructField("COLUMN6", StringType, true), > | StructField("COLUMN7", StringType, true), > | StructField("COLUMN8", StringType, true), > | StructField("COLUMN9", StringType, true), > |
returns empty result set when using TimestampType and NullType as StructType +DataFrame +Scala + Spark 1.4.1
> > SQL context available as sqlContext. > > scala> import org.apache.spark.sql.hive.HiveContext > import org.apache.spark.sql.hive.HiveContext > > scala> import org.apache.spark.sql.hive.orc._ > import org.apache.spark.sql.hive.orc._ > > scala> val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc) > 15/12/28 03:34:57 WARN SparkConf: The configuration key > 'spark.yarn.applicationMaster.waitTries' has been deprecated as of Spark > 1.3 and and may be removed in the future. Please use the new key > 'spark.yarn.am.waitTime' instead. > 15/12/28 03:34:57 INFO HiveContext: Initializing execution hive, version > 0.13.1 > hiveContext: org.apache.spark.sql.hive.HiveContext = > org.apache.spark.sql.hive.HiveContext@3413fbe > > scala> import org.apache.spark.sql.types.{StructType, StructField, > StringType, IntegerType,FloatType ,LongType ,TimestampType,NullType }; > import org.apache.spark.sql.types.{StructType, StructField, StringType, > IntegerType, FloatType, LongType, TimestampType, NullType} > > scala> val loandepoSchema = StructType(Seq( > | StructField("COLUMN1", StringType, true), > | StructField("COLUMN2", StringType , true), > | StructField("COLUMN3", TimestampType , true), > | StructField("COLUMN4", TimestampType , true), > | StructField("COLUMN5", StringType , true), > | StructField("COLUMN6", StringType, true), > | StructField("COLUMN7", IntegerType, true), > | StructField("COLUMN8", IntegerType, true), > | StructField("COLUMN9", StringType, true), > | StructField("COLUMN10", IntegerType, true), > | StructField("COLUMN11", IntegerType, true), > | StructField("COLUMN12", IntegerType, true), > | StructField("COLUMN13", StringType, true), > | StructField("COLUMN14", StringType, true), > | StructField("COLUMN15", StringType, true), > | StructField("COLUMN16", StringType, true), > | StructField("COLUMN17", StringType, true), > | StructField("COLUMN18", StringType, true), > | StructField("COLUMN19", StringType, true), > | StructField("COLUMN20", StringType, true), > | StructField("COLUMN21", StringType, true), > | StructField("COLUMN22", StringType, true))) > loandepoSchema: org.apache.spark.sql.types.StructType = > StructType(StructField(COLUMN1,StringType,true), > StructField(COLUMN2,StringType,true), > StructField(COLUMN3,TimestampType,true), > StructField(COLUMN4,TimestampType,true), > StructField(COLUMN5,StringType,true), StructField(COLUMN6,StringType,true), > StructField(COLUMN7,IntegerType,true), > StructField(COLUMN8,IntegerType,true), > StructField(COLUMN9,StringType,true), > StructField(COLUMN10,IntegerType,true), > StructField(COLUMN11,IntegerType,true), > StructField(COLUMN12,IntegerType,true), > StructField(COLUMN13,StringType,true), > StructField(COLUMN14,StringType,true), > StructField(COLUMN15,StringType,true), > StructField(COLUMN16,StringType,true), > StructField(COLUMN17,StringType,true), > StructField(COLUMN18,StringType,true), StructField(COLUMN19,Strin... > scala> val lonadepodf = > hiveContext.read.format("com.databricks.spark.csv").option("header", > "true").schema(loandepoSchema).load("/tmp/TestDivya/loandepo_10K.csv") > 15/12/28 03:37:52 INFO HiveContext: Initializing HiveMetastoreConnection > version 0.13.1 using Spark classes. > lonadepodf: org.apache.spark.sql.DataFrame = [COLUMN1: string, COLUMN2: > string, COLUMN3: timestamp, COLUMN4: timestamp, COLUMN5: string, COLUMN6: > string, COLUMN7: int, COLUMN8: int, COLUMN9: string, COLUMN10: int, > COLUMN11: int, COLUMN12: int, COLUMN13: string, COLUMN14: string, COLUMN15: > string, COLUMN16: string, COLUMN17: string, COLUMN18: string, COLUMN19: > string, COLUMN20: string, COLUMN21: string, COLUMN22: string] > > scala> lonadepodf.select("COLUMN1").show(10) > 15/12/28 03:38:01 INFO MemoryStore: ensureFreeSpace(216384) called with > curMem=0, maxMem=278302556 > 15/12/28 03:38:01 INFO MemoryStore: Block broadcast_0 stored as values in > memory (estimated size 211.3 KB, free 265.2 MB) > > ... > 15/12/28 03:38:07 INFO DAGScheduler: ResultStage 2 (show at :33) > finished in 0.653 s > 15/12/28 03:38:07 INFO YarnScheduler: Removed TaskSet 2.0, whose tasks > have all completed, from pool > 15/12/28 03:38:07 INFO DAGScheduler: Job 2 finished: show at :33, > took 0.669388 s > +---+ > |COLUMN1| > +---+ > +---+ > > > scala> val loandepoSchema = StructType(Seq( > | StructField("COLUMN1", StringType, true), > | StructField("COLUMN2", StringType , true), > | StructField("COLUMN3", StringType , true), > | StructField("COLUMN4", StringType , true), > | StructField("COLUMN5", StringType , true), > | StructField("COLUMN6", StringType, true), > | StructField("COLUMN7", StringType, true), > | StructField("COLUMN8", StringType, true), > | StructField("COLUMN9", StringType, true), > |