Hi, I have 2 files which come from csv import of 2 Oracle tables.
F1 has 46730613 rows F2 has 3386740 rows I build 2 tables with spark. Table F1 join with table F2 on c1=d1. All keys F2.d1 exists in F1.c1, so i expect to retrieve 46730613 rows. But it returns only 3437 rows // --- begin code --- val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.createSchemaRDD val rddFile = sc.textFile("hdfs://referential/F1/part-*") case class F1(c1:String, c2:String,c3:Double, c3:String, c5: String) val stkrdd = rddFile.map(x => x.split("|")).map(f => F1(f(44),f(3),f(10).toDouble, "",f(2))) stkrdd.registerAsTable("F1") sqlContext.cacheTable("F1") val prdfile = sc.textFile("hdfs://referential/F2/part-*") case class F2(d1: String, d2:String, d3:String,d4:String) val productrdd = prdfile.map(x => x.split("|")).map(f => F2(f(0),f(2),f(101),f(3))) productrdd.registerAsTable("F2") sqlContext.cacheTable("F2") val resrdd = sqlContext.sql("Select count(*) from F1, F2 where F1.c1 = F2.d1 ").count() // --- end of code --- Does anybody know what i missed ? Thanks -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Spark-SQL-Join-returns-less-rows-that-expected-tp19731.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org