[ https://issues.apache.org/jira/browse/SPARK-24233?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
smohr003 updated SPARK-24233: ----------------------------- Description: I know that I can use wild card * to read all subfolders. But, I am trying to use .par and .schema to speed up the read process. val absolutePath = "adl://datalakename.azuredatalakestore.net/testU/" Seq((1, "one"), (2, "two")).toDF("k", "v").write.mode("overwrite").parquet(absolutePath + "1") Seq((11, "one"), (22, "two")).toDF("k", "v").write.mode("overwrite").parquet(absolutePath + "2") Seq((111, "one"), (222, "two")).toDF("k", "v").write.mode("overwrite").parquet(absolutePath + "3") Seq((1111, "one"), (2222, "two")).toDF("k", "v").write.mode("overwrite").parquet(absolutePath + "4") Seq((2, "one"), (2, "two")).toDF("k", "v").write.mode("overwrite").parquet(absolutePath + "5") import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.\{FileSystem, Path} import java.net.URI def readDir(path: String): DataFrame = { val fs = FileSystem.get(new URI(path), new Configuration()) val subDir = fs.listStatus(new Path(path)).map(i => i.getPath.toString) var df = spark.read.parquet(subDir.head) val dfSchema = df.schema subDir.tail.par.foreach(p => df = df.union(spark.read.schema(dfSchema).parquet(p)).select(df.columns.head, df.columns.tail:_*)) df } val dfAll = readDir(absolutePath) dfAll.count The count of produced dfAll is 4, which in this example should be 10. was: I know that I can use wild card * to read all subfolders. But, I am trying to use .par and .schema to speed up the read process. val absolutePath = "adl://datalakename.azuredatalakestore.net/testU/" Seq((1, "one"), (2, "two")).toDF("k", "v").write.mode("overwrite").parquet(absolutePath + "1") Seq((11, "one"), (22, "two")).toDF("k", "v").write.mode("overwrite").parquet(absolutePath + "2") Seq((111, "one"), (222, "two")).toDF("k", "v").write.mode("overwrite").parquet(absolutePath + "3") Seq((1111, "one"), (2222, "two")).toDF("k", "v").write.mode("overwrite").parquet(absolutePath + "4") Seq((2, "one"), (2, "two")).toDF("k", "v").write.mode("overwrite").parquet(absolutePath + "5") import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.\{FileSystem, Path} import java.net.URI def readDir(path: String): DataFrame = { val fs = FileSystem.get(new URI(path), new Configuration()) val subDir = fs.listStatus(new Path(path)).map(i => i.getPath.toString) var df = spark.read.parquet(subDir.head) val dfSchema = df.schema subDir.tail.par.foreach(p => df = df.union(spark.read.schema(dfSchema).parquet(p)).select(df.columns.head, df.columns.tail:_*)) df } val dfAll = readDir(absolutePath) dfAll.count The count of produced df is 4, which in this example should be 10. > union operation on read of dataframe does nor produce correct result > --------------------------------------------------------------------- > > Key: SPARK-24233 > URL: https://issues.apache.org/jira/browse/SPARK-24233 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 2.1.0 > Reporter: smohr003 > Priority: Major > > I know that I can use wild card * to read all subfolders. But, I am trying to > use .par and .schema to speed up the read process. > val absolutePath = "adl://datalakename.azuredatalakestore.net/testU/" > Seq((1, "one"), (2, "two")).toDF("k", > "v").write.mode("overwrite").parquet(absolutePath + "1") > Seq((11, "one"), (22, "two")).toDF("k", > "v").write.mode("overwrite").parquet(absolutePath + "2") > Seq((111, "one"), (222, "two")).toDF("k", > "v").write.mode("overwrite").parquet(absolutePath + "3") > Seq((1111, "one"), (2222, "two")).toDF("k", > "v").write.mode("overwrite").parquet(absolutePath + "4") > Seq((2, "one"), (2, "two")).toDF("k", > "v").write.mode("overwrite").parquet(absolutePath + "5") > > import org.apache.hadoop.conf.Configuration > import org.apache.hadoop.fs.\{FileSystem, Path} > import java.net.URI > def readDir(path: String): DataFrame = > { val fs = FileSystem.get(new URI(path), new Configuration()) val subDir = > fs.listStatus(new Path(path)).map(i => i.getPath.toString) var df = > spark.read.parquet(subDir.head) val dfSchema = df.schema > subDir.tail.par.foreach(p => df = > df.union(spark.read.schema(dfSchema).parquet(p)).select(df.columns.head, > df.columns.tail:_*)) df } > val dfAll = readDir(absolutePath) > dfAll.count > The count of produced dfAll is 4, which in this example should be 10. -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org