GitHub user michalsenkyr opened a pull request: https://github.com/apache/spark/pull/22646
Support for nested JavaBean arrays, lists and maps in createDataFrame ## What changes were proposed in this pull request? Continuing from #22527, this PR seeks to add support for beans in array, list and map fields when creating DataFrames from Java beans. ## How was this patch tested? Appropriate unit tests were amended. Also manually tested in Spark shell: ``` scala> import scala.beans.BeanProperty import scala.beans.BeanProperty scala> class Nested(@BeanProperty var i: Int) extends Serializable defined class Nested scala> class Test(@BeanProperty var array: Array[Nested], @BeanProperty var list: java.util.List[Nested], @BeanProperty var map: java.util.Map[Integer, Nested]) extends Serializable defined class Test scala> import scala.collection.JavaConverters._ import scala.collection.JavaConverters._ scala> val array = Array(new Nested(1)) array: Array[Nested] = Array(Nested@757ad227) scala> val list = Seq(new Nested(2), new Nested(3)).asJava list: java.util.List[Nested] = [Nested@633dce39, Nested@4dd28982] scala> val map = Map(Int.box(1) -> new Nested(4), Int.box(2) -> new Nested(5)).asJava map: java.util.Map[Integer,Nested] = {1=Nested@57421e4e, 2=Nested@5a75bad4} scala> val df = spark.createDataFrame(Seq(new Test(array, list, map)).asJava, classOf[Test]) df: org.apache.spark.sql.DataFrame = [array: array<struct<i:int>>, list: array<struct<i:int>> ... 1 more field] scala> df.show() +-----+----------+--------------------+ |array| list| map| +-----+----------+--------------------+ |[[1]]|[[2], [3]]|[1 -> [4], 2 -> [5]]| +-----+----------+--------------------+ ``` Previous behavior: ``` scala> val df = spark.createDataFrame(Seq(new Test(array, list, map)).asJava, classOf[Test]) java.lang.IllegalArgumentException: The value (Nested@3dedc8b8) of the type (Nested) cannot be converted to struct<i:int> at org.apache.spark.sql.catalyst.CatalystTypeConverters$StructConverter.toCatalystImpl(CatalystTypeConverters.scala:262) at org.apache.spark.sql.catalyst.CatalystTypeConverters$StructConverter.toCatalystImpl(CatalystTypeConverters.scala:238) at org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:103) at org.apache.spark.sql.catalyst.CatalystTypeConverters$ArrayConverter$$anonfun$toCatalystImpl$1.apply(CatalystTypeConverters.scala:162) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at org.apache.spark.sql.catalyst.CatalystTypeConverters$ArrayConverter.toCatalystImpl(CatalystTypeConverters.scala:162) at org.apache.spark.sql.catalyst.CatalystTypeConverters$ArrayConverter.toCatalystImpl(CatalystTypeConverters.scala:154) at org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:103) at org.apache.spark.sql.catalyst.CatalystTypeConverters$$anonfun$createToCatalystConverter$2.apply(CatalystTypeConverters.scala:396) at org.apache.spark.sql.SQLContext$$anonfun$createStructConverter$1$1$$anonfun$apply$1.apply(SQLContext.scala:1114) at org.apache.spark.sql.SQLContext$$anonfun$createStructConverter$1$1$$anonfun$apply$1.apply(SQLContext.scala:1113) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at org.apache.spark.sql.SQLContext$$anonfun$createStructConverter$1$1.apply(SQLContext.scala:1113) at org.apache.spark.sql.SQLContext$$anonfun$createStructConverter$1$1.apply(SQLContext.scala:1108) at scala.collection.Iterator$$anon$11.next(Iterator.scala:410) at scala.collection.Iterator$class.toStream(Iterator.scala:1320) at scala.collection.AbstractIterator.toStream(Iterator.scala:1334) at scala.collection.TraversableOnce$class.toSeq(TraversableOnce.scala:298) at scala.collection.AbstractIterator.toSeq(Iterator.scala:1334) at org.apache.spark.sql.SparkSession.createDataFrame(SparkSession.scala:423) ... 51 elided ``` You can merge this pull request into a Git repository by running: $ git pull https://github.com/michalsenkyr/spark SPARK-25654 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/22646.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #22646 ---- commit b477d070ae4cac75add60fd1fa7c474c0e6113ac Author: Michal Senkyr <mike.senkyr@...> Date: 2018-10-02T18:24:56Z Add array, list and map support to SQLContext.beansToRow ---- --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org