GitHub user michalsenkyr opened a pull request:

    https://github.com/apache/spark/pull/22646

    Support for nested JavaBean arrays, lists and maps in createDataFrame

    ## What changes were proposed in this pull request?
    
    Continuing from #22527, this PR seeks to add support for beans in array, 
list and map fields when creating DataFrames from Java beans.
    
    ## How was this patch tested?
    
    Appropriate unit tests were amended.
    
    Also manually tested in Spark shell:
    
    ```
    scala> import scala.beans.BeanProperty
    import scala.beans.BeanProperty
    
    scala> class Nested(@BeanProperty var i: Int) extends Serializable
    defined class Nested
    
    scala> class Test(@BeanProperty var array: Array[Nested], @BeanProperty var 
list: java.util.List[Nested], @BeanProperty var map: java.util.Map[Integer, 
Nested]) extends Serializable
    defined class Test
    
    scala> import scala.collection.JavaConverters._
    import scala.collection.JavaConverters._
    
    scala> val array = Array(new Nested(1))
    array: Array[Nested] = Array(Nested@757ad227)
    
    scala> val list = Seq(new Nested(2), new Nested(3)).asJava
    list: java.util.List[Nested] = [Nested@633dce39, Nested@4dd28982]
    
    scala> val map = Map(Int.box(1) -> new Nested(4), Int.box(2) -> new 
Nested(5)).asJava
    map: java.util.Map[Integer,Nested] = {1=Nested@57421e4e, 2=Nested@5a75bad4}
    
    scala> val df = spark.createDataFrame(Seq(new Test(array, list, 
map)).asJava, classOf[Test])
    df: org.apache.spark.sql.DataFrame = [array: array<struct<i:int>>, list: 
array<struct<i:int>> ... 1 more field]
    
    scala> df.show()
    +-----+----------+--------------------+
    |array|      list|                 map|
    +-----+----------+--------------------+
    |[[1]]|[[2], [3]]|[1 -> [4], 2 -> [5]]|
    +-----+----------+--------------------+
    ```
    
    Previous behavior:
    
    ```
    scala> val df = spark.createDataFrame(Seq(new Test(array, list, 
map)).asJava, classOf[Test])
    java.lang.IllegalArgumentException: The value (Nested@3dedc8b8) of the type 
(Nested) cannot be converted to struct<i:int>
      at 
org.apache.spark.sql.catalyst.CatalystTypeConverters$StructConverter.toCatalystImpl(CatalystTypeConverters.scala:262)
      at 
org.apache.spark.sql.catalyst.CatalystTypeConverters$StructConverter.toCatalystImpl(CatalystTypeConverters.scala:238)
      at 
org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:103)
      at 
org.apache.spark.sql.catalyst.CatalystTypeConverters$ArrayConverter$$anonfun$toCatalystImpl$1.apply(CatalystTypeConverters.scala:162)
      at 
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
      at 
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
      at 
scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
      at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      at 
org.apache.spark.sql.catalyst.CatalystTypeConverters$ArrayConverter.toCatalystImpl(CatalystTypeConverters.scala:162)
      at 
org.apache.spark.sql.catalyst.CatalystTypeConverters$ArrayConverter.toCatalystImpl(CatalystTypeConverters.scala:154)
      at 
org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:103)
      at 
org.apache.spark.sql.catalyst.CatalystTypeConverters$$anonfun$createToCatalystConverter$2.apply(CatalystTypeConverters.scala:396)
      at 
org.apache.spark.sql.SQLContext$$anonfun$createStructConverter$1$1$$anonfun$apply$1.apply(SQLContext.scala:1114)
      at 
org.apache.spark.sql.SQLContext$$anonfun$createStructConverter$1$1$$anonfun$apply$1.apply(SQLContext.scala:1113)
      at 
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
      at 
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
      at 
scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
      at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      at 
org.apache.spark.sql.SQLContext$$anonfun$createStructConverter$1$1.apply(SQLContext.scala:1113)
      at 
org.apache.spark.sql.SQLContext$$anonfun$createStructConverter$1$1.apply(SQLContext.scala:1108)
      at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
      at scala.collection.Iterator$class.toStream(Iterator.scala:1320)
      at scala.collection.AbstractIterator.toStream(Iterator.scala:1334)
      at scala.collection.TraversableOnce$class.toSeq(TraversableOnce.scala:298)
      at scala.collection.AbstractIterator.toSeq(Iterator.scala:1334)
      at 
org.apache.spark.sql.SparkSession.createDataFrame(SparkSession.scala:423)
      ... 51 elided
    ```

You can merge this pull request into a Git repository by running:

    $ git pull https://github.com/michalsenkyr/spark SPARK-25654

Alternatively you can review and apply these changes as the patch at:

    https://github.com/apache/spark/pull/22646.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

    This closes #22646
    
----
commit b477d070ae4cac75add60fd1fa7c474c0e6113ac
Author: Michal Senkyr <mike.senkyr@...>
Date:   2018-10-02T18:24:56Z

    Add array, list and map support to SQLContext.beansToRow

----


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to