[ https://issues.apache.org/jira/browse/SPARK-15547?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Cheng Lian updated SPARK-15547: ------------------------------- Description: The following Spark shell snippet reproduces this issue: {code} case class ClassData(a: String, b: Long) case class NestedStruct(f: ClassData) val data = s"""{'f': {'b': 1, 'a': 'foo', 'c': 'extra'}} |{'f': {'b': 2, 'a': 'bar', 'c': 'extra'}} |""".stripMargin.trim.split("\n") val df = spark.read.json(sc.parallelize(data)) val ds = df.as[NestedStruct] {code} Exception thrown: {noformat} org.apache.spark.sql.AnalysisException: Try to map struct<a:string,b:bigint,c:string> to Tuple2, but failed as the number of fields does not line up. - Input schema: struct<f:struct<a:string,b:bigint,c:string>> - Target schema: struct<f:struct<a:string,b:bigint>>; at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.org$apache$spark$sql$catalyst$encoders$ExpressionEncoder$$fail$1(ExpressionEncoder.scala:267) at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$$anonfun$validate$3.apply(ExpressionEncoder.scala:311) at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$$anonfun$validate$3.apply(ExpressionEncoder.scala:307) at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99) at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99) at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:230) at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40) at scala.collection.mutable.HashMap.foreach(HashMap.scala:99) at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.validate(ExpressionEncoder.scala:307) at org.apache.spark.sql.Dataset.<init>(Dataset.scala:201) at org.apache.spark.sql.Dataset.<init>(Dataset.scala:168) at org.apache.spark.sql.Dataset$.apply(Dataset.scala:57) at org.apache.spark.sql.Dataset.as(Dataset.scala:360) ... 39 elided {noformat} We can workaround this exception by removing inner nested field {{c}}: {code} val data = s"""{"f": {"b": 1, "a": "foo"}} |{"f": {"b": 2, "a": "bar"}} |""".stripMargin.trim.split("\n") {code} Dataset is just a "view" of its underlying logical plan. The encoder and logical plan of the Dataset may have different field number and/or field order. This is OK as long as all fields referred by the encoder exist in the logical plan. This is because encoder de/serializer expressions are resolved by name rather than ordinal. However, as illustrated above, length of an inner nested struct must align with field number of the corresponding case class/Java bean. was: The following Spark shell snippet reproduces this issue: {code} case class ClassData(a: String, b: Long) case class NestedStruct(f: ClassData) val data = s"""{"f": {"b": 1, "a": "foo", "c": "extra"}} |{"f": {"b": 2, "a": "bar", "c": "extra"}} |""".stripMargin.trim.split("\n") val df = spark.read.json(sc.parallelize(data)) val ds = df.as[NestedStruct] {code} Exception thrown: {noformat} org.apache.spark.sql.AnalysisException: Try to map struct<a:string,b:bigint,c:string> to Tuple2, but failed as the number of fields does not line up. - Input schema: struct<f:struct<a:string,b:bigint,c:string>> - Target schema: struct<f:struct<a:string,b:bigint>>; at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.org$apache$spark$sql$catalyst$encoders$ExpressionEncoder$$fail$1(ExpressionEncoder.scala:267) at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$$anonfun$validate$3.apply(ExpressionEncoder.scala:311) at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$$anonfun$validate$3.apply(ExpressionEncoder.scala:307) at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99) at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99) at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:230) at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40) at scala.collection.mutable.HashMap.foreach(HashMap.scala:99) at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.validate(ExpressionEncoder.scala:307) at org.apache.spark.sql.Dataset.<init>(Dataset.scala:201) at org.apache.spark.sql.Dataset.<init>(Dataset.scala:168) at org.apache.spark.sql.Dataset$.apply(Dataset.scala:57) at org.apache.spark.sql.Dataset.as(Dataset.scala:360) ... 39 elided {noformat} We can workaround this exception by removing inner nested field {{c}}: {code} val data = s"""{"f": {"b": 1, "a": "foo"}} |{"f": {"b": 2, "a": "bar"}} |""".stripMargin.trim.split("\n") {code} Dataset is just a "view" of its underlying logical plan. The encoder and logical plan of the Dataset may have different field number and/or field order. This is OK as long as all fields referred by the encoder exist in the logical plan. This is because encoder de/serializer expressions are resolved by name rather than ordinal. However, as illustrated above, length of an inner nested struct must align with field number of the corresponding case class/Java bean. > Encoder validation is too strict for inner nested structs > --------------------------------------------------------- > > Key: SPARK-15547 > URL: https://issues.apache.org/jira/browse/SPARK-15547 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 1.6.1, 2.0.0 > Reporter: Cheng Lian > Assignee: Cheng Lian > > The following Spark shell snippet reproduces this issue: > {code} > case class ClassData(a: String, b: Long) > case class NestedStruct(f: ClassData) > val data = > s"""{'f': {'b': 1, 'a': 'foo', 'c': 'extra'}} > |{'f': {'b': 2, 'a': 'bar', 'c': 'extra'}} > |""".stripMargin.trim.split("\n") > val df = spark.read.json(sc.parallelize(data)) > val ds = df.as[NestedStruct] > {code} > Exception thrown: > {noformat} > org.apache.spark.sql.AnalysisException: Try to map > struct<a:string,b:bigint,c:string> to Tuple2, but failed as the number of > fields does not line up. > - Input schema: struct<f:struct<a:string,b:bigint,c:string>> > - Target schema: struct<f:struct<a:string,b:bigint>>; > at > org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.org$apache$spark$sql$catalyst$encoders$ExpressionEncoder$$fail$1(ExpressionEncoder.scala:267) > at > org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$$anonfun$validate$3.apply(ExpressionEncoder.scala:311) > at > org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$$anonfun$validate$3.apply(ExpressionEncoder.scala:307) > at > scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99) > at > scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99) > at > scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:230) > at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40) > at scala.collection.mutable.HashMap.foreach(HashMap.scala:99) > at > org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.validate(ExpressionEncoder.scala:307) > at org.apache.spark.sql.Dataset.<init>(Dataset.scala:201) > at org.apache.spark.sql.Dataset.<init>(Dataset.scala:168) > at org.apache.spark.sql.Dataset$.apply(Dataset.scala:57) > at org.apache.spark.sql.Dataset.as(Dataset.scala:360) > ... 39 elided > {noformat} > We can workaround this exception by removing inner nested field {{c}}: > {code} > val data = > s"""{"f": {"b": 1, "a": "foo"}} > |{"f": {"b": 2, "a": "bar"}} > |""".stripMargin.trim.split("\n") > {code} > Dataset is just a "view" of its underlying logical plan. The encoder and > logical plan of the Dataset may have different field number and/or field > order. This is OK as long as all fields referred by the encoder exist in the > logical plan. This is because encoder de/serializer expressions are resolved > by name rather than ordinal. > However, as illustrated above, length of an inner nested struct must align > with field number of the corresponding case class/Java bean. -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org