I'm trying to support parquet i/o for data-frames that contain a UDT (for t-digests). The UDT is defined here:
https://github.com/erikerlandson/isarn-sketches-spark/blob/feature/pyspark/src/main/scala/org/apache/spark/isarnproject/sketches/udt/TDigestUDT.scala#L37 I can read and write using 'objectFile', but when I try to use ' ...write.parquet(...)' I'm getting failures I can't make sense of. The full stack-dump is here: https://gist.github.com/erikerlandson/054652fc2d34ef896717124991196c0e Following is the first portion of the dump. The associated error message is: "failure: `TimestampType' expected but `{' found" scala> val data = sc.parallelize(Seq(1,2,3,4,5)).toDF("x") data: org.apache.spark.sql.DataFrame = [x: int] scala> val udaf = tdigestUDAF[Double].maxDiscrete(10) udaf: org.isarnproject.sketches.udaf.TDigestUDAF[Double] = TDigestUDAF(0.5,10) scala> val agg = data.agg(udaf($"x").alias("tdigest")) agg: org.apache.spark.sql.DataFrame = [tdigest: tdigest] scala> agg.show() +--------------------+ | tdigest| +--------------------+ |TDigestSQL(TDiges...| +--------------------+ scala> agg.write.parquet("/tmp/agg.parquet") 2017-07-30 13:32:13 ERROR Utils:91 - Aborting task java.lang.IllegalArgumentException: Unsupported dataType: {"type":"struct","fields":[{"name":"tdigest","type":{"type":"udt","class":"org.apache.spark.isarnproject.sketches.udt.TDigestUDT$","pyClass":"isarnproject.sketches.udt.tdigest.TDigestUDT","sqlType":{"type":"struct","fields":[{"name":"delta","type":"double","nullable":false,"metadata":{}},{"name":"maxDiscrete","type":"integer","nullable":false,"metadata":{}},{"name":"nclusters","type":"integer","nullable":false,"metadata":{}},{"name":"clustX","type":{"type":"array","elementType":"double","containsNull":false},"nullable":false,"metadata":{}},{"name":"clustM","type":{"type":"array","elementType":"double","containsNull":false},"nullable":false,"metadata":{}}]}},"nullable":true,"metadata":{}}]}, [1.1] failure: `TimestampType' expected but `{' found