Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19246#discussion_r139874732 --- Diff: python/pyspark/sql/types.py --- @@ -410,6 +410,24 @@ def __init__(self, name, dataType, nullable=True, metadata=None): self.dataType = dataType self.nullable = nullable self.metadata = metadata or {} + self.needConversion = dataType.needConversion + self.toInternal = dataType.toInternal + self.fromInternal = dataType.fromInternal + + def __getstate__(self): + """Return state values to be pickled.""" + return (self.name, self.dataType, self.nullable, self.metadata) + + def __setstate__(self, state): + """Restore state from the unpickled state values.""" + name, dataType, nullable, metadata = state + self.name = name + self.dataType = dataType + self.nullable = nullable + self.metadata = metadata + self.needConversion = dataType.needConversion --- End diff -- At the current master, https://github.com/apache/spark/commit/718bbc939037929ef5b8f4b4fe10aadfbab4408e **Before** ``` ./build/mvn -DskipTests -Psparkr -Phive -Phive-thriftserver clean package find . -name "*.pyc" -exec rm -f {} \; sync && sudo purge ./bin/pyspark --conf spark.python.profile=true ``` ```python df = spark.range(10000000).selectExpr("id as id0", "id as id1", "id as id2", "id as id3", "id as id4", "id as id5", "id as id6", "id as id7", "id as id8", "id as id9", "struct(id) as s").cache() df.count() df.rdd.map(lambda x: x).count() sc.show_profiles() ``` ``` ============================================================ Profile of RDD<id=13> ============================================================ 220158736 function calls (210148475 primitive calls) in 379.599 seconds Ordered by: internal time, cumulative time ncalls tottime percall cumtime percall filename:lineno(function) 30000000/20000000 88.015 0.000 263.471 0.000 types.py:623(fromInternal) 20000000 83.744 0.000 130.714 0.000 types.py:1421(_create_row) 683 62.466 0.091 358.079 0.524 {cPickle.loads} 20000000 21.786 0.000 285.257 0.000 types.py:1418(<lambda>) 10000000 18.998 0.000 18.998 0.000 {zip} 20000000 16.783 0.000 32.260 0.000 types.py:1469(__new__) 30045761 16.197 0.000 16.197 0.000 {isinstance} 20000000 15.477 0.000 15.477 0.000 {built-in method __new__ of type object at 0x10db7b428} 20000000 14.710 0.000 14.710 0.000 types.py:1553(__setattr__) 10000008 14.361 0.000 377.376 0.000 rdd.py:1040(<genexpr>) 20000000 9.984 0.000 9.984 0.000 types.py:1417(_create_row_inbound_converter) 10000000 9.579 0.000 19.590 0.000 types.py:440(fromInternal) ... ``` **After** ``` curl -O https://patch-diff.githubusercontent.com/raw/apache/spark/pull/19246.patch git apply 19246.patch git diff ./build/mvn -DskipTests -Psparkr -Phive -Phive-thriftserver clean package find . -name "*.pyc" -exec rm -f {} \; sync && sudo purge ./bin/pyspark --conf spark.python.profile=true ``` ```python df = spark.range(10000000).selectExpr("id as id0", "id as id1", "id as id2", "id as id3", "id as id4", "id as id5", "id as id6", "id as id7", "id as id8", "id as id9", "struct(id) as s").cache() df.count() df.rdd.map(lambda x: x).count() sc.show_profiles() ``` ``` ============================================================ Profile of RDD<id=13> ============================================================ 210149857 function calls (200139596 primitive calls) in 385.988 seconds Ordered by: internal time, cumulative time ncalls tottime percall cumtime percall filename:lineno(function) 30000000/20000000 92.012 0.000 265.554 0.000 types.py:632(fromInternal) 20000000 87.470 0.000 137.298 0.000 types.py:1430(_create_row) 683 65.402 0.096 364.590 0.534 {cPickle.loads} 20000000 22.989 0.000 288.543 0.000 types.py:1427(<lambda>) 10000000 19.146 0.000 19.146 0.000 {zip} 20000000 17.881 0.000 33.933 0.000 types.py:1478(__new__) 30045761 17.121 0.000 17.121 0.000 {isinstance} 20000000 16.052 0.000 16.052 0.000 {built-in method __new__ of type object at 0x10153d428} 20000000 15.894 0.000 15.894 0.000 types.py:1562(__setattr__) 10000008 14.938 0.000 383.739 0.000 rdd.py:1040(<genexpr>) 20000000 10.214 0.000 10.214 0.000 types.py:1426(_create_row_inbound_converter) 16 2.248 0.140 385.986 24.124 {sum} 1374 2.228 0.002 2.228 0.002 {method 'read' of 'file' objects} ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org