Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/18945#discussion_r140412632 --- Diff: python/pyspark/sql/dataframe.py --- @@ -1761,12 +1761,37 @@ def toPandas(self): raise ImportError("%s\n%s" % (e.message, msg)) else: dtype = {} + columns_with_null_int = set() + def null_handler(rows, columns_with_null_int): + for row in rows: + row = row.asDict() + for column in columns_with_null_int: + val = row[column] + dt = dtype[column] + if val is not None: + if abs(val) > 16777216: # Max value before np.float32 loses precision. + val = np.float64(val) + dt = np.float64 + dtype[column] = np.float64 + else: + val = np.float32(val) + if dt not in (np.float32, np.float64): + dt = np.float32 + dtype[column] = np.float32 + row[column] = val + row = Row(**row) + yield row + row_handler = lambda x,y: x for field in self.schema: pandas_type = _to_corrected_pandas_type(field.dataType) + if pandas_type in (np.int8, np.int16, np.int32) and field.nullable: + columns_with_null_int.add(field.name) + row_handler = null_handler + pandas_type = np.float32 --- End diff -- I don't think this is a correct fix.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org