Ruifeng Zheng created SPARK-54579:
-------------------------------------
Summary: createDataFrame incorrectly handle NaN in pandas dataframe
Key: SPARK-54579
URL: https://issues.apache.org/jira/browse/SPARK-54579
Project: Spark
Issue Type: Improvement
Components: Connect, PySpark
Affects Versions: 4.2.0
Reporter: Ruifeng Zheng
in spark classic
{code:java}
In [18]: pdf1 = pd.DataFrame({"x": np.array([1.0, np.nan, None])})
In [19]: pdf2 = pd.DataFrame({"x": [1.0, np.nan, None]})
In [20]: pdf1
Out[20]:
x
0 1.0
1 NaN
2 None
In [21]: pdf2
Out[21]:
x
0 1.0
1 NaN
2 NaN
In [22]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'false')
In [23]: spark.createDataFrame(pdf1).show()
+----+
| x|
+----+
| 1.0|
| NaN|
|NULL|
+----+
In [24]: spark.createDataFrame(pdf2).show()
+---+
| x|
+---+
|1.0|
|NaN|
|NaN|
+---+
In [25]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')
In [26]: spark.createDataFrame(pdf1).show()
+----+
| x|
+----+
| 1.0|
|NULL|
|NULL|
+----+
In [27]: spark.createDataFrame(pdf2).show()
+----+
| x|
+----+
| 1.0|
|NULL|
|NULL|
+----+ {code}
in spark connect
{code:java}
In [1]: import numpy as np
In [2]: import pandas as pd
In [3]: pdf1 = pd.DataFrame({"x": np.array([1.0, np.nan, float("nan"), None])})
In [4]: pdf2 = pd.DataFrame({"x": [1.0, np.nan, float("nan"), None]})
In [5]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')
In [6]: spark.createDataFrame(pdf1).show()
+----+
| x|
+----+
| 1.0|
|NULL|
|NULL|
|NULL|
+----+
In [7]: spark.createDataFrame(pdf2).show()
+----+
| x|
+----+
| 1.0|
|NULL|
|NULL|
|NULL|
+----+
In [8]: spark
Out[8]: <pyspark.sql.connect.session.SparkSession at 0x1288c4980>
In [9]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'false')
In [10]: spark.createDataFrame(pdf1).show()
+----+
| x|
+----+
| 1.0|
|NULL|
|NULL|
|NULL|
+----+
In [11]: spark.createDataFrame(pdf2).show()
+----+
| x|
+----+
| 1.0|
|NULL|
|NULL|
|NULL|
+----+ {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]