Ruifeng Zheng created SPARK-54579:
-------------------------------------

             Summary: createDataFrame incorrectly handle NaN in pandas dataframe
                 Key: SPARK-54579
                 URL: https://issues.apache.org/jira/browse/SPARK-54579
             Project: Spark
          Issue Type: Improvement
          Components: Connect, PySpark
    Affects Versions: 4.2.0
            Reporter: Ruifeng Zheng


in spark classic
{code:java}
In [18]: pdf1 = pd.DataFrame({"x": np.array([1.0, np.nan, None])})
In [19]: pdf2 = pd.DataFrame({"x": [1.0, np.nan, None]})

In [20]: pdf1
Out[20]:
      x
0   1.0
1   NaN
2  None

In [21]: pdf2
Out[21]:
     x
0  1.0
1  NaN
2  NaN

In [22]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'false')
In [23]: spark.createDataFrame(pdf1).show()
+----+
|   x|
+----+
| 1.0|
| NaN|
|NULL|
+----+

In [24]: spark.createDataFrame(pdf2).show()
+---+
|  x|
+---+
|1.0|
|NaN|
|NaN|
+---+

In [25]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')
In [26]: spark.createDataFrame(pdf1).show()
+----+
|   x|
+----+
| 1.0|
|NULL|
|NULL|
+----+

In [27]: spark.createDataFrame(pdf2).show()
+----+
|   x|
+----+
| 1.0|
|NULL|
|NULL|
+----+ {code}
 

in spark connect
{code:java}
In [1]: import numpy as np
In [2]: import pandas as pd

In [3]: pdf1 = pd.DataFrame({"x": np.array([1.0, np.nan, float("nan"), None])})
In [4]: pdf2 = pd.DataFrame({"x": [1.0, np.nan, float("nan"), None]})

In [5]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')
In [6]: spark.createDataFrame(pdf1).show()
+----+
|   x|
+----+
| 1.0|
|NULL|
|NULL|
|NULL|
+----+

In [7]: spark.createDataFrame(pdf2).show()
+----+
|   x|
+----+
| 1.0|
|NULL|
|NULL|
|NULL|
+----+

In [8]: spark
Out[8]: <pyspark.sql.connect.session.SparkSession at 0x1288c4980>

In [9]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'false')

In [10]: spark.createDataFrame(pdf1).show()
+----+
|   x|
+----+
| 1.0|
|NULL|
|NULL|
|NULL|
+----+

In [11]: spark.createDataFrame(pdf2).show()
+----+
|   x|
+----+
| 1.0|
|NULL|
|NULL|
|NULL|
+----+ {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to