[ 
https://issues.apache.org/jira/browse/SPARK-54579?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18042443#comment-18042443
 ] 

Ruifeng Zheng commented on SPARK-54579:
---------------------------------------

RCA:

 
{code:java}
In [18]: pdf1
Out[18]:
      x
0   1.0
1   NaN
2   NaN
3  None

In [19]: pdf1.x
Out[19]:
0     1.0
1     NaN
2     NaN
3    None
Name: x, dtype: object

In [20]: pa.Array.from_pandas(pdf1.x)
Out[20]:
<pyarrow.lib.DoubleArray object at 0x118b239a0>
[
  1,
  null,
  null,
  null
]

In [21]: pa.array([1.0, float("nan"), None])
Out[21]:
<pyarrow.lib.DoubleArray object at 0x119569cc0>
[
  1,
  nan,
  null
]{code}

> createDataFrame incorrectly handle NaN in pandas dataframe
> ----------------------------------------------------------
>
>                 Key: SPARK-54579
>                 URL: https://issues.apache.org/jira/browse/SPARK-54579
>             Project: Spark
>          Issue Type: Bug
>          Components: Connect, PySpark
>    Affects Versions: 4.2.0
>            Reporter: Ruifeng Zheng
>            Priority: Major
>
> in spark classic
> {code:java}
> In [18]: pdf1 = pd.DataFrame({"x": np.array([1.0, np.nan, None])})
> In [19]: pdf2 = pd.DataFrame({"x": [1.0, np.nan, None]})
> In [20]: pdf1
> Out[20]:
>       x
> 0   1.0
> 1   NaN
> 2  None
> In [21]: pdf2
> Out[21]:
>      x
> 0  1.0
> 1  NaN
> 2  NaN
> In [22]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'false')
> In [23]: spark.createDataFrame(pdf1).show()
> +----+
> |   x|
> +----+
> | 1.0|
> | NaN|
> |NULL|
> +----+
> In [24]: spark.createDataFrame(pdf2).show()
> +---+
> |  x|
> +---+
> |1.0|
> |NaN|
> |NaN|
> +---+
> In [25]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')
> In [26]: spark.createDataFrame(pdf1).show()
> +----+
> |   x|
> +----+
> | 1.0|
> |NULL|
> |NULL|
> +----+
> In [27]: spark.createDataFrame(pdf2).show()
> +----+
> |   x|
> +----+
> | 1.0|
> |NULL|
> |NULL|
> +----+ {code}
>  
> in spark connect
> {code:java}
> In [1]: import numpy as np
> In [2]: import pandas as pd
> In [3]: pdf1 = pd.DataFrame({"x": np.array([1.0, np.nan, float("nan"), 
> None])})
> In [4]: pdf2 = pd.DataFrame({"x": [1.0, np.nan, float("nan"), None]})
> In [5]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')
> In [6]: spark.createDataFrame(pdf1).show()
> +----+
> |   x|
> +----+
> | 1.0|
> |NULL|
> |NULL|
> |NULL|
> +----+
> In [7]: spark.createDataFrame(pdf2).show()
> +----+
> |   x|
> +----+
> | 1.0|
> |NULL|
> |NULL|
> |NULL|
> +----+
> In [8]: spark
> Out[8]: <pyspark.sql.connect.session.SparkSession at 0x1288c4980>
> In [9]: spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'false')
> In [10]: spark.createDataFrame(pdf1).show()
> +----+
> |   x|
> +----+
> | 1.0|
> |NULL|
> |NULL|
> |NULL|
> +----+
> In [11]: spark.createDataFrame(pdf2).show()
> +----+
> |   x|
> +----+
> | 1.0|
> |NULL|
> |NULL|
> |NULL|
> +----+ {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to