[ https://issues.apache.org/jira/browse/SPARK-52266?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Xinrong Meng updated SPARK-52266: --------------------------------- Description: {code:java} >>> pdf = pd.DataFrame({"a": ["x"], "b": [0]}) >>> pdf a b 0 x 0 >>> psdf = ps.from_pandas(pdf) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/Users/xinrong.meng/spark/python/pyspark/pandas/namespace.py", line 187, in from_pandas return DataFrame(pobj) File "/Users/xinrong.meng/spark/python/pyspark/pandas/frame.py", line 573, in __init__ internal = InternalFrame.from_pandas(pdf) File "/Users/xinrong.meng/spark/python/pyspark/pandas/internal.py", line 1480, in from_pandas ) = InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=prefer_timestamp_ntz) File "/Users/xinrong.meng/spark/python/pyspark/pandas/internal.py", line 1581, in prepare_pandas_frame spark_type = infer_pd_series_spark_type(reset_index[col], dtype, prefer_timestamp_ntz) File "/Users/xinrong.meng/spark/python/pyspark/pandas/typedef/typehints.py", line 368, in infer_pd_series_spark_type return from_arrow_type(pa.Array.from_pandas(pser).type, prefer_timestamp_ntz) File "pyarrow/array.pxi", line 1115, in pyarrow.lib.Array.from_pandas File "pyarrow/array.pxi", line 339, in pyarrow.lib.array File "pyarrow/array.pxi", line 85, in pyarrow.lib._ndarray_to_array File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status pyarrow.lib.ArrowTypeError: Input object was not a NumPy array >>> {code} {code:java} >>> pd.__version__ '2.2.3' >>> pa.__version__ '15.0.2' >>> np.__version__ '2.0.2' {code} was: {code:java} >>> pdf = pd.DataFrame({"a": ["x"], "b": [0]}) >>> pdf a b 0 x 0 >>> psdf = ps.from_pandas(pdf) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/Users/xinrong.meng/spark/python/pyspark/pandas/namespace.py", line 187, in from_pandas return DataFrame(pobj) File "/Users/xinrong.meng/spark/python/pyspark/pandas/frame.py", line 573, in __init__ internal = InternalFrame.from_pandas(pdf) File "/Users/xinrong.meng/spark/python/pyspark/pandas/internal.py", line 1480, in from_pandas ) = InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=prefer_timestamp_ntz) File "/Users/xinrong.meng/spark/python/pyspark/pandas/internal.py", line 1581, in prepare_pandas_frame spark_type = infer_pd_series_spark_type(reset_index[col], dtype, prefer_timestamp_ntz) File "/Users/xinrong.meng/spark/python/pyspark/pandas/typedef/typehints.py", line 368, in infer_pd_series_spark_type return from_arrow_type(pa.Array.from_pandas(pser).type, prefer_timestamp_ntz) File "pyarrow/array.pxi", line 1115, in pyarrow.lib.Array.from_pandas File "pyarrow/array.pxi", line 339, in pyarrow.lib.array File "pyarrow/array.pxi", line 85, in pyarrow.lib._ndarray_to_array File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status pyarrow.lib.ArrowTypeError: Input object was not a NumPy array >>> {code} > Arrow fails to infer the schema with string and int column when creating a > DataFrame > ------------------------------------------------------------------------------------ > > Key: SPARK-52266 > URL: https://issues.apache.org/jira/browse/SPARK-52266 > Project: Spark > Issue Type: Sub-task > Components: PS > Affects Versions: 4.1.0 > Reporter: Xinrong Meng > Priority: Major > > {code:java} > >>> pdf = pd.DataFrame({"a": ["x"], "b": [0]}) > >>> pdf > a b > 0 x 0 > >>> psdf = ps.from_pandas(pdf) > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > File "/Users/xinrong.meng/spark/python/pyspark/pandas/namespace.py", line > 187, in from_pandas > return DataFrame(pobj) > File "/Users/xinrong.meng/spark/python/pyspark/pandas/frame.py", line 573, > in __init__ > internal = InternalFrame.from_pandas(pdf) > File "/Users/xinrong.meng/spark/python/pyspark/pandas/internal.py", line > 1480, in from_pandas > ) = InternalFrame.prepare_pandas_frame(pdf, > prefer_timestamp_ntz=prefer_timestamp_ntz) > File "/Users/xinrong.meng/spark/python/pyspark/pandas/internal.py", line > 1581, in prepare_pandas_frame > spark_type = infer_pd_series_spark_type(reset_index[col], dtype, > prefer_timestamp_ntz) > File > "/Users/xinrong.meng/spark/python/pyspark/pandas/typedef/typehints.py", line > 368, in infer_pd_series_spark_type > return from_arrow_type(pa.Array.from_pandas(pser).type, > prefer_timestamp_ntz) > File "pyarrow/array.pxi", line 1115, in pyarrow.lib.Array.from_pandas > File "pyarrow/array.pxi", line 339, in pyarrow.lib.array > File "pyarrow/array.pxi", line 85, in pyarrow.lib._ndarray_to_array > File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status > pyarrow.lib.ArrowTypeError: Input object was not a NumPy array > >>> > {code} > > {code:java} > >>> pd.__version__ > '2.2.3' > >>> pa.__version__ > '15.0.2' > >>> np.__version__ > '2.0.2' {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org