itholic commented on code in PR #42793: URL: https://github.com/apache/spark/pull/42793#discussion_r1319275507
########## python/pyspark/pandas/typedef/typehints.py: ########## @@ -487,23 +487,23 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes - [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] + [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False, categories_dtype=int64)] Review Comment: Added `dtype` of categories is added to `__repr__`: https://github.com/pandas-dev/pandas/issues/52179. ########## python/pyspark/pandas/frame.py: ########## @@ -10530,12 +10530,12 @@ def stack(self) -> DataFrameOrSeries: kg m cat 1.0 2.0 dog 3.0 4.0 - >>> df_multi_level_cols2.stack().sort_index() # doctest: +SKIP - height weight - cat kg NaN 1.0 - m 2.0 NaN - dog kg NaN 3.0 - m 4.0 NaN + >>> df_multi_level_cols2.stack().sort_index() Review Comment: Bug fixed in Pandas: https://github.com/pandas-dev/pandas/issues/53786. ########## python/pyspark/pandas/groupby.py: ########## @@ -311,7 +311,14 @@ def aggregate( i for i, gkey in enumerate(self._groupkeys) if gkey._psdf is not self._psdf ) if len(should_drop_index) > 0: - psdf = psdf.reset_index(level=should_drop_index, drop=True) + drop = not any( + [ + isinstance(func_or_funcs[gkey.name], list) + for gkey in self._groupkeys + if gkey.name in func_or_funcs + ] + ) + psdf = psdf.reset_index(level=should_drop_index, drop=drop) Review Comment: Bug fixed in Pandas: https://github.com/pandas-dev/pandas/issues/52849. ########## python/pyspark/pandas/tests/test_stats.py: ########## @@ -273,7 +268,18 @@ def test_skew_kurt_numerical_stability(self): self.assert_eq(psdf.kurt(), pdf.kurt(), almost=True) def test_dataframe_corr(self): - pdf = makeMissingDataframe(0.3, 42) + pdf = pd.DataFrame( + index=[ + "".join( + np.random.choice( + list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10 + ) + ) + for _ in range(30) + ], + columns=list("ABCD"), + dtype="float64", + ) Review Comment: The testing util `makeMissingDataframe` is removed. ########## python/pyspark/pandas/tests/frame/test_reshaping.py: ########## @@ -291,7 +291,8 @@ def test_stack(self): psdf_multi_level_cols2 = ps.from_pandas(pdf_multi_level_cols2) self.assert_eq( - psdf_multi_level_cols2.stack().sort_index(), pdf_multi_level_cols2.stack().sort_index() + psdf_multi_level_cols2.stack().sort_index()[["weight", "height"]], + pdf_multi_level_cols2.stack().sort_index()[["weight", "height"]], Review Comment: This just for handling the column order: **DataFrame** ```python >>> pdf weight height kg m cat 1.0 2.0 dog 3.0 4.0 ``` **DataFrame.stack() in Pandas 1.5.3** ```python >>> pdf.stack() weight height cat kg 1.0 NaN m NaN 2.0 dog kg 3.0 NaN m NaN 4.0 ``` **DataFrame.stack() in Pandas 2.1.0** ```python >>> pdf.stack() weight height cat kg 1.0 NaN m NaN 2.0 dog kg 3.0 NaN m NaN 4.0 ``` I think maybe this is the minor bug in Pandas, so I reported for the Pandas community to make sure. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org