BryanCutler commented on a change in pull request #28025: [SPARK-31186][PySpark][SQL] toPandas should not fail on duplicate column names URL: https://github.com/apache/spark/pull/28025#discussion_r398731133
########## File path: python/pyspark/sql/pandas/conversion.py ########## @@ -132,25 +132,35 @@ def toPandas(self): # Below is toPandas without Arrow optimization. pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns) - dtype = {} - for field in self.schema: + dtype = [None] * len(self.schema) + for fieldIdx in range(len(self.schema)): + field = self.schema[fieldIdx] + pandas_col = pdf.iloc[:, fieldIdx] + pandas_type = PandasConversionMixin._to_corrected_pandas_type(field.dataType) # SPARK-21766: if an integer field is nullable and has null values, it can be # inferred by pandas as float column. Once we convert the column with NaN back # to integer type e.g., np.int16, we will hit exception. So we use the inferred # float type, not the corrected type from the schema in this case. if pandas_type is not None and \ not(isinstance(field.dataType, IntegralType) and field.nullable and - pdf[field.name].isnull().any()): - dtype[field.name] = pandas_type + pandas_col.isnull().any()): + dtype[fieldIdx] = pandas_type # Ensure we fall back to nullable numpy types, even when whole column is null: - if isinstance(field.dataType, IntegralType) and pdf[field.name].isnull().any(): - dtype[field.name] = np.float64 - if isinstance(field.dataType, BooleanType) and pdf[field.name].isnull().any(): - dtype[field.name] = np.object + if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any(): + dtype[fieldIdx] = np.float64 + if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any(): + dtype[fieldIdx] = np.object + + df = pd.DataFrame() + for index, t in enumerate(dtype): + if t is not None: + series = pdf.iloc[:, index].astype(t, copy=False) + else: + series = pdf.iloc[:, index] + df.insert(index, self.schema[index].name, series, allow_duplicates=True) Review comment: Does this make a copy of the data? Seems to go into a `make_block` method, but I can't tell for sure if that is doing an allocation ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org