Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/19646#discussion_r148778328 --- Diff: python/pyspark/sql/session.py --- @@ -416,6 +417,50 @@ def _createFromLocal(self, data, schema): data = [schema.toInternal(row) for row in data] return self._sc.parallelize(data), schema + def _getNumpyRecordDtypes(self, rec): + """ + Used when converting a pandas.DataFrame to Spark using to_records(), this will correct + the dtypes of records so they can be properly loaded into Spark. + :param rec: a numpy record to check dtypes + :return corrected dtypes for a numpy.record or None if no correction needed + """ + import numpy as np + cur_dtypes = rec.dtype + col_names = cur_dtypes.names + record_type_list = [] + has_rec_fix = False + for i in xrange(len(cur_dtypes)): + curr_type = cur_dtypes[i] + # If type is a datetime64 timestamp, convert to microseconds + # NOTE: if dtype is M8[ns] then np.record.tolist() will output values as longs, + # this conversion will lead to an output of py datetime objects, see SPARK-22417 + if curr_type == np.dtype('M8[ns]'): + curr_type = 'M8[us]' + has_rec_fix = True + record_type_list.append((str(col_names[i]), curr_type)) + return record_type_list if has_rec_fix else None + + def _convertFromPandas(self, pdf, schema): + """ + Convert a pandas.DataFrame to list of records that can be used to make a DataFrame + :return tuple of list of records and schema + """ + # Convert pandas.DataFrame to list of numpy records + np_records = pdf.to_records(index=False) --- End diff -- thanks! I also tried the data type: ``` >>> pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1, 1)]}).dtypes ts datetime64[ns] dtype: object >>> pd.DataFrame({"d": [pd.Timestamp.now().date()]}).dtypes d object dtype: object ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org