Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/18664#discussion_r147023750 --- Diff: python/pyspark/sql/types.py --- @@ -1619,11 +1619,38 @@ def to_arrow_type(dt): arrow_type = pa.decimal(dt.precision, dt.scale) elif type(dt) == StringType: arrow_type = pa.string() + elif type(dt) == DateType: + arrow_type = pa.date32() + elif type(dt) == TimestampType: + # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read + arrow_type = pa.timestamp('us', tz='UTC') else: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) return arrow_type +def _check_dataframe_localize_timestamps(df): + """ Convert timezone aware timestamps to timezone-naive in local time + """ + from pandas.api.types import is_datetime64tz_dtype + for column, series in df.iteritems(): + # TODO: handle nested timestamps, such as ArrayType(TimestampType())? + if is_datetime64tz_dtype(series.dtype): + df[column] = series.dt.tz_convert('tzlocal()').dt.tz_localize(None) + return df + + +def _check_series_convert_timestamps_internal(s): + """ Convert a tz-naive timestamp in local tz to UTC normalized for Spark internal storage + """ + from pandas.api.types import is_datetime64_dtype + # TODO: handle nested timestamps, such as ArrayType(TimestampType())? + if is_datetime64_dtype(s.dtype): + return s.dt.tz_localize('tzlocal()').dt.tz_convert('UTC') + else: + return s --- End diff -- I meant if is_datetime64**tz**_dtype(s.dtype) but had the strange timezone like `tzlocal()`, I thought we need `s.dt.tz_convert('UTC')`.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org