Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/19607#discussion_r149582142 --- Diff: python/pyspark/sql/types.py --- @@ -1629,37 +1629,82 @@ def to_arrow_type(dt): return arrow_type -def _check_dataframe_localize_timestamps(pdf): +def _check_dataframe_localize_timestamps(pdf, timezone): """ - Convert timezone aware timestamps to timezone-naive in local time + Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone :param pdf: pandas.DataFrame - :return pandas.DataFrame where any timezone aware columns have be converted to tz-naive + :param timezone: the timezone to convert. if None then use local timezone + :return pandas.DataFrame where any timezone aware columns have been converted to tz-naive """ from pandas.api.types import is_datetime64tz_dtype + tz = timezone or 'tzlocal()' for column, series in pdf.iteritems(): # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if is_datetime64tz_dtype(series.dtype): - pdf[column] = series.dt.tz_convert('tzlocal()').dt.tz_localize(None) + pdf[column] = series.dt.tz_convert(tz).dt.tz_localize(None) return pdf -def _check_series_convert_timestamps_internal(s): +def _check_series_convert_timestamps_internal(s, timezone): """ - Convert a tz-naive timestamp in local tz to UTC normalized for Spark internal storage + Convert a tz-naive timestamp in the specified timezone or local timezone to UTC normalized for + Spark internal storage + :param s: a pandas.Series + :param timezone: the timezone to convert. if None then use local timezone :return pandas.Series where if it is a timestamp, has been UTC normalized without a time zone """ from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if is_datetime64_dtype(s.dtype): - return s.dt.tz_localize('tzlocal()').dt.tz_convert('UTC') + tz = timezone or 'tzlocal()' + return s.dt.tz_localize(tz).dt.tz_convert('UTC') elif is_datetime64tz_dtype(s.dtype): return s.dt.tz_convert('UTC') else: return s +def _check_series_convert_timestamps_localize(s, timezone): + """ + Convert timestamp to timezone-naive in the specified timezone or local timezone + + :param s: a pandas.Series + :param timezone: the timezone to convert. if None then use local timezone + :return pandas.Series where if it is a timestamp, has been converted to tz-naive + """ + import pandas as pd + try: + from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype + tz = timezone or 'tzlocal()' + # TODO: handle nested timestamps, such as ArrayType(TimestampType())? + if is_datetime64tz_dtype(s.dtype): + return s.dt.tz_convert(tz).dt.tz_localize(None) + elif is_datetime64_dtype(s.dtype) and timezone is not None: + # `s.dt.tz_localize('tzlocal()')` doesn't work properly when including NaT. + return s.apply(lambda ts: ts.tz_localize('tzlocal()').tz_convert(tz).tz_localize(None) + if ts is not pd.NaT else pd.NaT) + else: + return s + except ImportError: --- End diff -- We will be able to remove this block if we decided to support only Pandas >=0.19.2.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org