This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 64e31684c9 Python: Add year, month, day, and hour transforms (#5462)
64e31684c9 is described below
commit 64e31684c9806f91370a89a4b0692a9e73967a33
Author: jun-he <[email protected]>
AuthorDate: Mon Aug 29 06:31:15 2022 -0700
Python: Add year, month, day, and hour transforms (#5462)
* add time (year, month, day, hour) transforms
* address the comments
* update the PR to address the comments
---
python/pyiceberg/transforms.py | 227 ++++++++++++++++++++++++++++++++++++-
python/pyiceberg/utils/datetime.py | 42 +++++++
python/tests/test_transforms.py | 185 ++++++++++++++++++++++++++++++
3 files changed, 451 insertions(+), 3 deletions(-)
diff --git a/python/pyiceberg/transforms.py b/python/pyiceberg/transforms.py
index fdb4978d60..270918e67b 100644
--- a/python/pyiceberg/transforms.py
+++ b/python/pyiceberg/transforms.py
@@ -18,6 +18,7 @@
import base64
import struct
from abc import ABC, abstractmethod
+from enum import IntEnum
from functools import singledispatch
from typing import (
Any,
@@ -58,6 +59,10 @@ IDENTITY = "identity"
VOID = "void"
BUCKET = "bucket"
TRUNCATE = "truncate"
+YEAR = "year"
+MONTH = "month"
+DAY = "day"
+HOUR = "hour"
BUCKET_PARSER = ParseNumberFromBrackets(BUCKET)
TRUNCATE_PARSER = ParseNumberFromBrackets(TRUNCATE)
@@ -92,6 +97,14 @@ class Transform(IcebergBaseModel, ABC, Generic[S, T]):
return BucketTransform(num_buckets=BUCKET_PARSER.match(v))
elif v.startswith(TRUNCATE):
return TruncateTransform(width=TRUNCATE_PARSER.match(v))
+ elif v == YEAR:
+ return YearTransform()
+ elif v == MONTH:
+ return MonthTransform()
+ elif v == DAY:
+ return DayTransform()
+ elif v == HOUR:
+ return HourTransform()
else:
return UnknownTransform(transform=v)
return v
@@ -141,7 +154,6 @@ class BucketTransform(Transform[S, int]):
num_buckets (int): The number of buckets.
"""
- _source_type: IcebergType = PrivateAttr()
_num_buckets: PositiveInt = PrivateAttr()
def __init__(self, num_buckets: int, **data: Any):
@@ -215,6 +227,217 @@ class BucketTransform(Transform[S, int]):
return f"BucketTransform(num_buckets={self._num_buckets})"
+class TimeResolution(IntEnum):
+ YEAR = 6
+ MONTH = 5
+ WEEK = 4
+ DAY = 3
+ HOUR = 2
+ MINUTE = 1
+ SECOND = 0
+
+
+class TimeTransform(Transform[S, int], Singleton):
+ @property
+ @abstractmethod
+ def granularity(self) -> TimeResolution:
+ ...
+
+ def satisfies_order_of(self, other: Transform) -> bool:
+ return self.granularity <= other.granularity if hasattr(other,
"granularity") else False
+
+ def result_type(self, source: IcebergType) -> IcebergType:
+ return IntegerType()
+
+ @property
+ def dedup_name(self) -> str:
+ return "time"
+
+ @property
+ def preserves_order(self) -> bool:
+ return True
+
+
+class YearTransform(TimeTransform):
+ """Transforms a datetime value into a year value.
+
+ Example:
+ >>> transform = YearTransform()
+ >>> transform.transform(TimestampType())(1512151975038194)
+ 47
+ """
+
+ __root__: Literal["year"] = Field(default="year")
+
+ def transform(self, source: IcebergType) -> Callable[[Optional[S]],
Optional[int]]:
+ source_type = type(source)
+ if source_type == DateType:
+
+ def year_func(v):
+ return datetime.days_to_years(v)
+
+ elif source_type in {TimestampType, TimestamptzType}:
+
+ def year_func(v):
+ return datetime.micros_to_years(v)
+
+ else:
+ raise ValueError(f"Cannot apply year transform for type: {source}")
+
+ return lambda v: year_func(v) if v is not None else None
+
+ def can_transform(self, source: IcebergType) -> bool:
+ return type(source) in {
+ DateType,
+ TimestampType,
+ TimestamptzType,
+ }
+
+ @property
+ def granularity(self) -> TimeResolution:
+ return TimeResolution.YEAR
+
+ def to_human_string(self, _: IcebergType, value: Optional[S]) -> str:
+ return datetime.to_human_year(value) if isinstance(value, int) else
"null"
+
+ def __repr__(self) -> str:
+ return "YearTransform()"
+
+
+class MonthTransform(TimeTransform):
+ """Transforms a datetime value into a month value.
+
+ Example:
+ >>> transform = MonthTransform()
+ >>> transform.transform(DateType())(17501)
+ 575
+ """
+
+ __root__: Literal["month"] = Field(default="month")
+
+ def transform(self, source: IcebergType) -> Callable[[Optional[S]],
Optional[int]]:
+ source_type = type(source)
+ if source_type == DateType:
+
+ def month_func(v):
+ return datetime.days_to_months(v)
+
+ elif source_type in {TimestampType, TimestamptzType}:
+
+ def month_func(v):
+ return datetime.micros_to_months(v)
+
+ else:
+ raise ValueError(f"Cannot apply month transform for type:
{source}")
+
+ return lambda v: month_func(v) if v else None
+
+ def can_transform(self, source: IcebergType) -> bool:
+ return type(source) in {
+ DateType,
+ TimestampType,
+ TimestamptzType,
+ }
+
+ @property
+ def granularity(self) -> TimeResolution:
+ return TimeResolution.MONTH
+
+ def to_human_string(self, _: IcebergType, value: Optional[S]) -> str:
+ return datetime.to_human_month(value) if isinstance(value, int) else
"null"
+
+ def __repr__(self) -> str:
+ return "MonthTransform()"
+
+
+class DayTransform(TimeTransform):
+ """Transforms a datetime value into a day value.
+
+ Example:
+ >>> transform = MonthTransform()
+ >>> transform.transform(DateType())(17501)
+ 17501
+ """
+
+ __root__: Literal["day"] = Field(default="day")
+
+ def transform(self, source: IcebergType) -> Callable[[Optional[S]],
Optional[int]]:
+ source_type = type(source)
+ if source_type == DateType:
+
+ def day_func(v):
+ return v
+
+ elif source_type in {TimestampType, TimestamptzType}:
+
+ def day_func(v):
+ return datetime.micros_to_days(v)
+
+ else:
+ raise ValueError(f"Cannot apply day transform for type: {source}")
+
+ return lambda v: day_func(v) if v else None
+
+ def can_transform(self, source: IcebergType) -> bool:
+ return type(source) in {
+ DateType,
+ TimestampType,
+ TimestamptzType,
+ }
+
+ def result_type(self, source: IcebergType) -> IcebergType:
+ return DateType()
+
+ @property
+ def granularity(self) -> TimeResolution:
+ return TimeResolution.DAY
+
+ def to_human_string(self, _: IcebergType, value: Optional[S]) -> str:
+ return datetime.to_human_day(value) if isinstance(value, int) else
"null"
+
+ def __repr__(self) -> str:
+ return "DayTransform()"
+
+
+class HourTransform(TimeTransform):
+ """Transforms a datetime value into a hour value.
+
+ Example:
+ >>> transform = HourTransform()
+ >>> transform.transform(TimestampType())(1512151975038194)
+ 420042
+ """
+
+ __root__: Literal["hour"] = Field(default="hour")
+
+ def transform(self, source: IcebergType) -> Callable[[Optional[S]],
Optional[int]]:
+ if type(source) in {TimestampType, TimestamptzType}:
+
+ def hour_func(v):
+ return datetime.micros_to_hours(v)
+
+ else:
+ raise ValueError(f"Cannot apply hour transform for type: {source}")
+
+ return lambda v: hour_func(v) if v else None
+
+ def can_transform(self, source: IcebergType) -> bool:
+ return type(source) in {
+ TimestampType,
+ TimestamptzType,
+ }
+
+ @property
+ def granularity(self) -> TimeResolution:
+ return TimeResolution.HOUR
+
+ def to_human_string(self, _: IcebergType, value: Optional[S]) -> str:
+ return datetime.to_human_hour(value) if isinstance(value, int) else
"null"
+
+ def __repr__(self) -> str:
+ return "HourTransform()"
+
+
def _base64encode(buffer: bytes) -> str:
"""Converts bytes to base64 string"""
return base64.b64encode(buffer).decode("ISO-8859-1")
@@ -230,7 +453,6 @@ class IdentityTransform(Transform[S, S]):
"""
__root__: Literal["identity"] = Field(default="identity")
- _source_type: IcebergType = PrivateAttr()
def transform(self, source: IcebergType) -> Callable[[Optional[S]],
Optional[S]]:
return lambda v: v
@@ -389,7 +611,6 @@ class UnknownTransform(Transform):
"""
__root__: Literal["unknown"] = Field(default="unknown")
- _source_type: IcebergType = PrivateAttr()
_transform: str = PrivateAttr()
def __init__(self, transform: str, **data: Any):
diff --git a/python/pyiceberg/utils/datetime.py
b/python/pyiceberg/utils/datetime.py
index 9e491ce9ce..5bfcf772fe 100644
--- a/python/pyiceberg/utils/datetime.py
+++ b/python/pyiceberg/utils/datetime.py
@@ -98,11 +98,26 @@ def micros_to_timestamptz(micros: int):
return EPOCH_TIMESTAMPTZ + dt
+def to_human_year(year_ordinal: int) -> str:
+ """Converts a DateType value to human string"""
+ return f"{EPOCH_TIMESTAMP.year + year_ordinal:0=4d}"
+
+
+def to_human_month(month_ordinal: int) -> str:
+ """Converts a DateType value to human string"""
+ return f"{EPOCH_TIMESTAMP.year + month_ordinal // 12:0=4d}-{1 +
month_ordinal % 12:0=2d}"
+
+
def to_human_day(day_ordinal: int) -> str:
"""Converts a DateType value to human string"""
return (EPOCH_DATE + timedelta(days=day_ordinal)).isoformat()
+def to_human_hour(hour_ordinal: int) -> str:
+ """Converts a DateType value to human string"""
+ return (EPOCH_TIMESTAMP + timedelta(hours=hour_ordinal)).isoformat("-",
"hours")
+
+
def to_human_time(micros_from_midnight: int) -> str:
"""Converts a TimeType value to human string"""
return micros_to_time(micros_from_midnight).isoformat()
@@ -116,3 +131,30 @@ def to_human_timestamptz(timestamp_micros: int) -> str:
def to_human_timestamp(timestamp_micros: int) -> str:
"""Converts a TimestampType value to human string"""
return (EPOCH_TIMESTAMP +
timedelta(microseconds=timestamp_micros)).isoformat()
+
+
+def micros_to_hours(timestamp: int) -> int:
+ """Converts a timestamp in microseconds to a date in hours"""
+ return int((datetime.utcfromtimestamp(timestamp // 1_000_000) -
EPOCH_TIMESTAMP).total_seconds() / 3600)
+
+
+def days_to_months(days: int) -> int:
+ """Creates a date from the number of days from 1970-01-01"""
+ d = days_to_date(days)
+ return (d.year - EPOCH_DATE.year) * 12 + (d.month - EPOCH_DATE.month)
+
+
+def micros_to_months(timestamp: int) -> int:
+ dt = micros_to_timestamp(timestamp)
+ return (dt.year - EPOCH_TIMESTAMP.year) * 12 + (dt.month -
EPOCH_TIMESTAMP.month) - (1 if dt.day < EPOCH_TIMESTAMP.day else 0)
+
+
+def days_to_years(days: int) -> int:
+ return days_to_date(days).year - EPOCH_DATE.year
+
+
+def micros_to_years(timestamp: int) -> int:
+ dt = micros_to_timestamp(timestamp)
+ return (dt.year - EPOCH_TIMESTAMP.year) - (
+ 1 if dt.month < EPOCH_TIMESTAMP.month or (dt.month ==
EPOCH_TIMESTAMP.month and dt.day < EPOCH_TIMESTAMP.day) else 0
+ )
diff --git a/python/tests/test_transforms.py b/python/tests/test_transforms.py
index bdebae94e4..eec3e4aebe 100644
--- a/python/tests/test_transforms.py
+++ b/python/tests/test_transforms.py
@@ -24,11 +24,15 @@ import pytest
from pyiceberg import transforms
from pyiceberg.transforms import (
BucketTransform,
+ DayTransform,
+ HourTransform,
IdentityTransform,
+ MonthTransform,
Transform,
TruncateTransform,
UnknownTransform,
VoidTransform,
+ YearTransform,
)
from pyiceberg.types import (
BinaryType,
@@ -139,6 +143,125 @@ def test_string_with_surrogate_pair():
assert bucket_transform(string_with_surrogate_pair) == mmh3.hash(as_bytes)
[email protected](
+ "date,date_transform,expected",
+ [
+ (47, YearTransform(), "2017"),
+ (575, MonthTransform(), "2017-12"),
+ (17501, DayTransform(), "2017-12-01"),
+ ],
+)
+def test_date_to_human_string(date, date_transform, expected):
+ assert date_transform.to_human_string(DateType(), date) == expected
+
+
[email protected](
+ "date_transform",
+ [
+ YearTransform(),
+ MonthTransform(),
+ DayTransform(),
+ ],
+)
+def test_none_date_to_human_string(date_transform):
+ assert date_transform.to_human_string(DateType(), None) == "null"
+
+
+def test_hour_to_human_string():
+ assert HourTransform().to_human_string(TimestampType(), None) == "null"
+ assert HourTransform().to_human_string(TimestampType(), 420042) ==
"2017-12-01-18"
+
+
[email protected](
+ "negative_value,time_transform,expected",
+ [
+ (-1, YearTransform(), "1969"),
+ (-1, MonthTransform(), "1969-12"),
+ (-1, DayTransform(), "1969-12-31"),
+ (-1, HourTransform(), "1969-12-31-23"),
+ ],
+)
+def test_negative_value_to_human_string(negative_value, time_transform,
expected):
+ assert time_transform.to_human_string(TimestampType(), negative_value) ==
expected
+
+
[email protected](
+ "type_var",
+ [
+ DateType(),
+ TimestampType(),
+ TimestamptzType(),
+ ],
+)
+def test_time_methods(type_var):
+ assert YearTransform().can_transform(type_var)
+ assert MonthTransform().can_transform(type_var)
+ assert DayTransform().can_transform(type_var)
+ assert YearTransform().preserves_order
+ assert MonthTransform().preserves_order
+ assert DayTransform().preserves_order
+ assert YearTransform().result_type(type_var) == IntegerType()
+ assert MonthTransform().result_type(type_var) == IntegerType()
+ assert DayTransform().result_type(type_var) == DateType()
+ assert YearTransform().dedup_name == "time"
+ assert MonthTransform().dedup_name == "time"
+ assert DayTransform().dedup_name == "time"
+
+
[email protected](
+ "transform,type_var,value,expected",
+ [
+ (DayTransform(), DateType(), 17501, 17501),
+ (DayTransform(), DateType(), -1, -1),
+ (MonthTransform(), DateType(), 17501, 575),
+ (MonthTransform(), DateType(), -1, -1),
+ (YearTransform(), DateType(), 17501, 47),
+ (YearTransform(), DateType(), -1, -1),
+ (YearTransform(), TimestampType(), 1512151975038194, 47),
+ (YearTransform(), TimestampType(), -1, -1),
+ (MonthTransform(), TimestamptzType(), 1512151975038194, 575),
+ (MonthTransform(), TimestamptzType(), -1, -1),
+ (DayTransform(), TimestampType(), 1512151975038194, 17501),
+ (DayTransform(), TimestampType(), -1, -1),
+ ],
+)
+def test_time_apply_method(transform, type_var, value, expected):
+ assert transform.transform(type_var)(value) == expected
+
+
[email protected](
+ "type_var",
+ [
+ TimestampType(),
+ TimestamptzType(),
+ ],
+)
+def test_hour_method(type_var):
+ assert HourTransform().can_transform(type_var)
+ assert HourTransform().result_type(type_var) == IntegerType()
+ assert HourTransform().transform(type_var)(1512151975038194) == 420042
+ assert HourTransform().dedup_name == "time"
+
+
[email protected](
+ "transform,other_transform",
+ [
+ (YearTransform(), MonthTransform()),
+ (YearTransform(), DayTransform()),
+ (YearTransform(), HourTransform()),
+ (MonthTransform(), DayTransform()),
+ (MonthTransform(), HourTransform()),
+ (DayTransform(), HourTransform()),
+ ],
+)
+def test_satisfies_order_of_method(transform, other_transform):
+ assert transform.satisfies_order_of(transform)
+ assert other_transform.satisfies_order_of(transform)
+ assert not transform.satisfies_order_of(other_transform)
+ assert not transform.satisfies_order_of(VoidTransform())
+ assert not other_transform.satisfies_order_of(IdentityTransform())
+
+
@pytest.mark.parametrize(
"type_var,value,expected",
[
@@ -318,3 +441,65 @@ def test_void_transform_str():
def test_void_transform_repr():
assert repr(VoidTransform()) == "VoidTransform()"
+
+
+def test_year_transform_serialize():
+ assert YearTransform().json() == '"year"'
+
+
+def test_year_transform_deserialize():
+ transform = TestType.parse_raw('"year"').__root__
+ assert transform == YearTransform()
+
+
+def test_month_transform_serialize():
+ assert MonthTransform().json() == '"month"'
+
+
+def test_month_transform_deserialize():
+ transform = TestType.parse_raw('"month"').__root__
+ assert transform == MonthTransform()
+
+
+def test_day_transform_serialize():
+ assert DayTransform().json() == '"day"'
+
+
+def test_day_transform_deserialize():
+ transform = TestType.parse_raw('"day"').__root__
+ assert transform == DayTransform()
+
+
+def test_hour_transform_serialize():
+ assert HourTransform().json() == '"hour"'
+
+
+def test_hour_transform_deserialize():
+ transform = TestType.parse_raw('"hour"').__root__
+ assert transform == HourTransform()
+
+
[email protected](
+ "transform,transform_str",
+ [
+ (YearTransform(), "year"),
+ (MonthTransform(), "month"),
+ (DayTransform(), "day"),
+ (HourTransform(), "hour"),
+ ],
+)
+def test_datetime_transform_str(transform, transform_str):
+ assert str(transform) == transform_str
+
+
[email protected](
+ "transform,transform_repr",
+ [
+ (YearTransform(), "YearTransform()"),
+ (MonthTransform(), "MonthTransform()"),
+ (DayTransform(), "DayTransform()"),
+ (HourTransform(), "HourTransform()"),
+ ],
+)
+def test_datetime_transform_repr(transform, transform_repr):
+ assert repr(transform) == transform_repr