This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 64e31684c9 Python: Add year, month, day, and hour transforms (#5462)
64e31684c9 is described below

commit 64e31684c9806f91370a89a4b0692a9e73967a33
Author: jun-he <[email protected]>
AuthorDate: Mon Aug 29 06:31:15 2022 -0700

    Python: Add year, month, day, and hour transforms (#5462)
    
    * add time (year, month, day, hour) transforms
    
    * address the comments
    
    * update the PR to address the comments
---
 python/pyiceberg/transforms.py     | 227 ++++++++++++++++++++++++++++++++++++-
 python/pyiceberg/utils/datetime.py |  42 +++++++
 python/tests/test_transforms.py    | 185 ++++++++++++++++++++++++++++++
 3 files changed, 451 insertions(+), 3 deletions(-)

diff --git a/python/pyiceberg/transforms.py b/python/pyiceberg/transforms.py
index fdb4978d60..270918e67b 100644
--- a/python/pyiceberg/transforms.py
+++ b/python/pyiceberg/transforms.py
@@ -18,6 +18,7 @@
 import base64
 import struct
 from abc import ABC, abstractmethod
+from enum import IntEnum
 from functools import singledispatch
 from typing import (
     Any,
@@ -58,6 +59,10 @@ IDENTITY = "identity"
 VOID = "void"
 BUCKET = "bucket"
 TRUNCATE = "truncate"
+YEAR = "year"
+MONTH = "month"
+DAY = "day"
+HOUR = "hour"
 
 BUCKET_PARSER = ParseNumberFromBrackets(BUCKET)
 TRUNCATE_PARSER = ParseNumberFromBrackets(TRUNCATE)
@@ -92,6 +97,14 @@ class Transform(IcebergBaseModel, ABC, Generic[S, T]):
                 return BucketTransform(num_buckets=BUCKET_PARSER.match(v))
             elif v.startswith(TRUNCATE):
                 return TruncateTransform(width=TRUNCATE_PARSER.match(v))
+            elif v == YEAR:
+                return YearTransform()
+            elif v == MONTH:
+                return MonthTransform()
+            elif v == DAY:
+                return DayTransform()
+            elif v == HOUR:
+                return HourTransform()
             else:
                 return UnknownTransform(transform=v)
         return v
@@ -141,7 +154,6 @@ class BucketTransform(Transform[S, int]):
       num_buckets (int): The number of buckets.
     """
 
-    _source_type: IcebergType = PrivateAttr()
     _num_buckets: PositiveInt = PrivateAttr()
 
     def __init__(self, num_buckets: int, **data: Any):
@@ -215,6 +227,217 @@ class BucketTransform(Transform[S, int]):
         return f"BucketTransform(num_buckets={self._num_buckets})"
 
 
+class TimeResolution(IntEnum):
+    YEAR = 6
+    MONTH = 5
+    WEEK = 4
+    DAY = 3
+    HOUR = 2
+    MINUTE = 1
+    SECOND = 0
+
+
+class TimeTransform(Transform[S, int], Singleton):
+    @property
+    @abstractmethod
+    def granularity(self) -> TimeResolution:
+        ...
+
+    def satisfies_order_of(self, other: Transform) -> bool:
+        return self.granularity <= other.granularity if hasattr(other, 
"granularity") else False
+
+    def result_type(self, source: IcebergType) -> IcebergType:
+        return IntegerType()
+
+    @property
+    def dedup_name(self) -> str:
+        return "time"
+
+    @property
+    def preserves_order(self) -> bool:
+        return True
+
+
+class YearTransform(TimeTransform):
+    """Transforms a datetime value into a year value.
+
+    Example:
+        >>> transform = YearTransform()
+        >>> transform.transform(TimestampType())(1512151975038194)
+        47
+    """
+
+    __root__: Literal["year"] = Field(default="year")
+
+    def transform(self, source: IcebergType) -> Callable[[Optional[S]], 
Optional[int]]:
+        source_type = type(source)
+        if source_type == DateType:
+
+            def year_func(v):
+                return datetime.days_to_years(v)
+
+        elif source_type in {TimestampType, TimestamptzType}:
+
+            def year_func(v):
+                return datetime.micros_to_years(v)
+
+        else:
+            raise ValueError(f"Cannot apply year transform for type: {source}")
+
+        return lambda v: year_func(v) if v is not None else None
+
+    def can_transform(self, source: IcebergType) -> bool:
+        return type(source) in {
+            DateType,
+            TimestampType,
+            TimestamptzType,
+        }
+
+    @property
+    def granularity(self) -> TimeResolution:
+        return TimeResolution.YEAR
+
+    def to_human_string(self, _: IcebergType, value: Optional[S]) -> str:
+        return datetime.to_human_year(value) if isinstance(value, int) else 
"null"
+
+    def __repr__(self) -> str:
+        return "YearTransform()"
+
+
+class MonthTransform(TimeTransform):
+    """Transforms a datetime value into a month value.
+
+    Example:
+        >>> transform = MonthTransform()
+        >>> transform.transform(DateType())(17501)
+        575
+    """
+
+    __root__: Literal["month"] = Field(default="month")
+
+    def transform(self, source: IcebergType) -> Callable[[Optional[S]], 
Optional[int]]:
+        source_type = type(source)
+        if source_type == DateType:
+
+            def month_func(v):
+                return datetime.days_to_months(v)
+
+        elif source_type in {TimestampType, TimestamptzType}:
+
+            def month_func(v):
+                return datetime.micros_to_months(v)
+
+        else:
+            raise ValueError(f"Cannot apply month transform for type: 
{source}")
+
+        return lambda v: month_func(v) if v else None
+
+    def can_transform(self, source: IcebergType) -> bool:
+        return type(source) in {
+            DateType,
+            TimestampType,
+            TimestamptzType,
+        }
+
+    @property
+    def granularity(self) -> TimeResolution:
+        return TimeResolution.MONTH
+
+    def to_human_string(self, _: IcebergType, value: Optional[S]) -> str:
+        return datetime.to_human_month(value) if isinstance(value, int) else 
"null"
+
+    def __repr__(self) -> str:
+        return "MonthTransform()"
+
+
+class DayTransform(TimeTransform):
+    """Transforms a datetime value into a day value.
+
+    Example:
+        >>> transform = MonthTransform()
+        >>> transform.transform(DateType())(17501)
+        17501
+    """
+
+    __root__: Literal["day"] = Field(default="day")
+
+    def transform(self, source: IcebergType) -> Callable[[Optional[S]], 
Optional[int]]:
+        source_type = type(source)
+        if source_type == DateType:
+
+            def day_func(v):
+                return v
+
+        elif source_type in {TimestampType, TimestamptzType}:
+
+            def day_func(v):
+                return datetime.micros_to_days(v)
+
+        else:
+            raise ValueError(f"Cannot apply day transform for type: {source}")
+
+        return lambda v: day_func(v) if v else None
+
+    def can_transform(self, source: IcebergType) -> bool:
+        return type(source) in {
+            DateType,
+            TimestampType,
+            TimestamptzType,
+        }
+
+    def result_type(self, source: IcebergType) -> IcebergType:
+        return DateType()
+
+    @property
+    def granularity(self) -> TimeResolution:
+        return TimeResolution.DAY
+
+    def to_human_string(self, _: IcebergType, value: Optional[S]) -> str:
+        return datetime.to_human_day(value) if isinstance(value, int) else 
"null"
+
+    def __repr__(self) -> str:
+        return "DayTransform()"
+
+
+class HourTransform(TimeTransform):
+    """Transforms a datetime value into a hour value.
+
+    Example:
+        >>> transform = HourTransform()
+        >>> transform.transform(TimestampType())(1512151975038194)
+        420042
+    """
+
+    __root__: Literal["hour"] = Field(default="hour")
+
+    def transform(self, source: IcebergType) -> Callable[[Optional[S]], 
Optional[int]]:
+        if type(source) in {TimestampType, TimestamptzType}:
+
+            def hour_func(v):
+                return datetime.micros_to_hours(v)
+
+        else:
+            raise ValueError(f"Cannot apply hour transform for type: {source}")
+
+        return lambda v: hour_func(v) if v else None
+
+    def can_transform(self, source: IcebergType) -> bool:
+        return type(source) in {
+            TimestampType,
+            TimestamptzType,
+        }
+
+    @property
+    def granularity(self) -> TimeResolution:
+        return TimeResolution.HOUR
+
+    def to_human_string(self, _: IcebergType, value: Optional[S]) -> str:
+        return datetime.to_human_hour(value) if isinstance(value, int) else 
"null"
+
+    def __repr__(self) -> str:
+        return "HourTransform()"
+
+
 def _base64encode(buffer: bytes) -> str:
     """Converts bytes to base64 string"""
     return base64.b64encode(buffer).decode("ISO-8859-1")
@@ -230,7 +453,6 @@ class IdentityTransform(Transform[S, S]):
     """
 
     __root__: Literal["identity"] = Field(default="identity")
-    _source_type: IcebergType = PrivateAttr()
 
     def transform(self, source: IcebergType) -> Callable[[Optional[S]], 
Optional[S]]:
         return lambda v: v
@@ -389,7 +611,6 @@ class UnknownTransform(Transform):
     """
 
     __root__: Literal["unknown"] = Field(default="unknown")
-    _source_type: IcebergType = PrivateAttr()
     _transform: str = PrivateAttr()
 
     def __init__(self, transform: str, **data: Any):
diff --git a/python/pyiceberg/utils/datetime.py 
b/python/pyiceberg/utils/datetime.py
index 9e491ce9ce..5bfcf772fe 100644
--- a/python/pyiceberg/utils/datetime.py
+++ b/python/pyiceberg/utils/datetime.py
@@ -98,11 +98,26 @@ def micros_to_timestamptz(micros: int):
     return EPOCH_TIMESTAMPTZ + dt
 
 
+def to_human_year(year_ordinal: int) -> str:
+    """Converts a DateType value to human string"""
+    return f"{EPOCH_TIMESTAMP.year + year_ordinal:0=4d}"
+
+
+def to_human_month(month_ordinal: int) -> str:
+    """Converts a DateType value to human string"""
+    return f"{EPOCH_TIMESTAMP.year + month_ordinal // 12:0=4d}-{1 + 
month_ordinal % 12:0=2d}"
+
+
 def to_human_day(day_ordinal: int) -> str:
     """Converts a DateType value to human string"""
     return (EPOCH_DATE + timedelta(days=day_ordinal)).isoformat()
 
 
+def to_human_hour(hour_ordinal: int) -> str:
+    """Converts a DateType value to human string"""
+    return (EPOCH_TIMESTAMP + timedelta(hours=hour_ordinal)).isoformat("-", 
"hours")
+
+
 def to_human_time(micros_from_midnight: int) -> str:
     """Converts a TimeType value to human string"""
     return micros_to_time(micros_from_midnight).isoformat()
@@ -116,3 +131,30 @@ def to_human_timestamptz(timestamp_micros: int) -> str:
 def to_human_timestamp(timestamp_micros: int) -> str:
     """Converts a TimestampType value to human string"""
     return (EPOCH_TIMESTAMP + 
timedelta(microseconds=timestamp_micros)).isoformat()
+
+
+def micros_to_hours(timestamp: int) -> int:
+    """Converts a timestamp in microseconds to a date in hours"""
+    return int((datetime.utcfromtimestamp(timestamp // 1_000_000) - 
EPOCH_TIMESTAMP).total_seconds() / 3600)
+
+
+def days_to_months(days: int) -> int:
+    """Creates a date from the number of days from 1970-01-01"""
+    d = days_to_date(days)
+    return (d.year - EPOCH_DATE.year) * 12 + (d.month - EPOCH_DATE.month)
+
+
+def micros_to_months(timestamp: int) -> int:
+    dt = micros_to_timestamp(timestamp)
+    return (dt.year - EPOCH_TIMESTAMP.year) * 12 + (dt.month - 
EPOCH_TIMESTAMP.month) - (1 if dt.day < EPOCH_TIMESTAMP.day else 0)
+
+
+def days_to_years(days: int) -> int:
+    return days_to_date(days).year - EPOCH_DATE.year
+
+
+def micros_to_years(timestamp: int) -> int:
+    dt = micros_to_timestamp(timestamp)
+    return (dt.year - EPOCH_TIMESTAMP.year) - (
+        1 if dt.month < EPOCH_TIMESTAMP.month or (dt.month == 
EPOCH_TIMESTAMP.month and dt.day < EPOCH_TIMESTAMP.day) else 0
+    )
diff --git a/python/tests/test_transforms.py b/python/tests/test_transforms.py
index bdebae94e4..eec3e4aebe 100644
--- a/python/tests/test_transforms.py
+++ b/python/tests/test_transforms.py
@@ -24,11 +24,15 @@ import pytest
 from pyiceberg import transforms
 from pyiceberg.transforms import (
     BucketTransform,
+    DayTransform,
+    HourTransform,
     IdentityTransform,
+    MonthTransform,
     Transform,
     TruncateTransform,
     UnknownTransform,
     VoidTransform,
+    YearTransform,
 )
 from pyiceberg.types import (
     BinaryType,
@@ -139,6 +143,125 @@ def test_string_with_surrogate_pair():
     assert bucket_transform(string_with_surrogate_pair) == mmh3.hash(as_bytes)
 
 
[email protected](
+    "date,date_transform,expected",
+    [
+        (47, YearTransform(), "2017"),
+        (575, MonthTransform(), "2017-12"),
+        (17501, DayTransform(), "2017-12-01"),
+    ],
+)
+def test_date_to_human_string(date, date_transform, expected):
+    assert date_transform.to_human_string(DateType(), date) == expected
+
+
[email protected](
+    "date_transform",
+    [
+        YearTransform(),
+        MonthTransform(),
+        DayTransform(),
+    ],
+)
+def test_none_date_to_human_string(date_transform):
+    assert date_transform.to_human_string(DateType(), None) == "null"
+
+
+def test_hour_to_human_string():
+    assert HourTransform().to_human_string(TimestampType(), None) == "null"
+    assert HourTransform().to_human_string(TimestampType(), 420042) == 
"2017-12-01-18"
+
+
[email protected](
+    "negative_value,time_transform,expected",
+    [
+        (-1, YearTransform(), "1969"),
+        (-1, MonthTransform(), "1969-12"),
+        (-1, DayTransform(), "1969-12-31"),
+        (-1, HourTransform(), "1969-12-31-23"),
+    ],
+)
+def test_negative_value_to_human_string(negative_value, time_transform, 
expected):
+    assert time_transform.to_human_string(TimestampType(), negative_value) == 
expected
+
+
[email protected](
+    "type_var",
+    [
+        DateType(),
+        TimestampType(),
+        TimestamptzType(),
+    ],
+)
+def test_time_methods(type_var):
+    assert YearTransform().can_transform(type_var)
+    assert MonthTransform().can_transform(type_var)
+    assert DayTransform().can_transform(type_var)
+    assert YearTransform().preserves_order
+    assert MonthTransform().preserves_order
+    assert DayTransform().preserves_order
+    assert YearTransform().result_type(type_var) == IntegerType()
+    assert MonthTransform().result_type(type_var) == IntegerType()
+    assert DayTransform().result_type(type_var) == DateType()
+    assert YearTransform().dedup_name == "time"
+    assert MonthTransform().dedup_name == "time"
+    assert DayTransform().dedup_name == "time"
+
+
[email protected](
+    "transform,type_var,value,expected",
+    [
+        (DayTransform(), DateType(), 17501, 17501),
+        (DayTransform(), DateType(), -1, -1),
+        (MonthTransform(), DateType(), 17501, 575),
+        (MonthTransform(), DateType(), -1, -1),
+        (YearTransform(), DateType(), 17501, 47),
+        (YearTransform(), DateType(), -1, -1),
+        (YearTransform(), TimestampType(), 1512151975038194, 47),
+        (YearTransform(), TimestampType(), -1, -1),
+        (MonthTransform(), TimestamptzType(), 1512151975038194, 575),
+        (MonthTransform(), TimestamptzType(), -1, -1),
+        (DayTransform(), TimestampType(), 1512151975038194, 17501),
+        (DayTransform(), TimestampType(), -1, -1),
+    ],
+)
+def test_time_apply_method(transform, type_var, value, expected):
+    assert transform.transform(type_var)(value) == expected
+
+
[email protected](
+    "type_var",
+    [
+        TimestampType(),
+        TimestamptzType(),
+    ],
+)
+def test_hour_method(type_var):
+    assert HourTransform().can_transform(type_var)
+    assert HourTransform().result_type(type_var) == IntegerType()
+    assert HourTransform().transform(type_var)(1512151975038194) == 420042
+    assert HourTransform().dedup_name == "time"
+
+
[email protected](
+    "transform,other_transform",
+    [
+        (YearTransform(), MonthTransform()),
+        (YearTransform(), DayTransform()),
+        (YearTransform(), HourTransform()),
+        (MonthTransform(), DayTransform()),
+        (MonthTransform(), HourTransform()),
+        (DayTransform(), HourTransform()),
+    ],
+)
+def test_satisfies_order_of_method(transform, other_transform):
+    assert transform.satisfies_order_of(transform)
+    assert other_transform.satisfies_order_of(transform)
+    assert not transform.satisfies_order_of(other_transform)
+    assert not transform.satisfies_order_of(VoidTransform())
+    assert not other_transform.satisfies_order_of(IdentityTransform())
+
+
 @pytest.mark.parametrize(
     "type_var,value,expected",
     [
@@ -318,3 +441,65 @@ def test_void_transform_str():
 
 def test_void_transform_repr():
     assert repr(VoidTransform()) == "VoidTransform()"
+
+
+def test_year_transform_serialize():
+    assert YearTransform().json() == '"year"'
+
+
+def test_year_transform_deserialize():
+    transform = TestType.parse_raw('"year"').__root__
+    assert transform == YearTransform()
+
+
+def test_month_transform_serialize():
+    assert MonthTransform().json() == '"month"'
+
+
+def test_month_transform_deserialize():
+    transform = TestType.parse_raw('"month"').__root__
+    assert transform == MonthTransform()
+
+
+def test_day_transform_serialize():
+    assert DayTransform().json() == '"day"'
+
+
+def test_day_transform_deserialize():
+    transform = TestType.parse_raw('"day"').__root__
+    assert transform == DayTransform()
+
+
+def test_hour_transform_serialize():
+    assert HourTransform().json() == '"hour"'
+
+
+def test_hour_transform_deserialize():
+    transform = TestType.parse_raw('"hour"').__root__
+    assert transform == HourTransform()
+
+
[email protected](
+    "transform,transform_str",
+    [
+        (YearTransform(), "year"),
+        (MonthTransform(), "month"),
+        (DayTransform(), "day"),
+        (HourTransform(), "hour"),
+    ],
+)
+def test_datetime_transform_str(transform, transform_str):
+    assert str(transform) == transform_str
+
+
[email protected](
+    "transform,transform_repr",
+    [
+        (YearTransform(), "YearTransform()"),
+        (MonthTransform(), "MonthTransform()"),
+        (DayTransform(), "DayTransform()"),
+        (HourTransform(), "HourTransform()"),
+    ],
+)
+def test_datetime_transform_repr(transform, transform_repr):
+    assert repr(transform) == transform_repr

Reply via email to