This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 9d19ef7c V3: Introduce `timestamp_ns` and `timestamptz_ns` (#1632)
9d19ef7c is described below
commit 9d19ef7cbb81e4784f91d1bf2944b8ee7e64b8a1
Author: Sung Yun <[email protected]>
AuthorDate: Sun Mar 23 15:12:46 2025 -0400
V3: Introduce `timestamp_ns` and `timestamptz_ns` (#1632)
Fixes: https://github.com/apache/iceberg-python/issues/1552
- [x] Add TimestampNanoType and TimestampTzNanoType
- [x] Add Readers and Writers
- [x] Enhance Transforms
- [x] Add String Expressions parsing for nanoseconds timestamps
- [x] Add format-version compatibility check for each type
- [x] Run compatibility check on TableMetadata creation
- [x] Unit tests
python native `datetime` module does not have support for nanoseconds.
We'll need to update our internal date time representations to use a
different library. numpy? arrow?
---
pyiceberg/avro/reader.py | 18 ++++++
pyiceberg/avro/resolver.py | 24 ++++++++
pyiceberg/avro/writer.py | 12 ++++
pyiceberg/conversions.py | 17 +++++-
pyiceberg/io/pyarrow.py | 14 +++++
pyiceberg/schema.py | 41 +++++++++++++
pyiceberg/table/metadata.py | 7 ++-
pyiceberg/transforms.py | 98 +++++++++++++++++++++++++------
pyiceberg/types.py | 51 +++++++++++++++-
pyiceberg/utils/datetime.py | 95 ++++++++++++++++++++++++++++++
pyiceberg/utils/schema_conversion.py | 8 ++-
tests/avro/test_reader.py | 12 ++++
tests/avro/test_writer.py | 12 ++++
tests/conftest.py | 5 +-
tests/integration/test_reads.py | 37 ------------
tests/table/test_metadata.py | 111 +++++++++++++++++++++++++++++++++++
tests/test_conversions.py | 8 +++
tests/test_transforms.py | 61 ++++++++++++++++++-
tests/utils/test_datetime.py | 65 +++++++++++++++++++-
19 files changed, 627 insertions(+), 69 deletions(-)
diff --git a/pyiceberg/avro/reader.py b/pyiceberg/avro/reader.py
index 21f5d807..4c028ed7 100644
--- a/pyiceberg/avro/reader.py
+++ b/pyiceberg/avro/reader.py
@@ -175,6 +175,14 @@ class TimestampReader(IntegerReader):
"""
+class TimestampNanoReader(IntegerReader):
+ """Reads a nanosecond granularity timestamp from the stream.
+
+ Long is decoded as python integer which represents
+ the number of nanoseconds from the unix epoch, 1 January 1970.
+ """
+
+
class TimestamptzReader(IntegerReader):
"""Reads a microsecond granularity timestamptz from the stream.
@@ -185,6 +193,16 @@ class TimestamptzReader(IntegerReader):
"""
+class TimestamptzNanoReader(IntegerReader):
+ """Reads a microsecond granularity timestamptz from the stream.
+
+ Long is decoded as python integer which represents
+ the number of nanoseconds from the unix epoch, 1 January 1970.
+
+ Adjusted to UTC.
+ """
+
+
class StringReader(Reader):
def read(self, decoder: BinaryDecoder) -> str:
return decoder.read_utf8()
diff --git a/pyiceberg/avro/resolver.py b/pyiceberg/avro/resolver.py
index 004af8bd..9ed111ff 100644
--- a/pyiceberg/avro/resolver.py
+++ b/pyiceberg/avro/resolver.py
@@ -44,7 +44,9 @@ from pyiceberg.avro.reader import (
StringReader,
StructReader,
TimeReader,
+ TimestampNanoReader,
TimestampReader,
+ TimestamptzNanoReader,
TimestamptzReader,
UnknownReader,
UUIDReader,
@@ -64,6 +66,8 @@ from pyiceberg.avro.writer import (
OptionWriter,
StringWriter,
StructWriter,
+ TimestampNanoWriter,
+ TimestamptzNanoWriter,
TimestamptzWriter,
TimestampWriter,
TimeWriter,
@@ -99,7 +103,9 @@ from pyiceberg.types import (
PrimitiveType,
StringType,
StructType,
+ TimestampNanoType,
TimestampType,
+ TimestamptzNanoType,
TimestamptzType,
TimeType,
UnknownType,
@@ -184,9 +190,15 @@ class
ConstructWriter(SchemaVisitorPerPrimitiveType[Writer]):
def visit_timestamp(self, timestamp_type: TimestampType) -> Writer:
return TimestampWriter()
+ def visit_timestamp_ns(self, timestamp_ns_type: TimestampNanoType) ->
Writer:
+ return TimestampNanoWriter()
+
def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> Writer:
return TimestamptzWriter()
+ def visit_timestamptz_ns(self, timestamptz_ns_type: TimestamptzNanoType)
-> Writer:
+ return TimestamptzNanoWriter()
+
def visit_string(self, string_type: StringType) -> Writer:
return StringWriter()
@@ -332,9 +344,15 @@ class
WriteSchemaResolver(PrimitiveWithPartnerVisitor[IcebergType, Writer]):
def visit_timestamp(self, timestamp_type: TimestampType, partner:
Optional[IcebergType]) -> Writer:
return TimestampWriter()
+ def visit_timestamp_ns(self, timestamp_ns_type: TimestampNanoType,
partner: Optional[IcebergType]) -> Writer:
+ return TimestampNanoWriter()
+
def visit_timestamptz(self, timestamptz_type: TimestamptzType, partner:
Optional[IcebergType]) -> Writer:
return TimestamptzWriter()
+ def visit_timestamptz_ns(self, timestamptz_ns_type: TimestamptzNanoType,
partner: Optional[IcebergType]) -> Writer:
+ return TimestamptzNanoWriter()
+
def visit_string(self, string_type: StringType, partner:
Optional[IcebergType]) -> Writer:
return StringWriter()
@@ -465,9 +483,15 @@ class
ReadSchemaResolver(PrimitiveWithPartnerVisitor[IcebergType, Reader]):
def visit_timestamp(self, timestamp_type: TimestampType, partner:
Optional[IcebergType]) -> Reader:
return TimestampReader()
+ def visit_timestamp_ns(self, timestamp_ns_type: TimestampNanoType,
partner: Optional[IcebergType]) -> Reader:
+ return TimestampNanoReader()
+
def visit_timestamptz(self, timestamptz_type: TimestamptzType, partner:
Optional[IcebergType]) -> Reader:
return TimestamptzReader()
+ def visit_timestamptz_ns(self, timestamptz_ns_type: TimestamptzNanoType,
partner: Optional[IcebergType]) -> Reader:
+ return TimestamptzNanoReader()
+
def visit_string(self, string_type: StringType, partner:
Optional[IcebergType]) -> Reader:
return StringReader()
diff --git a/pyiceberg/avro/writer.py b/pyiceberg/avro/writer.py
index 51d2d1bc..80e96b04 100644
--- a/pyiceberg/avro/writer.py
+++ b/pyiceberg/avro/writer.py
@@ -95,12 +95,24 @@ class TimestampWriter(Writer):
encoder.write_int(val)
+@dataclass(frozen=True)
+class TimestampNanoWriter(Writer):
+ def write(self, encoder: BinaryEncoder, val: int) -> None:
+ encoder.write_int(val)
+
+
@dataclass(frozen=True)
class TimestamptzWriter(Writer):
def write(self, encoder: BinaryEncoder, val: int) -> None:
encoder.write_int(val)
+@dataclass(frozen=True)
+class TimestamptzNanoWriter(Writer):
+ def write(self, encoder: BinaryEncoder, val: int) -> None:
+ encoder.write_int(val)
+
+
@dataclass(frozen=True)
class StringWriter(Writer):
def write(self, encoder: BinaryEncoder, val: Any) -> None:
diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py
index ed5c6f7a..fe208b4a 100644
--- a/pyiceberg/conversions.py
+++ b/pyiceberg/conversions.py
@@ -55,7 +55,9 @@ from pyiceberg.types import (
LongType,
PrimitiveType,
StringType,
+ TimestampNanoType,
TimestampType,
+ TimestamptzNanoType,
TimestamptzType,
TimeType,
UnknownType,
@@ -66,6 +68,7 @@ from pyiceberg.utils.datetime import (
date_str_to_days,
date_to_days,
datetime_to_micros,
+ datetime_to_nanos,
days_to_date,
micros_to_time,
micros_to_timestamp,
@@ -127,7 +130,9 @@ def _(primitive_type: BooleanType, value_str: str) ->
Union[int, float, str, uui
@partition_to_py.register(DateType)
@partition_to_py.register(TimeType)
@partition_to_py.register(TimestampType)
+@partition_to_py.register(TimestampNanoType)
@partition_to_py.register(TimestamptzType)
+@partition_to_py.register(TimestamptzNanoType)
@handle_none
def _(primitive_type: PrimitiveType, value_str: str) -> int:
"""Convert a string to an integer value.
@@ -213,12 +218,20 @@ def _(_: PrimitiveType, value: int) -> bytes:
@to_bytes.register(TimestampType)
@to_bytes.register(TimestamptzType)
-def _(_: TimestampType, value: Union[datetime, int]) -> bytes:
+def _(_: PrimitiveType, value: Union[datetime, int]) -> bytes:
if isinstance(value, datetime):
value = datetime_to_micros(value)
return _LONG_STRUCT.pack(value)
+@to_bytes.register(TimestampNanoType)
+@to_bytes.register(TimestamptzNanoType)
+def _(_: PrimitiveType, value: Union[datetime, int]) -> bytes:
+ if isinstance(value, datetime):
+ value = datetime_to_nanos(value)
+ return _LONG_STRUCT.pack(value)
+
+
@to_bytes.register(DateType)
def _(_: DateType, value: Union[date, int]) -> bytes:
if isinstance(value, date):
@@ -319,6 +332,8 @@ def _(_: PrimitiveType, b: bytes) -> int:
@from_bytes.register(TimeType)
@from_bytes.register(TimestampType)
@from_bytes.register(TimestamptzType)
+@from_bytes.register(TimestampNanoType)
+@from_bytes.register(TimestamptzNanoType)
def _(_: PrimitiveType, b: bytes) -> int:
return _LONG_STRUCT.unpack(b)[0]
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index d9f84a42..88be6aba 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -163,7 +163,9 @@ from pyiceberg.types import (
PrimitiveType,
StringType,
StructType,
+ TimestampNanoType,
TimestampType,
+ TimestamptzNanoType,
TimestamptzType,
TimeType,
UnknownType,
@@ -662,9 +664,15 @@ class
_ConvertToArrowSchema(SchemaVisitorPerPrimitiveType[pa.DataType]):
def visit_timestamp(self, _: TimestampType) -> pa.DataType:
return pa.timestamp(unit="us")
+ def visit_timestamp_ns(self, _: TimestampNanoType) -> pa.DataType:
+ return pa.timestamp(unit="ns")
+
def visit_timestamptz(self, _: TimestamptzType) -> pa.DataType:
return pa.timestamp(unit="us", tz="UTC")
+ def visit_timestamptz_ns(self, _: TimestamptzNanoType) -> pa.DataType:
+ return pa.timestamp(unit="ns", tz="UTC")
+
def visit_string(self, _: StringType) -> pa.DataType:
return pa.large_string()
@@ -1894,9 +1902,15 @@ class
PrimitiveToPhysicalType(SchemaVisitorPerPrimitiveType[str]):
def visit_timestamp(self, timestamp_type: TimestampType) -> str:
return "INT64"
+ def visit_timestamp_ns(self, timestamp_type: TimestampNanoType) -> str:
+ return "INT64"
+
def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> str:
return "INT64"
+ def visit_timestamptz_ns(self, timestamptz_ns_type: TimestamptzNanoType)
-> str:
+ return "INT64"
+
def visit_string(self, string_type: StringType) -> str:
return "BYTE_ARRAY"
diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py
index 7f6cfe99..51762c57 100644
--- a/pyiceberg/schema.py
+++ b/pyiceberg/schema.py
@@ -57,7 +57,9 @@ from pyiceberg.types import (
PrimitiveType,
StringType,
StructType,
+ TimestampNanoType,
TimestampType,
+ TimestamptzNanoType,
TimestamptzType,
TimeType,
UnknownType,
@@ -362,6 +364,21 @@ class Schema(IcebergBaseModel):
f"Cannot add field {field.name} as an identifier field:
must not be nested in an optional field {parent}"
)
+ def check_format_version_compatibility(self, format_version: int) -> None:
+ """Check that the schema is compatible for the given table format
version.
+
+ Args:
+ format_version: The Iceberg table format version.
+
+ Raises:
+ ValueError: If the schema is not compatible for the format version.
+ """
+ for field in self._lazy_id_to_field.values():
+ if format_version < field.field_type.minimum_format_version():
+ raise ValueError(
+ f"{field.field_type} is only supported in
{field.field_type.minimum_format_version()} or higher. Current format version
is: {format_version}"
+ )
+
class SchemaVisitor(Generic[T], ABC):
def before_field(self, field: NestedField) -> None:
@@ -522,8 +539,12 @@ class
PrimitiveWithPartnerVisitor(SchemaWithPartnerVisitor[P, T]):
return self.visit_time(primitive, primitive_partner)
elif isinstance(primitive, TimestampType):
return self.visit_timestamp(primitive, primitive_partner)
+ elif isinstance(primitive, TimestampNanoType):
+ return self.visit_timestamp_ns(primitive, primitive_partner)
elif isinstance(primitive, TimestamptzType):
return self.visit_timestamptz(primitive, primitive_partner)
+ elif isinstance(primitive, TimestamptzNanoType):
+ return self.visit_timestamptz_ns(primitive, primitive_partner)
elif isinstance(primitive, StringType):
return self.visit_string(primitive, primitive_partner)
elif isinstance(primitive, UUIDType):
@@ -573,10 +594,18 @@ class
PrimitiveWithPartnerVisitor(SchemaWithPartnerVisitor[P, T]):
def visit_timestamp(self, timestamp_type: TimestampType, partner:
Optional[P]) -> T:
"""Visit a TimestampType."""
+ @abstractmethod
+ def visit_timestamp_ns(self, timestamp_ns_type: TimestampNanoType,
partner: Optional[P]) -> T:
+ """Visit a TimestampNanoType."""
+
@abstractmethod
def visit_timestamptz(self, timestamptz_type: TimestamptzType, partner:
Optional[P]) -> T:
"""Visit a TimestamptzType."""
+ @abstractmethod
+ def visit_timestamptz_ns(self, timestamptz_ns_type: TimestamptzNanoType,
partner: Optional[P]) -> T:
+ """Visit a TimestamptzNanoType."""
+
@abstractmethod
def visit_string(self, string_type: StringType, partner: Optional[P]) -> T:
"""Visit a StringType."""
@@ -706,8 +735,12 @@ class SchemaVisitorPerPrimitiveType(SchemaVisitor[T], ABC):
return self.visit_time(primitive)
elif isinstance(primitive, TimestampType):
return self.visit_timestamp(primitive)
+ elif isinstance(primitive, TimestampNanoType):
+ return self.visit_timestamp_ns(primitive)
elif isinstance(primitive, TimestamptzType):
return self.visit_timestamptz(primitive)
+ elif isinstance(primitive, TimestamptzNanoType):
+ return self.visit_timestamptz_ns(primitive)
elif isinstance(primitive, StringType):
return self.visit_string(primitive)
elif isinstance(primitive, UUIDType):
@@ -759,10 +792,18 @@ class SchemaVisitorPerPrimitiveType(SchemaVisitor[T],
ABC):
def visit_timestamp(self, timestamp_type: TimestampType) -> T:
"""Visit a TimestampType."""
+ @abstractmethod
+ def visit_timestamp_ns(self, timestamp_type: TimestampNanoType) -> T:
+ """Visit a TimestampNanoType."""
+
@abstractmethod
def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> T:
"""Visit a TimestamptzType."""
+ @abstractmethod
+ def visit_timestamptz_ns(self, timestamptz_ns_type: TimestamptzNanoType)
-> T:
+ """Visit a TimestamptzNanoType."""
+
@abstractmethod
def visit_string(self, string_type: StringType) -> T:
"""Visit a StringType."""
diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py
index d5ce7656..abe3d8ef 100644
--- a/pyiceberg/table/metadata.py
+++ b/pyiceberg/table/metadata.py
@@ -578,6 +578,11 @@ def new_table_metadata(
) -> TableMetadata:
from pyiceberg.table import TableProperties
+ # Remove format-version so it does not get persisted
+ format_version = int(properties.pop(TableProperties.FORMAT_VERSION,
TableProperties.DEFAULT_FORMAT_VERSION))
+
+ schema.check_format_version_compatibility(format_version)
+
fresh_schema = assign_fresh_schema_ids(schema)
fresh_partition_spec = assign_fresh_partition_spec_ids(partition_spec,
schema, fresh_schema)
fresh_sort_order = assign_fresh_sort_order_ids(sort_order, schema,
fresh_schema)
@@ -585,8 +590,6 @@ def new_table_metadata(
if table_uuid is None:
table_uuid = uuid.uuid4()
- # Remove format-version so it does not get persisted
- format_version = int(properties.pop(TableProperties.FORMAT_VERSION,
TableProperties.DEFAULT_FORMAT_VERSION))
if format_version == 1:
return TableMetadataV1(
location=location,
diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py
index 4a652a2f..fd908592 100644
--- a/pyiceberg/transforms.py
+++ b/pyiceberg/transforms.py
@@ -73,7 +73,9 @@ from pyiceberg.types import (
IntegerType,
LongType,
StringType,
+ TimestampNanoType,
TimestampType,
+ TimestamptzNanoType,
TimestamptzType,
TimeType,
UUIDType,
@@ -290,6 +292,8 @@ class BucketTransform(Transform[S, int]):
TimeType,
TimestampType,
TimestamptzType,
+ TimestampNanoType,
+ TimestamptzNanoType,
DecimalType,
StringType,
FixedType,
@@ -323,6 +327,18 @@ class BucketTransform(Transform[S, int]):
return mmh3.hash(struct.pack("<q", v))
+ elif isinstance(source, (TimestampNanoType, TimestamptzNanoType)):
+
+ def hash_func(v: Any) -> int:
+ # In order to bucket TimestampNano the same as Timestamp
+ # convert to micros before hashing.
+ if isinstance(v, py_datetime.datetime):
+ v = datetime.datetime_to_micros(v)
+ else:
+ v = datetime.nanos_to_micros(v)
+
+ return mmh3.hash(struct.pack("<q", v))
+
elif isinstance(source, (IntegerType, LongType)):
def hash_func(v: Any) -> int:
@@ -457,13 +473,20 @@ class YearTransform(TimeTransform[S]):
return datetime.micros_to_years(v)
+ elif isinstance(source, (TimestampNanoType, TimestamptzNanoType)):
+
+ def year_func(v: Any) -> int:
+ # python datetime has no nanoseconds support.
+ # nanosecond datetimes will be expressed as int as a workaround
+ return datetime.nanos_to_years(v)
+
else:
raise ValueError(f"Cannot apply year transform for type: {source}")
return lambda v: year_func(v) if v is not None else None
def can_transform(self, source: IcebergType) -> bool:
- return isinstance(source, (DateType, TimestampType, TimestamptzType))
+ return isinstance(source, (DateType, TimestampType, TimestamptzType,
TimestampNanoType, TimestamptzNanoType))
@property
def granularity(self) -> TimeResolution:
@@ -481,15 +504,19 @@ class YearTransform(TimeTransform[S]):
import pyarrow.compute as pc
if isinstance(source, DateType):
- epoch = datetime.EPOCH_DATE
+ epoch = pa.scalar(datetime.EPOCH_DATE)
elif isinstance(source, TimestampType):
- epoch = datetime.EPOCH_TIMESTAMP
+ epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
elif isinstance(source, TimestamptzType):
- epoch = datetime.EPOCH_TIMESTAMPTZ
+ epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
+ elif isinstance(source, TimestampNanoType):
+ epoch =
pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
+ elif isinstance(source, TimestamptzNanoType):
+ epoch =
pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
else:
raise ValueError(f"Cannot apply year transform for type: {source}")
- return lambda v: pc.years_between(pa.scalar(epoch), v) if v is not
None else None
+ return lambda v: pc.years_between(epoch, v) if v is not None else None
class MonthTransform(TimeTransform[S]):
@@ -520,13 +547,20 @@ class MonthTransform(TimeTransform[S]):
return datetime.micros_to_months(v)
+ elif isinstance(source, (TimestampNanoType, TimestamptzNanoType)):
+
+ def month_func(v: Any) -> int:
+ # python datetime has no nanoseconds support.
+ # nanosecond datetimes will be expressed as int as a workaround
+ return datetime.nanos_to_months(v)
+
else:
raise ValueError(f"Cannot apply month transform for type:
{source}")
return lambda v: month_func(v) if v is not None else None
def can_transform(self, source: IcebergType) -> bool:
- return isinstance(source, (DateType, TimestampType, TimestamptzType))
+ return isinstance(source, (DateType, TimestampType, TimestamptzType,
TimestampNanoType, TimestamptzNanoType))
@property
def granularity(self) -> TimeResolution:
@@ -544,17 +578,21 @@ class MonthTransform(TimeTransform[S]):
import pyarrow.compute as pc
if isinstance(source, DateType):
- epoch = datetime.EPOCH_DATE
+ epoch = pa.scalar(datetime.EPOCH_DATE)
elif isinstance(source, TimestampType):
- epoch = datetime.EPOCH_TIMESTAMP
+ epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
elif isinstance(source, TimestamptzType):
- epoch = datetime.EPOCH_TIMESTAMPTZ
+ epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
+ elif isinstance(source, TimestampNanoType):
+ epoch =
pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
+ elif isinstance(source, TimestamptzNanoType):
+ epoch =
pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
else:
raise ValueError(f"Cannot apply month transform for type:
{source}")
def month_func(v: pa.Array) -> pa.Array:
return pc.add(
- pc.multiply(pc.years_between(pa.scalar(epoch), v),
pa.scalar(12)),
+ pc.multiply(pc.years_between(epoch, v), pa.scalar(12)),
pc.add(pc.month(v), pa.scalar(-1)),
)
@@ -589,13 +627,20 @@ class DayTransform(TimeTransform[S]):
return datetime.micros_to_days(v)
+ elif isinstance(source, (TimestampNanoType, TimestamptzNanoType)):
+
+ def day_func(v: Any) -> int:
+ # python datetime has no nanoseconds support.
+ # nanosecond datetimes will be expressed as int as a workaround
+ return datetime.nanos_to_days(v)
+
else:
raise ValueError(f"Cannot apply day transform for type: {source}")
return lambda v: day_func(v) if v is not None else None
def can_transform(self, source: IcebergType) -> bool:
- return isinstance(source, (DateType, TimestampType, TimestamptzType))
+ return isinstance(source, (DateType, TimestampType, TimestamptzType,
TimestampNanoType, TimestamptzNanoType))
def result_type(self, source: IcebergType) -> IcebergType:
"""Return the result type of a day transform.
@@ -621,15 +666,19 @@ class DayTransform(TimeTransform[S]):
import pyarrow.compute as pc
if isinstance(source, DateType):
- epoch = datetime.EPOCH_DATE
+ epoch = pa.scalar(datetime.EPOCH_DATE)
elif isinstance(source, TimestampType):
- epoch = datetime.EPOCH_TIMESTAMP
+ epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
elif isinstance(source, TimestamptzType):
- epoch = datetime.EPOCH_TIMESTAMPTZ
+ epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
+ elif isinstance(source, TimestampNanoType):
+ epoch =
pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
+ elif isinstance(source, TimestamptzNanoType):
+ epoch =
pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
else:
raise ValueError(f"Cannot apply day transform for type: {source}")
- return lambda v: pc.days_between(pa.scalar(epoch), v) if v is not None
else None
+ return lambda v: pc.days_between(epoch, v) if v is not None else None
class HourTransform(TimeTransform[S]):
@@ -652,13 +701,20 @@ class HourTransform(TimeTransform[S]):
return datetime.micros_to_hours(v)
+ elif isinstance(source, (TimestampNanoType, TimestamptzNanoType)):
+
+ def day_func(v: Any) -> int:
+ # python datetime has no nanoseconds support.
+ # nanosecond datetimes will be expressed as int as a workaround
+ return datetime.nanos_to_hours(v)
+
else:
raise ValueError(f"Cannot apply hour transform for type: {source}")
return lambda v: hour_func(v) if v is not None else None
def can_transform(self, source: IcebergType) -> bool:
- return isinstance(source, (TimestampType, TimestamptzType))
+ return isinstance(source, (TimestampType, TimestamptzType,
TimestampNanoType, TimestamptzNanoType))
@property
def granularity(self) -> TimeResolution:
@@ -676,13 +732,17 @@ class HourTransform(TimeTransform[S]):
import pyarrow.compute as pc
if isinstance(source, TimestampType):
- epoch = datetime.EPOCH_TIMESTAMP
+ epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
elif isinstance(source, TimestamptzType):
- epoch = datetime.EPOCH_TIMESTAMPTZ
+ epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
+ elif isinstance(source, TimestampNanoType):
+ epoch =
pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
+ elif isinstance(source, TimestamptzNanoType):
+ epoch =
pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
else:
raise ValueError(f"Cannot apply hour transform for type: {source}")
- return lambda v: pc.hours_between(pa.scalar(epoch), v) if v is not
None else None
+ return lambda v: pc.hours_between(epoch, v) if v is not None else None
def _base64encode(buffer: bytes) -> str:
diff --git a/pyiceberg/types.py b/pyiceberg/types.py
index 456f9ad8..8e83b011 100644
--- a/pyiceberg/types.py
+++ b/pyiceberg/types.py
@@ -53,7 +53,7 @@ from pydantic import (
from pydantic_core.core_schema import ValidatorFunctionWrapHandler
from pyiceberg.exceptions import ValidationError
-from pyiceberg.typedef import IcebergBaseModel, IcebergRootModel, L
+from pyiceberg.typedef import IcebergBaseModel, IcebergRootModel, L,
TableVersion
from pyiceberg.utils.parsing import ParseNumberFromBrackets
from pyiceberg.utils.singleton import Singleton
@@ -140,6 +140,10 @@ class IcebergType(IcebergBaseModel):
return TimestampType()
if v == "timestamptz":
return TimestamptzType()
+ if v == "timestamp_ns":
+ return TimestampNanoType()
+ if v == "timestamptz_ns":
+ return TimestamptzNanoType()
if v == "date":
return DateType()
if v == "time":
@@ -177,6 +181,10 @@ class IcebergType(IcebergBaseModel):
def is_struct(self) -> bool:
return isinstance(self, StructType)
+ def minimum_format_version(self) -> TableVersion:
+ """Minimum Iceberg format version after which this type is
supported."""
+ return 1
+
class PrimitiveType(Singleton, IcebergRootModel[str], IcebergType):
"""Base class for all Iceberg Primitive Types."""
@@ -703,6 +711,44 @@ class TimestamptzType(PrimitiveType):
root: Literal["timestamptz"] = Field(default="timestamptz")
+class TimestampNanoType(PrimitiveType):
+ """A TimestampNano data type in Iceberg can be represented using an
instance of this class.
+
+ TimestampNanos in Iceberg have nanosecond precision and include a date and
a time of day without a timezone.
+
+ Example:
+ >>> column_foo = TimestampNanoType()
+ >>> isinstance(column_foo, TimestampNanoType)
+ True
+ >>> column_foo
+ TimestampNanoType()
+ """
+
+ root: Literal["timestamp_ns"] = Field(default="timestamp_ns")
+
+ def minimum_format_version(self) -> TableVersion:
+ return 3
+
+
+class TimestamptzNanoType(PrimitiveType):
+ """A TimestamptzNano data type in Iceberg can be represented using an
instance of this class.
+
+ TimestamptzNanos in Iceberg are stored as UTC and include a date and a
time of day with a timezone.
+
+ Example:
+ >>> column_foo = TimestamptzNanoType()
+ >>> isinstance(column_foo, TimestamptzNanoType)
+ True
+ >>> column_foo
+ TimestamptzNanoType()
+ """
+
+ root: Literal["timestamptz_ns"] = Field(default="timestamptz_ns")
+
+ def minimum_format_version(self) -> TableVersion:
+ return 3
+
+
class StringType(PrimitiveType):
"""A String data type in Iceberg can be represented using an instance of
this class.
@@ -765,3 +811,6 @@ class UnknownType(PrimitiveType):
"""
root: Literal["unknown"] = Field(default="unknown")
+
+ def minimum_format_version(self) -> TableVersion:
+ return 3
diff --git a/pyiceberg/utils/datetime.py b/pyiceberg/utils/datetime.py
index 0cb6926e..46bbb32d 100644
--- a/pyiceberg/utils/datetime.py
+++ b/pyiceberg/utils/datetime.py
@@ -29,8 +29,10 @@ from datetime import (
EPOCH_DATE = date.fromisoformat("1970-01-01")
EPOCH_TIMESTAMP = datetime.fromisoformat("1970-01-01T00:00:00.000000")
ISO_TIMESTAMP = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(.\d{1,6})?")
+ISO_TIMESTAMP_NANO =
re.compile(r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(.\d{1,6})?(\d{1,3})?")
EPOCH_TIMESTAMPTZ = datetime.fromisoformat("1970-01-01T00:00:00.000000+00:00")
ISO_TIMESTAMPTZ =
re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(.\d{1,6})?[-+]\d{2}:\d{2}")
+ISO_TIMESTAMPTZ_NANO =
re.compile(r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(.\d{1,6})?(\d{1,3})?([-+]\d{2}:\d{2})")
def micros_to_days(timestamp: int) -> int:
@@ -91,6 +93,59 @@ def timestamp_to_micros(timestamp_str: str) -> int:
raise ValueError(f"Invalid timestamp without zone: {timestamp_str} (must
be ISO-8601)")
+def time_str_to_nanos(time_str: str) -> int:
+ """Convert an ISO-8601 formatted time to nanoseconds from midnight."""
+ return time_to_nanos(time.fromisoformat(time_str))
+
+
+def time_to_nanos(t: time) -> int:
+ """Convert a datetime.time object to nanoseconds from midnight."""
+ # python datetime and time doesn't have nanoseconds support yet
+ # https://github.com/python/cpython/issues/59648
+ return ((((t.hour * 60 + t.minute) * 60) + t.second) * 1_000_000 +
t.microsecond) * 1_000
+
+
+def datetime_to_nanos(dt: datetime) -> int:
+ """Convert a datetime to nanoseconds from 1970-01-01T00:00:00.000000000."""
+ # python datetime and time doesn't have nanoseconds support yet
+ # https://github.com/python/cpython/issues/59648
+ if dt.tzinfo:
+ delta = dt - EPOCH_TIMESTAMPTZ
+ else:
+ delta = dt - EPOCH_TIMESTAMP
+ return ((delta.days * 86400 + delta.seconds) * 1_000_000 +
delta.microseconds) * 1_000
+
+
+def timestamp_to_nanos(timestamp_str: str) -> int:
+ """Convert an ISO-9601 formatted timestamp without zone to nanoseconds
from 1970-01-01T00:00:00.000000000."""
+ if match := ISO_TIMESTAMP_NANO.fullmatch(timestamp_str):
+ # Python datetime does not have native nanoseconds support
+ # Hence we need to extract nanoseconds timestamp manually
+ ns_str = match.group(3) or "0"
+ ms_str = match.group(2) if match.group(2) else ""
+ timestamp_str_without_ns_str = match.group(1) + ms_str
+ return
datetime_to_nanos(datetime.fromisoformat(timestamp_str_without_ns_str)) +
int(ns_str)
+ if ISO_TIMESTAMPTZ_NANO.fullmatch(timestamp_str):
+ # When we can match a timestamp without a zone, we can give a more
specific error
+ raise ValueError(f"Zone offset provided, but not expected:
{timestamp_str}")
+ raise ValueError(f"Invalid timestamp without zone: {timestamp_str} (must
be ISO-8601)")
+
+
+def timestamptz_to_nanos(timestamptz_str: str) -> int:
+ """Convert an ISO-8601 formatted timestamp with zone to nanoseconds from
1970-01-01T00:00:00.000000000+00:00."""
+ if match := ISO_TIMESTAMPTZ_NANO.fullmatch(timestamptz_str):
+ # Python datetime does not have native nanoseconds support
+ # Hence we need to extract nanoseconds timestamp manually
+ ns_str = match.group(3) or "0"
+ ms_str = match.group(2) if match.group(2) else ""
+ timestamptz_str_without_ns_str = match.group(1) + ms_str +
match.group(4)
+ return
datetime_to_nanos(datetime.fromisoformat(timestamptz_str_without_ns_str)) +
int(ns_str)
+ if ISO_TIMESTAMPTZ_NANO.fullmatch(timestamptz_str):
+ # When we can match a timestamp without a zone, we can give a more
specific error
+ raise ValueError(f"Missing zone offset: {timestamptz_str} (must be
ISO-8601)")
+ raise ValueError(f"Invalid timestamp with zone: {timestamptz_str} (must be
ISO-8601)")
+
+
def datetime_to_millis(dt: datetime) -> int:
"""Convert a datetime to milliseconds from 1970-01-01T00:00:00.000000."""
if dt.tzinfo:
@@ -184,3 +239,43 @@ def days_to_years(days: int) -> int:
def micros_to_years(micros: int) -> int:
return micros_to_timestamp(micros).year - EPOCH_TIMESTAMP.year
+
+
+def nanos_to_timestamp(nanos: int) -> datetime:
+ """Convert nanoseconds from epoch to a microsecond timestamp."""
+ dt = timedelta(microseconds=nanos_to_micros(nanos))
+ return EPOCH_TIMESTAMP + dt
+
+
+def nanos_to_years(nanos: int) -> int:
+ return nanos_to_timestamp(nanos).year - EPOCH_TIMESTAMP.year
+
+
+def nanos_to_months(nanos: int) -> int:
+ dt = nanos_to_timestamp(nanos)
+ return (dt.year - EPOCH_TIMESTAMP.year) * 12 + (dt.month -
EPOCH_TIMESTAMP.month)
+
+
+def nanos_to_days(nanos: int) -> int:
+ """Convert a timestamp in nanoseconds to a date in days."""
+ return timedelta(microseconds=nanos // 1000).days
+
+
+def nanos_to_time(nanos: int) -> time:
+ """Convert a timestamp in nanoseconds to a microsecond precision time."""
+ micros = nanos_to_micros(nanos)
+ micros, microseconds = divmod(micros, 1000000)
+ micros, seconds = divmod(micros, 60)
+ micros, minutes = divmod(micros, 60)
+ hours = micros
+ return time(hour=hours, minute=minutes, second=seconds,
microsecond=microseconds)
+
+
+def nanos_to_hours(nanos: int) -> int:
+ """Convert a timestamp in nanoseconds to hours from 1970-01-01T00:00."""
+ return nanos // 3_600_000_000_0000
+
+
+def nanos_to_micros(nanos: int) -> int:
+ """Convert a nanoseconds timestamp to microsecond timestamp by dropping
precision."""
+ return nanos // 1000
diff --git a/pyiceberg/utils/schema_conversion.py
b/pyiceberg/utils/schema_conversion.py
index 17eb2051..6959380d 100644
--- a/pyiceberg/utils/schema_conversion.py
+++ b/pyiceberg/utils/schema_conversion.py
@@ -605,13 +605,17 @@ class
ConvertSchemaToAvro(SchemaVisitorPerPrimitiveType[AvroType]):
return {"type": "long", "logicalType": "time-micros"}
def visit_timestamp(self, timestamp_type: TimestampType) -> AvroType:
- # Iceberg only supports micro's
return {"type": "long", "logicalType": "timestamp-micros",
"adjust-to-utc": False}
+ def visit_timestamp_ns(self, timestamp_type: TimestampType) -> AvroType:
+ return {"type": "long", "logicalType": "timestamp-nanos",
"adjust-to-utc": False}
+
def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> AvroType:
- # Iceberg only supports micro's
return {"type": "long", "logicalType": "timestamp-micros",
"adjust-to-utc": True}
+ def visit_timestamptz_ns(self, timestamptz_type: TimestamptzType) ->
AvroType:
+ return {"type": "long", "logicalType": "timestamp-nanos",
"adjust-to-utc": True}
+
def visit_string(self, string_type: StringType) -> AvroType:
return "string"
diff --git a/tests/avro/test_reader.py b/tests/avro/test_reader.py
index c713201b..3fdd3bbd 100644
--- a/tests/avro/test_reader.py
+++ b/tests/avro/test_reader.py
@@ -35,7 +35,9 @@ from pyiceberg.avro.reader import (
StringReader,
StructReader,
TimeReader,
+ TimestampNanoReader,
TimestampReader,
+ TimestamptzNanoReader,
TimestamptzReader,
UnknownReader,
UUIDReader,
@@ -58,7 +60,9 @@ from pyiceberg.types import (
NestedField,
StringType,
StructType,
+ TimestampNanoType,
TimestampType,
+ TimestamptzNanoType,
TimestamptzType,
TimeType,
UnknownType,
@@ -313,10 +317,18 @@ def test_timestamp_reader() -> None:
assert construct_reader(TimestampType()) == TimestampReader()
+def test_timestamp_ns_reader() -> None:
+ assert construct_reader(TimestampNanoType()) == TimestampNanoReader()
+
+
def test_timestamptz_reader() -> None:
assert construct_reader(TimestamptzType()) == TimestamptzReader()
+def test_timestamptz_ns_reader() -> None:
+ assert construct_reader(TimestamptzNanoType()) == TimestamptzNanoReader()
+
+
def test_string_reader() -> None:
assert construct_reader(StringType()) == StringReader()
diff --git a/tests/avro/test_writer.py b/tests/avro/test_writer.py
index 951be7e7..3114b97d 100644
--- a/tests/avro/test_writer.py
+++ b/tests/avro/test_writer.py
@@ -33,6 +33,8 @@ from pyiceberg.avro.writer import (
FloatWriter,
IntegerWriter,
StringWriter,
+ TimestampNanoWriter,
+ TimestamptzNanoWriter,
TimestamptzWriter,
TimestampWriter,
TimeWriter,
@@ -55,7 +57,9 @@ from pyiceberg.types import (
NestedField,
StringType,
StructType,
+ TimestampNanoType,
TimestampType,
+ TimestamptzNanoType,
TimestamptzType,
TimeType,
UnknownType,
@@ -113,10 +117,18 @@ def test_timestamp_writer() -> None:
assert construct_writer(TimestampType()) == TimestampWriter()
+def test_timestamp_ns_writer() -> None:
+ assert construct_writer(TimestampNanoType()) == TimestampNanoWriter()
+
+
def test_timestamptz_writer() -> None:
assert construct_writer(TimestamptzType()) == TimestamptzWriter()
+def test_timestamptz_ns_writer() -> None:
+ assert construct_writer(TimestamptzNanoType()) == TimestamptzNanoWriter()
+
+
def test_string_writer() -> None:
assert construct_writer(StringType()) == StringWriter()
diff --git a/tests/conftest.py b/tests/conftest.py
index 1cbb3cfa..6444b7b2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -920,10 +920,9 @@ EXAMPLE_TABLE_METADATA_V3 = {
{"id": 1, "name": "x", "required": True, "type": "long"},
{"id": 2, "name": "y", "required": True, "type": "long",
"doc": "comment"},
{"id": 3, "name": "z", "required": True, "type": "long"},
- # TODO: Add unknown, timestamp(tz)_ns
{"id": 4, "name": "u", "required": True, "type": "unknown"},
- # {"id": 5, "name": "ns", "required": True, "type":
"timestamp_ns"},
- # {"id": 6, "name": "nstz", "required": True, "type":
"timestamptz_ns"},
+ {"id": 5, "name": "ns", "required": True, "type":
"timestamp_ns"},
+ {"id": 6, "name": "nstz", "required": True, "type":
"timestamptz_ns"},
],
},
],
diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
index 1eb3500a..ee5f8a25 100644
--- a/tests/integration/test_reads.py
+++ b/tests/integration/test_reads.py
@@ -56,7 +56,6 @@ from pyiceberg.types import (
NestedField,
StringType,
TimestampType,
- UnknownType,
)
from pyiceberg.utils.concurrent import ExecutorFactory
@@ -979,39 +978,3 @@ def test_scan_with_datetime(catalog: Catalog) -> None:
df = table.scan(row_filter=LessThan("datetime", yesterday)).to_pandas()
assert len(df) == 0
-
-
[email protected]
[email protected]("catalog",
[pytest.lazy_fixture("session_catalog_hive")])
-def test_read_unknown_type(catalog: Catalog) -> None:
- identifier = "default.test_table_read_unknown_type"
- arrow_table = pa.Table.from_pydict(
- {
- "int": [1, 2],
- "string": ["a", "b"],
- "unknown": [None, None],
- },
- schema=pa.schema(
- [
- pa.field("int", pa.int32(), nullable=True),
- pa.field("string", pa.string(), nullable=True),
- pa.field("unknown", pa.null(), nullable=True),
- ],
- ),
- )
-
- try:
- catalog.drop_table(identifier)
- except NoSuchTableError:
- pass
-
- tbl = catalog.create_table(
- identifier,
- schema=arrow_table.schema,
- )
-
- tbl.append(arrow_table)
-
- assert tbl.schema().find_type("unknown") == UnknownType()
- result_table = tbl.scan().to_arrow()
- assert result_table["unknown"].to_pylist() == [None, None]
diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py
index d2ee5c31..a8410cff 100644
--- a/tests/table/test_metadata.py
+++ b/tests/table/test_metadata.py
@@ -48,8 +48,12 @@ from pyiceberg.types import (
LongType,
MapType,
NestedField,
+ PrimitiveType,
StringType,
StructType,
+ TimestampNanoType,
+ TimestamptzNanoType,
+ UnknownType,
)
@@ -765,3 +769,110 @@ def test_make_metadata_fresh() -> None:
)
assert actual.model_dump() == expected.model_dump()
+
+
+def test_new_table_metadata_with_v3_schema() -> None:
+ schema = Schema(
+ NestedField(field_id=10, name="foo", field_type=StringType(),
required=False),
+ NestedField(field_id=22, name="bar", field_type=IntegerType(),
required=True),
+ NestedField(field_id=33, name="baz", field_type=BooleanType(),
required=False),
+ NestedField(field_id=34, name="qux", field_type=TimestampNanoType(),
required=False),
+ NestedField(field_id=35, name="quux",
field_type=TimestamptzNanoType(), required=False),
+ schema_id=10,
+ identifier_field_ids=[22],
+ )
+
+ partition_spec = PartitionSpec(
+ PartitionField(source_id=22, field_id=1022,
transform=IdentityTransform(), name="bar"), spec_id=10
+ )
+
+ sort_order = SortOrder(
+ SortField(source_id=10, transform=IdentityTransform(),
direction=SortDirection.ASC, null_order=NullOrder.NULLS_LAST),
+ order_id=10,
+ )
+
+ actual = new_table_metadata(
+ schema=schema,
+ partition_spec=partition_spec,
+ sort_order=sort_order,
+ location="s3://some_v1_location/",
+ properties={"format-version": "3"},
+ )
+
+ expected_schema = Schema(
+ NestedField(field_id=1, name="foo", field_type=StringType(),
required=False),
+ NestedField(field_id=2, name="bar", field_type=IntegerType(),
required=True),
+ NestedField(field_id=3, name="baz", field_type=BooleanType(),
required=False),
+ NestedField(field_id=4, name="qux", field_type=TimestampNanoType(),
required=False),
+ NestedField(field_id=5, name="quux", field_type=TimestamptzNanoType(),
required=False),
+ schema_id=0,
+ identifier_field_ids=[2],
+ )
+
+ expected_spec = PartitionSpec(PartitionField(source_id=2, field_id=1000,
transform=IdentityTransform(), name="bar"))
+
+ expected_sort_order = SortOrder(
+ SortField(source_id=1, transform=IdentityTransform(),
direction=SortDirection.ASC, null_order=NullOrder.NULLS_LAST),
+ order_id=1,
+ )
+
+ expected = TableMetadataV3(
+ location="s3://some_v1_location/",
+ table_uuid=actual.table_uuid,
+ last_updated_ms=actual.last_updated_ms,
+ last_column_id=5,
+ schemas=[expected_schema],
+ schema_=expected_schema,
+ current_schema_id=0,
+ partition_spec=[field.model_dump() for field in expected_spec.fields],
+ partition_specs=[expected_spec],
+ default_spec_id=0,
+ last_partition_id=1000,
+ properties={},
+ current_snapshot_id=None,
+ snapshots=[],
+ snapshot_log=[],
+ metadata_log=[],
+ sort_orders=[expected_sort_order],
+ default_sort_order_id=1,
+ refs={},
+ format_version=3,
+ )
+
+ assert actual.model_dump() == expected.model_dump()
+ assert actual.schemas == [expected_schema]
+ assert actual.partition_specs == [expected_spec]
+ assert actual.sort_orders == [expected_sort_order]
+
+
[email protected](
+ "field_type",
+ [
+ TimestampNanoType(),
+ TimestamptzNanoType(),
+ UnknownType(),
+ ],
+)
+def test_new_table_metadata_format_v2_with_v3_schema_fails(field_type:
PrimitiveType) -> None:
+ schema = Schema(
+ NestedField(field_id=34, name="qux", field_type=field_type,
required=False),
+ schema_id=10,
+ )
+
+ partition_spec = PartitionSpec(
+ PartitionField(source_id=34, field_id=1022,
transform=IdentityTransform(), name="qux"), spec_id=10
+ )
+
+ sort_order = SortOrder(
+ SortField(source_id=34, transform=IdentityTransform(),
direction=SortDirection.ASC, null_order=NullOrder.NULLS_LAST),
+ order_id=34,
+ )
+
+ with pytest.raises(ValueError, match=f"{field_type} is only supported in 3
or higher. Current format version is: 2"):
+ new_table_metadata(
+ schema=schema,
+ partition_spec=partition_spec,
+ sort_order=sort_order,
+ location="s3://some_v1_location/",
+ properties={"format-version": "2"},
+ )
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index 0eafb966..2ee0ba3d 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -43,6 +43,10 @@ Notes:
- Stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte
little-endian long
- 400000L is 0...110|00011010|10000000 in binary
- 10000000 -> 128 (-128), 00011010 -> 26, 00000110 -> 6, ... ,
00000000 -> 0
+ TimestampNano:
+ - Stored as nanoseconds from 1970-01-01 00:00:00.000000000 in an
8-byte little-endian long
+ - 400000000L is 00010111|11010111|10000100|00000000 in binary
+ - 00000000 -> 0, 10000100 -> 124 (-124), 11010111 -> 41 (-41),
00010111 -> 23, ... , 00000000 -> 0
String:
- Stored as UTF-8 bytes (without length)
- 'A' -> 65, 'B' -> 66, 'C' -> 67
@@ -99,7 +103,9 @@ from pyiceberg.types import (
LongType,
PrimitiveType,
StringType,
+ TimestampNanoType,
TimestampType,
+ TimestamptzNanoType,
TimestamptzType,
TimeType,
UUIDType,
@@ -266,6 +272,8 @@ def
test_partition_to_py_raise_on_incorrect_precision_or_scale(
(TimestamptzType(), b"\x00\xe8vH\x17\x00\x00\x00", 100000000000),
(TimestampType(), b"\x80\x1a\x06\x00\x00\x00\x00\x00", 400000),
(TimestampType(), b"\x00\xe8vH\x17\x00\x00\x00", 100000000000),
+ (TimestampNanoType(), b"\00\x84\xd7\x17\x00\x00\x00\x00", 400000000),
+ (TimestamptzNanoType(), b"\00\x84\xd7\x17\x00\x00\x00\x00", 400000000),
(StringType(), b"ABC", "ABC"),
(StringType(), b"foo", "foo"),
(
diff --git a/tests/test_transforms.py b/tests/test_transforms.py
index 8987f8b1..d22c94cc 100644
--- a/tests/test_transforms.py
+++ b/tests/test_transforms.py
@@ -16,7 +16,7 @@
# specific language governing permissions and limitations
# under the License.
# pylint: disable=eval-used,protected-access,redefined-outer-name
-from datetime import date
+from datetime import date, datetime
from decimal import Decimal
from typing import Annotated, Any, Callable, Optional, Union
from uuid import UUID
@@ -24,6 +24,7 @@ from uuid import UUID
import mmh3 as mmh3
import pyarrow as pa
import pytest
+import pytz
from pydantic import (
BeforeValidator,
PlainSerializer,
@@ -103,7 +104,9 @@ from pyiceberg.types import (
NestedField,
PrimitiveType,
StringType,
+ TimestampNanoType,
TimestampType,
+ TimestamptzNanoType,
TimestamptzType,
TimeType,
UnknownType,
@@ -114,7 +117,9 @@ from pyiceberg.utils.datetime import (
date_to_days,
time_str_to_micros,
timestamp_to_micros,
+ timestamp_to_nanos,
timestamptz_to_micros,
+ timestamptz_to_nanos,
)
@@ -142,6 +147,26 @@ from pyiceberg.utils.datetime import (
("iceberg", StringType(), 1210000089),
(UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), UUIDType(), 1488055340),
(b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", UUIDType(), 1488055340),
+ (
+ timestamp_to_nanos("2017-11-16T22:31:08.000001"),
+ TimestampNanoType(),
+ -1207196810,
+ ),
+ (
+ timestamp_to_nanos("2017-11-16T22:31:08.000001001"),
+ TimestampNanoType(),
+ -1207196810,
+ ),
+ (
+ timestamptz_to_nanos("2017-11-16T14:31:08.000001-08:00"),
+ TimestamptzNanoType(),
+ -1207196810,
+ ),
+ (
+ timestamptz_to_nanos("2017-11-16T14:31:08.000001001-08:00"),
+ TimestamptzNanoType(),
+ -1207196810,
+ ),
],
)
def test_bucket_hash_values(test_input: Any, test_type: PrimitiveType,
expected: Any) -> None:
@@ -1603,7 +1628,7 @@ def test_ymd_pyarrow_transforms(
]
else:
with pytest.raises(ValueError):
-
transform.pyarrow_transform(DateType())(arrow_table_date_timestamps[source_col])
+
transform.pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col])
@pytest.mark.parametrize(
@@ -1629,6 +1654,38 @@ def test_bucket_pyarrow_transforms(
assert expected == transform.pyarrow_transform(source_type)(input_arr)
+# pyiceberg_core currently does not support bucket transform on timestamp_ns
and timestamptz_ns
+# https://github.com/apache/iceberg-rust/issues/1110
[email protected](
+ "source_type, input_arr, num_buckets",
+ [
+ (
+ TimestampNanoType(),
+ pa.array([datetime(1970, 1, 1, 0, 0, 0), datetime(2025, 2, 26, 1,
2, 3)], type=pa.timestamp(unit="ns")),
+ 10,
+ ),
+ (
+ TimestamptzNanoType(),
+ pa.array(
+ [datetime(1970, 1, 1, 0, 0, 0), datetime(2025, 2, 26, 1, 2,
3)],
+ type=pa.timestamp(unit="ns", tz=pytz.timezone("Etc/GMT+10")),
+ ),
+ 10,
+ ),
+ ],
+)
+def test_unsupported_bucket_pyarrow_transform(
+ source_type: PrimitiveType,
+ input_arr: Union[pa.Array, pa.ChunkedArray],
+ num_buckets: int,
+) -> None:
+ transform: Transform[Any, Any] = BucketTransform(num_buckets=num_buckets)
+ with pytest.raises(ValueError) as exc_info:
+ transform.pyarrow_transform(source_type)(input_arr)
+
+ assert "FeatureUnsupported => Unsupported data type for bucket transform"
in str(exc_info.value)
+
+
@pytest.mark.parametrize(
"source_type, input_arr, expected, width",
[
diff --git a/tests/utils/test_datetime.py b/tests/utils/test_datetime.py
index ac7ba545..6f6f4a91 100644
--- a/tests/utils/test_datetime.py
+++ b/tests/utils/test_datetime.py
@@ -14,12 +14,21 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-from datetime import datetime, timezone, tzinfo
+from datetime import datetime, time, timezone, tzinfo
import pytest
import pytz
-from pyiceberg.utils.datetime import datetime_to_millis, millis_to_datetime
+from pyiceberg.utils.datetime import (
+ datetime_to_millis,
+ datetime_to_nanos,
+ millis_to_datetime,
+ nanos_to_micros,
+ time_str_to_nanos,
+ time_to_nanos,
+ timestamp_to_nanos,
+ timestamptz_to_nanos,
+)
timezones = [
pytz.timezone("Etc/GMT"),
@@ -71,3 +80,55 @@ def test_datetime_tz_to_millis(tz: tzinfo) -> None:
def test_millis_to_datetime() -> None:
assert millis_to_datetime(1690971805918) == datetime(2023, 8, 2, 10, 23,
25, 918000)
+
+
[email protected]("time_str, nanos", [("00:00:00+00:00", 0),
("20:21:44.375612-05:00", 73304375612000)])
+def test_time_str_to_nanos(time_str: str, nanos: int) -> None:
+ assert nanos == time_str_to_nanos(time_str)
+
+
[email protected](
+ "time_, nanos", [(time(0, 0, 0), 0), (time(20, 21, 44, 375612,
tzinfo=pytz.timezone("Etc/GMT-5")), 73304375612000)]
+)
+def test_time_to_nanos(time_: time, nanos: int) -> None:
+ assert nanos == time_to_nanos(time_)
+
+
[email protected](
+ "datetime_, nanos",
+ [
+ (datetime(1970, 1, 1, 0, 0, 0), 0),
+ (datetime(2025, 2, 23, 20, 21, 44, 375612,
tzinfo=pytz.timezone("Etc/GMT-5")), 1740324104375612000),
+ ],
+)
+def test_datetime_to_nanos(datetime_: datetime, nanos: int) -> None:
+ assert nanos == datetime_to_nanos(datetime_)
+
+
[email protected](
+ "timestamp, nanos",
+ [
+ ("1970-01-01T00:00:00", 0),
+ ("2025-02-23T20:21:44.375612", 1740342104375612000),
+ ("2025-02-23T20:21:44.375612001", 1740342104375612001),
+ ],
+)
+def test_timestamp_to_nanos(timestamp: str, nanos: int) -> None:
+ assert nanos == timestamp_to_nanos(timestamp)
+
+
[email protected](
+ "timestamp, nanos",
+ [
+ ("1970-01-01T00:00:00+00:00", 0),
+ ("2025-02-23T16:21:44.375612-04:00", 1740342104375612000),
+ ("2025-02-23T16:21:44.375612001-04:00", 1740342104375612001),
+ ],
+)
+def test_timestamptz_to_nanos(timestamp: str, nanos: int) -> None:
+ assert nanos == timestamptz_to_nanos(timestamp)
+
+
[email protected]("nanos, micros", [(1510871468000001001,
1510871468000001), (-1510871468000001001, -1510871468000002)])
+def test_nanos_to_micros(nanos: int, micros: int) -> None:
+ assert micros == nanos_to_micros(nanos)