This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new c06e3203 Add JSON single value encoding (#1805)
c06e3203 is described below
commit c06e32034fbd32d5aab9b2ef19ad2d253141479d
Author: Fokko Driesprong <[email protected]>
AuthorDate: Wed Mar 19 12:37:37 2025 +0100
Add JSON single value encoding (#1805)
# Rationale for this change
Adds support for encoding built-in Python values into JSON encoded
values, according to the spec:
https://iceberg.apache.org/spec/#json-single-value-serialization
# Are these changes tested?
Yes, with both expected/actual and round-trip tests.
# Are there any user-facing changes?
No
<!-- In the case of user-facing changes, please add the changelog label.
-->
---
pyiceberg/conversions.py | 219 +++++++++++++++++++++++++++++++++++++++++++++-
tests/test_conversions.py | 49 +++++++++++
2 files changed, 266 insertions(+), 2 deletions(-)
diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py
index 7c1455d4..ed5c6f7a 100644
--- a/pyiceberg/conversions.py
+++ b/pyiceberg/conversions.py
@@ -20,6 +20,7 @@ This module enables:
- Converting partition strings to built-in python objects.
- Converting a value to a byte buffer.
- Converting a byte buffer to a value.
+ - Converting a json-single field serialized field
Note:
Conversion logic varies based on the PrimitiveType implementation.
Therefore conversion functions
@@ -28,6 +29,7 @@ Note:
implementations that share the same conversion logic, registrations can be
stacked.
"""
+import codecs
import uuid
from datetime import date, datetime, time
from decimal import Decimal
@@ -60,7 +62,23 @@ from pyiceberg.types import (
UUIDType,
strtobool,
)
-from pyiceberg.utils.datetime import date_to_days, datetime_to_micros,
time_to_micros
+from pyiceberg.utils.datetime import (
+ date_str_to_days,
+ date_to_days,
+ datetime_to_micros,
+ days_to_date,
+ micros_to_time,
+ micros_to_timestamp,
+ micros_to_timestamptz,
+ time_str_to_micros,
+ time_to_micros,
+ timestamp_to_micros,
+ timestamptz_to_micros,
+ to_human_day,
+ to_human_time,
+ to_human_timestamp,
+ to_human_timestamptz,
+)
from pyiceberg.utils.decimal import decimal_to_bytes, unscaled_to_decimal
_BOOL_STRUCT = Struct("<?")
@@ -283,7 +301,7 @@ def from_bytes(primitive_type: PrimitiveType, b: bytes) ->
L: # type: ignore
primitive_type (PrimitiveType): An implementation of the PrimitiveType
base class.
b (bytes): The bytes to convert.
"""
- raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not
supported: {str(b)}")
+ raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not
supported: {b!r}")
@from_bytes.register(BooleanType)
@@ -336,3 +354,200 @@ def _(primitive_type: DecimalType, buf: bytes) -> Decimal:
@from_bytes.register(UnknownType)
def _(type_: UnknownType, buf: bytes) -> None:
return None
+
+
+@singledispatch # type: ignore
+def to_json(primitive_type: PrimitiveType, val: Any) -> L: # type: ignore
+ """Convert built-in python values into JSON value types.
+
+ https://iceberg.apache.org/spec/#json-single-value-serialization
+
+ Args:
+ primitive_type (PrimitiveType): An implementation of the PrimitiveType
base class.
+ val (Any): The arbitrary built-in value to convert into the right form
+ """
+ raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not
supported: {val}")
+
+
+@to_json.register(BooleanType)
+def _(_: BooleanType, val: bool) -> bool:
+ """Python bool automatically converts into a JSON bool."""
+ return val
+
+
+@to_json.register(IntegerType)
+@to_json.register(LongType)
+def _(_: Union[IntegerType, LongType], val: int) -> int:
+ """Python int automatically converts to a JSON int."""
+ return val
+
+
+@to_json.register(DateType)
+def _(_: DateType, val: Union[date, int]) -> str:
+ """JSON date is string encoded."""
+ if isinstance(val, date):
+ val = date_to_days(val)
+ return to_human_day(val)
+
+
+@to_json.register(TimeType)
+def _(_: TimeType, val: Union[int, time]) -> str:
+ """Python time or microseconds since epoch serializes into an ISO8601
time."""
+ if isinstance(val, time):
+ val = time_to_micros(val)
+ return to_human_time(val)
+
+
+@to_json.register(TimestampType)
+def _(_: PrimitiveType, val: Union[int, datetime]) -> str:
+ """Python datetime (without timezone) or microseconds since epoch
serializes into an ISO8601 timestamp."""
+ if isinstance(val, datetime):
+ val = datetime_to_micros(val)
+
+ return to_human_timestamp(val)
+
+
+@to_json.register(TimestamptzType)
+def _(_: TimestamptzType, val: Union[int, datetime]) -> str:
+ """Python datetime (with timezone) or microseconds since epoch serializes
into an ISO8601 timestamp."""
+ if isinstance(val, datetime):
+ val = datetime_to_micros(val)
+ return to_human_timestamptz(val)
+
+
+@to_json.register(FloatType)
+@to_json.register(DoubleType)
+def _(_: Union[FloatType, DoubleType], val: float) -> float:
+ """Float serializes into JSON float."""
+ return val
+
+
+@to_json.register(StringType)
+def _(_: StringType, val: str) -> str:
+ """Python string serializes into JSON string."""
+ return val
+
+
+@to_json.register(FixedType)
+def _(t: FixedType, b: bytes) -> str:
+ """Python bytes serializes into hexadecimal encoded string."""
+ if len(t) != len(b):
+ raise ValueError(f"FixedType has length {len(t)}, which is different
from the value: {len(b)}")
+
+ return codecs.encode(b, "hex").decode(UTF8)
+
+
+@to_json.register(BinaryType)
+def _(_: BinaryType, b: bytes) -> str:
+ """Python bytes serializes into hexadecimal encoded string."""
+ return codecs.encode(b, "hex").decode(UTF8)
+
+
+@to_json.register(DecimalType)
+def _(_: DecimalType, val: Decimal) -> str:
+ """Python decimal serializes into string.
+
+ Stores the string representation of the decimal value, specifically, for
+ values with a positive scale, the number of digits to the right of the
+ decimal point is used to indicate scale, for values with a negative scale,
+ the scientific notation is used and the exponent must equal the negated
scale.
+ """
+ return str(val)
+
+
+@to_json.register(UUIDType)
+def _(_: UUIDType, val: uuid.UUID) -> str:
+ """Serialize into a JSON string."""
+ return str(val)
+
+
+@singledispatch # type: ignore
+def from_json(primitive_type: PrimitiveType, val: Any) -> L: # type: ignore
+ """Convert JSON value types into built-in python values.
+
+ https://iceberg.apache.org/spec/#json-single-value-serialization
+
+ Args:
+ primitive_type (PrimitiveType): An implementation of the PrimitiveType
base class.
+ val (Any): The arbitrary JSON value to convert into the right form
+ """
+ raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not
supported: {str(val)}")
+
+
+@from_json.register(BooleanType)
+def _(_: BooleanType, val: bool) -> bool:
+ """JSON bool automatically converts into a Python bool."""
+ return val
+
+
+@from_json.register(IntegerType)
+@from_json.register(LongType)
+def _(_: Union[IntegerType, LongType], val: int) -> int:
+ """JSON int automatically converts to a Python int."""
+ return val
+
+
+@from_json.register(DateType)
+def _(_: DateType, val: str) -> date:
+ """JSON date is string encoded."""
+ return days_to_date(date_str_to_days(val))
+
+
+@from_json.register(TimeType)
+def _(_: TimeType, val: str) -> time:
+ """JSON ISO8601 string into Python time."""
+ return micros_to_time(time_str_to_micros(val))
+
+
+@from_json.register(TimestampType)
+def _(_: PrimitiveType, val: str) -> datetime:
+ """JSON ISO8601 string into Python datetime."""
+ return micros_to_timestamp(timestamp_to_micros(val))
+
+
+@from_json.register(TimestamptzType)
+def _(_: TimestamptzType, val: str) -> datetime:
+ """JSON ISO8601 string into Python datetime."""
+ return micros_to_timestamptz(timestamptz_to_micros(val))
+
+
+@from_json.register(FloatType)
+@from_json.register(DoubleType)
+def _(_: Union[FloatType, DoubleType], val: float) -> float:
+ """JSON float deserializes into a Python float."""
+ return val
+
+
+@from_json.register(StringType)
+def _(_: StringType, val: str) -> str:
+ """JSON string serializes into a Python string."""
+ return val
+
+
+@from_json.register(FixedType)
+def _(t: FixedType, val: str) -> bytes:
+ """JSON hexadecimal encoded string into bytes."""
+ b = codecs.decode(val.encode(UTF8), "hex")
+
+ if len(t) != len(b):
+ raise ValueError(f"FixedType has length {len(t)}, which is different
from the value: {len(b)}")
+
+ return b
+
+
+@from_json.register(BinaryType)
+def _(_: BinaryType, val: str) -> bytes:
+ """JSON hexadecimal encoded string into bytes."""
+ return codecs.decode(val.encode(UTF8), "hex")
+
+
+@from_json.register(DecimalType)
+def _(_: DecimalType, val: str) -> Decimal:
+ """Convert JSON string into a Python Decimal."""
+ return Decimal(val)
+
+
+@from_json.register(UUIDType)
+def _(_: UUIDType, val: str) -> uuid.UUID:
+ """Convert JSON string into Python UUID."""
+ return uuid.UUID(val)
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index f57998aa..0eafb966 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -545,3 +545,52 @@ def test_datetime_obj_to_bytes(primitive_type:
PrimitiveType, value: Union[datet
bytes_from_value = conversions.to_bytes(primitive_type, value)
assert bytes_from_value == expected_bytes
+
+
[email protected](
+ "primitive_type, value, expected",
+ [
+ (BooleanType(), True, True),
+ (IntegerType(), 34, 34),
+ (LongType(), 34, 34),
+ (FloatType(), 1.0, 1.0),
+ (DoubleType(), 1.0, 1.0),
+ (DecimalType(9, 4), Decimal("123.4500"), "123.4500"),
+ (DecimalType(9, 0), Decimal("2"), "2"),
+ (DecimalType(9, -20), Decimal("2E+20"), "2E+20"),
+ (DateType(), date(2017, 11, 16), "2017-11-16"),
+ (TimeType(), time(22, 31, 8, 123456), "22:31:08.123456"),
+ (TimestampType(), datetime(2017, 11, 16, 22, 31, 8, 123456),
"2017-11-16T22:31:08.123456"),
+ (TimestamptzType(), datetime(2017, 11, 16, 22, 31, 8, 123456,
tzinfo=timezone.utc), "2017-11-16T22:31:08.123456+00:00"),
+ (StringType(), "iceberg", "iceberg"),
+ (BinaryType(), b"\x01\x02\x03\xff", "010203ff"),
+ (FixedType(4), b"\x01\x02\x03\xff", "010203ff"),
+ ],
+)
+def test_json_single_serialization(primitive_type: PrimitiveType, value: Any,
expected: Any) -> None:
+ json_val = conversions.to_json(primitive_type, value)
+ assert json_val == expected
+
+
[email protected](
+ "primitive_type, value",
+ [
+ (BooleanType(), True),
+ (IntegerType(), 34),
+ (LongType(), 34),
+ (FloatType(), 1.0),
+ (DoubleType(), 1.0),
+ (DecimalType(9, 4), Decimal("123.4500")),
+ (DecimalType(9, 0), Decimal("2")),
+ (DecimalType(9, -20), Decimal("2E+20")),
+ (DateType(), date(2017, 11, 16)),
+ (TimeType(), time(22, 31, 8, 123456)),
+ (TimestampType(), datetime(2017, 11, 16, 22, 31, 8, 123456)),
+ (TimestamptzType(), datetime(2017, 11, 16, 22, 31, 8, 123456,
tzinfo=timezone.utc)),
+ (StringType(), "iceberg"),
+ (BinaryType(), b"\x01\x02\x03\xff"),
+ (FixedType(4), b"\x01\x02\x03\xff"),
+ ],
+)
+def test_json_serialize_roundtrip(primitive_type: PrimitiveType, value: Any)
-> None:
+ assert value == conversions.from_json(primitive_type,
conversions.to_json(primitive_type, value))