This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


The following commit(s) were added to refs/heads/main by this push:
     new c06e3203 Add JSON single value encoding (#1805)
c06e3203 is described below

commit c06e32034fbd32d5aab9b2ef19ad2d253141479d
Author: Fokko Driesprong <[email protected]>
AuthorDate: Wed Mar 19 12:37:37 2025 +0100

    Add JSON single value encoding (#1805)
    
    # Rationale for this change
    
    Adds support for encoding built-in Python values into JSON encoded
    values, according to the spec:
    https://iceberg.apache.org/spec/#json-single-value-serialization
    
    # Are these changes tested?
    
    Yes, with both expected/actual and round-trip tests.
    
    # Are there any user-facing changes?
    
    No
    <!-- In the case of user-facing changes, please add the changelog label.
    -->
---
 pyiceberg/conversions.py  | 219 +++++++++++++++++++++++++++++++++++++++++++++-
 tests/test_conversions.py |  49 +++++++++++
 2 files changed, 266 insertions(+), 2 deletions(-)

diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py
index 7c1455d4..ed5c6f7a 100644
--- a/pyiceberg/conversions.py
+++ b/pyiceberg/conversions.py
@@ -20,6 +20,7 @@ This module enables:
     - Converting partition strings to built-in python objects.
     - Converting a value to a byte buffer.
     - Converting a byte buffer to a value.
+    - Converting a json-single field serialized field
 
 Note:
     Conversion logic varies based on the PrimitiveType implementation. 
Therefore conversion functions
@@ -28,6 +29,7 @@ Note:
     implementations that share the same conversion logic, registrations can be 
stacked.
 """
 
+import codecs
 import uuid
 from datetime import date, datetime, time
 from decimal import Decimal
@@ -60,7 +62,23 @@ from pyiceberg.types import (
     UUIDType,
     strtobool,
 )
-from pyiceberg.utils.datetime import date_to_days, datetime_to_micros, 
time_to_micros
+from pyiceberg.utils.datetime import (
+    date_str_to_days,
+    date_to_days,
+    datetime_to_micros,
+    days_to_date,
+    micros_to_time,
+    micros_to_timestamp,
+    micros_to_timestamptz,
+    time_str_to_micros,
+    time_to_micros,
+    timestamp_to_micros,
+    timestamptz_to_micros,
+    to_human_day,
+    to_human_time,
+    to_human_timestamp,
+    to_human_timestamptz,
+)
 from pyiceberg.utils.decimal import decimal_to_bytes, unscaled_to_decimal
 
 _BOOL_STRUCT = Struct("<?")
@@ -283,7 +301,7 @@ def from_bytes(primitive_type: PrimitiveType, b: bytes) -> 
L:  # type: ignore
         primitive_type (PrimitiveType): An implementation of the PrimitiveType 
base class.
         b (bytes): The bytes to convert.
     """
-    raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not 
supported: {str(b)}")
+    raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not 
supported: {b!r}")
 
 
 @from_bytes.register(BooleanType)
@@ -336,3 +354,200 @@ def _(primitive_type: DecimalType, buf: bytes) -> Decimal:
 @from_bytes.register(UnknownType)
 def _(type_: UnknownType, buf: bytes) -> None:
     return None
+
+
+@singledispatch  # type: ignore
+def to_json(primitive_type: PrimitiveType, val: Any) -> L:  # type: ignore
+    """Convert built-in python values into JSON value types.
+
+    https://iceberg.apache.org/spec/#json-single-value-serialization
+
+    Args:
+        primitive_type (PrimitiveType): An implementation of the PrimitiveType 
base class.
+        val (Any): The arbitrary built-in value to convert into the right form
+    """
+    raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not 
supported: {val}")
+
+
+@to_json.register(BooleanType)
+def _(_: BooleanType, val: bool) -> bool:
+    """Python bool automatically converts into a JSON bool."""
+    return val
+
+
+@to_json.register(IntegerType)
+@to_json.register(LongType)
+def _(_: Union[IntegerType, LongType], val: int) -> int:
+    """Python int automatically converts to a JSON int."""
+    return val
+
+
+@to_json.register(DateType)
+def _(_: DateType, val: Union[date, int]) -> str:
+    """JSON date is string encoded."""
+    if isinstance(val, date):
+        val = date_to_days(val)
+    return to_human_day(val)
+
+
+@to_json.register(TimeType)
+def _(_: TimeType, val: Union[int, time]) -> str:
+    """Python time or microseconds since epoch serializes into an ISO8601 
time."""
+    if isinstance(val, time):
+        val = time_to_micros(val)
+    return to_human_time(val)
+
+
+@to_json.register(TimestampType)
+def _(_: PrimitiveType, val: Union[int, datetime]) -> str:
+    """Python datetime (without timezone) or microseconds since epoch 
serializes into an ISO8601 timestamp."""
+    if isinstance(val, datetime):
+        val = datetime_to_micros(val)
+
+    return to_human_timestamp(val)
+
+
+@to_json.register(TimestamptzType)
+def _(_: TimestamptzType, val: Union[int, datetime]) -> str:
+    """Python datetime (with timezone) or microseconds since epoch serializes 
into an ISO8601 timestamp."""
+    if isinstance(val, datetime):
+        val = datetime_to_micros(val)
+    return to_human_timestamptz(val)
+
+
+@to_json.register(FloatType)
+@to_json.register(DoubleType)
+def _(_: Union[FloatType, DoubleType], val: float) -> float:
+    """Float serializes into JSON float."""
+    return val
+
+
+@to_json.register(StringType)
+def _(_: StringType, val: str) -> str:
+    """Python string serializes into JSON string."""
+    return val
+
+
+@to_json.register(FixedType)
+def _(t: FixedType, b: bytes) -> str:
+    """Python bytes serializes into hexadecimal encoded string."""
+    if len(t) != len(b):
+        raise ValueError(f"FixedType has length {len(t)}, which is different 
from the value: {len(b)}")
+
+    return codecs.encode(b, "hex").decode(UTF8)
+
+
+@to_json.register(BinaryType)
+def _(_: BinaryType, b: bytes) -> str:
+    """Python bytes serializes into hexadecimal encoded string."""
+    return codecs.encode(b, "hex").decode(UTF8)
+
+
+@to_json.register(DecimalType)
+def _(_: DecimalType, val: Decimal) -> str:
+    """Python decimal serializes into string.
+
+    Stores the string representation of the decimal value, specifically, for
+    values with a positive scale, the number of digits to the right of the
+    decimal point is used to indicate scale, for values with a negative scale,
+    the scientific notation is used and the exponent must equal the negated 
scale.
+    """
+    return str(val)
+
+
+@to_json.register(UUIDType)
+def _(_: UUIDType, val: uuid.UUID) -> str:
+    """Serialize into a JSON string."""
+    return str(val)
+
+
+@singledispatch  # type: ignore
+def from_json(primitive_type: PrimitiveType, val: Any) -> L:  # type: ignore
+    """Convert JSON value types into built-in python values.
+
+    https://iceberg.apache.org/spec/#json-single-value-serialization
+
+    Args:
+        primitive_type (PrimitiveType): An implementation of the PrimitiveType 
base class.
+        val (Any): The arbitrary JSON value to convert into the right form
+    """
+    raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not 
supported: {str(val)}")
+
+
+@from_json.register(BooleanType)
+def _(_: BooleanType, val: bool) -> bool:
+    """JSON bool automatically converts into a Python bool."""
+    return val
+
+
+@from_json.register(IntegerType)
+@from_json.register(LongType)
+def _(_: Union[IntegerType, LongType], val: int) -> int:
+    """JSON int automatically converts to a Python int."""
+    return val
+
+
+@from_json.register(DateType)
+def _(_: DateType, val: str) -> date:
+    """JSON date is string encoded."""
+    return days_to_date(date_str_to_days(val))
+
+
+@from_json.register(TimeType)
+def _(_: TimeType, val: str) -> time:
+    """JSON ISO8601 string into Python time."""
+    return micros_to_time(time_str_to_micros(val))
+
+
+@from_json.register(TimestampType)
+def _(_: PrimitiveType, val: str) -> datetime:
+    """JSON ISO8601 string into Python datetime."""
+    return micros_to_timestamp(timestamp_to_micros(val))
+
+
+@from_json.register(TimestamptzType)
+def _(_: TimestamptzType, val: str) -> datetime:
+    """JSON ISO8601 string into Python datetime."""
+    return micros_to_timestamptz(timestamptz_to_micros(val))
+
+
+@from_json.register(FloatType)
+@from_json.register(DoubleType)
+def _(_: Union[FloatType, DoubleType], val: float) -> float:
+    """JSON float deserializes into a Python float."""
+    return val
+
+
+@from_json.register(StringType)
+def _(_: StringType, val: str) -> str:
+    """JSON string serializes into a Python string."""
+    return val
+
+
+@from_json.register(FixedType)
+def _(t: FixedType, val: str) -> bytes:
+    """JSON hexadecimal encoded string into bytes."""
+    b = codecs.decode(val.encode(UTF8), "hex")
+
+    if len(t) != len(b):
+        raise ValueError(f"FixedType has length {len(t)}, which is different 
from the value: {len(b)}")
+
+    return b
+
+
+@from_json.register(BinaryType)
+def _(_: BinaryType, val: str) -> bytes:
+    """JSON hexadecimal encoded string into bytes."""
+    return codecs.decode(val.encode(UTF8), "hex")
+
+
+@from_json.register(DecimalType)
+def _(_: DecimalType, val: str) -> Decimal:
+    """Convert JSON string into a Python Decimal."""
+    return Decimal(val)
+
+
+@from_json.register(UUIDType)
+def _(_: UUIDType, val: str) -> uuid.UUID:
+    """Convert JSON string into Python UUID."""
+    return uuid.UUID(val)
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index f57998aa..0eafb966 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -545,3 +545,52 @@ def test_datetime_obj_to_bytes(primitive_type: 
PrimitiveType, value: Union[datet
     bytes_from_value = conversions.to_bytes(primitive_type, value)
 
     assert bytes_from_value == expected_bytes
+
+
[email protected](
+    "primitive_type, value, expected",
+    [
+        (BooleanType(), True, True),
+        (IntegerType(), 34, 34),
+        (LongType(), 34, 34),
+        (FloatType(), 1.0, 1.0),
+        (DoubleType(), 1.0, 1.0),
+        (DecimalType(9, 4), Decimal("123.4500"), "123.4500"),
+        (DecimalType(9, 0), Decimal("2"), "2"),
+        (DecimalType(9, -20), Decimal("2E+20"), "2E+20"),
+        (DateType(), date(2017, 11, 16), "2017-11-16"),
+        (TimeType(), time(22, 31, 8, 123456), "22:31:08.123456"),
+        (TimestampType(), datetime(2017, 11, 16, 22, 31, 8, 123456), 
"2017-11-16T22:31:08.123456"),
+        (TimestamptzType(), datetime(2017, 11, 16, 22, 31, 8, 123456, 
tzinfo=timezone.utc), "2017-11-16T22:31:08.123456+00:00"),
+        (StringType(), "iceberg", "iceberg"),
+        (BinaryType(), b"\x01\x02\x03\xff", "010203ff"),
+        (FixedType(4), b"\x01\x02\x03\xff", "010203ff"),
+    ],
+)
+def test_json_single_serialization(primitive_type: PrimitiveType, value: Any, 
expected: Any) -> None:
+    json_val = conversions.to_json(primitive_type, value)
+    assert json_val == expected
+
+
[email protected](
+    "primitive_type, value",
+    [
+        (BooleanType(), True),
+        (IntegerType(), 34),
+        (LongType(), 34),
+        (FloatType(), 1.0),
+        (DoubleType(), 1.0),
+        (DecimalType(9, 4), Decimal("123.4500")),
+        (DecimalType(9, 0), Decimal("2")),
+        (DecimalType(9, -20), Decimal("2E+20")),
+        (DateType(), date(2017, 11, 16)),
+        (TimeType(), time(22, 31, 8, 123456)),
+        (TimestampType(), datetime(2017, 11, 16, 22, 31, 8, 123456)),
+        (TimestamptzType(), datetime(2017, 11, 16, 22, 31, 8, 123456, 
tzinfo=timezone.utc)),
+        (StringType(), "iceberg"),
+        (BinaryType(), b"\x01\x02\x03\xff"),
+        (FixedType(4), b"\x01\x02\x03\xff"),
+    ],
+)
+def test_json_serialize_roundtrip(primitive_type: PrimitiveType, value: Any) 
-> None:
+    assert value == conversions.from_json(primitive_type, 
conversions.to_json(primitive_type, value))

Reply via email to