This is an automated email from the ASF dual-hosted git repository.
Fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 75cd77fda Fix precision loss in large integral string conversions
(#3405)
75cd77fda is described below
commit 75cd77fda94a6bdd242c8d892d2bae5db595653c
Author: Minh Vu <[email protected]>
AuthorDate: Mon May 25 22:03:47 2026 +0200
Fix precision loss in large integral string conversions (#3405)
## Summary
Fixes precision loss when converting large integral strings in two
runtime paths:
- `StringLiteral.to(IntegerType/LongType)`
- `partition_to_py(...)` for integral and time-based partition values
backed by integers
## Root cause
Both paths were converting through `float` before converting to `int`,
which loses precision for values outside the IEEE-754 exact integer
range.
That caused valid 64-bit integers like `LongType.max` and
`9007199254740993` to be corrupted.
## What changed
- Replaced `int(float(...))` with exact integer parsing in
`partition_to_py`
- For `StringLiteral.to(IntegerType/LongType)`, exact integral strings
now use exact integer parsing while fractional numeric strings retain
the existing truncation behavior
- Added regression tests for `LongType.max` and `9007199254740993`
## Validation
- `uv run pytest tests/expressions/test_literals.py
tests/test_conversions.py`
Closes #3404.
---
pyiceberg/conversions.py | 2 +-
pyiceberg/expressions/literals.py | 19 +++++++++++++------
tests/expressions/test_literals.py | 33 +++++++++++++++++++++++++++++++++
tests/test_conversions.py | 3 +++
4 files changed, 50 insertions(+), 7 deletions(-)
diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py
index 42d996f75..268cbb93e 100644
--- a/pyiceberg/conversions.py
+++ b/pyiceberg/conversions.py
@@ -143,7 +143,7 @@ def _(primitive_type: PrimitiveType, value_str: str) -> int:
_, _, exponent = Decimal(value_str).as_tuple()
if exponent != 0: # Raise if there are digits to the right of the decimal
raise ValueError(f"Cannot convert partition value, value cannot have
fractional digits for {primitive_type} partition")
- return int(float(value_str))
+ return int(value_str)
@partition_to_py.register(FloatType)
diff --git a/pyiceberg/expressions/literals.py
b/pyiceberg/expressions/literals.py
index 5bf70990b..50c6d2d61 100644
--- a/pyiceberg/expressions/literals.py
+++ b/pyiceberg/expressions/literals.py
@@ -68,6 +68,13 @@ from pyiceberg.utils.singleton import Singleton
UUID_BYTES_LENGTH = 16
+def _parse_numeric_string(value: str) -> Decimal:
+ number = Decimal(value)
+ if not number.is_finite():
+ raise ValueError(f"Cannot convert non-finite numeric string: {value}")
+ return number
+
+
class Literal(IcebergRootModel[L], Generic[L], ABC): # type: ignore
"""Literal which has a value and can be converted between types."""
@@ -555,27 +562,27 @@ class StringLiteral(Literal[str]):
@to.register(IntegerType)
def _(self, type_var: IntegerType) -> Literal[int]:
try:
- number = int(float(self.value))
+ number = _parse_numeric_string(self.value)
if IntegerType.max < number:
return IntAboveMax()
elif IntegerType.min > number:
return IntBelowMin()
- return LongLiteral(number)
- except ValueError as e:
+ return LongLiteral(int(number))
+ except (ArithmeticError, OverflowError, ValueError) as e:
raise ValueError(f"Could not convert {self.value} into a
{type_var}") from e
@to.register(LongType)
def _(self, type_var: LongType) -> Literal[int]:
try:
- long_value = int(float(self.value))
+ long_value = _parse_numeric_string(self.value)
if LongType.max < long_value:
return LongAboveMax()
elif LongType.min > long_value:
return LongBelowMin()
else:
- return LongLiteral(long_value)
- except (TypeError, ValueError) as e:
+ return LongLiteral(int(long_value))
+ except (ArithmeticError, OverflowError, TypeError, ValueError) as e:
raise ValueError(f"Could not convert {self.value} into a
{type_var}") from e
@to.register(DateType)
diff --git a/tests/expressions/test_literals.py
b/tests/expressions/test_literals.py
index c3ace5d36..7a129685b 100644
--- a/tests/expressions/test_literals.py
+++ b/tests/expressions/test_literals.py
@@ -39,6 +39,7 @@ from pyiceberg.expressions.literals import (
IntAboveMax,
IntBelowMin,
Literal,
+ LongAboveMax,
LongLiteral,
StringLiteral,
TimeLiteral,
@@ -845,6 +846,38 @@ def test_string_to_int_min_value() -> None:
assert isinstance(literal(str(IntegerType.min - 1)).to(IntegerType()),
IntBelowMin)
+def test_string_to_long_max_value_without_precision_loss() -> None:
+ assert literal(str(LongType.max)).to(LongType()) == literal(LongType.max)
+
+
+def test_string_to_long_large_integer_without_precision_loss() -> None:
+ assert literal("9007199254740993").to(LongType()) ==
literal(9007199254740993)
+
+
+def test_string_to_long_decimal_like_integer_without_precision_loss() -> None:
+ assert literal("9007199254740993.0").to(LongType()) ==
literal(9007199254740993)
+
+
+def test_string_to_long_scientific_notation_integer_without_precision_loss()
-> None:
+ assert literal("9007199254740993e0").to(LongType()) ==
literal(9007199254740993)
+
+
+def test_string_to_long_max_decimal_like_integer_without_precision_loss() ->
None:
+ assert literal(f"{LongType.max}.0").to(LongType()) == literal(LongType.max)
+
+
+def test_string_to_integer_scientific_notation_without_regression() -> None:
+ assert literal("1e3").to(IntegerType()) == literal(1000)
+
+
+def test_string_to_integer_large_scientific_notation_above_max() -> None:
+ assert isinstance(literal("1e1000000").to(IntegerType()), IntAboveMax)
+
+
+def test_string_to_long_large_scientific_notation_above_max() -> None:
+ assert isinstance(literal("1e1000000").to(LongType()), LongAboveMax)
+
+
def test_string_to_integer_type_invalid_value() -> None:
with pytest.raises(ValueError) as e:
_ = literal("abc").to(IntegerType())
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index e38bdbd6f..9b73b2db8 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -160,8 +160,11 @@ def test_unscaled_to_decimal(unscaled: int, scale: int,
expected_result: Decimal
(IntegerType(), "1", 1),
(IntegerType(), "9999", 9999),
(LongType(), "123456789", 123456789),
+ (LongType(), "9007199254740993", 9007199254740993),
+ (LongType(), str(LongType.max), LongType.max),
(FloatType(), "1.1", 1.1),
(DoubleType(), "99999.9", 99999.9),
+ (TimestampNanoType(), "9007199254740993", 9007199254740993),
(DecimalType(5, 2), "123.45", Decimal("123.45")),
(StringType(), "foo", "foo"),
(UUIDType(), "f79c3e09-677c-4bbd-a479-3f349cb785e7",
uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")),