Fokko commented on code in PR #5124:
URL: https://github.com/apache/iceberg/pull/5124#discussion_r918360957
##########
python/pyiceberg/transforms.py:
##########
@@ -130,99 +161,58 @@ def apply(self, value: Optional[S]) -> Optional[int]:
def result_type(self, source: IcebergType) -> IcebergType:
return IntegerType()
- @abstractmethod
- def can_transform(self, source: IcebergType) -> bool:
- pass
-
- def __repr__(self) -> str:
- return f"transforms.bucket(source_type={repr(self._source_type)},
num_buckets={self._num_buckets})"
-
-
-class BucketNumberTransform(BaseBucketTransform):
- """Transforms a value of IntegerType, LongType, DateType, TimeType,
TimestampType, or TimestamptzType
- into a bucket partition value
-
- Example:
- >>> transform = BucketNumberTransform(LongType(), 100)
- >>> transform.apply(81068000000)
- 59
- """
-
def can_transform(self, source: IcebergType) -> bool:
- return type(source) in {IntegerType, DateType, LongType, TimeType,
TimestampType, TimestamptzType}
-
- def hash(self, value) -> int:
- return mmh3.hash(struct.pack("<q", value))
-
-
-class BucketDecimalTransform(BaseBucketTransform):
- """Transforms a value of DecimalType into a bucket partition value.
-
- Example:
- >>> transform = BucketDecimalTransform(DecimalType(9, 2), 100)
- >>> transform.apply(Decimal("14.20"))
- 59
- """
-
- def can_transform(self, source: IcebergType) -> bool:
- return isinstance(source, DecimalType)
-
- def hash(self, value: Decimal) -> int:
- return mmh3.hash(decimal_to_bytes(value))
-
-
-class BucketStringTransform(BaseBucketTransform):
- """Transforms a value of StringType into a bucket partition value.
-
- Example:
- >>> transform = BucketStringTransform(StringType(), 100)
- >>> transform.apply("iceberg")
- 89
- """
-
- def can_transform(self, source: IcebergType) -> bool:
- return isinstance(source, StringType)
-
- def hash(self, value: str) -> int:
- return mmh3.hash(value)
-
-
-class BucketBytesTransform(BaseBucketTransform):
- """Transforms a value of FixedType or BinaryType into a bucket partition
value.
-
- Example:
- >>> transform = BucketBytesTransform(BinaryType(), 100)
- >>> transform.apply(b"\\x00\\x01\\x02\\x03")
- 41
- """
-
- def can_transform(self, source: IcebergType) -> bool:
- return type(source) in {FixedType, BinaryType}
-
- def hash(self, value: bytes) -> int:
- return mmh3.hash(value)
+ return type(source) in {
+ IntegerType,
+ DateType,
+ LongType,
+ TimeType,
+ TimestampType,
+ TimestamptzType,
+ DecimalType,
+ StringType,
+ FixedType,
+ BinaryType,
+ UUIDType,
+ }
+
+ def hash_function(self, source: IcebergType, bucket: bool = True) ->
Callable[[Optional[Any]], Optional[int]]:
+ source_type = type(source)
+ if source_type in {IntegerType, LongType, DateType, TimeType,
TimestampType, TimestamptzType}:
+
+ def hash_func(v):
+ return mmh3.hash(struct.pack("<q", v))
+
+ elif source_type == DecimalType:
+
+ def hash_func(v):
+ return mmh3.hash(decimal_to_bytes(v))
+
+ elif source_type in {StringType, FixedType, BinaryType}:
+
+ def hash_func(v):
+ return mmh3.hash(v)
+
+ elif source_type == UUIDType:
+
+ def hash_func(v):
Review Comment:
We re-create the functions each time we call the `transform()` method. We
could also define them top-level, but then we define a lot of functions in
module scope that we don't use all the time. This might consume a bit more
memory but shouldn't burn any more CPU cycles.
##########
python/pyiceberg/transforms.py:
##########
@@ -312,6 +293,28 @@ def source_type(self) -> IcebergType:
def width(self) -> int:
return self._width
+ def hash_function(self, source: IcebergType) -> Callable[[Optional[S]],
Optional[S]]:
+ source_type = type(source)
+ if source_type in {IntegerType, LongType}:
+
+ def truncate_func(v):
+ return v - v % self._width
+
+ elif source_type in {StringType, BinaryType}:
+
+ def truncate_func(v):
+ return v[0 : min(self._width, len(v))]
Review Comment:
Yes, like it a lot
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]