the-other-tim-brown commented on code in PR #728: URL: https://github.com/apache/incubator-xtable/pull/728#discussion_r2296924772
########## xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetConversionSource.java: ########## @@ -0,0 +1,434 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.parquet; + +import static org.apache.xtable.GenericTable.getTableName; +import static org.apache.xtable.model.storage.TableFormat.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.sql.Timestamp; +import java.time.Duration; +import java.time.Instant; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.time.temporal.ChronoUnit; +import java.util.*; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import lombok.Builder; +import lombok.Value; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.*; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.MetadataBuilder; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import org.apache.hudi.client.HoodieReadClient; +import org.apache.hudi.common.config.HoodieMetadataConfig; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import org.apache.xtable.GenericTable; +import org.apache.xtable.conversion.*; +import org.apache.xtable.conversion.ConversionConfig; +import org.apache.xtable.conversion.ConversionController; +import org.apache.xtable.conversion.ConversionSourceProvider; +import org.apache.xtable.conversion.SourceTable; +import org.apache.xtable.conversion.TargetTable; +import org.apache.xtable.hudi.HudiTestUtil; +import org.apache.xtable.model.sync.SyncMode; + +public class TestParquetConversionSource { Review Comment: We will need to either move this to run in the integration tests by changing `Test` to `IT` or it needs to set `@Execution(SAME_THREAD)` so that it will not try to start a spark session in the same JVM as the other tests like `TestDeltaSync`. This looks like the reason why `TestDeltaSync` will fail when run as part of the full suite but not when run in isolation. ########## xtable-core/src/main/java/org/apache/xtable/parquet/ParquetSchemaExtractor.java: ########## @@ -295,15 +298,23 @@ public InternalSchema toInternalSchema(Type schema, String parentPath) { .fieldId(fieldId == null ? null : fieldId.intValue()) .build()); } - if (currentRepetition != Repetition.REPEATED - && schema.asGroupType().getName() != "list" + // RECORD Type (non-nullable elements) + if (schema.asGroupType().getName() != "list" && !Arrays.asList("key_value", "map").contains(schema.asGroupType().getName())) { return InternalSchema.builder() .name(schema.getName()) .comment(null) + // .recordKeyFields(subFields) // necessary for Hudi metadata .dataType(InternalType.RECORD) .fields(subFields) - .isNullable(isNullable(schema.asGroupType())) + .isNullable( + isNullable(schema.asGroupType())) // false isNullable(schema.asGroupType()) (TODO causing Review Comment: Right now this is returning that the top level record schema is nullable which should not be the case. Do we need some special handling for this case? ########## xtable-core/src/test/java/org/apache/xtable/ITConversionController.java: ########## @@ -188,6 +306,10 @@ private ConversionSourceProvider<?> getConversionSourceProvider(String sourceTab throw new IllegalArgumentException("Unsupported source format: " + sourceTableFormat); } } + /* + test for Parquet file conversion Review Comment: Let's remove these changes then if they are not required ########## xtable-core/src/main/java/org/apache/xtable/parquet/ParquetStatsExtractor.java: ########## @@ -112,17 +119,59 @@ public static Map<ColumnDescriptor, List<ColumnStat>> getStatsForFile(ParquetMet .totalSize(columnMetaData.getTotalSize()) .range( Range.vector( - columnMetaData.getStatistics().genericGetMin(), - columnMetaData.getStatistics().genericGetMax())) + columnMetaData.getPrimitiveType().getPrimitiveTypeName() + == PrimitiveType.PrimitiveTypeName + .BINARY // TODO how about DECIMAL, JSON, BSON + // and ENUM logicalTypes? + ? columnMetaData + .getPrimitiveType() + .getLogicalTypeAnnotation() + != null + ? columnMetaData + .getPrimitiveType() + .getLogicalTypeAnnotation() + .toString() + .equals("STRING") + ? new String( + ((Binary) + columnMetaData + .getStatistics() + .genericGetMin()) + .getBytes(), + StandardCharsets.UTF_8) + : columnMetaData.getStatistics().genericGetMin() + : columnMetaData.getStatistics().genericGetMin() + : columnMetaData + .getStatistics() + .genericGetMin(), // if stats are string convert to + // litteraly a string stat and Review Comment: Let's move this conversion logic into a helper method and then add a new GH Issue for handling any other logical types. ########## xtable-core/src/test/java/org/apache/xtable/parquet/TestSparkParquetTable.java: ########## @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.parquet; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.parquet.example.data.Group; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.apache.spark.api.java.JavaSparkContext; + +import org.apache.xtable.GenericTable; + +public class TestSparkParquetTable implements GenericTable<Group, String> { Review Comment: Is this being used by the tests? ########## xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergColumnStatsConverter.java: ########## @@ -50,7 +50,8 @@ public static IcebergColumnStatsConverter getInstance() { return INSTANCE; } - public Metrics toIceberg(Schema schema, long totalRowCount, List<ColumnStat> fieldColumnStats) { + public Metrics toIceberg( Review Comment: The changes to this file and others that are just formatting in the Iceberg and Delta paths should be reverted to minimize the diff to just the parts that are essential for review. ########## xtable-core/src/main/java/org/apache/xtable/parquet/ParquetStatsExtractor.java: ########## @@ -132,19 +181,23 @@ public static InternalDataFile toInternalDataFile(Configuration hadoopConf, Path try { FileSystem fs = FileSystem.get(hadoopConf); file = fs.getFileStatus(parentPath); - // InputPartitionFields partitionInfo = initPartitionInfo(); footer = parquetMetadataExtractor.readParquetMetadata(hadoopConf, parentPath); MessageType schema = parquetMetadataExtractor.getSchema(footer); columnStatsForAFile = getColumnStatsForaFile(footer); - // partitionValues = partitionExtractor.createPartitionValues( - // partitionInfo); + partitionValues = + partitionValueExtractor.extractPartitionValues( + partitionSpecExtractor.spec( + partitionValueExtractor.extractSchemaForParquetPartitions( + parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath()), + file.getPath().toString())), + parentPath.toString()); } catch (java.io.IOException e) { Review Comment: I think this is just hiding the exception currently and we need to fix this. ########## xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java: ########## @@ -28,4 +28,8 @@ public static String getPartitionPath(Path tableBasePath, Path filePath) { int endIndex = pathStr.length() - fileName.length() - 1; return endIndex <= startIndex ? "" : pathStr.substring(startIndex, endIndex); } + + public static String getPartitionPathValue(Path tableBasePath, Path filePath) { + return getPartitionPath(tableBasePath, filePath).split("=")[1]; + } Review Comment: What if there are multiple `=` in the path? Like for `/some/path/year=2025/month=08/` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@xtable.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org