yihua commented on code in PR #18385:
URL: https://github.com/apache/hudi/pull/18385#discussion_r3251202126
##########
hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/helpers/TestCloudObjectsSelectorCommon.java:
##########
@@ -200,6 +206,59 @@ void loadDatasetWithNestedSchemaAndCoalesceAliases()
throws IOException {
Assertions.assertEquals(expectedSchema, result.get().schema(), "output
dataset schema should match source schema");
}
+ @Test
+ void parquetMixedSchemasMergedByDefault(@TempDir Path tempDir) {
+ String p1 = tempDir.resolve("part1").toString();
+ String p2 = tempDir.resolve("part2").toString();
+
+ StructType schema1 = DataTypes.createStructType(Arrays.asList(
+ DataTypes.createStructField("id", DataTypes.IntegerType, true),
+ DataTypes.createStructField("b", DataTypes.StringType, true)));
+
sparkSession.createDataFrame(Collections.singletonList(RowFactory.create(1,
"x")), schema1)
+ .write().parquet(p1);
+
+ StructType schema2 = DataTypes.createStructType(Arrays.asList(
+ DataTypes.createStructField("id", DataTypes.IntegerType, true),
+ DataTypes.createStructField("c", DataTypes.IntegerType, true)));
+
sparkSession.createDataFrame(Collections.singletonList(RowFactory.create(1,
99)), schema2)
+ .write().parquet(p2);
+
+ CloudObjectsSelectorCommon cloudObjectsSelectorCommon = new
CloudObjectsSelectorCommon(new TypedProperties());
+ List<CloudObjectMetadata> input = Arrays.asList(
+ new CloudObjectMetadata(p1, 1L),
+ new CloudObjectMetadata(p2, 1L));
+ Option<Dataset<Row>> result =
cloudObjectsSelectorCommon.loadAsDataset(sparkSession, input, "parquet",
Option.empty(), 1);
+ Assertions.assertTrue(result.isPresent());
+ Dataset<Row> ds = result.get();
+ Assertions.assertEquals(2, ds.count());
+ Set<String> colNames =
Arrays.stream(ds.schema().fields()).map(StructField::name).collect(Collectors.toSet());
+ Assertions.assertTrue(colNames.contains("b"));
+ Assertions.assertTrue(colNames.contains("c"));
+ }
+
+ /**
+ * Verifies that the format-gating predicate for the cloud-incremental
mergeSchema option recognises
+ * Parquet and ORC and rejects everything else. End-to-end ORC ingestion is
not exercised here because
+ * {@code hudi-utilities} pulls in {@code orc-core-nohive} while Spark 3.x's
ORC writer expects the
+ * regular {@code orc-core}; that classpath conflict makes {@code
sparkSession.write().orc(...)} fail
+ * with {@code NoSuchFieldError: type} in this module's tests. The
end-to-end behaviour for ORC is
+ * covered by Parquet's tests via the shared helper, plus this predicate
test for the format dispatch.
+ */
+ @Test
+ void isParquetOrOrcFileFormatRecognisesBothFormats() {
+
Assertions.assertTrue(CloudObjectsSelectorCommon.isParquetOrOrcFileFormat("parquet"));
Review Comment:
import `Assertions.assertTrue` and `Assertions.assertFalse`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]