HuangZhenQiu commented on code in PR #18022:
URL: https://github.com/apache/hudi/pull/18022#discussion_r2744301255
##########
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java:
##########
@@ -221,32 +231,117 @@ public DataStream<RowData>
produceDataStream(StreamExecutionEnvironment execEnv)
TypeInformation<RowData> typeInfo =
(TypeInformation<RowData>)
TypeInfoDataTypeConverter.fromDataTypeToTypeInfo(getProducedDataType());
OptionsInference.setupSourceTasks(conf, execEnv.getParallelism());
- if (conf.get(FlinkOptions.READ_AS_STREAMING)) {
- StreamReadMonitoringFunction monitoringFunction = new
StreamReadMonitoringFunction(
- conf, FilePathUtils.toFlinkPath(path), tableRowType,
maxCompactionMemoryInBytes, partitionPruner);
- InputFormat<RowData, ?> inputFormat = getInputFormat(true);
- OneInputStreamOperatorFactory<MergeOnReadInputSplit, RowData>
factory = StreamReadOperator.factory((MergeOnReadInputFormat) inputFormat);
- SingleOutputStreamOperator<MergeOnReadInputSplit>
monitorOperatorStream = execEnv.addSource(monitoringFunction,
getSourceOperatorName("split_monitor"))
- .uid(Pipelines.opUID("split_monitor", conf))
- .setParallelism(1)
- .setMaxParallelism(1);
-
- DataStream<MergeOnReadInputSplit> sourceWithKey =
addFileDistributionStrategy(monitorOperatorStream);
-
- SingleOutputStreamOperator<RowData> streamReadSource = sourceWithKey
- .transform("split_reader", typeInfo, factory)
- .uid(Pipelines.opUID("split_reader", conf))
- .setParallelism(conf.get(FlinkOptions.READ_TASKS));
- return new DataStreamSource<>(streamReadSource);
+
+ if (conf.get(FlinkOptions.READ_SOURCE_V2_ENABLED)) {
+ return produceNewSourceDataStream(execEnv);
} else {
- InputFormatSourceFunctionAdapter<RowData> func = new
InputFormatSourceFunctionAdapter<>(getInputFormat(), typeInfo);
- DataStreamSource<RowData> source = execEnv.addSource(func,
asSummaryString(), typeInfo);
- return
source.name(getSourceOperatorName("bounded_source")).setParallelism(conf.get(FlinkOptions.READ_TASKS));
+ return produceLegacySourceDataStream(execEnv, typeInfo);
}
}
};
}
+ /**
+ * Produces a DataStream using the new FLIP-27 HoodieSource.
+ *
+ * @param execEnv the stream execution environment
+ * @return the configured DataStream
+ */
+ private DataStream<RowData>
produceNewSourceDataStream(StreamExecutionEnvironment execEnv) {
+ HoodieSource<RowData> hoodieSource = createHoodieSource();
+ DataStreamSource<RowData> source = execEnv.fromSource(
+ hoodieSource, WatermarkStrategy.noWatermarks(), "hudi_source");
+ return
source.name(getSourceOperatorName("hudi_source")).uid(Pipelines.opUID("hudi_source",
conf)).setParallelism(conf.get(FlinkOptions.READ_TASKS));
+ }
+
+ /**
+ * Produces a DataStream using the legacy source implementation.
+ *
+ * @param execEnv the stream execution environment
+ * @param typeInfo type information for RowData
+ * @return the configured DataStream
+ */
+ private DataStream<RowData> produceLegacySourceDataStream(
+ StreamExecutionEnvironment execEnv,
+ TypeInformation<RowData> typeInfo) {
+ if (conf.get(FlinkOptions.READ_AS_STREAMING)) {
+ StreamReadMonitoringFunction monitoringFunction = new
StreamReadMonitoringFunction(
+ conf, FilePathUtils.toFlinkPath(path), tableRowType,
maxCompactionMemoryInBytes, partitionPruner);
Review Comment:
Will handle it in a follow PR
[#18052](https://github.com/apache/hudi/issues/18052)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]