Re: [PR] feat: support limit push down in Hudi Flink Source V2 [hudi]

via GitHub Mon, 30 Mar 2026 00:44:11 -0700


cshuo commented on code in PR #18406:
URL: https://github.com/apache/hudi/pull/18406#discussion_r3008038977



##########
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/reader/HoodieSourceSplitReader.java:
##########
@@ -54,21 +56,30 @@
 public class HoodieSourceSplitReader<T> implements 
SplitReader<HoodieRecordWithPosition<T>, HoodieSourceSplit> {
   private static final Logger LOG = 
LoggerFactory.getLogger(HoodieSourceSplitReader.class);
 
+  /** Sentinel value indicating that no row limit has been pushed down. */
+  public static final long NO_LIMIT = -1L;
+
   private final SerializableComparator<HoodieSourceSplit> splitComparator;
   private final Queue<HoodieSourceSplit> splits;
   private final FlinkStreamReadMetrics readerMetrics;
   private final SplitReaderFunction<T> readerFunction;
+  /** Total row limit pushed down from the planner; {@code NO_LIMIT} means 
unlimited. */
+  private final long limit;
+  /** Number of records emitted so far across all splits. */
+  private long totalReadCount = 0;
   private transient HoodieSourceSplit currentSplit;
   private transient 
CloseableIterator<RecordsWithSplitIds<HoodieRecordWithPosition<T>>> 
currentReader;

Review Comment:
   unused variable, can be removed?



##########
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/reader/function/HoodieSplitReaderFunction.java:
##########
@@ -71,28 +62,26 @@ public HoodieSplitReaderFunction(
       String mergeType,
       List<ExpressionPredicates.Predicate> predicates,
       boolean emitDelete) {
-
+    super(configuration, predicates, internalSchemaManager, emitDelete);
     ValidationUtils.checkArgument(tableSchema != null, "tableSchema can't be 
null");
     ValidationUtils.checkArgument(requiredSchema != null, "requiredSchema 
can't be null");
     ValidationUtils.checkArgument(internalSchemaManager != null, 
"internalSchemaManager can't be null");
     this.tableSchema = tableSchema;
     this.requiredSchema = requiredSchema;
-    this.internalSchemaManager = internalSchemaManager;
-    this.configuration = configuration;
-    this.writeConfig = FlinkWriteClients.getHoodieClientConfig(configuration);
-    this.predicates = predicates;
     this.mergeType = mergeType;
-    this.emitDelete = emitDelete;
   }
 
   @Override
   public RecordsWithSplitIds<HoodieRecordWithPosition<RowData>> 
read(HoodieSourceSplit split) {
     final String splitId = split.splitId();
-    HoodieTableMetaClient metaClient = 
StreamerUtil.metaClientForReader(configuration, getHadoopConf());
+    HoodieTableMetaClient metaClient = StreamerUtil.metaClientForReader(conf, 
getHadoopConf());
 
     try {
+      if (fileGroupReader != null) {

Review Comment:
   We do not need close file group reader here, since the underlying file group 
reader will be closed when `RecordsWithSplitIds` is finished and 
`RecordsWithSplitIds#recycle()` is called. 
   
   Besides, it's not safe as well, because the split fetcher runs in a separate 
thread, and may fetch more than one `RecordsWithSplitIds` into a queue, and 
it's possible that the second `RecordsWithSplitIds` is fetched before the first 
`RecordsWithSplitIds` is finished. 



##########
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/reader/HoodieSourceSplitReader.java:
##########
@@ -54,21 +56,30 @@
 public class HoodieSourceSplitReader<T> implements 
SplitReader<HoodieRecordWithPosition<T>, HoodieSourceSplit> {
   private static final Logger LOG = 
LoggerFactory.getLogger(HoodieSourceSplitReader.class);
 
+  /** Sentinel value indicating that no row limit has been pushed down. */
+  public static final long NO_LIMIT = -1L;
+
   private final SerializableComparator<HoodieSourceSplit> splitComparator;
   private final Queue<HoodieSourceSplit> splits;
   private final FlinkStreamReadMetrics readerMetrics;
   private final SplitReaderFunction<T> readerFunction;
+  /** Total row limit pushed down from the planner; {@code NO_LIMIT} means 
unlimited. */
+  private final long limit;
+  /** Number of records emitted so far across all splits. */
+  private long totalReadCount = 0;

Review Comment:
   Can we add an `RecordLimiter` utility class, which provides methods like 
`increment()` and `reachLimit()`? Then adding a `Option<RecordLimiter>` field 
here to make things clearer?



##########
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/reader/function/AbstractSplitReaderFunction.java:
##########
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.source.reader.function;
+
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.table.data.RowData;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.configuration.HadoopConfigurations;
+import org.apache.hudi.source.ExpressionPredicates;
+import org.apache.hudi.table.format.InternalSchemaManager;
+import org.apache.hudi.util.FlinkWriteClients;
+import org.apache.hudi.util.Lazy;
+
+import java.util.List;
+
+/**
+ * Abstract implementation of SplitReaderFunction.
+ */
+public abstract class AbstractSplitReaderFunction implements 
SplitReaderFunction<RowData> {
+
+  protected final Configuration conf;
+  protected final InternalSchemaManager internalSchemaManager;
+  protected final List<ExpressionPredicates.Predicate> predicates;
+  protected final boolean emitDelete;
+  private transient Lazy<HoodieWriteConfig> writeConfig;
+  private transient Lazy<org.apache.hadoop.conf.Configuration> hadoopConf;
+
+  public AbstractSplitReaderFunction(
+      Configuration conf,
+      List<ExpressionPredicates.Predicate> predicates,
+      InternalSchemaManager internalSchemaManager,
+      boolean emitDelete) {
+    this.conf = conf;
+    this.predicates = predicates;
+    this.internalSchemaManager = internalSchemaManager;
+    this.emitDelete = emitDelete;
+  }
+
+  protected HoodieWriteConfig getWriteConfig() {
+    if (writeConfig == null) {
+      writeConfig = Lazy.lazily(() -> 
FlinkWriteClients.getHoodieClientConfig(conf));

Review Comment:
   We can directly initialize lazy variable in the constructor, e.g., put 
following code after line 52:
   ```
   this.writeConfig = Lazy.lazily(() -> 
FlinkWriteClients.getHoodieClientConfig(conf));
   this.hadoopConf = Lazy.lazily(() -> 
HadoopConfigurations.getHadoopConf(conf));
   ```
   Then methods `getWriteConfig` and `getHadoopConf` are not needed anymore, 
and can be replaced by `this.writeConfig.get()`/ `this.hadoopConf.get()`



##########
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/reader/function/HoodieSplitReaderFunction.java:
##########
@@ -71,28 +62,26 @@ public HoodieSplitReaderFunction(
       String mergeType,
       List<ExpressionPredicates.Predicate> predicates,
       boolean emitDelete) {
-
+    super(configuration, predicates, internalSchemaManager, emitDelete);
     ValidationUtils.checkArgument(tableSchema != null, "tableSchema can't be 
null");
     ValidationUtils.checkArgument(requiredSchema != null, "requiredSchema 
can't be null");
     ValidationUtils.checkArgument(internalSchemaManager != null, 
"internalSchemaManager can't be null");
     this.tableSchema = tableSchema;
     this.requiredSchema = requiredSchema;
-    this.internalSchemaManager = internalSchemaManager;
-    this.configuration = configuration;
-    this.writeConfig = FlinkWriteClients.getHoodieClientConfig(configuration);
-    this.predicates = predicates;
     this.mergeType = mergeType;
-    this.emitDelete = emitDelete;
   }
 
   @Override
   public RecordsWithSplitIds<HoodieRecordWithPosition<RowData>> 
read(HoodieSourceSplit split) {
     final String splitId = split.splitId();
-    HoodieTableMetaClient metaClient = 
StreamerUtil.metaClientForReader(configuration, getHadoopConf());
+    HoodieTableMetaClient metaClient = StreamerUtil.metaClientForReader(conf, 
getHadoopConf());
 
     try {
+      if (fileGroupReader != null) {
+        fileGroupReader.close();
+      }
       this.fileGroupReader = createFileGroupReader(split, metaClient);
-      final ClosableIterator<RowData> recordIterator = 
fileGroupReader.getClosableIterator();
+      ClosableIterator<RowData> recordIterator = 
fileGroupReader.getClosableIterator();

Review Comment:
   unnecessary change?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat: support limit push down in Hudi Flink Source V2 [hudi]

Reply via email to