manuzhang commented on code in PR #14948:
URL: https://github.com/apache/iceberg/pull/14948#discussion_r3245896179
##########
spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java:
##########
@@ -94,21 +97,40 @@ public InputPartition[] planInputPartitions() {
InputPartition[] partitions = new InputPartition[taskGroups.size()];
for (int index = 0; index < taskGroups.size(); index++) {
+ ScanTaskGroup<?> taskGroup = taskGroups.get(index);
+
partitions[index] =
new SparkInputPartition(
groupingKeyType,
- taskGroups.get(index),
+ taskGroup,
tableBroadcast,
fileIOBroadcast,
projectionString,
caseSensitive,
locations != null ? locations[index] :
SparkPlanningUtil.NO_LOCATION_PREFERENCE,
- cacheDeleteFilesOnExecutors);
+ cacheDeleteFilesOnExecutors,
+ shouldUseMergingSortedReader(taskGroup));
}
return partitions;
}
+ /** Returns whether sort ordering was reported for this batch's scan. */
+ private boolean isOrderingEnabled() {
+ return orderingEnabled;
+ }
+
+ /**
+ * Returns true if this task group should use a k-way merging reader. This
requires ordering to be
+ * enabled at the table level (validated by {@link #isOrderingEnabled()},
multiple files in the
+ * group, and all tasks being {@link FileScanTask}s.
+ */
+ private boolean shouldUseMergingSortedReader(ScanTaskGroup<?> taskGroup) {
Review Comment:
We can cache the result and don't need to check this every time.
##########
spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java:
##########
@@ -94,21 +97,40 @@ public InputPartition[] planInputPartitions() {
InputPartition[] partitions = new InputPartition[taskGroups.size()];
for (int index = 0; index < taskGroups.size(); index++) {
+ ScanTaskGroup<?> taskGroup = taskGroups.get(index);
+
partitions[index] =
new SparkInputPartition(
groupingKeyType,
- taskGroups.get(index),
+ taskGroup,
tableBroadcast,
fileIOBroadcast,
projectionString,
caseSensitive,
locations != null ? locations[index] :
SparkPlanningUtil.NO_LOCATION_PREFERENCE,
- cacheDeleteFilesOnExecutors);
+ cacheDeleteFilesOnExecutors,
+ shouldUseMergingSortedReader(taskGroup));
}
return partitions;
}
+ /** Returns whether sort ordering was reported for this batch's scan. */
+ private boolean isOrderingEnabled() {
+ return orderingEnabled;
+ }
+
+ /**
+ * Returns true if this task group should use a k-way merging reader. This
requires ordering to be
+ * enabled at the table level (validated by {@link #isOrderingEnabled()},
multiple files in the
+ * group, and all tasks being {@link FileScanTask}s.
+ */
+ private boolean shouldUseMergingSortedReader(ScanTaskGroup<?> taskGroup) {
Review Comment:
We can cache the result instead of caching this every time.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]