Re: [PR] feat: extend table scan to support v2 deletes [iceberg-cpp]

via GitHub Mon, 05 Jan 2026 18:43:39 -0800


gty404 commented on code in PR #489:
URL: https://github.com/apache/iceberg-cpp/pull/489#discussion_r2663331027



##########
src/iceberg/table_scan.h:
##########
@@ -50,175 +63,293 @@ class ICEBERG_EXPORT FileScanTask : public ScanTask {
   ///
   /// \param data_file The data file to read.
   /// \param delete_files Delete files that apply to this data file.
-  /// \param residual_filter Optional residual filter to apply after reading.
+  /// \param filter Optional residual filter to apply after reading.
   explicit FileScanTask(std::shared_ptr<DataFile> data_file,
                         std::vector<std::shared_ptr<DataFile>> delete_files = 
{},
-                        std::shared_ptr<Expression> residual_filter = nullptr);
+                        std::shared_ptr<Expression> filter = nullptr);
 
   /// \brief The data file that should be read by this scan task.
-  const std::shared_ptr<DataFile>& data_file() const;
+  const std::shared_ptr<DataFile>& data_file() const { return data_file_; }
 
   /// \brief Delete files that apply to this data file.
-  const std::vector<std::shared_ptr<DataFile>>& delete_files() const;
+  const std::vector<std::shared_ptr<DataFile>>& delete_files() const {
+    return delete_files_;
+  }
 
   /// \brief Residual filter to apply after reading.
-  const std::shared_ptr<Expression>& residual_filter() const;
-
-  /// \brief Check if any deletes need to be applied.
-  bool has_deletes() const;
-
-  /// \brief Check if a residual filter needs to be applied.
-  bool has_residual_filter() const;
+  const std::shared_ptr<Expression>& residual_filter() const { return 
residual_filter_; }
 
+  Kind kind() const override { return Kind::kFileScanTask; }
   int64_t size_bytes() const override;
   int32_t files_count() const override;
   int64_t estimated_row_count() const override;
 
-  /**
-   * \brief Returns a C-ABI compatible ArrowArrayStream to read the data for 
this task.
-   *
-   * \param io The FileIO instance for accessing the file data.
-   * \param projected_schema The projected schema for reading the data.
-   * \param filter Optional filter expression to apply during reading.
-   * \return A Result containing an ArrowArrayStream, or an error on failure.
-   */
+  /// TODO(gangwu): move it to iceberg/data/task_scanner.h
+  ///
+  /// \brief Returns a C-ABI compatible ArrowArrayStream to read the data for 
this task.
+  ///
+  /// \param io The FileIO instance for accessing the file data.
+  /// \param projected_schema The projected schema for reading the data.
+  /// \return A Result containing an ArrowArrayStream, or an error on failure.
   Result<ArrowArrayStream> ToArrow(const std::shared_ptr<FileIO>& io,
-                                   const std::shared_ptr<Schema>& 
projected_schema,
-                                   const std::shared_ptr<Expression>& filter) 
const;
+                                   std::shared_ptr<Schema> projected_schema) 
const;
 
  private:
-  /// \brief Data file metadata.
   std::shared_ptr<DataFile> data_file_;
-  /// \brief Delete files that apply to this data file.
   std::vector<std::shared_ptr<DataFile>> delete_files_;
-  /// \brief Residual filter to apply after reading.
   std::shared_ptr<Expression> residual_filter_;
 };
 
-/// \brief Scan context holding snapshot and scan-specific metadata.
+namespace internal {
+
+// Internal table scan context used by different scan implementations.
 struct TableScanContext {
-  /// \brief Table metadata.
-  std::shared_ptr<TableMetadata> table_metadata;
-  /// \brief Snapshot to scan.
-  std::shared_ptr<Snapshot> snapshot;
-  /// \brief Projected schema.
-  std::shared_ptr<Schema> projected_schema;
-  /// \brief Filter expression to apply.
+  std::optional<int64_t> snapshot_id;
   std::shared_ptr<Expression> filter;
-  /// \brief Whether the scan is case-sensitive.
-  bool case_sensitive = false;
-  /// \brief Additional options for the scan.
+  bool ignore_residuals{false};
+  bool case_sensitive{true};
+  bool return_column_stats{false};
+  std::unordered_set<int32_t> columns_to_keep_stats;
+  std::vector<std::string> selected_columns;
+  std::shared_ptr<Schema> projected_schema;
   std::unordered_map<std::string, std::string> options;
-  /// \brief Optional limit on the number of rows to scan.
-  std::optional<int64_t> limit;
+  bool from_snapshot_id_inclusive{false};
+  std::optional<int64_t> from_snapshot_id;
+  std::optional<int64_t> to_snapshot_id;
+  std::string branch{};
+  std::optional<int64_t> min_rows_requested;
+
+  // Validate the context parameters to see if they have conflicts.
+  [[nodiscard]] Status Validate() const;
 };
 
+}  // namespace internal
+
 /// \brief Builder class for creating TableScan instances.
-class ICEBERG_EXPORT TableScanBuilder {
+class ICEBERG_EXPORT TableScanBuilder : public ErrorCollector {
  public:
   /// \brief Constructs a TableScanBuilder for the given table.
-  /// \param table_metadata The metadata of the table to scan.
-  /// \param file_io The FileIO instance for reading manifests and data files.
-  explicit TableScanBuilder(std::shared_ptr<TableMetadata> table_metadata,
-                            std::shared_ptr<FileIO> file_io);
-
-  /// \brief Sets the snapshot ID to scan.
-  /// \param snapshot_id The ID of the snapshot.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithSnapshotId(int64_t snapshot_id);
-
-  /// \brief Selects columns to include in the scan.
-  /// \param column_names A list of column names. If empty, all columns will 
be selected.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithColumnNames(std::vector<std::string> column_names);
-
-  /// \brief Sets the schema to use for the scan.
-  /// \param schema The schema to use.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithProjectedSchema(std::shared_ptr<Schema> schema);
-
-  /// \brief Applies a filter expression to the scan.
-  /// \param filter Filter expression to use.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithFilter(std::shared_ptr<Expression> filter);
-
-  /// \brief Sets whether the scan should be case-sensitive.
-  /// \param case_sensitive Whether the scan is case-sensitive.
-  /// /return Reference to the builder.
-  TableScanBuilder& WithCaseSensitive(bool case_sensitive);
-
-  /// \brief Sets an option for the scan.
-  /// \param property The name of the option.
-  /// \param value The value of the option.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithOption(std::string property, std::string value);
-
-  /// \brief Sets an optional limit on the number of rows to scan.
-  /// \param limit Optional limit on the number of rows.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithLimit(std::optional<int64_t> limit);
+  /// \param metadata Current table metadata.
+  /// \param io FileIO instance for reading manifests files.
+  static Result<std::unique_ptr<TableScanBuilder>> Make(
+      std::shared_ptr<TableMetadata> metadata, std::shared_ptr<FileIO> io);
+
+  /// \brief Update property that will override the table's behavior
+  /// based on the incoming pair. Unknown properties will be ignored.
+  /// \param key name of the table property to be overridden
+  /// \param value value to override with
+  TableScanBuilder& Option(std::string key, std::string value);
+
+  /// \brief Set the projected schema.
+  /// \param schema a projection schema
+  TableScanBuilder& Project(std::shared_ptr<Schema> schema);
+
+  /// \brief If data columns are selected via Select(), controls whether
+  /// the match to the schema will be done with case sensitivity. Default is 
true.
+  /// \param case_sensitive whether the scan is case-sensitive
+  TableScanBuilder& CaseSensitive(bool case_sensitive);
+
+  /// \brief Request this scan to load the column stats with each data file.
+  ///
+  /// Column stats include: value count, null value count, lower bounds, and 
upper bounds.
+  TableScanBuilder& IncludeColumnStats();
+
+  /// \brief Request this scan to load the column stats for the specific 
columns with each
+  /// data file.
+  ///
+  /// Column stats include: value count, null value count, lower bounds, and 
upper bounds.
+  ///
+  /// \param requested_columns column names for which to keep the stats.
+  TableScanBuilder& IncludeColumnStats(const std::vector<std::string>& 
requested_columns);
+
+  /// \brief Request this scan to read the given data columns.
+  ///
+  /// This produces an expected schema that includes all fields that are 
either selected
+  /// or used by this scan's filter expression.
+  ///
+  /// \param column_names column names from the table's schema
+  TableScanBuilder& Select(const std::vector<std::string>& column_names);
+
+  /// \brief Set the expression to filter data.
+  /// \param filter a filter expression
+  TableScanBuilder& Filter(std::shared_ptr<Expression> filter);
+
+  /// \brief Request data filtering to files but not to rows in those files.
+  TableScanBuilder& IgnoreResiduals();
+
+  /// \brief Request this can to return at least the given number of rows.

Review Comment:
   ```suggestion
     /// \brief Request this scan to return at least the given number of rows.
   ```



##########
src/iceberg/table_scan.cc:
##########
@@ -135,186 +153,346 @@ Result<ArrowArrayStream> 
MakeArrowArrayStream(std::unique_ptr<Reader> reader) {
 
 }  // namespace
 
+namespace internal {
+
+Status TableScanContext::Validate() const {
+  if (!columns_to_keep_stats.empty() && !return_column_stats) {
+    return InvalidArgument(
+        "Cannot select columns to keep stats when column stats are not 
returned");
+  }
+  if (projected_schema != nullptr && !selected_columns.empty()) {
+    return InvalidArgument(
+        "Cannot set projection schema and selected columns at the same time");
+  }
+  if (snapshot_id.has_value() &&
+      (from_snapshot_id.has_value() || to_snapshot_id.has_value())) {
+    return InvalidArgument("Cannot mix snapshot scan and incremental scan");
+  }
+  if (min_rows_requested.has_value() && min_rows_requested.value() < 0) {
+    return InvalidArgument("Min rows requested cannot be negative");
+  }
+  return {};
+}
+
+}  // namespace internal
+
+ScanTask::~ScanTask() = default;
+
 // FileScanTask implementation
 
 FileScanTask::FileScanTask(std::shared_ptr<DataFile> data_file,
                            std::vector<std::shared_ptr<DataFile>> delete_files,
                            std::shared_ptr<Expression> residual_filter)
     : data_file_(std::move(data_file)),
       delete_files_(std::move(delete_files)),
-      residual_filter_(std::move(residual_filter)) {}
-
-const std::shared_ptr<DataFile>& FileScanTask::data_file() const { return 
data_file_; }
-
-const std::vector<std::shared_ptr<DataFile>>& FileScanTask::delete_files() 
const {
-  return delete_files_;
-}
-
-const std::shared_ptr<Expression>& FileScanTask::residual_filter() const {
-  return residual_filter_;
+      residual_filter_(std::move(residual_filter)) {
+  ICEBERG_DCHECK(data_file_ != nullptr, "Data file cannot be null for 
FileScanTask");
 }
 
-bool FileScanTask::has_deletes() const { return !delete_files_.empty(); }
-
-bool FileScanTask::has_residual_filter() const { return residual_filter_ != 
nullptr; }
-
 int64_t FileScanTask::size_bytes() const { return 
data_file_->file_size_in_bytes; }
 
 int32_t FileScanTask::files_count() const { return 1; }
 
 int64_t FileScanTask::estimated_row_count() const { return 
data_file_->record_count; }
 
 Result<ArrowArrayStream> FileScanTask::ToArrow(
-    const std::shared_ptr<FileIO>& io, const std::shared_ptr<Schema>& 
projected_schema,
-    const std::shared_ptr<Expression>& filter) const {
-  if (has_deletes()) {
+    const std::shared_ptr<FileIO>& io, std::shared_ptr<Schema> 
projected_schema) const {
+  if (!delete_files_.empty()) {
     return NotSupported("Reading data files with delete files is not yet 
supported.");
   }
 
   const ReaderOptions options{.path = data_file_->file_path,
                               .length = data_file_->file_size_in_bytes,
                               .io = io,
-                              .projection = projected_schema,
-                              .filter = filter};
+                              .projection = std::move(projected_schema),
+                              .filter = residual_filter_};
 
   ICEBERG_ASSIGN_OR_RAISE(auto reader,
                           ReaderFactoryRegistry::Open(data_file_->file_format, 
options));
 
   return MakeArrowArrayStream(std::move(reader));
 }
 
+Result<std::unique_ptr<TableScanBuilder>> TableScanBuilder::Make(
+    std::shared_ptr<TableMetadata> metadata, std::shared_ptr<FileIO> io) {
+  ICEBERG_PRECHECK(metadata != nullptr, "Table metadata cannot be null");
+  ICEBERG_PRECHECK(io != nullptr, "FileIO cannot be null");
+  return std::unique_ptr<TableScanBuilder>(
+      new TableScanBuilder(std::move(metadata), std::move(io)));
+}
+
 TableScanBuilder::TableScanBuilder(std::shared_ptr<TableMetadata> 
table_metadata,
                                    std::shared_ptr<FileIO> file_io)
-    : file_io_(std::move(file_io)) {
-  context_.table_metadata = std::move(table_metadata);
-}
+    : metadata_(std::move(table_metadata)), io_(std::move(file_io)) {}
 
-TableScanBuilder& TableScanBuilder::WithColumnNames(
-    std::vector<std::string> column_names) {
-  column_names_ = std::move(column_names);
+TableScanBuilder& TableScanBuilder::Option(std::string key, std::string value) 
{
+  context_.options[std::move(key)] = std::move(value);
   return *this;
 }
 
-TableScanBuilder& 
TableScanBuilder::WithProjectedSchema(std::shared_ptr<Schema> schema) {
+TableScanBuilder& TableScanBuilder::Project(std::shared_ptr<Schema> schema) {
   context_.projected_schema = std::move(schema);
   return *this;
 }
 
-TableScanBuilder& TableScanBuilder::WithSnapshotId(int64_t snapshot_id) {
-  snapshot_id_ = snapshot_id;
+TableScanBuilder& TableScanBuilder::CaseSensitive(bool case_sensitive) {
+  context_.case_sensitive = case_sensitive;
+  return *this;
+}
+
+TableScanBuilder& TableScanBuilder::IncludeColumnStats() {
+  context_.return_column_stats = true;
+  return *this;
+}
+
+TableScanBuilder& TableScanBuilder::IncludeColumnStats(
+    const std::vector<std::string>& requested_columns) {
+  context_.return_column_stats = true;
+  context_.columns_to_keep_stats.clear();
+  context_.columns_to_keep_stats.reserve(requested_columns.size());
+
+  ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto schema_ref, ResolveSnapshotSchema());
+  const auto& schema = schema_ref.get();
+  for (const auto& column_name : requested_columns) {
+    ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto field, 
schema->FindFieldByName(column_name));
+    if (field.has_value()) {
+      context_.columns_to_keep_stats.insert(field.value().get().field_id());
+    }
+  }
+
+  return *this;
+}
+
+TableScanBuilder& TableScanBuilder::Select(const std::vector<std::string>& 
column_names) {
+  context_.selected_columns = column_names;
   return *this;
 }
 
-TableScanBuilder& TableScanBuilder::WithFilter(std::shared_ptr<Expression> 
filter) {
+TableScanBuilder& TableScanBuilder::Filter(std::shared_ptr<Expression> filter) 
{
   context_.filter = std::move(filter);
   return *this;
 }
 
-TableScanBuilder& TableScanBuilder::WithCaseSensitive(bool case_sensitive) {
-  context_.case_sensitive = case_sensitive;
+TableScanBuilder& TableScanBuilder::IgnoreResiduals() {
+  context_.ignore_residuals = true;
   return *this;
 }
 
-TableScanBuilder& TableScanBuilder::WithOption(std::string property, 
std::string value) {
-  context_.options[std::move(property)] = std::move(value);
+TableScanBuilder& TableScanBuilder::MinRowsRequested(int64_t num_rows) {
+  context_.min_rows_requested = num_rows;
   return *this;
 }
 
-TableScanBuilder& TableScanBuilder::WithLimit(std::optional<int64_t> limit) {
-  context_.limit = limit;
+TableScanBuilder& TableScanBuilder::UseSnapshot(int64_t snapshot_id) {
+  ICEBERG_BUILDER_CHECK(!context_.snapshot_id.has_value(),
+                        "Cannot override snapshot, already set snapshot id={}",
+                        context_.snapshot_id.value());
+  ICEBERG_BUILDER_ASSIGN_OR_RETURN(std::ignore, 
metadata_->SnapshotById(snapshot_id));
+  context_.snapshot_id = snapshot_id;
   return *this;
 }
 
-Result<std::unique_ptr<TableScan>> TableScanBuilder::Build() {
-  const auto& table_metadata = context_.table_metadata;
-  auto snapshot_id = snapshot_id_ ? snapshot_id_ : 
table_metadata->current_snapshot_id;
-  if (!snapshot_id) {
-    return InvalidArgument("No snapshot ID specified for table {}",
-                           table_metadata->table_uuid);
+TableScanBuilder& TableScanBuilder::UseRef(const std::string& ref) {
+  if (ref == SnapshotRef::kMainBranch) {
+    snapshot_schema_ = nullptr;
+    context_.snapshot_id.reset();
+    return *this;
+  }
+
+  ICEBERG_BUILDER_CHECK(!context_.snapshot_id.has_value(),
+                        "Cannot override ref, already set snapshot id={}",
+                        context_.snapshot_id.value());
+  auto iter = metadata_->refs.find(ref);
+  if (iter != metadata_->refs.end()) {
+    ICEBERG_BUILDER_CHECK(iter->second != nullptr, "Ref {} is null", ref);
+    int32_t snapshot_id = iter->second->snapshot_id;
+    ICEBERG_BUILDER_ASSIGN_OR_RETURN(std::ignore, 
metadata_->SnapshotById(snapshot_id));
+    context_.snapshot_id = snapshot_id;
+  } else {
+    return AddError(InvalidArgument("Cannot find ref {}", ref));
   }

Review Comment:
   ```suggestion
     ICEBERG_BUILDER_CHECK(iter != metadata_->refs.end(), "Cannot find ref {}", 
ref);
     ICEBERG_BUILDER_CHECK(iter->second != nullptr, "Ref {} is null", ref);
     int32_t snapshot_id = iter->second->snapshot_id;
     ICEBERG_BUILDER_ASSIGN_OR_RETURN(std::ignore, 
metadata_->SnapshotById(snapshot_id));
     context_.snapshot_id = snapshot_id;
   ```



##########
src/iceberg/table_scan.h:
##########
@@ -50,175 +63,293 @@ class ICEBERG_EXPORT FileScanTask : public ScanTask {
   ///
   /// \param data_file The data file to read.
   /// \param delete_files Delete files that apply to this data file.
-  /// \param residual_filter Optional residual filter to apply after reading.
+  /// \param filter Optional residual filter to apply after reading.
   explicit FileScanTask(std::shared_ptr<DataFile> data_file,
                         std::vector<std::shared_ptr<DataFile>> delete_files = 
{},
-                        std::shared_ptr<Expression> residual_filter = nullptr);
+                        std::shared_ptr<Expression> filter = nullptr);
 
   /// \brief The data file that should be read by this scan task.
-  const std::shared_ptr<DataFile>& data_file() const;
+  const std::shared_ptr<DataFile>& data_file() const { return data_file_; }
 
   /// \brief Delete files that apply to this data file.
-  const std::vector<std::shared_ptr<DataFile>>& delete_files() const;
+  const std::vector<std::shared_ptr<DataFile>>& delete_files() const {
+    return delete_files_;
+  }
 
   /// \brief Residual filter to apply after reading.
-  const std::shared_ptr<Expression>& residual_filter() const;
-
-  /// \brief Check if any deletes need to be applied.
-  bool has_deletes() const;
-
-  /// \brief Check if a residual filter needs to be applied.
-  bool has_residual_filter() const;
+  const std::shared_ptr<Expression>& residual_filter() const { return 
residual_filter_; }
 
+  Kind kind() const override { return Kind::kFileScanTask; }
   int64_t size_bytes() const override;
   int32_t files_count() const override;
   int64_t estimated_row_count() const override;
 
-  /**
-   * \brief Returns a C-ABI compatible ArrowArrayStream to read the data for 
this task.
-   *
-   * \param io The FileIO instance for accessing the file data.
-   * \param projected_schema The projected schema for reading the data.
-   * \param filter Optional filter expression to apply during reading.
-   * \return A Result containing an ArrowArrayStream, or an error on failure.
-   */
+  /// TODO(gangwu): move it to iceberg/data/task_scanner.h
+  ///
+  /// \brief Returns a C-ABI compatible ArrowArrayStream to read the data for 
this task.
+  ///
+  /// \param io The FileIO instance for accessing the file data.
+  /// \param projected_schema The projected schema for reading the data.
+  /// \return A Result containing an ArrowArrayStream, or an error on failure.
   Result<ArrowArrayStream> ToArrow(const std::shared_ptr<FileIO>& io,
-                                   const std::shared_ptr<Schema>& 
projected_schema,
-                                   const std::shared_ptr<Expression>& filter) 
const;
+                                   std::shared_ptr<Schema> projected_schema) 
const;
 
  private:
-  /// \brief Data file metadata.
   std::shared_ptr<DataFile> data_file_;
-  /// \brief Delete files that apply to this data file.
   std::vector<std::shared_ptr<DataFile>> delete_files_;
-  /// \brief Residual filter to apply after reading.
   std::shared_ptr<Expression> residual_filter_;
 };
 
-/// \brief Scan context holding snapshot and scan-specific metadata.
+namespace internal {
+
+// Internal table scan context used by different scan implementations.
 struct TableScanContext {
-  /// \brief Table metadata.
-  std::shared_ptr<TableMetadata> table_metadata;
-  /// \brief Snapshot to scan.
-  std::shared_ptr<Snapshot> snapshot;
-  /// \brief Projected schema.
-  std::shared_ptr<Schema> projected_schema;
-  /// \brief Filter expression to apply.
+  std::optional<int64_t> snapshot_id;
   std::shared_ptr<Expression> filter;
-  /// \brief Whether the scan is case-sensitive.
-  bool case_sensitive = false;
-  /// \brief Additional options for the scan.
+  bool ignore_residuals{false};
+  bool case_sensitive{true};
+  bool return_column_stats{false};
+  std::unordered_set<int32_t> columns_to_keep_stats;
+  std::vector<std::string> selected_columns;
+  std::shared_ptr<Schema> projected_schema;
   std::unordered_map<std::string, std::string> options;
-  /// \brief Optional limit on the number of rows to scan.
-  std::optional<int64_t> limit;
+  bool from_snapshot_id_inclusive{false};
+  std::optional<int64_t> from_snapshot_id;
+  std::optional<int64_t> to_snapshot_id;

Review Comment:
   How about defining a VersionRange or DataRange to express a specific version 
or version range?



##########
src/iceberg/table_scan.cc:
##########
@@ -135,186 +153,346 @@ Result<ArrowArrayStream> 
MakeArrowArrayStream(std::unique_ptr<Reader> reader) {
 
 }  // namespace
 
+namespace internal {
+
+Status TableScanContext::Validate() const {
+  if (!columns_to_keep_stats.empty() && !return_column_stats) {

Review Comment:
   use  ICEBERG_PRECHECK ?



##########
src/iceberg/table_scan.h:
##########
@@ -50,175 +63,293 @@ class ICEBERG_EXPORT FileScanTask : public ScanTask {
   ///
   /// \param data_file The data file to read.
   /// \param delete_files Delete files that apply to this data file.
-  /// \param residual_filter Optional residual filter to apply after reading.
+  /// \param filter Optional residual filter to apply after reading.
   explicit FileScanTask(std::shared_ptr<DataFile> data_file,
                         std::vector<std::shared_ptr<DataFile>> delete_files = 
{},
-                        std::shared_ptr<Expression> residual_filter = nullptr);
+                        std::shared_ptr<Expression> filter = nullptr);
 
   /// \brief The data file that should be read by this scan task.
-  const std::shared_ptr<DataFile>& data_file() const;
+  const std::shared_ptr<DataFile>& data_file() const { return data_file_; }
 
   /// \brief Delete files that apply to this data file.
-  const std::vector<std::shared_ptr<DataFile>>& delete_files() const;
+  const std::vector<std::shared_ptr<DataFile>>& delete_files() const {
+    return delete_files_;
+  }
 
   /// \brief Residual filter to apply after reading.
-  const std::shared_ptr<Expression>& residual_filter() const;
-
-  /// \brief Check if any deletes need to be applied.
-  bool has_deletes() const;
-
-  /// \brief Check if a residual filter needs to be applied.
-  bool has_residual_filter() const;
+  const std::shared_ptr<Expression>& residual_filter() const { return 
residual_filter_; }
 
+  Kind kind() const override { return Kind::kFileScanTask; }
   int64_t size_bytes() const override;
   int32_t files_count() const override;
   int64_t estimated_row_count() const override;
 
-  /**
-   * \brief Returns a C-ABI compatible ArrowArrayStream to read the data for 
this task.
-   *
-   * \param io The FileIO instance for accessing the file data.
-   * \param projected_schema The projected schema for reading the data.
-   * \param filter Optional filter expression to apply during reading.
-   * \return A Result containing an ArrowArrayStream, or an error on failure.
-   */
+  /// TODO(gangwu): move it to iceberg/data/task_scanner.h
+  ///
+  /// \brief Returns a C-ABI compatible ArrowArrayStream to read the data for 
this task.
+  ///
+  /// \param io The FileIO instance for accessing the file data.
+  /// \param projected_schema The projected schema for reading the data.
+  /// \return A Result containing an ArrowArrayStream, or an error on failure.
   Result<ArrowArrayStream> ToArrow(const std::shared_ptr<FileIO>& io,
-                                   const std::shared_ptr<Schema>& 
projected_schema,
-                                   const std::shared_ptr<Expression>& filter) 
const;
+                                   std::shared_ptr<Schema> projected_schema) 
const;
 
  private:
-  /// \brief Data file metadata.
   std::shared_ptr<DataFile> data_file_;
-  /// \brief Delete files that apply to this data file.
   std::vector<std::shared_ptr<DataFile>> delete_files_;
-  /// \brief Residual filter to apply after reading.
   std::shared_ptr<Expression> residual_filter_;
 };
 
-/// \brief Scan context holding snapshot and scan-specific metadata.
+namespace internal {
+
+// Internal table scan context used by different scan implementations.
 struct TableScanContext {
-  /// \brief Table metadata.
-  std::shared_ptr<TableMetadata> table_metadata;
-  /// \brief Snapshot to scan.
-  std::shared_ptr<Snapshot> snapshot;
-  /// \brief Projected schema.
-  std::shared_ptr<Schema> projected_schema;
-  /// \brief Filter expression to apply.
+  std::optional<int64_t> snapshot_id;
   std::shared_ptr<Expression> filter;
-  /// \brief Whether the scan is case-sensitive.
-  bool case_sensitive = false;
-  /// \brief Additional options for the scan.
+  bool ignore_residuals{false};
+  bool case_sensitive{true};
+  bool return_column_stats{false};
+  std::unordered_set<int32_t> columns_to_keep_stats;
+  std::vector<std::string> selected_columns;
+  std::shared_ptr<Schema> projected_schema;
   std::unordered_map<std::string, std::string> options;
-  /// \brief Optional limit on the number of rows to scan.
-  std::optional<int64_t> limit;
+  bool from_snapshot_id_inclusive{false};
+  std::optional<int64_t> from_snapshot_id;
+  std::optional<int64_t> to_snapshot_id;
+  std::string branch{};
+  std::optional<int64_t> min_rows_requested;
+
+  // Validate the context parameters to see if they have conflicts.
+  [[nodiscard]] Status Validate() const;
 };
 
+}  // namespace internal
+
 /// \brief Builder class for creating TableScan instances.
-class ICEBERG_EXPORT TableScanBuilder {
+class ICEBERG_EXPORT TableScanBuilder : public ErrorCollector {
  public:
   /// \brief Constructs a TableScanBuilder for the given table.
-  /// \param table_metadata The metadata of the table to scan.
-  /// \param file_io The FileIO instance for reading manifests and data files.
-  explicit TableScanBuilder(std::shared_ptr<TableMetadata> table_metadata,
-                            std::shared_ptr<FileIO> file_io);
-
-  /// \brief Sets the snapshot ID to scan.
-  /// \param snapshot_id The ID of the snapshot.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithSnapshotId(int64_t snapshot_id);
-
-  /// \brief Selects columns to include in the scan.
-  /// \param column_names A list of column names. If empty, all columns will 
be selected.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithColumnNames(std::vector<std::string> column_names);
-
-  /// \brief Sets the schema to use for the scan.
-  /// \param schema The schema to use.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithProjectedSchema(std::shared_ptr<Schema> schema);
-
-  /// \brief Applies a filter expression to the scan.
-  /// \param filter Filter expression to use.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithFilter(std::shared_ptr<Expression> filter);
-
-  /// \brief Sets whether the scan should be case-sensitive.
-  /// \param case_sensitive Whether the scan is case-sensitive.
-  /// /return Reference to the builder.
-  TableScanBuilder& WithCaseSensitive(bool case_sensitive);
-
-  /// \brief Sets an option for the scan.
-  /// \param property The name of the option.
-  /// \param value The value of the option.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithOption(std::string property, std::string value);
-
-  /// \brief Sets an optional limit on the number of rows to scan.
-  /// \param limit Optional limit on the number of rows.
-  /// \return Reference to the builder.
-  TableScanBuilder& WithLimit(std::optional<int64_t> limit);
+  /// \param metadata Current table metadata.
+  /// \param io FileIO instance for reading manifests files.
+  static Result<std::unique_ptr<TableScanBuilder>> Make(
+      std::shared_ptr<TableMetadata> metadata, std::shared_ptr<FileIO> io);
+
+  /// \brief Update property that will override the table's behavior
+  /// based on the incoming pair. Unknown properties will be ignored.
+  /// \param key name of the table property to be overridden
+  /// \param value value to override with
+  TableScanBuilder& Option(std::string key, std::string value);
+
+  /// \brief Set the projected schema.
+  /// \param schema a projection schema
+  TableScanBuilder& Project(std::shared_ptr<Schema> schema);
+
+  /// \brief If data columns are selected via Select(), controls whether
+  /// the match to the schema will be done with case sensitivity. Default is 
true.
+  /// \param case_sensitive whether the scan is case-sensitive
+  TableScanBuilder& CaseSensitive(bool case_sensitive);
+
+  /// \brief Request this scan to load the column stats with each data file.
+  ///
+  /// Column stats include: value count, null value count, lower bounds, and 
upper bounds.
+  TableScanBuilder& IncludeColumnStats();
+
+  /// \brief Request this scan to load the column stats for the specific 
columns with each
+  /// data file.
+  ///
+  /// Column stats include: value count, null value count, lower bounds, and 
upper bounds.
+  ///
+  /// \param requested_columns column names for which to keep the stats.
+  TableScanBuilder& IncludeColumnStats(const std::vector<std::string>& 
requested_columns);
+
+  /// \brief Request this scan to read the given data columns.
+  ///
+  /// This produces an expected schema that includes all fields that are 
either selected
+  /// or used by this scan's filter expression.
+  ///
+  /// \param column_names column names from the table's schema
+  TableScanBuilder& Select(const std::vector<std::string>& column_names);
+
+  /// \brief Set the expression to filter data.
+  /// \param filter a filter expression
+  TableScanBuilder& Filter(std::shared_ptr<Expression> filter);
+
+  /// \brief Request data filtering to files but not to rows in those files.
+  TableScanBuilder& IgnoreResiduals();
+
+  /// \brief Request this can to return at least the given number of rows.
+  ///
+  /// This is used as a hint and is entirely optional in order to not have to 
return more
+  /// rows than necessary. This may return fewer rows if the scan does not 
contain that
+  /// many, or it may return more than requested.
+  ///
+  /// \param num_rows The minimum number of rows requested
+  TableScanBuilder& MinRowsRequested(int64_t num_rows);
+
+  /// \brief Request this scan to use the given snapshot by ID.
+  /// \param snapshot_id a snapshot ID
+  /// \note InvalidArgument will be returned if the snapshot cannot be found
+  TableScanBuilder& UseSnapshot(int64_t snapshot_id);
+
+  /// \brief Request this scan to use the given reference.
+  /// \param ref reference
+  /// \note InvalidArgument will be returned if a reference with the given name
+  /// could not be found
+  TableScanBuilder& UseRef(const std::string& ref);
+
+  /// \brief Request this scan to use the most recent snapshot as of the given 
time
+  /// in milliseconds on the branch in the scan or main if no branch is set.
+  /// \param timestamp_millis a timestamp in milliseconds.
+  /// \note InvalidArgument will be returned if the snapshot cannot be found 
or time
+  /// travel is attempted on a tag
+  TableScanBuilder& AsOfTime(int64_t timestamp_millis);
+
+  /// \brief Instructs this scan to look for changes starting from a 
particular snapshot
+  /// (inclusive).
+  ///
+  /// If the start snapshot is not configured, it defaults to the oldest 
ancestor of the
+  /// end snapshot (inclusive).
+  ///
+  /// \param from_snapshot_id the start snapshot ID (inclusive)
+  /// \note InvalidArgument will be returned if the start snapshot is not an 
ancestor of
+  /// the end snapshot
+  TableScanBuilder& FromSnapshotInclusive(int64_t from_snapshot_id);

Review Comment:
   TableScanBuilder& FromSnapshot(int64_t from_snapshot_id, bool inclusive);
   TableScanBuilder& FromSnapshot(const std::string& ref,  bool inclusive);
   
   How about merging these interfaces of FromSnapshot into this one? Can ref 
also be changed to tag? SnapshotRef might be Branch or Tag. Using ref here 
would be a bit strange



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: extend table scan to support v2 deletes [iceberg-cpp]

Reply via email to