wgtmac commented on code in PR #489:
URL: https://github.com/apache/iceberg-cpp/pull/489#discussion_r2664097868
##########
src/iceberg/table_scan.h:
##########
@@ -50,175 +63,293 @@ class ICEBERG_EXPORT FileScanTask : public ScanTask {
///
/// \param data_file The data file to read.
/// \param delete_files Delete files that apply to this data file.
- /// \param residual_filter Optional residual filter to apply after reading.
+ /// \param filter Optional residual filter to apply after reading.
explicit FileScanTask(std::shared_ptr<DataFile> data_file,
std::vector<std::shared_ptr<DataFile>> delete_files =
{},
- std::shared_ptr<Expression> residual_filter = nullptr);
+ std::shared_ptr<Expression> filter = nullptr);
/// \brief The data file that should be read by this scan task.
- const std::shared_ptr<DataFile>& data_file() const;
+ const std::shared_ptr<DataFile>& data_file() const { return data_file_; }
/// \brief Delete files that apply to this data file.
- const std::vector<std::shared_ptr<DataFile>>& delete_files() const;
+ const std::vector<std::shared_ptr<DataFile>>& delete_files() const {
+ return delete_files_;
+ }
/// \brief Residual filter to apply after reading.
- const std::shared_ptr<Expression>& residual_filter() const;
-
- /// \brief Check if any deletes need to be applied.
- bool has_deletes() const;
-
- /// \brief Check if a residual filter needs to be applied.
- bool has_residual_filter() const;
+ const std::shared_ptr<Expression>& residual_filter() const { return
residual_filter_; }
+ Kind kind() const override { return Kind::kFileScanTask; }
int64_t size_bytes() const override;
int32_t files_count() const override;
int64_t estimated_row_count() const override;
- /**
- * \brief Returns a C-ABI compatible ArrowArrayStream to read the data for
this task.
- *
- * \param io The FileIO instance for accessing the file data.
- * \param projected_schema The projected schema for reading the data.
- * \param filter Optional filter expression to apply during reading.
- * \return A Result containing an ArrowArrayStream, or an error on failure.
- */
+ /// TODO(gangwu): move it to iceberg/data/task_scanner.h
+ ///
+ /// \brief Returns a C-ABI compatible ArrowArrayStream to read the data for
this task.
+ ///
+ /// \param io The FileIO instance for accessing the file data.
+ /// \param projected_schema The projected schema for reading the data.
+ /// \return A Result containing an ArrowArrayStream, or an error on failure.
Result<ArrowArrayStream> ToArrow(const std::shared_ptr<FileIO>& io,
- const std::shared_ptr<Schema>&
projected_schema,
- const std::shared_ptr<Expression>& filter)
const;
+ std::shared_ptr<Schema> projected_schema)
const;
private:
- /// \brief Data file metadata.
std::shared_ptr<DataFile> data_file_;
- /// \brief Delete files that apply to this data file.
std::vector<std::shared_ptr<DataFile>> delete_files_;
- /// \brief Residual filter to apply after reading.
std::shared_ptr<Expression> residual_filter_;
};
-/// \brief Scan context holding snapshot and scan-specific metadata.
+namespace internal {
+
+// Internal table scan context used by different scan implementations.
struct TableScanContext {
- /// \brief Table metadata.
- std::shared_ptr<TableMetadata> table_metadata;
- /// \brief Snapshot to scan.
- std::shared_ptr<Snapshot> snapshot;
- /// \brief Projected schema.
- std::shared_ptr<Schema> projected_schema;
- /// \brief Filter expression to apply.
+ std::optional<int64_t> snapshot_id;
std::shared_ptr<Expression> filter;
- /// \brief Whether the scan is case-sensitive.
- bool case_sensitive = false;
- /// \brief Additional options for the scan.
+ bool ignore_residuals{false};
+ bool case_sensitive{true};
+ bool return_column_stats{false};
+ std::unordered_set<int32_t> columns_to_keep_stats;
+ std::vector<std::string> selected_columns;
+ std::shared_ptr<Schema> projected_schema;
std::unordered_map<std::string, std::string> options;
- /// \brief Optional limit on the number of rows to scan.
- std::optional<int64_t> limit;
+ bool from_snapshot_id_inclusive{false};
+ std::optional<int64_t> from_snapshot_id;
+ std::optional<int64_t> to_snapshot_id;
+ std::string branch{};
+ std::optional<int64_t> min_rows_requested;
+
+ // Validate the context parameters to see if they have conflicts.
+ [[nodiscard]] Status Validate() const;
};
+} // namespace internal
+
/// \brief Builder class for creating TableScan instances.
-class ICEBERG_EXPORT TableScanBuilder {
+class ICEBERG_EXPORT TableScanBuilder : public ErrorCollector {
public:
/// \brief Constructs a TableScanBuilder for the given table.
- /// \param table_metadata The metadata of the table to scan.
- /// \param file_io The FileIO instance for reading manifests and data files.
- explicit TableScanBuilder(std::shared_ptr<TableMetadata> table_metadata,
- std::shared_ptr<FileIO> file_io);
-
- /// \brief Sets the snapshot ID to scan.
- /// \param snapshot_id The ID of the snapshot.
- /// \return Reference to the builder.
- TableScanBuilder& WithSnapshotId(int64_t snapshot_id);
-
- /// \brief Selects columns to include in the scan.
- /// \param column_names A list of column names. If empty, all columns will
be selected.
- /// \return Reference to the builder.
- TableScanBuilder& WithColumnNames(std::vector<std::string> column_names);
-
- /// \brief Sets the schema to use for the scan.
- /// \param schema The schema to use.
- /// \return Reference to the builder.
- TableScanBuilder& WithProjectedSchema(std::shared_ptr<Schema> schema);
-
- /// \brief Applies a filter expression to the scan.
- /// \param filter Filter expression to use.
- /// \return Reference to the builder.
- TableScanBuilder& WithFilter(std::shared_ptr<Expression> filter);
-
- /// \brief Sets whether the scan should be case-sensitive.
- /// \param case_sensitive Whether the scan is case-sensitive.
- /// /return Reference to the builder.
- TableScanBuilder& WithCaseSensitive(bool case_sensitive);
-
- /// \brief Sets an option for the scan.
- /// \param property The name of the option.
- /// \param value The value of the option.
- /// \return Reference to the builder.
- TableScanBuilder& WithOption(std::string property, std::string value);
-
- /// \brief Sets an optional limit on the number of rows to scan.
- /// \param limit Optional limit on the number of rows.
- /// \return Reference to the builder.
- TableScanBuilder& WithLimit(std::optional<int64_t> limit);
+ /// \param metadata Current table metadata.
+ /// \param io FileIO instance for reading manifests files.
+ static Result<std::unique_ptr<TableScanBuilder>> Make(
+ std::shared_ptr<TableMetadata> metadata, std::shared_ptr<FileIO> io);
+
+ /// \brief Update property that will override the table's behavior
+ /// based on the incoming pair. Unknown properties will be ignored.
+ /// \param key name of the table property to be overridden
+ /// \param value value to override with
+ TableScanBuilder& Option(std::string key, std::string value);
+
+ /// \brief Set the projected schema.
+ /// \param schema a projection schema
+ TableScanBuilder& Project(std::shared_ptr<Schema> schema);
+
+ /// \brief If data columns are selected via Select(), controls whether
+ /// the match to the schema will be done with case sensitivity. Default is
true.
+ /// \param case_sensitive whether the scan is case-sensitive
+ TableScanBuilder& CaseSensitive(bool case_sensitive);
+
+ /// \brief Request this scan to load the column stats with each data file.
+ ///
+ /// Column stats include: value count, null value count, lower bounds, and
upper bounds.
+ TableScanBuilder& IncludeColumnStats();
+
+ /// \brief Request this scan to load the column stats for the specific
columns with each
+ /// data file.
+ ///
+ /// Column stats include: value count, null value count, lower bounds, and
upper bounds.
+ ///
+ /// \param requested_columns column names for which to keep the stats.
+ TableScanBuilder& IncludeColumnStats(const std::vector<std::string>&
requested_columns);
+
+ /// \brief Request this scan to read the given data columns.
+ ///
+ /// This produces an expected schema that includes all fields that are
either selected
+ /// or used by this scan's filter expression.
+ ///
+ /// \param column_names column names from the table's schema
+ TableScanBuilder& Select(const std::vector<std::string>& column_names);
+
+ /// \brief Set the expression to filter data.
+ /// \param filter a filter expression
+ TableScanBuilder& Filter(std::shared_ptr<Expression> filter);
+
+ /// \brief Request data filtering to files but not to rows in those files.
+ TableScanBuilder& IgnoreResiduals();
+
+ /// \brief Request this can to return at least the given number of rows.
+ ///
+ /// This is used as a hint and is entirely optional in order to not have to
return more
+ /// rows than necessary. This may return fewer rows if the scan does not
contain that
+ /// many, or it may return more than requested.
+ ///
+ /// \param num_rows The minimum number of rows requested
+ TableScanBuilder& MinRowsRequested(int64_t num_rows);
+
+ /// \brief Request this scan to use the given snapshot by ID.
+ /// \param snapshot_id a snapshot ID
+ /// \note InvalidArgument will be returned if the snapshot cannot be found
+ TableScanBuilder& UseSnapshot(int64_t snapshot_id);
+
+ /// \brief Request this scan to use the given reference.
+ /// \param ref reference
+ /// \note InvalidArgument will be returned if a reference with the given name
+ /// could not be found
+ TableScanBuilder& UseRef(const std::string& ref);
+
+ /// \brief Request this scan to use the most recent snapshot as of the given
time
+ /// in milliseconds on the branch in the scan or main if no branch is set.
+ /// \param timestamp_millis a timestamp in milliseconds.
+ /// \note InvalidArgument will be returned if the snapshot cannot be found
or time
+ /// travel is attempted on a tag
+ TableScanBuilder& AsOfTime(int64_t timestamp_millis);
+
+ /// \brief Instructs this scan to look for changes starting from a
particular snapshot
+ /// (inclusive).
+ ///
+ /// If the start snapshot is not configured, it defaults to the oldest
ancestor of the
+ /// end snapshot (inclusive).
+ ///
+ /// \param from_snapshot_id the start snapshot ID (inclusive)
+ /// \note InvalidArgument will be returned if the start snapshot is not an
ancestor of
+ /// the end snapshot
+ TableScanBuilder& FromSnapshotInclusive(int64_t from_snapshot_id);
Review Comment:
It makes sense to combine them. I don't think the ref applies to tag.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]