liurenjie1024 commented on code in PR #129: URL: https://github.com/apache/iceberg-rust/pull/129#discussion_r1436018238
########## crates/iceberg/src/table.rs: ########## @@ -42,8 +50,624 @@ impl Table { &self.metadata } + /// Returns current metadata ref. + pub fn metadata_ref(&self) -> TableMetadataRef { + self.metadata.clone() + } + /// Returns current metadata location. pub fn metadata_location(&self) -> Option<&str> { self.metadata_location.as_deref() } + + /// Creates a table scan. + pub fn scan(&self) -> TableScanBuilder<'_> { + TableScanBuilder::new(self) + } +} + +/// Builder to create table scan. +pub struct TableScanBuilder<'a> { + table: &'a Table, + // Empty column names means to select all columns + column_names: Vec<String>, + limit: Option<usize>, + case_sensitive: bool, + snapshot_id: Option<i64>, +} + +impl<'a> TableScanBuilder<'a> { + fn new(table: &'a Table) -> Self { + Self { + table, + column_names: vec![], + case_sensitive: false, + snapshot_id: None, + limit: None, + } + } + + /// Select all columns. + pub fn select_all(mut self) -> Self { + self.column_names.clear(); + self + } + + /// Select some columns of the table. + pub fn select(mut self, column_names: impl IntoIterator<Item = impl ToString>) -> Self { + self.column_names = column_names + .into_iter() + .map(|item| item.to_string()) + .collect(); + self + } + + /// Limit the number of rows returned. + /// + /// If not set, all rows will be returned. + /// If set, the value must be greater than 0. + pub fn limit(mut self, limit: usize) -> Self { + self.limit = Some(limit); + self + } + + /// Set the snapshot to scan. When not set, it uses current snapshot. + pub fn snapshot_id(mut self, snapshot_id: i64) -> Self { + self.snapshot_id = Some(snapshot_id); + self + } + + /// Build the table scan. + pub fn build(self) -> Result<TableScan> { + let snapshot = match self.snapshot_id { + Some(snapshot_id) => self + .table + .metadata() + .snapshot_by_id(snapshot_id) + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Snapshot with id {} not found", snapshot_id), + ) + })? + .clone(), + None => self + .table + .metadata() + .current_snapshot() + .ok_or_else(|| { + Error::new( + ErrorKind::FeatureUnsupported, + "Can't scan table without snapshots", + ) + })? + .clone(), + }; + + let schema = snapshot.schema(self.table.metadata())?; + + // Check that all column names exist in the schema. + if !self.column_names.is_empty() { + for column_name in &self.column_names { + if schema.field_by_name(column_name).is_none() { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("Column {} not found in table.", column_name), + )); + } + } + } + + Ok(TableScan { + column_names: self.column_names.clone(), + limit: None, Review Comment: Good catch! > Also curious how are you planning on enforcing the limit? I think this helps to push limit to concrete scan task, but we can't enforce it in a distributed environment. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org