houqp commented on a change in pull request #932: URL: https://github.com/apache/arrow-datafusion/pull/932#discussion_r696268779
########## File path: ballista/rust/core/src/serde/logical_plan/to_proto.rs ########## @@ -253,6 +256,44 @@ impl TryInto<DataType> for &protobuf::ArrowType { } } +#[allow(clippy::from_over_into)] +impl Into<protobuf::Statistics> for Statistics { + fn into(self) -> protobuf::Statistics { + let none_value = -1_i64; + protobuf::Statistics { + num_rows: self.num_rows.map(|n| n as i64).unwrap_or(none_value), + total_byte_size: self.total_byte_size.map(|n| n as i64).unwrap_or(none_value), + column_stats: vec![], Review comment: what's the reason for always setting column_stats to empty vector here? ########## File path: ballista/rust/scheduler/src/lib.rs ########## @@ -282,24 +282,18 @@ impl SchedulerGrpc for SchedulerServer { match file_type { FileType::Parquet => { - let parquet_exec = - ParquetExec::try_from_path(&path, None, None, 1024, 1, None) - .map_err(|e| { - let msg = format!("Error opening parquet files: {}", e); - error!("{}", msg); - tonic::Status::internal(msg) - })?; + let parquet_desc = ParquetTableDescriptor::new(&path).map_err(|e| { + let msg = format!("Error opening parquet files: {}", e); + error!("{}", msg); + tonic::Status::internal(msg) + })?; //TODO include statistics and any other info needed to reconstruct ParquetExec Ok(Response::new(GetFileMetadataResult { - schema: Some(parquet_exec.schema().as_ref().into()), - partitions: parquet_exec - .partitions() - .iter() - .map(|part| FilePartitionMetadata { - filename: part.filenames().to_vec(), - }) - .collect(), + schema: Some(parquet_desc.schema().as_ref().into()), + partitions: vec![FilePartitionMetadata { + filename: vec![path], Review comment: I remember we discussed this in the original PR. After taking a second look at the code, I am still not fully following the change here. The old behavior has `FilePartitionMetadata.filename` set to a vector of file paths returned from a directory list, while the new behavior here has the filename always set to a vector of single entry with value set to the root path of the table. Shouldn't we use `parquet_desc.descriptor.descriptor` to build the filename vector here instead? ########## File path: ballista/rust/core/src/serde/logical_plan/from_proto.rs ########## @@ -301,6 +315,48 @@ impl TryInto<LogicalPlan> for &protobuf::LogicalPlanNode { } } +impl TryInto<TableDescriptor> for &protobuf::TableDescriptor { + type Error = BallistaError; + + fn try_into(self) -> Result<TableDescriptor, Self::Error> { + let partition_files = self + .partition_files + .iter() + .map(|f| f.try_into()) + .collect::<Result<Vec<PartitionedFile>, _>>()?; + let schema = convert_required!(self.schema)?; + Ok(TableDescriptor { + path: self.path.to_owned(), + partition_files, + schema: Arc::new(schema), + }) + } +} + +impl TryInto<PartitionedFile> for &protobuf::PartitionedFile { + type Error = BallistaError; + + fn try_into(self) -> Result<PartitionedFile, Self::Error> { + let statistics = convert_required!(self.statistics)?; + Ok(PartitionedFile { + file_path: self.path.clone(), + statistics, + }) + } +} + +impl TryInto<Statistics> for &protobuf::Statistics { + type Error = BallistaError; + + fn try_into(self) -> Result<Statistics, Self::Error> { + Ok(Statistics { + num_rows: Some(self.num_rows as usize), + total_byte_size: Some(self.total_byte_size as usize), + column_statistics: None, Review comment: what's the reason for always setting column_statistics to None here? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org