alamb commented on code in PR #7244:
URL: https://github.com/apache/arrow-datafusion/pull/7244#discussion_r1290137396
##########
datafusion/core/src/datasource/file_format/parquet.rs:
##########
@@ -543,6 +574,172 @@ async fn fetch_statistics(
Ok(statistics)
}
+/// Implements [`DataSink`] for writing to a parquet file.
+struct ParquetSink {
+ /// Config options for writing data
+ config: FileSinkConfig,
+}
+
+impl Debug for ParquetSink {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("ParquetSink").finish()
+ }
+}
+
+impl DisplayAs for ParquetSink {
+ fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) ->
fmt::Result {
+ match t {
+ DisplayFormatType::Default | DisplayFormatType::Verbose => {
+ write!(
+ f,
+ "ParquetSink(writer_mode={:?}, file_groups=",
+ self.config.writer_mode
+ )?;
+ FileGroupDisplay(&self.config.file_groups).fmt_as(t, f)?;
+ write!(f, ")")
+ }
+ }
+ }
+}
+
+impl ParquetSink {
+ fn new(config: FileSinkConfig) -> Self {
+ Self { config }
+ }
+
+ /// Builds a parquet WriterProperties struct, setting options as
appropriate from TaskContext options
+ fn parquet_writer_props_from_context(
+ &self,
+ context: &Arc<TaskContext>,
+ ) -> WriterProperties {
+ let parquet_context =
&context.session_config().options().execution.parquet;
+ let mut builder = WriterProperties::builder()
+ .set_created_by(parquet_context.created_by.clone())
+
.set_data_page_row_count_limit(parquet_context.data_page_row_count_limit)
+ .set_data_page_size_limit(parquet_context.data_pagesize_limit);
Review Comment:
> Otherwise, it would seem that the setting is valid and only fail much
later when a write is attempted.
I agree this is a non ideal UX.
However, I think storing the config parameters as strings would be
acceptable as an initial implementation, especially as we don't apply
validation yet for other config settings:
```
❯ set datafusion.sql_parser.dialect = 'wowsa';
0 rows in set. Query took 0.000 seconds.
❯ select 'foo';
Error during planning: Unsupported SQL dialect: wowsa. Available dialects:
Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL,
ClickHouse, BigQuery, Ansi
```
Mostly I am trying to make sure you don't feel the need to have to fix
everything in one go (though of course, if you wanted to add a generic
validation framework for config options, that would be amazing)
##########
datafusion/core/src/datasource/file_format/parquet.rs:
##########
@@ -543,6 +574,172 @@ async fn fetch_statistics(
Ok(statistics)
}
+/// Implements [`DataSink`] for writing to a parquet file.
+struct ParquetSink {
+ /// Config options for writing data
+ config: FileSinkConfig,
+}
+
+impl Debug for ParquetSink {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("ParquetSink").finish()
+ }
+}
+
+impl DisplayAs for ParquetSink {
+ fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) ->
fmt::Result {
+ match t {
+ DisplayFormatType::Default | DisplayFormatType::Verbose => {
+ write!(
+ f,
+ "ParquetSink(writer_mode={:?}, file_groups=",
+ self.config.writer_mode
+ )?;
+ FileGroupDisplay(&self.config.file_groups).fmt_as(t, f)?;
+ write!(f, ")")
+ }
+ }
+ }
+}
+
+impl ParquetSink {
+ fn new(config: FileSinkConfig) -> Self {
+ Self { config }
+ }
+
+ /// Builds a parquet WriterProperties struct, setting options as
appropriate from TaskContext options
+ fn parquet_writer_props_from_context(
+ &self,
+ context: &Arc<TaskContext>,
+ ) -> WriterProperties {
+ let parquet_context =
&context.session_config().options().execution.parquet;
+ let mut builder = WriterProperties::builder()
+ .set_created_by(parquet_context.created_by.clone())
+
.set_data_page_row_count_limit(parquet_context.data_page_row_count_limit)
+ .set_data_page_size_limit(parquet_context.data_pagesize_limit);
Review Comment:
> Otherwise, it would seem that the setting is valid and only fail much
later when a write is attempted.
I agree this is a non ideal UX.
However, I think storing the config parameters as strings would be
acceptable as an initial implementation, especially as we don't apply
validation yet for other config settings:
```
❯ set datafusion.sql_parser.dialect = 'wowsa';
0 rows in set. Query took 0.000 seconds.
❯ select 'foo';
Error during planning: Unsupported SQL dialect: wowsa. Available dialects:
Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL,
ClickHouse, BigQuery, Ansi
```
Mostly I am trying to make sure you don't feel the need to have to fix
everything in one go (though of course, if you wanted to add a generic
validation framework for config options, that would be amazing)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]