This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 556c5ff819 Cleanup CSV WriterBuilder, Default to AutoSI Second
Precision (#4735) (#4909)
556c5ff819 is described below
commit 556c5ff8193665bc4cbba80505d517f4d5b8b601
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Wed Oct 11 07:58:20 2023 +0100
Cleanup CSV WriterBuilder, Default to AutoSI Second Precision (#4735)
(#4909)
* Cleanup CSV WriterBuilder (#4735)
* Update test
* Review feedback
* Clippy
---
arrow-csv/src/writer.rs | 134 ++++++++++++++++++++++++++++--------------------
arrow/tests/csv.rs | 42 ---------------
2 files changed, 79 insertions(+), 97 deletions(-)
diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs
index 840e8e8a93..1ca956e2c7 100644
--- a/arrow-csv/src/writer.rs
+++ b/arrow-csv/src/writer.rs
@@ -70,11 +70,6 @@ use csv::ByteRecord;
use std::io::Write;
use crate::map_csv_error;
-
-const DEFAULT_DATE_FORMAT: &str = "%F";
-const DEFAULT_TIME_FORMAT: &str = "%T";
-const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f";
-const DEFAULT_TIMESTAMP_TZ_FORMAT: &str = "%FT%H:%M:%S.%9f%:z";
const DEFAULT_NULL_VALUE: &str = "";
/// A CSV writer
@@ -82,41 +77,29 @@ const DEFAULT_NULL_VALUE: &str = "";
pub struct Writer<W: Write> {
/// The object to write to
writer: csv::Writer<W>,
- /// Whether file should be written with headers. Defaults to `true`
+ /// Whether file should be written with headers, defaults to `true`
has_headers: bool,
- /// The date format for date arrays
+ /// The date format for date arrays, defaults to RFC3339
date_format: Option<String>,
- /// The datetime format for datetime arrays
+ /// The datetime format for datetime arrays, defaults to RFC3339
datetime_format: Option<String>,
- /// The timestamp format for timestamp arrays
+ /// The timestamp format for timestamp arrays, defaults to RFC3339
timestamp_format: Option<String>,
- /// The timestamp format for timestamp (with timezone) arrays
+ /// The timestamp format for timestamp (with timezone) arrays, defaults to
RFC3339
timestamp_tz_format: Option<String>,
- /// The time format for time arrays
+ /// The time format for time arrays, defaults to RFC3339
time_format: Option<String>,
/// Is the beginning-of-writer
beginning: bool,
- /// The value to represent null entries
- null_value: String,
+ /// The value to represent null entries, defaults to [`DEFAULT_NULL_VALUE`]
+ null_value: Option<String>,
}
impl<W: Write> Writer<W> {
/// Create a new CsvWriter from a writable object, with default options
pub fn new(writer: W) -> Self {
let delimiter = b',';
- let mut builder = csv::WriterBuilder::new();
- let writer = builder.delimiter(delimiter).from_writer(writer);
- Writer {
- writer,
- has_headers: true,
- date_format: Some(DEFAULT_DATE_FORMAT.to_string()),
- datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()),
- time_format: Some(DEFAULT_TIME_FORMAT.to_string()),
- timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()),
- timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()),
- beginning: true,
- null_value: DEFAULT_NULL_VALUE.to_string(),
- }
+ WriterBuilder::new().with_delimiter(delimiter).build(writer)
}
/// Write a vector of record batches to a writable object
@@ -138,7 +121,7 @@ impl<W: Write> Writer<W> {
}
let options = FormatOptions::default()
- .with_null(&self.null_value)
+
.with_null(self.null_value.as_deref().unwrap_or(DEFAULT_NULL_VALUE))
.with_date_format(self.date_format.as_deref())
.with_datetime_format(self.datetime_format.as_deref())
.with_timestamp_format(self.timestamp_format.as_deref())
@@ -207,9 +190,9 @@ impl<W: Write> RecordBatchWriter for Writer<W> {
#[derive(Clone, Debug)]
pub struct WriterBuilder {
/// Optional column delimiter. Defaults to `b','`
- delimiter: Option<u8>,
+ delimiter: u8,
/// Whether to write column names as file headers. Defaults to `true`
- has_headers: bool,
+ has_header: bool,
/// Optional date format for date arrays
date_format: Option<String>,
/// Optional datetime format for datetime arrays
@@ -227,14 +210,14 @@ pub struct WriterBuilder {
impl Default for WriterBuilder {
fn default() -> Self {
Self {
- has_headers: true,
- delimiter: None,
- date_format: Some(DEFAULT_DATE_FORMAT.to_string()),
- datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()),
- time_format: Some(DEFAULT_TIME_FORMAT.to_string()),
- timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()),
- timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()),
- null_value: Some(DEFAULT_NULL_VALUE.to_string()),
+ has_header: true,
+ delimiter: b',',
+ date_format: None,
+ datetime_format: None,
+ time_format: None,
+ timestamp_format: None,
+ timestamp_tz_format: None,
+ null_value: None,
}
}
}
@@ -254,7 +237,7 @@ impl WriterBuilder {
/// let file = File::create("target/out.csv").unwrap();
///
/// // create a builder that doesn't write headers
- /// let builder = WriterBuilder::new().has_headers(false);
+ /// let builder = WriterBuilder::new().with_header(false);
/// let writer = builder.build(file);
///
/// writer
@@ -265,48 +248,92 @@ impl WriterBuilder {
}
/// Set whether to write headers
+ #[deprecated(note = "Use Self::with_header")]
+ #[doc(hidden)]
pub fn has_headers(mut self, has_headers: bool) -> Self {
- self.has_headers = has_headers;
+ self.has_header = has_headers;
+ self
+ }
+
+ /// Set whether to write the CSV file with a header
+ pub fn with_header(mut self, header: bool) -> Self {
+ self.has_header = header;
self
}
+ /// Returns `true` if this writer is configured to write a header
+ pub fn header(&self) -> bool {
+ self.has_header
+ }
+
/// Set the CSV file's column delimiter as a byte character
pub fn with_delimiter(mut self, delimiter: u8) -> Self {
- self.delimiter = Some(delimiter);
+ self.delimiter = delimiter;
self
}
+ /// Get the CSV file's column delimiter as a byte character
+ pub fn delimiter(&self) -> u8 {
+ self.delimiter
+ }
+
/// Set the CSV file's date format
pub fn with_date_format(mut self, format: String) -> Self {
self.date_format = Some(format);
self
}
+ /// Get the CSV file's date format if set, defaults to RFC3339
+ pub fn date_format(&self) -> Option<&str> {
+ self.date_format.as_deref()
+ }
+
/// Set the CSV file's datetime format
pub fn with_datetime_format(mut self, format: String) -> Self {
self.datetime_format = Some(format);
self
}
+ /// Get the CSV file's datetime format if set, defaults to RFC3339
+ pub fn datetime_format(&self) -> Option<&str> {
+ self.datetime_format.as_deref()
+ }
+
/// Set the CSV file's time format
pub fn with_time_format(mut self, format: String) -> Self {
self.time_format = Some(format);
self
}
+ /// Get the CSV file's datetime time if set, defaults to RFC3339
+ pub fn time_format(&self) -> Option<&str> {
+ self.time_format.as_deref()
+ }
+
/// Set the CSV file's timestamp format
pub fn with_timestamp_format(mut self, format: String) -> Self {
self.timestamp_format = Some(format);
self
}
+ /// Get the CSV file's timestamp format if set, defaults to RFC3339
+ pub fn timestamp_format(&self) -> Option<&str> {
+ self.timestamp_format.as_deref()
+ }
+
/// Set the value to represent null in output
pub fn with_null(mut self, null_value: String) -> Self {
self.null_value = Some(null_value);
self
}
- /// Use RFC3339 format for date/time/timestamps
+ /// Get the value to represent null in output
+ pub fn null(&self) -> &str {
+ self.null_value.as_deref().unwrap_or(DEFAULT_NULL_VALUE)
+ }
+
+ /// Use RFC3339 format for date/time/timestamps (default)
+ #[deprecated(note = "Use WriterBuilder::default()")]
pub fn with_rfc3339(mut self) -> Self {
self.date_format = None;
self.datetime_format = None;
@@ -318,21 +345,18 @@ impl WriterBuilder {
/// Create a new `Writer`
pub fn build<W: Write>(self, writer: W) -> Writer<W> {
- let delimiter = self.delimiter.unwrap_or(b',');
let mut builder = csv::WriterBuilder::new();
- let writer = builder.delimiter(delimiter).from_writer(writer);
+ let writer = builder.delimiter(self.delimiter).from_writer(writer);
Writer {
writer,
- has_headers: self.has_headers,
+ beginning: true,
+ has_headers: self.has_header,
date_format: self.date_format,
datetime_format: self.datetime_format,
time_format: self.time_format,
timestamp_format: self.timestamp_format,
timestamp_tz_format: self.timestamp_tz_format,
- beginning: true,
- null_value: self
- .null_value
- .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()),
+ null_value: self.null_value,
}
}
}
@@ -411,11 +435,11 @@ mod tests {
let expected = r#"c1,c2,c3,c4,c5,c6,c7
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes
-consectetur adipiscing
elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes
-sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
+consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes
+sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes
-consectetur adipiscing
elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes
-sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
+consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes
+sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo
"#;
assert_eq!(expected.to_string(), String::from_utf8(buffer).unwrap());
}
@@ -512,7 +536,7 @@ sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
let mut file = tempfile::tempfile().unwrap();
let builder = WriterBuilder::new()
- .has_headers(false)
+ .with_header(false)
.with_delimiter(b'|')
.with_null("NULL".to_string())
.with_time_format("%r".to_string());
@@ -560,7 +584,7 @@ sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
)
.unwrap();
- let builder = WriterBuilder::new().has_headers(false);
+ let builder = WriterBuilder::new().with_header(false);
let mut buf: Cursor<Vec<u8>> = Default::default();
// drop the writer early to release the borrow.
@@ -652,7 +676,7 @@ sed do eiusmod
tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo
let mut file = tempfile::tempfile().unwrap();
- let builder = WriterBuilder::new().with_rfc3339();
+ let builder = WriterBuilder::new();
let mut writer = builder.build(&mut file);
let batches = vec![&batch];
for batch in batches {
diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs
index 3ee3191017..a79b6b44c2 100644
--- a/arrow/tests/csv.rs
+++ b/arrow/tests/csv.rs
@@ -53,48 +53,6 @@ fn test_export_csv_timestamps() {
}
drop(writer);
- let left = "c1,c2
-2019-04-18T20:54:47.378000000+10:00,2019-04-18T10:54:47.378000000
-2021-10-30T17:59:07.000000000+11:00,2021-10-30T06:59:07.000000000\n";
- let right = String::from_utf8(sw).unwrap();
- assert_eq!(left, right);
-}
-
-#[test]
-fn test_export_csv_timestamps_using_rfc3339() {
- let schema = Schema::new(vec![
- Field::new(
- "c1",
- DataType::Timestamp(TimeUnit::Millisecond,
Some("Australia/Sydney".into())),
- true,
- ),
- Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None),
true),
- ]);
-
- let c1 = TimestampMillisecondArray::from(
- // 1555584887 converts to 2019-04-18, 20:54:47 in time zone
Australia/Sydney (AEST).
- // The offset (difference to UTC) is +10:00.
- // 1635577147 converts to 2021-10-30 17:59:07 in time zone
Australia/Sydney (AEDT)
- // The offset (difference to UTC) is +11:00. Note that daylight
savings is in effect on 2021-10-30.
- //
- vec![Some(1555584887378), Some(1635577147000)],
- )
- .with_timezone("Australia/Sydney");
- let c2 =
- TimestampMillisecondArray::from(vec![Some(1555584887378),
Some(1635577147000)]);
- let batch =
- RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1),
Arc::new(c2)]).unwrap();
-
- let mut sw = Vec::new();
- let mut writer = arrow_csv::WriterBuilder::new()
- .with_rfc3339()
- .build(&mut sw);
- let batches = vec![&batch];
- for batch in batches {
- writer.write(batch).unwrap();
- }
- drop(writer);
-
let left = "c1,c2
2019-04-18T20:54:47.378+10:00,2019-04-18T10:54:47.378
2021-10-30T17:59:07+11:00,2021-10-30T06:59:07\n";