This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 4964d84431 Add `ReaderBuilder::with_header` for csv reader (#4949)
4964d84431 is described below

commit 4964d844313d5e62cf102616d26864dca6fe286e
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Wed Oct 18 14:18:52 2023 +0100

    Add `ReaderBuilder::with_header` for csv reader (#4949)
    
    * Add ReaderBuilder::with_header
    
    * Update test
---
 arrow-csv/examples/csv_calculation.rs |  2 +-
 arrow-csv/src/reader/mod.rs           | 48 ++++++++++++++++++++---------------
 arrow/benches/csv_reader.rs           |  2 +-
 parquet/src/bin/parquet-fromcsv.rs    |  6 ++---
 4 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/arrow-csv/examples/csv_calculation.rs 
b/arrow-csv/examples/csv_calculation.rs
index 12aaadde44..6ce963e2b0 100644
--- a/arrow-csv/examples/csv_calculation.rs
+++ b/arrow-csv/examples/csv_calculation.rs
@@ -33,7 +33,7 @@ fn main() {
         Field::new("c4", DataType::Boolean, true),
     ]);
     let mut reader = ReaderBuilder::new(Arc::new(csv_schema))
-        .has_header(true)
+        .with_header(true)
         .build(file)
         .unwrap();
 
diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 1106b16bc4..a194b35ffa 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -225,7 +225,7 @@ impl InferredDataType {
 /// The format specification for the CSV file
 #[derive(Debug, Clone, Default)]
 pub struct Format {
-    has_header: bool,
+    header: bool,
     delimiter: Option<u8>,
     escape: Option<u8>,
     quote: Option<u8>,
@@ -235,7 +235,7 @@ pub struct Format {
 
 impl Format {
     pub fn with_header(mut self, has_header: bool) -> Self {
-        self.has_header = has_header;
+        self.header = has_header;
         self
     }
 
@@ -280,7 +280,7 @@ impl Format {
 
         // get or create header names
         // when has_header is false, creates default column names with column_ 
prefix
-        let headers: Vec<String> = if self.has_header {
+        let headers: Vec<String> = if self.header {
             let headers = 
&csv_reader.headers().map_err(map_csv_error)?.clone();
             headers.iter().map(|s| s.to_string()).collect()
         } else {
@@ -331,7 +331,7 @@ impl Format {
     /// Build a [`csv::Reader`] for this [`Format`]
     fn build_reader<R: Read>(&self, reader: R) -> csv::Reader<R> {
         let mut builder = csv::ReaderBuilder::new();
-        builder.has_headers(self.has_header);
+        builder.has_headers(self.header);
 
         if let Some(c) = self.delimiter {
             builder.delimiter(c);
@@ -403,7 +403,7 @@ pub fn infer_reader_schema<R: Read>(
 ) -> Result<(Schema, usize), ArrowError> {
     let format = Format {
         delimiter: Some(delimiter),
-        has_header,
+        header: has_header,
         ..Default::default()
     };
     format.infer_schema(reader, max_read_records)
@@ -425,7 +425,7 @@ pub fn infer_schema_from_files(
     let mut records_to_read = max_read_records.unwrap_or(usize::MAX);
     let format = Format {
         delimiter: Some(delimiter),
-        has_header,
+        header: has_header,
         ..Default::default()
     };
 
@@ -1095,8 +1095,16 @@ impl ReaderBuilder {
     }
 
     /// Set whether the CSV file has headers
+    #[deprecated(note = "Use with_header")]
+    #[doc(hidden)]
     pub fn has_header(mut self, has_header: bool) -> Self {
-        self.format.has_header = has_header;
+        self.format.header = has_header;
+        self
+    }
+
+    /// Set whether the CSV file has a header
+    pub fn with_header(mut self, has_header: bool) -> Self {
+        self.format.header = has_header;
         self
     }
 
@@ -1176,7 +1184,7 @@ impl ReaderBuilder {
         let delimiter = self.format.build_parser();
         let record_decoder = RecordDecoder::new(delimiter, 
self.schema.fields().len());
 
-        let header = self.format.has_header as usize;
+        let header = self.format.header as usize;
 
         let (start, end) = match self.bounds {
             Some((start, end)) => (start + header, end + header),
@@ -1317,7 +1325,7 @@ mod tests {
             .chain(Cursor::new("\n".to_string()))
             .chain(file_without_headers);
         let mut csv = ReaderBuilder::new(Arc::new(schema))
-            .has_header(true)
+            .with_header(true)
             .build(both_files)
             .unwrap();
         let batch = csv.next().unwrap().unwrap();
@@ -1335,7 +1343,7 @@ mod tests {
             .unwrap();
 
         file.rewind().unwrap();
-        let builder = ReaderBuilder::new(Arc::new(schema)).has_header(true);
+        let builder = ReaderBuilder::new(Arc::new(schema)).with_header(true);
 
         let mut csv = builder.build(file).unwrap();
         let expected_schema = Schema::new(vec![
@@ -1505,7 +1513,7 @@ mod tests {
         let file = File::open("test/data/null_test.csv").unwrap();
 
         let mut csv = ReaderBuilder::new(schema)
-            .has_header(true)
+            .with_header(true)
             .build(file)
             .unwrap();
 
@@ -1530,7 +1538,7 @@ mod tests {
         let file = File::open("test/data/init_null_test.csv").unwrap();
 
         let mut csv = ReaderBuilder::new(schema)
-            .has_header(true)
+            .with_header(true)
             .build(file)
             .unwrap();
 
@@ -1588,7 +1596,7 @@ mod tests {
         let null_regex = Regex::new("^nil$").unwrap();
 
         let mut csv = ReaderBuilder::new(schema)
-            .has_header(true)
+            .with_header(true)
             .with_null_regex(null_regex)
             .build(file)
             .unwrap();
@@ -1710,7 +1718,7 @@ mod tests {
         ]);
 
         let builder = ReaderBuilder::new(Arc::new(schema))
-            .has_header(true)
+            .with_header(true)
             .with_delimiter(b'|')
             .with_batch_size(512)
             .with_projection(vec![0, 1, 2, 3]);
@@ -2037,7 +2045,7 @@ mod tests {
             Field::new("text2", DataType::Utf8, false),
         ]);
         let builder = ReaderBuilder::new(Arc::new(schema))
-            .has_header(false)
+            .with_header(false)
             .with_quote(b'~'); // default is ", change to ~
 
         let mut csv_text = Vec::new();
@@ -2069,7 +2077,7 @@ mod tests {
             Field::new("text2", DataType::Utf8, false),
         ]);
         let builder = ReaderBuilder::new(Arc::new(schema))
-            .has_header(false)
+            .with_header(false)
             .with_escape(b'\\'); // default is None, change to \
 
         let mut csv_text = Vec::new();
@@ -2101,7 +2109,7 @@ mod tests {
             Field::new("text2", DataType::Utf8, false),
         ]);
         let builder = ReaderBuilder::new(Arc::new(schema))
-            .has_header(false)
+            .with_header(false)
             .with_terminator(b'\n'); // default is CRLF, change to LF
 
         let mut csv_text = Vec::new();
@@ -2143,7 +2151,7 @@ mod tests {
         ]));
 
         for (idx, (bounds, has_header, expected)) in 
tests.into_iter().enumerate() {
-            let mut reader = 
ReaderBuilder::new(schema.clone()).has_header(has_header);
+            let mut reader = 
ReaderBuilder::new(schema.clone()).with_header(has_header);
             if let Some((start, end)) = bounds {
                 reader = reader.with_bounds(start, end);
             }
@@ -2208,7 +2216,7 @@ mod tests {
                 for capacity in [1, 3, 7, 100] {
                     let reader = ReaderBuilder::new(schema.clone())
                         .with_batch_size(batch_size)
-                        .has_header(has_header)
+                        .with_header(has_header)
                         .build(File::open(path).unwrap())
                         .unwrap();
 
@@ -2226,7 +2234,7 @@ mod tests {
 
                     let reader = ReaderBuilder::new(schema.clone())
                         .with_batch_size(batch_size)
-                        .has_header(has_header)
+                        .with_header(has_header)
                         .build_buffered(buffered)
                         .unwrap();
 
diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs
index 4c3f663bf7..5a91dfe0a6 100644
--- a/arrow/benches/csv_reader.rs
+++ b/arrow/benches/csv_reader.rs
@@ -45,7 +45,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: 
Vec<ArrayRef>) {
                 let cursor = Cursor::new(buf.as_slice());
                 let reader = csv::ReaderBuilder::new(batch.schema())
                     .with_batch_size(batch_size)
-                    .has_header(true)
+                    .with_header(true)
                     .build_buffered(cursor)
                     .unwrap();
 
diff --git a/parquet/src/bin/parquet-fromcsv.rs 
b/parquet/src/bin/parquet-fromcsv.rs
index 548bbdbfb8..1f5d0a62bb 100644
--- a/parquet/src/bin/parquet-fromcsv.rs
+++ b/parquet/src/bin/parquet-fromcsv.rs
@@ -321,7 +321,7 @@ fn configure_reader_builder(args: &Args, arrow_schema: 
Arc<Schema>) -> ReaderBui
 
     let mut builder = ReaderBuilder::new(arrow_schema)
         .with_batch_size(args.batch_size)
-        .has_header(args.has_header)
+        .with_header(args.has_header)
         .with_delimiter(args.get_delimiter());
 
     builder = configure_reader(
@@ -606,7 +606,7 @@ mod tests {
 
         let reader_builder = configure_reader_builder(&args, arrow_schema);
         let builder_debug = format!("{reader_builder:?}");
-        assert_debug_text(&builder_debug, "has_header", "false");
+        assert_debug_text(&builder_debug, "header", "false");
         assert_debug_text(&builder_debug, "delimiter", "Some(44)");
         assert_debug_text(&builder_debug, "quote", "Some(34)");
         assert_debug_text(&builder_debug, "terminator", "None");
@@ -641,7 +641,7 @@ mod tests {
         ]));
         let reader_builder = configure_reader_builder(&args, arrow_schema);
         let builder_debug = format!("{reader_builder:?}");
-        assert_debug_text(&builder_debug, "has_header", "true");
+        assert_debug_text(&builder_debug, "header", "true");
         assert_debug_text(&builder_debug, "delimiter", "Some(9)");
         assert_debug_text(&builder_debug, "quote", "None");
         assert_debug_text(&builder_debug, "terminator", "Some(10)");

Reply via email to