alamb commented on a change in pull request #9534:
URL: https://github.com/apache/arrow/pull/9534#discussion_r579790243



##########
File path: rust/arrow/src/csv/reader.rs
##########
@@ -121,18 +141,12 @@ fn infer_file_schema<R: Read + Seek>(
             .collect()
     };
 
-    // save the csv reader position after reading headers
-    let position = csv_reader.position().clone();
-
     let header_length = headers.len();
     // keep track of inferred field types
     let mut column_types: Vec<HashSet<DataType>> = vec![HashSet::new(); 
header_length];
     // keep track of columns with nulls
     let mut nulls: Vec<bool> = vec![false; header_length];
 
-    // return csv reader position to after headers
-    csv_reader.seek(position)?;

Review comment:
       That is a good catch. 

##########
File path: rust/arrow/src/csv/reader.rs
##########
@@ -99,7 +99,27 @@ fn infer_field_schema(string: &str) -> DataType {
 /// If `max_read_records` is not set, the whole file is read to infer its 
schema.
 ///
 /// Return infered schema and number of records used for inference.
-fn infer_file_schema<R: Read + Seek>(
+pub fn infer_file_schema<R: Read + Seek>(
+    reader: &mut R,
+    delimiter: u8,
+    max_read_records: Option<usize>,
+    has_header: bool,
+) -> Result<(Schema, usize)> {
+    let (schema, records_count) =
+        infer_schema_from_reader(reader, delimiter, max_read_records, 
has_header)?;
+    // return the reader seek back to the start
+    reader.seek(SeekFrom::Start(0))?;
+
+    Ok((schema, records_count))
+}
+
+/// Infer schema of CSV records provided by struct that implements `Read` 
trait.
+///
+/// `max_read_records` controlling the maximum number of records to read. If 
`max_read_records` is
+/// not set, all records are read to infer the schema.
+///
+/// Return infered schema and number of records used for inference.
+pub fn infer_schema_from_reader<R: Read>(

Review comment:
       ```suggestion
   pub fn infer_reader_schema<R: Read>(
   ```
   
   To mirror the name of `infer_file_schema`? I don't feel strongly about this 
but wanted to suggest it

##########
File path: rust/arrow/src/csv/reader.rs
##########
@@ -99,7 +99,27 @@ fn infer_field_schema(string: &str) -> DataType {
 /// If `max_read_records` is not set, the whole file is read to infer its 
schema.
 ///
 /// Return infered schema and number of records used for inference.
-fn infer_file_schema<R: Read + Seek>(
+pub fn infer_file_schema<R: Read + Seek>(
+    reader: &mut R,
+    delimiter: u8,
+    max_read_records: Option<usize>,
+    has_header: bool,
+) -> Result<(Schema, usize)> {
+    let (schema, records_count) =
+        infer_schema_from_reader(reader, delimiter, max_read_records, 
has_header)?;
+    // return the reader seek back to the start

Review comment:
       for any other reviewers, this seek back to the start simply keeps the 
existing behavior. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to