nevi-me commented on a change in pull request #7309:
URL: https://github.com/apache/arrow/pull/7309#discussion_r434689868



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -157,9 +156,26 @@ fn generate_schema(spec: HashMap<String, 
HashSet<DataType>>) -> Result<Arc<Schem
 /// `max_read_records` controlling the maximum number of records to read.
 ///
 /// If `max_read_records` is not set, the whole file is read to infer its 
field types.
-fn infer_json_schema(file: File, max_read_records: Option<usize>) -> 
Result<Arc<Schema>> {
+pub fn infer_json_schema_from_seekable<R: Read + Seek>(

Review comment:
       May you please add a note on why a user would want/need to use this vs 
the non-seekable one

##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -1037,62 +1065,73 @@ mod tests {
             .unwrap();
         let batch = reader.next().unwrap().unwrap();
 
-        assert_eq!(4, batch.num_columns());
-        assert_eq!(4, batch.num_rows());
-
-        let schema = batch.schema();
-
-        let a = schema.column_with_name("a").unwrap();
-        assert_eq!(&DataType::Int64, a.1.data_type());
-        let b = schema.column_with_name("b").unwrap();
-        assert_eq!(
-            &DataType::List(Box::new(DataType::Float64)),
-            b.1.data_type()
-        );
-        let c = schema.column_with_name("c").unwrap();
-        assert_eq!(
-            &DataType::List(Box::new(DataType::Boolean)),
-            c.1.data_type()
-        );
-        let d = schema.column_with_name("d").unwrap();
-        assert_eq!(&DataType::List(Box::new(DataType::Utf8)), d.1.data_type());
-
-        let bb = batch
-            .column(b.0)
-            .as_any()
-            .downcast_ref::<ListArray>()
-            .unwrap();
-        let bb = bb.values();
-        let bb = bb.as_any().downcast_ref::<Float64Array>().unwrap();
-        assert_eq!(10, bb.len());
-        assert_eq!(4.0, bb.value(9));
-
-        let cc = batch
-            .column(c.0)
-            .as_any()
-            .downcast_ref::<ListArray>()
-            .unwrap();
-        let cc = cc.values();
-        let cc = cc.as_any().downcast_ref::<BooleanArray>().unwrap();
-        assert_eq!(6, cc.len());
-        assert_eq!(false, cc.value(0));
-        assert_eq!(false, cc.value(3));
-        assert_eq!(false, cc.is_valid(2));
-        assert_eq!(false, cc.is_valid(4));
-
-        let dd = batch
-            .column(d.0)
-            .as_any()
-            .downcast_ref::<ListArray>()
-            .unwrap();
-        let dd = dd.values();
-        let dd = dd.as_any().downcast_ref::<StringArray>().unwrap();
-        assert_eq!(7, dd.len());
-        assert_eq!(false, dd.is_valid(1));
-        assert_eq!("text", dd.value(2));
-        assert_eq!("1", dd.value(3));
-        assert_eq!("false", dd.value(4));
-        assert_eq!("array", dd.value(5));
-        assert_eq!("2.4", dd.value(6));
+        let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap();

Review comment:
       Will this mean that we're then not supporting zipped files for now? I'm 
fine with that.

##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -364,6 +370,24 @@ impl<R: Read> Reader<R> {
         }
     }
 
+    /// Create a new JSON Reader from a `BufReader<R: Read>`
+    ///
+    /// If reading a `File`, you can customise the Reader, such as to enable 
schema

Review comment:
       This doesn't flow well. Perhaps:
   
   ```
   To customi{s|z}e the schema, such as to enable schema inference, use 
`ReaderBuilder`
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to