alamb commented on a change in pull request #9412:
URL: https://github.com/apache/arrow/pull/9412#discussion_r577531684
##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -330,136 +307,237 @@ pub fn infer_json_schema<R: Read>(
infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records))
}
+fn set_object_scalar_field_type(
+ field_types: &mut HashMap<String, InferredType>,
+ key: &str,
+ ftype: DataType,
+) -> Result<()> {
+ if !field_types.contains_key(key) {
+ field_types.insert(key.to_string(),
InferredType::Scalar(HashSet::new()));
+ }
+
+ match field_types.get_mut(key).unwrap() {
+ InferredType::Scalar(hs) => {
+ hs.insert(ftype);
+ Ok(())
+ }
+ // in case of column contains both scalar type and scalar array type,
we convert type of
+ // this column to scalar array.
+ scalar_array @ InferredType::Array(_) => {
+ let mut hs = HashSet::new();
+ hs.insert(ftype);
+ scalar_array.merge(InferredType::Scalar(hs))?;
+ Ok(())
+ }
+ t => Err(ArrowError::JsonError(format!(
+ "Expected scalar or scalar array JSON type, found: {:?}",
+ t,
+ ))),
+ }
+}
+
+fn infer_scalar_array_type(array: &[Value]) -> Result<InferredType> {
+ let mut hs = HashSet::new();
+
+ for v in array {
+ match v {
+ Value::Null => {}
+ Value::Number(n) => {
+ if n.is_i64() {
+ hs.insert(DataType::Int64);
+ } else {
+ hs.insert(DataType::Float64);
+ }
+ }
+ Value::Bool(_) => {
+ hs.insert(DataType::Boolean);
+ }
+ Value::String(_) => {
+ hs.insert(DataType::Utf8);
+ }
+ Value::Array(_) | Value::Object(_) => {
+ return Err(ArrowError::JsonError(format!(
+ "Expected scalar value for scalar array, got: {:?}",
+ v
+ )));
+ }
+ }
+ }
+
+ Ok(InferredType::Scalar(hs))
+}
+
+fn infer_nested_array_type(array: &[Value]) -> Result<InferredType> {
+ let mut inner_ele_type = InferredType::Any;
+
+ for v in array {
+ match v {
+ Value::Array(inner_array) => {
+ inner_ele_type.merge(infer_array_element_type(inner_array)?)?;
+ }
+ x => {
+ return Err(ArrowError::JsonError(format!(
+ "Got non array element in nested array: {:?}",
+ x
+ )));
+ }
+ }
+ }
+
+ Ok(InferredType::Array(Box::new(inner_ele_type)))
+}
+
+fn infer_struct_array_type(array: &[Value]) -> Result<InferredType> {
+ let mut field_types = HashMap::new();
+
+ for v in array {
+ match v {
+ Value::Object(map) => {
+ collect_field_types_from_object(&mut field_types, map)?;
+ }
+ _ => {
+ return Err(ArrowError::JsonError(format!(
+ "Expected struct value for struct array, got: {:?}",
+ v
+ )));
+ }
+ }
+ }
+
+ Ok(InferredType::Object(field_types))
+}
+
+fn infer_array_element_type(array: &[Value]) -> Result<InferredType> {
+ match array.iter().take(1).next() {
+ None => Ok(InferredType::Any), // empty array, return any type that
can be updated later
+ Some(a) => match a {
+ Value::Array(_) => infer_nested_array_type(array),
+ Value::Object(_) => infer_struct_array_type(array),
+ _ => infer_scalar_array_type(array),
+ },
+ }
+}
+
+fn collect_field_types_from_object(
+ field_types: &mut HashMap<String, InferredType>,
+ map: &JsonMap<String, Value>,
+) -> Result<()> {
+ for (k, v) in map {
+ match v {
+ Value::Array(array) => {
+ let ele_type = infer_array_element_type(array)?;
+
+ if !field_types.contains_key(k) {
+ match ele_type {
+ InferredType::Scalar(_) => {
+ field_types.insert(
+ k.to_string(),
+
InferredType::Array(Box::new(InferredType::Scalar(
+ HashSet::new(),
+ ))),
+ );
+ }
+ InferredType::Object(_) => {
+ field_types.insert(
+ k.to_string(),
+
InferredType::Array(Box::new(InferredType::Object(
+ HashMap::new(),
+ ))),
+ );
+ }
+ InferredType::Any | InferredType::Array(_) => {
+ // set inner type to any for nested array as well
+ // so it can be updated properly from subsequent
type merges
+ field_types.insert(
+ k.to_string(),
+
InferredType::Array(Box::new(InferredType::Any)),
+ );
+ }
+ }
+ }
+
+ match field_types.get_mut(k).unwrap() {
+ InferredType::Array(inner_type) => {
+ inner_type.merge(ele_type)?;
+ }
+ // in case of column contains both scalar type and scalar
array type, we
+ // convert type of this column to scalar array.
+ field_type @ InferredType::Scalar(_) => {
+ field_type.merge(ele_type)?;
+ *field_type =
InferredType::Array(Box::new(field_type.clone()));
+ }
+ t => {
+ return Err(ArrowError::JsonError(format!(
+ "Expected array json type, found: {:?}",
+ t,
+ )));
+ }
+ }
+ }
+ Value::Bool(_) => {
+ set_object_scalar_field_type(field_types, k,
DataType::Boolean)?;
+ }
+ Value::Null => {
+ // do nothing, we treat json as nullable by default when
+ // inferring
+ }
+ Value::Number(n) => {
+ if n.is_f64() {
+ set_object_scalar_field_type(field_types, k,
DataType::Float64)?;
+ } else {
+ // default to i64
+ set_object_scalar_field_type(field_types, k,
DataType::Int64)?;
+ }
+ }
+ Value::String(_) => {
+ set_object_scalar_field_type(field_types, k, DataType::Utf8)?;
+ }
+ Value::Object(inner_map) => {
+ if !field_types.contains_key(k) {
+ field_types
+ .insert(k.to_string(),
InferredType::Object(HashMap::new()));
+ }
+ match field_types.get_mut(k).unwrap() {
+ InferredType::Object(inner_field_types) => {
+ collect_field_types_from_object(inner_field_types,
inner_map)?;
+ }
+ t => {
+ return Err(ArrowError::JsonError(format!(
+ "Expected object json type, found: {:?}",
+ t,
+ )));
+ }
+ }
+ }
+ }
+ }
+
+ Ok(())
+}
+
/// Infer the fields of a JSON file by reading all items from the JSON Value
Iterator.
+///
+/// The following type coercion logic is implemented:
Review comment:
❤️
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]