jecsand838 commented on code in PR #8292:
URL: https://github.com/apache/arrow-rs/pull/8292#discussion_r2338008633
##########
arrow-avro/src/codec.rs:
##########
@@ -193,6 +200,224 @@ impl AvroDataType {
pub fn nullability(&self) -> Option<Nullability> {
self.nullability
}
+
+ #[inline]
+ fn parse_default_literal(&self, default_json: &Value) ->
Result<AvroLiteral, ArrowError> {
+ fn expect_string<'v>(
+ default_json: &'v Value,
+ data_type: &str,
+ ) -> Result<&'v str, ArrowError> {
+ match default_json {
+ Value::String(s) => Ok(s.as_str()),
+ _ => Err(ArrowError::SchemaError(format!(
+ "Default value must be a JSON string for {data_type}"
+ ))),
+ }
+ }
+
+ fn parse_bytes_default(
+ default_json: &Value,
+ expected_len: Option<usize>,
+ ) -> Result<Vec<u8>, ArrowError> {
+ let s = expect_string(default_json, "bytes/fixed logical types")?;
+ let mut out = Vec::with_capacity(s.len());
+ for ch in s.chars() {
+ let cp = ch as u32;
+ if cp > 0xFF {
+ return Err(ArrowError::SchemaError(format!(
+ "Invalid codepoint U+{cp:04X} in bytes/fixed default;
must be ≤ 0xFF"
+ )));
+ }
+ out.push(cp as u8);
+ }
+ if let Some(len) = expected_len {
+ if out.len() != len {
+ return Err(ArrowError::SchemaError(format!(
+ "Default length {} does not match expected fixed size
{len}",
+ out.len(),
+ )));
+ }
+ }
+ Ok(out)
+ }
+
+ fn parse_json_i64(default_json: &Value, data_type: &str) ->
Result<i64, ArrowError> {
+ match default_json {
+ Value::Number(n) => n.as_i64().ok_or_else(|| {
+ ArrowError::SchemaError(format!("Default {data_type} must
be an integer"))
+ }),
+ _ => Err(ArrowError::SchemaError(format!(
+ "Default {data_type} must be a JSON integer"
+ ))),
+ }
+ }
+
+ fn parse_json_f64(default_json: &Value, data_type: &str) ->
Result<f64, ArrowError> {
+ match default_json {
+ Value::Number(n) => n.as_f64().ok_or_else(|| {
+ ArrowError::SchemaError(format!("Default {data_type} must
be a number"))
+ }),
+ _ => Err(ArrowError::SchemaError(format!(
+ "Default {data_type} must be a JSON number"
+ ))),
+ }
+ }
+
+ // Handle JSON nulls per-spec: allowed only for `null` type or unions
with null FIRST
+ if default_json.is_null() {
+ return match self.codec() {
+ Codec::Null => Ok(AvroLiteral::Null),
+ _ if self.nullability() == Some(Nullability::NullFirst) =>
Ok(AvroLiteral::Null),
+ _ => Err(ArrowError::SchemaError(
+ "JSON null default is only valid for `null` type or for a
union whose first branch is `null`"
+ .to_string(),
+ )),
+ };
+ }
+ let lit = match self.codec() {
+ Codec::Null => {
+ return Err(ArrowError::SchemaError(
+ "Default for `null` type must be JSON null".to_string(),
+ ))
+ }
+ Codec::Boolean => match default_json {
+ Value::Bool(b) => AvroLiteral::Boolean(*b),
+ _ => {
+ return Err(ArrowError::SchemaError(
+ "Boolean default must be a JSON boolean".to_string(),
+ ))
+ }
+ },
+ Codec::Int32 | Codec::Date32 | Codec::TimeMillis => {
+ let i = parse_json_i64(default_json, "int")?;
+ if i < i32::MIN as i64 || i > i32::MAX as i64 {
+ return Err(ArrowError::SchemaError(format!(
+ "Default int {i} out of i32 range"
+ )));
+ }
+ AvroLiteral::Int(i as i32)
+ }
+ Codec::Int64
+ | Codec::TimeMicros
+ | Codec::TimestampMillis(_)
+ | Codec::TimestampMicros(_) =>
AvroLiteral::Long(parse_json_i64(default_json, "long")?),
+ Codec::Float32 => {
+ let f = parse_json_f64(default_json, "float")?;
+ if !f.is_finite() || f < f32::MIN as f64 || f > f32::MAX as
f64 {
+ return Err(ArrowError::SchemaError(format!(
+ "Default float {f} out of f32 range or not finite"
+ )));
+ }
+ AvroLiteral::Float(f as f32)
+ }
+ Codec::Float64 => AvroLiteral::Double(parse_json_f64(default_json,
"double")?),
+ Codec::Utf8 | Codec::Utf8View | Codec::Uuid => {
+ AvroLiteral::String(expect_string(default_json,
"string/uuid")?.to_string())
+ }
+ Codec::Binary =>
AvroLiteral::Bytes(parse_bytes_default(default_json, None)?),
+ Codec::Fixed(sz) => {
+ AvroLiteral::Bytes(parse_bytes_default(default_json, Some(*sz
as usize))?)
+ }
+ Codec::Decimal(_, _, fixed_size) => {
+ AvroLiteral::Bytes(parse_bytes_default(default_json,
*fixed_size)?)
+ }
+ Codec::Enum(symbols) => {
+ let s = expect_string(default_json, "enum")?;
+ if symbols.iter().any(|sym| sym == s) {
+ AvroLiteral::Enum(s.to_string())
+ } else {
+ return Err(ArrowError::SchemaError(format!(
+ "Default enum symbol {s:?} not found in reader enum
symbols"
+ )));
+ }
+ }
+ Codec::Interval =>
AvroLiteral::Bytes(parse_bytes_default(default_json, Some(12))?),
+ Codec::List(item_dt) => match default_json {
+ Value::Array(items) => AvroLiteral::Array(
+ items
+ .iter()
+ .map(|v| item_dt.parse_default_literal(v))
+ .collect::<Result<_, _>>()?,
+ ),
+ _ => {
+ return Err(ArrowError::SchemaError(
+ "Default value must be a JSON array for Avro array
type".to_string(),
+ ))
+ }
+ },
+ Codec::Map(val_dt) => match default_json {
+ Value::Object(map) => {
+ let mut out = IndexMap::with_capacity(map.len());
+ for (k, v) in map {
+ out.insert(k.clone(),
val_dt.parse_default_literal(v)?);
+ }
+ AvroLiteral::Map(out)
+ }
+ _ => {
+ return Err(ArrowError::SchemaError(
+ "Default value must be a JSON object for Avro map
type".to_string(),
+ ))
+ }
+ },
+ Codec::Struct(fields) => match default_json {
+ Value::Object(obj) => {
+ let mut out: IndexMap<String, AvroLiteral> =
+ IndexMap::with_capacity(fields.len());
+ for f in fields.as_ref() {
+ let name = f.name().to_string();
+ if let Some(sub) = obj.get(&name) {
+ // Explicit value provided in the record default
object
+ let lit =
f.data_type().parse_default_literal(sub)?;
+ out.insert(name, lit);
+ } else if let Some(default_json) =
+
f.data_type().metadata.get(AVRO_FIELD_DEFAULT_METADATA_KEY)
+ {
+ // Use the subfield's own stored default (validate
and parse)
+ let v: Value =
serde_json::from_str(default_json).map_err(|e| {
+ ArrowError::SchemaError(format!(
+ "Failed to parse stored subfield default
JSON for '{}': {e}",
+ f.name(),
+ ))
+ })?;
+ let lit = f.data_type().parse_default_literal(&v)?;
+ out.insert(name, lit);
+ } else if f.data_type().nullability() ==
Some(Nullability::default()) {
Review Comment:
@nathaniel-d-ef Good callout. I see what you're getting at. I went ahead and
cleaned this code up a bit more.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]