jecsand838 commented on code in PR #8274: URL: https://github.com/apache/arrow-rs/pull/8274#discussion_r2323709400
########## arrow-avro/src/writer/encoder.rs: ########## @@ -84,48 +85,211 @@ fn write_bool<W: Write + ?Sized>(writer: &mut W, v: bool) -> Result<(), ArrowErr /// - Null-first (default): null => 0, value => 1 /// - Null-second (Impala): value => 0, null => 1 #[inline] -fn write_optional_branch<W: Write + ?Sized>( +fn write_optional_index<W: Write + ?Sized>( writer: &mut W, is_null: bool, - impala_mode: bool, + order: Nullability, ) -> Result<(), ArrowError> { - let branch = if impala_mode == is_null { 1 } else { 0 }; - write_int(writer, branch) + // For NullFirst: null => 0x00, value => 0x02 + // For NullSecond: value => 0x00, null => 0x02 + let byte = match order { + Nullability::NullFirst => { + if is_null { + 0x00 + } else { + 0x02 + } + } + Nullability::NullSecond => { + if is_null { + 0x02 + } else { + 0x00 + } + } + }; + writer + .write_all(&[byte]) + .map_err(|e| ArrowError::IoError(format!("write union branch: {e}"), e)) } -/// Encode a `RecordBatch` in Avro binary format using **default options**. -pub fn encode_record_batch<W: Write>(batch: &RecordBatch, out: &mut W) -> Result<(), ArrowError> { - encode_record_batch_with_options(batch, out, &EncoderOptions::default()) +/// Per‑site encoder plan for a field. This mirrors Avro structure so nested +/// optional branch order can be honored exactly as declared by the schema. +#[derive(Debug, Clone)] +enum FieldPlan { + /// Non-nested scalar/logical type + Scalar, + /// Record/Struct with Avro‑ordered children + Struct { encoders: Vec<FieldBinding> }, + /// Array with item‑site nullability and nested plan + List { + items_nullability: Option<Nullability>, + item_plan: Box<FieldPlan>, + }, } -/// Encode a `RecordBatch` with explicit `EncoderOptions`. -pub fn encode_record_batch_with_options<W: Write>( - batch: &RecordBatch, - out: &mut W, - opts: &EncoderOptions, -) -> Result<(), ArrowError> { - let mut encoders = batch - .schema() - .fields() - .iter() - .zip(batch.columns()) - .map(|(field, array)| Ok((field.is_nullable(), make_encoder(array.as_ref())?))) - .collect::<Result<Vec<_>, ArrowError>>()?; - (0..batch.num_rows()).try_for_each(|row| { - encoders.iter_mut().try_for_each(|(is_nullable, enc)| { - if *is_nullable { - let is_null = enc.is_null(row); - write_optional_branch(out, is_null, opts.impala_mode)?; - if is_null { - return Ok(()); +/// Unified binding used for both top‑level columns and struct children. +/// +/// This replaces the previous duplication between `StructChildPlan` and `ColumnPlan`. +#[derive(Debug, Clone)] +struct FieldBinding { + /// Index of the Arrow field/column associated with this Avro field site + arrow_index: usize, + /// Nullability/order for this site (None if not optional) Review Comment: > Does avro use `optional` and `required` as complementary terms, the way parquet does? Avro does seem to follow that convention. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org