jecsand838 commented on code in PR #8402:
URL: https://github.com/apache/arrow-rs/pull/8402#discussion_r2370381775


##########
arrow-avro/src/writer/mod.rs:
##########
@@ -19,19 +19,115 @@
 //!
 //! # Overview
 //!
-//! *   Use **`AvroWriter`** (Object Container File) when you want a
-//!     self‑contained Avro file with header, schema JSON, optional 
compression,
-//!     blocks, and sync markers.
-//! *   Use **`AvroStreamWriter`** (raw binary stream) when you already know 
the
-//!     schema out‑of‑band (i.e., via a schema registry) and need a stream
-//!     of Avro‑encoded records with minimal framing.
+//! Use this module to serialize Arrow `RecordBatch` values into Avro. Two 
output
+//! formats are supported:
 //!
-
-/// Encodes `RecordBatch` into the Avro binary format.
-pub mod encoder;
-/// Logic for different Avro container file formats.
-pub mod format;
-
+//! * **`AvroWriter`** — writes an **Object Container File (OCF)**: a 
self‑describing
+//!   file with header (schema JSON + metadata), optional compression, data 
blocks, and
+//!   sync markers. See Avro 1.11.1 “Object Container Files.”
+//!   
<https://avro.apache.org/docs/1.11.1/specification/#object-container-files>
+//! * **`AvroStreamWriter`** — writes a **raw Avro binary stream** (“datum” 
bytes) without
+//!   any container framing. This is useful when the schema is known 
out‑of‑band (i.e.,
+//!   via a registry) and you want minimal overhead.
+//!
+//! ## Which format should I use?
+//!
+//! * Use **OCF** when you need a portable, self‑contained file. The schema 
travels with
+//!   the data, making it easy to read elsewhere.
+//! * Use the **raw stream** when your surrounding protocol supplies schema 
information
+//!   (i.e., a schema registry). If you need **single‑object encoding (SOE)** 
or Confluent
+//!   **Schema Registry** framing, you must add the appropriate prefix 
*outside* this writer:
+//!   - **SOE**: `0xC3 0x01` + 8‑byte little‑endian CRC‑64‑AVRO fingerprint + 
Avro body
+//!     (see Avro 1.11.1 “Single object encoding”).
+//!     
<https://avro.apache.org/docs/1.11.1/specification/#single-object-encoding>
+//!   - **Confluent wire format**: magic `0x00` + **big‑endian** 4‑byte schema 
ID and Avro body.
+//!     
<https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format>
+//!
+//! ## Quickstart: Write an OCF in memory (runnable)

Review Comment:
   I checked the docs before pushing and agree with you. 100% good callout!



##########
arrow-avro/src/lib.rs:
##########
@@ -15,9 +15,165 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro]
+//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro].
 //!
-//! [Apache Arrow]: https://arrow.apache.org
+//! This crate provides:
+//! - a [`reader`] that decodes Avro (Object Container Files, Avro 
Single‑Object encoding,
+//!   and Confluent Schema Registry wire format) into Arrow `RecordBatch`es,
+//! - and a [`writer`] that encodes Arrow `RecordBatch`es into Avro (OCF or 
raw Avro binary).
+//!
+//! If you’re new to Arrow or Avro, see:
+//! - Arrow project site: <https://arrow.apache.org/>
+//! - Avro 1.11.1 specification: 
<https://avro.apache.org/docs/1.11.1/specification/>
+//!
+//! ## Quickstart: OCF (Object Container File) round‑trip *(runnable)*
+//!
+//! The example below creates an Arrow table, writes an **Avro OCF** fully in 
memory,
+//! and then reads it back. OCF is a self‑describing file format that embeds 
the Avro
+//! schema in a header with optional compression and block sync markers.
+//! Spec: 
<https://avro.apache.org/docs/1.11.1/specification/#object-container-files>
+//!
+//! ```
+//! use std::io::Cursor;
+//! use std::sync::Arc;
+//! use arrow_array::{ArrayRef, Int32Array, RecordBatch};
+//! use arrow_schema::{DataType, Field, Schema};
+//! use arrow_avro::writer::AvroWriter;
+//! use arrow_avro::reader::ReaderBuilder;
+//!
+//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
+//! // Build a tiny Arrow batch
+//! let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);
+//! let batch = RecordBatch::try_new(
+//!     Arc::new(schema.clone()),
+//!     vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
+//! )?;
+//!
+//! // Write an Avro **Object Container File** (OCF) to a Vec<u8>
+//! let sink: Vec<u8> = Vec::new();
+//! let mut w = AvroWriter::new(sink, schema.clone())?;
+//! w.write(&batch)?;
+//! w.finish()?;
+//! let bytes = w.into_inner();
+//! assert!(!bytes.is_empty());
+//!
+//! // Read it back
+//! let mut r = ReaderBuilder::new().build(Cursor::new(bytes))?;
+//! let out = r.next().unwrap()?;
+//! assert_eq!(out.num_rows(), 3);
+//! # Ok(()) }
+//! ```
+//!
+//! ## Quickstart: Confluent wire‑format round‑trip *(runnable)*
+//!
+//! The **Confluent Schema Registry wire format** prefixes each Avro message 
with a
+//! 1‑byte magic `0x00` and a **4‑byte big‑endian** schema ID, followed by the 
Avro body.
+//! See: 
<https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format>
+//!
+//! In this round‑trip, we:
+//! 1) Use `AvroStreamWriter` to create a **raw Avro body** for a single‑row 
batch,
+//! 2) Wrap it with the Confluent prefix (magic and schema ID),
+//! 3) Decode it back to Arrow using a `Decoder` configured with a 
`SchemaStore` that
+//!    maps the schema ID to the Avro schema used by the writer.
+//!
+//! ```ignore
+//! use arrow_avro::reader::ReaderBuilder;
+//! use arrow_avro::schema::{AvroSchema, SchemaStore, Fingerprint, 
FingerprintAlgorithm, CONFLUENT_MAGIC};
+//!
+//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
+//! // Writer schema registered under Schema Registry ID 1
+//! let avro_json = r#"{
+//!   "type":"record","name":"User",
+//!   "fields":[{"name":"id","type":"long"},{"name":"name","type":"string"}]
+//! }"#;
+//!
+//! let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::None);
+//! let id: u32 = 1;
+//! store.set(Fingerprint::Id(id), AvroSchema::new(avro_json.to_string()))?;
+//!
+//! // Minimal Avro body encoder for {id: long, name: string}
+//! fn enc_long(v: i64, out: &mut Vec<u8>) {
+//!   let mut n = ((v << 1) ^ (v >> 63)) as u64;
+//!   while (n & !0x7F) != 0 { out.push(((n as u8) & 0x7F) | 0x80); n >>= 7; }
+//!   out.push(n as u8);
+//! }
+//! fn enc_len(l: usize, out: &mut Vec<u8>) { enc_long(l as i64, out); }

Review Comment:
   Good callout! I cleaned this up.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to