jecsand838 commented on code in PR #9171:
URL: https://github.com/apache/arrow-rs/pull/9171#discussion_r2723836867
##########
arrow-avro/src/writer/mod.rs:
##########
@@ -79,11 +162,194 @@ mod encoder;
/// Logic for different Avro container file formats.
pub mod format;
+/// A contiguous set of encoded rows.
+///
+/// `EncodedRows` stores:
+/// - a single backing byte buffer (`bytes::Bytes`)
+/// - a `Vec<u64>` of row boundary offsets (length = `rows + 1`)
+///
+/// This lets callers get per-row payloads as zero-copy `Bytes` slices.
+///
+/// For compatibility with APIs that require owned `Vec<u8>`, use
[`EncodedRows::to_vecs`].
+#[derive(Debug, Clone)]
+pub struct EncodedRows {
+ data: Bytes,
+ offsets: Vec<u64>,
+}
+
+impl EncodedRows {
+ /// Create a new `EncodedRows` from a backing buffer and row boundary
offsets.
+ ///
+ /// `offsets` must have length `rows + 1`, and be monotonically
non-decreasing.
+ /// The last offset should equal `data.len()`.
+ pub fn new(data: Bytes, offsets: Vec<u64>) -> Self {
+ Self { data, offsets }
+ }
+
+ /// Number of rows in this buffer.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.offsets.len().saturating_sub(1)
+ }
+
+ /// Returns `true` if there are no rows.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns the backing buffer.
+ ///
+ /// Note: individual rows should typically be accessed via [`Self::row`]
or [`Self::rows`].
+ #[inline]
+ pub fn bytes(&self) -> &Bytes {
+ &self.data
+ }
+
+ /// Returns the row boundary offsets (length = `len() + 1`).
+ #[inline]
+ pub fn offsets(&self) -> &[u64] {
+ &self.offsets
+ }
+
+ /// Return the `i`th row as a zero-copy `Bytes` slice.
+ ///
+ /// # Errors
+ ///
+ /// Returns an error if the row offsets are invalid (e.g. exceed
`usize::MAX`).
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::sync::Arc;
+ /// use arrow_array::{ArrayRef, Int32Array, RecordBatch};
+ /// use arrow_schema::{DataType, Field, Schema};
+ /// use arrow_avro::writer::WriterBuilder;
+ /// use arrow_avro::writer::format::AvroSoeFormat;
+ ///
+ /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
+ /// let schema = Schema::new(vec![Field::new("x", DataType::Int32,
false)]);
+ /// let batch = RecordBatch::try_new(
+ /// Arc::new(schema.clone()),
+ /// vec![Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef],
+ /// )?;
+ ///
+ /// let mut encoder =
WriterBuilder::new(schema).build_encoder::<AvroSoeFormat>()?;
+ /// encoder.encode(&batch)?;
+ /// let rows = encoder.flush();
+ ///
+ /// // Access the first row (index 0)
+ /// let row0 = rows.row(0)?;
+ /// assert!(!row0.is_empty());
+ /// # Ok(())
+ /// # }
+ /// ```
+ pub fn row(&self, i: usize) -> Result<Bytes, ArrowError> {
+ if i >= self.len() {
+ return Err(ArrowError::AvroError(format!(
+ "Row index {i} out of bounds for len {}",
+ self.len()
+ )));
+ }
+ // SAFETY:
+ // self.len() is defined as self.offsets.len().saturating_sub(1).
+ // The check `i >= self.len()` above ensures that `i <
self.offsets.len() - 1`.
+ // Therefore, both `i` and `i + 1` are strictly within the bounds of
`self.offsets`.
+ let (start_u64, end_u64) = unsafe {
Review Comment:
I did see a difference surprisingly.
In the screenshot below I run the benchmarks first with the unsafe code
before changing the production code to be safe and re-running. There seemed to
be a significant performance impact.
> NOTE: For the safe test I used `let (start_u64, end_u64) =
(self.offsets[i], self.offsets[i + 1]);`.
I made sure to push up the benches I used for this in a new
`benches/encoder.rs` file, which can be expanded on in future PRs.
<img width="736" height="925" alt="Screenshot 2026-01-23 at 10 11 29 PM"
src="https://github.com/user-attachments/assets/697bb812-e53d-490b-bd9a-9abbfe3dff42"
/>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]