This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 7bb96c5ec9 Improve `arrow-ipc` documentation (#6983)
7bb96c5ec9 is described below
commit 7bb96c5ec9f268412d863c22a3f09b28bc1ba7d6
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Jan 22 18:17:44 2025 -0500
Improve `arrow-ipc` documentation (#6983)
* Improve `arrow-ipc` documentation
* Improve, reduce emphasis on Read/Write/Seek
* Apply suggestions from code review
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
---------
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
---
arrow-ipc/src/lib.rs | 19 +++++++++++
arrow-ipc/src/reader.rs | 87 +++++++++++++++++++++++++++++++++++++++++++++----
arrow-ipc/src/writer.rs | 59 +++++++++++++++++++++++++++++----
3 files changed, 153 insertions(+), 12 deletions(-)
diff --git a/arrow-ipc/src/lib.rs b/arrow-ipc/src/lib.rs
index a76083b939..4638abdb4e 100644
--- a/arrow-ipc/src/lib.rs
+++ b/arrow-ipc/src/lib.rs
@@ -17,7 +17,26 @@
//! Support for the [Arrow IPC Format]
//!
+//! The Arrow IPC format defines how to read and write [`RecordBatch`]es
to/from
+//! a file or stream of bytes. This format can be used to serialize and
deserialize
+//! data to files and over the network.
+//!
+//! There are two variants of the IPC format:
+//! 1. [IPC Streaming Format]: Supports streaming data sources, implemented by
+//! [StreamReader] and [StreamWriter]
+//!
+//! 2. [IPC File Format]: Supports random access, implemented by [FileReader]
and
+//! [FileWriter].
+//!
+//! See the [`reader`] and [`writer`] modules for more information.
+//!
//! [Arrow IPC Format]:
https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc
+//! [IPC Streaming Format]:
https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
+//! [StreamReader]: reader::StreamReader
+//! [StreamWriter]: writer::StreamWriter
+//! [IPC File Format]:
https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format
+//! [FileReader]: reader::FileReader
+//! [FileWriter]: writer::FileWriter
#![warn(missing_docs)]
pub mod convert;
diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs
index 4dcd56156e..b72785651b 100644
--- a/arrow-ipc/src/reader.rs
+++ b/arrow-ipc/src/reader.rs
@@ -17,8 +17,12 @@
//! Arrow IPC File and Stream Readers
//!
-//! The `FileReader` and `StreamReader` have similar interfaces,
-//! however the `FileReader` expects a reader that supports `Seek`ing
+//! # Notes
+//!
+//! The [`FileReader`] and [`StreamReader`] have similar interfaces,
+//! however the [`FileReader`] expects a reader that supports [`Seek`]ing
+//!
+//! [`Seek`]: std::io::Seek
mod stream;
@@ -997,10 +1001,49 @@ impl FileReaderBuilder {
}
}
-/// Arrow File reader
+/// Arrow File Reader
+///
+/// Reads Arrow [`RecordBatch`]es from bytes in the [IPC File Format],
+/// providing random access to the record batches.
+///
+/// # See Also
+///
+/// * [`Self::set_index`] for random access
+/// * [`StreamReader`] for reading streaming data
+///
+/// # Example: Reading from a `File`
+/// ```
+/// # use std::io::Cursor;
+/// use arrow_array::record_batch;
+/// # use arrow_ipc::reader::FileReader;
+/// # use arrow_ipc::writer::FileWriter;
+/// # let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
+/// # let mut file = vec![]; // mimic a stream for the example
+/// # {
+/// # let mut writer = FileWriter::try_new(&mut file,
&batch.schema()).unwrap();
+/// # writer.write(&batch).unwrap();
+/// # writer.write(&batch).unwrap();
+/// # writer.finish().unwrap();
+/// # }
+/// # let mut file = Cursor::new(&file);
+/// let projection = None; // read all columns
+/// let mut reader = FileReader::try_new(&mut file, projection).unwrap();
+/// // Position the reader to the second batch
+/// reader.set_index(1).unwrap();
+/// // read batches from the reader using the Iterator trait
+/// let mut num_rows = 0;
+/// for batch in reader {
+/// let batch = batch.unwrap();
+/// num_rows += batch.num_rows();
+/// }
+/// assert_eq!(num_rows, 3);
+/// ```
+/// # Example: Reading from `mmap`ed file
///
-/// For an example creating Arrays with memory mapped (`mmap`) files see the
[`zero_copy_ipc`] example.
+/// For an example creating Arrays without copying using memory mapped
(`mmap`)
+/// files see the [`zero_copy_ipc`] example.
///
+/// [IPC Streaming Format]:
https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
/// [`zero_copy_ipc`]:
https://github.com/apache/arrow-rs/blob/main/arrow/examples/zero_copy_ipc.rs
pub struct FileReader<R> {
/// File reader that supports reading and seeking
@@ -1078,7 +1121,7 @@ impl<R: Read + Seek> FileReader<R> {
self.decoder.schema.clone()
}
- /// Read a specific record batch
+ /// See to a specific [`RecordBatch`]
///
/// Sets the current block to the index, allowing random reads
pub fn set_index(&mut self, index: usize) -> Result<(), ArrowError> {
@@ -1136,7 +1179,39 @@ impl<R: Read + Seek> RecordBatchReader for FileReader<R>
{
}
}
-/// Arrow Stream reader
+/// Arrow Stream Reader
+///
+/// Reads Arrow [`RecordBatch`]es from bytes in the [IPC Streaming Format].
+///
+/// # See Also
+///
+/// * [`FileReader`] for random access.
+///
+/// # Example
+/// ```
+/// # use arrow_array::record_batch;
+/// # use arrow_ipc::reader::StreamReader;
+/// # use arrow_ipc::writer::StreamWriter;
+/// # let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
+/// # let mut stream = vec![]; // mimic a stream for the example
+/// # {
+/// # let mut writer = StreamWriter::try_new(&mut stream,
&batch.schema()).unwrap();
+/// # writer.write(&batch).unwrap();
+/// # writer.finish().unwrap();
+/// # }
+/// # let stream = stream.as_slice();
+/// let projection = None; // read all columns
+/// let mut reader = StreamReader::try_new(stream, projection).unwrap();
+/// // read batches from the reader using the Iterator trait
+/// let mut num_rows = 0;
+/// for batch in reader {
+/// let batch = batch.unwrap();
+/// num_rows += batch.num_rows();
+/// }
+/// assert_eq!(num_rows, 3);
+/// ```
+///
+/// [IPC Streaming Format]:
https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
pub struct StreamReader<R> {
/// Stream reader
reader: R,
diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
index ee5b9a54cc..1581df56de 100644
--- a/arrow-ipc/src/writer.rs
+++ b/arrow-ipc/src/writer.rs
@@ -17,8 +17,12 @@
//! Arrow IPC File and Stream Writers
//!
-//! The `FileWriter` and `StreamWriter` have similar interfaces,
-//! however the `FileWriter` expects a reader that supports `Seek`ing
+//! # Notes
+//!
+//! [`FileWriter`] and [`StreamWriter`] have similar interfaces,
+//! however the [`FileWriter`] expects a reader that supports [`Seek`]ing
+//!
+//! [`Seek`]: std::io::Seek
use std::cmp::min;
use std::collections::HashMap;
@@ -188,7 +192,7 @@ impl Default for IpcWriteOptions {
/// Handles low level details of encoding [`Array`] and [`Schema`] into the
/// [Arrow IPC Format].
///
-/// # Example:
+/// # Example
/// ```
/// # fn run() {
/// # use std::sync::Arc;
@@ -905,7 +909,28 @@ impl DictionaryTracker {
}
}
-/// Writer for an IPC file
+/// Arrow File Writer
+///
+/// Writes Arrow [`RecordBatch`]es in the [IPC File Format].
+///
+/// # See Also
+///
+/// * [`StreamWriter`] for writing IPC Streams
+///
+/// # Example
+/// ```
+/// # use arrow_array::record_batch;
+/// # use arrow_ipc::writer::StreamWriter;
+/// # let mut file = vec![]; // mimic a file for the example
+/// let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
+/// // create a new writer, the schema must be known in advance
+/// let mut writer = StreamWriter::try_new(&mut file,
&batch.schema()).unwrap();
+/// // write each batch to the underlying writer
+/// writer.write(&batch).unwrap();
+/// // When all batches are written, call finish to flush all buffers
+/// writer.finish().unwrap();
+/// ```
+/// [IPC File Format]:
https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format
pub struct FileWriter<W> {
/// The object to write to
writer: W,
@@ -1108,7 +1133,7 @@ impl<W: Write> FileWriter<W> {
Ok(())
}
- /// Unwraps the the underlying writer.
+ /// Unwraps the underlying writer.
///
/// The writer is flushed and the FileWriter is finished before returning.
///
@@ -1135,7 +1160,29 @@ impl<W: Write> RecordBatchWriter for FileWriter<W> {
}
}
-/// Writer for an IPC stream
+/// Arrow Stream Writer
+///
+/// Writes Arrow [`RecordBatch`]es to bytes using the [IPC Streaming Format].
+///
+/// # See Also
+///
+/// * [`FileWriter`] for writing IPC Files
+///
+/// # Example
+/// ```
+/// # use arrow_array::record_batch;
+/// # use arrow_ipc::writer::StreamWriter;
+/// # let mut stream = vec![]; // mimic a stream for the example
+/// let batch = record_batch!(("a", Int32, [1, 2, 3])).unwrap();
+/// // create a new writer, the schema must be known in advance
+/// let mut writer = StreamWriter::try_new(&mut stream,
&batch.schema()).unwrap();
+/// // write each batch to the underlying stream
+/// writer.write(&batch).unwrap();
+/// // When all batches are written, call finish to flush all buffers
+/// writer.finish().unwrap();
+/// ```
+///
+/// [IPC Streaming Format]:
https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
pub struct StreamWriter<W> {
/// The object to write to
writer: W,