etseidl commented on code in PR #9868:
URL: https://github.com/apache/arrow-rs/pull/9868#discussion_r3175605685


##########
parquet/src/parquet_thrift.rs:
##########
@@ -1106,4 +1154,55 @@ pub(crate) mod tests {
         assert_eq!(header.size, 0);
         assert_eq!(header.element_type, ElementType::Byte);
     }
+
+    /// Regression test for a Thrift list header that claims a multi-billion
+    /// element size on a few bytes of input.
+    ///
+    /// Prior to capping `Vec::with_capacity` against the remaining input,
+    /// this caused `alloc::raw_vec::handle_error` to panic with
+    /// "capacity overflow" before any element was actually decoded.
+    /// After the fix the capacity is clamped to the remaining buffer and
+    /// the read fails cleanly with EOF when the (still claimed-but-large)
+    /// element loop runs out of input.
+    #[test]
+    fn test_read_thrift_vec_huge_size_does_not_panic() {

Review Comment:
   I tried this test with a recent main branch and it passed (returns EOF 
error).



##########
parquet/src/parquet_thrift.rs:
##########
@@ -1106,4 +1154,55 @@ pub(crate) mod tests {
         assert_eq!(header.size, 0);
         assert_eq!(header.element_type, ElementType::Byte);
     }
+
+    /// Regression test for a Thrift list header that claims a multi-billion
+    /// element size on a few bytes of input.
+    ///
+    /// Prior to capping `Vec::with_capacity` against the remaining input,
+    /// this caused `alloc::raw_vec::handle_error` to panic with
+    /// "capacity overflow" before any element was actually decoded.
+    /// After the fix the capacity is clamped to the remaining buffer and
+    /// the read fails cleanly with EOF when the (still claimed-but-large)
+    /// element loop runs out of input.
+    #[test]
+    fn test_read_thrift_vec_huge_size_does_not_panic() {
+        // Construct a list header that takes the "follow-up varint" branch
+        // (high nibble = 0xF) and supplies a varint that decodes to a
+        // multi-billion `size`. Pre-fix this caused
+        // `Vec::with_capacity(size)` to panic with "capacity overflow"
+        // before any element was decoded.
+        let mut data: Vec<u8> = vec![0xF8]; // element_type=8 (Binary), 
0xF=read varint
+        // Pack a varint that decodes to roughly i32::MAX as the size.
+        data.extend_from_slice(&[0xff, 0xff, 0xff, 0xff, 0x07]);
+        let mut prot = ThriftSliceInputProtocol::new(&data);
+        // We don't care whether this returns Ok or Err — only that it does
+        // not panic.
+        let _ = read_thrift_vec::<&[u8], _>(&mut prot);
+    }
+
+    /// As [`test_read_thrift_vec_huge_size_does_not_panic`] but exercising
+    /// the public Parquet metadata entrypoint that the fuzzer reached.
+    #[test]
+    fn test_decode_metadata_huge_thrift_list_does_not_panic() {

Review Comment:
   Confirmed that this test and the following one do fail without the fix



##########
parquet/src/parquet_thrift.rs:
##########
@@ -678,16 +697,45 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> 
ReadThrift<'a, R> for &'a [u8] {
     }
 }
 
+/// Fallback upper bound for [`Vec::with_capacity`] when the protocol does not
+/// expose a remaining-byte count. Picked conservatively: even the smallest
+/// Thrift element is at least one byte on the wire, and Parquet metadata
+/// payloads larger than this in any single list are well outside the spec.
+const READ_THRIFT_VEC_FALLBACK_CAP: usize = 1 << 20;
+
 /// Read a Thrift encoded [list] from the input protocol object.
 ///
+/// The list size is taken from the wire as an `i32`. Because that value is
+/// untrusted, the up-front [`Vec::with_capacity`] allocation is bounded by
+/// the number of bytes actually remaining in the input (when the protocol
+/// can report it) — every element on the wire costs at least one byte, so
+/// a list whose `size` exceeds the remaining input is malformed by
+/// definition. Streaming protocols that cannot report a remaining count
+/// fall back to [`READ_THRIFT_VEC_FALLBACK_CAP`].
+///
+/// Negative sizes are rejected explicitly. Although the Thrift spec uses an
+/// unsigned varint, a buggy or hostile encoder can produce a value whose
+/// `i32` interpretation is negative.
+///
 /// [list]: 
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set
 pub(crate) fn read_thrift_vec<'a, T, R>(prot: &mut R) -> Result<Vec<T>>
 where
     R: ThriftCompactInputProtocol<'a>,
     T: ReadThrift<'a, R>,
 {
     let list_ident = prot.read_list_begin()?;
-    let mut res = Vec::with_capacity(list_ident.size as usize);
+    if list_ident.size < 0 {
+        return Err(general_err!(
+            "Thrift list has negative size {}",
+            list_ident.size
+        ));
+    }
+    let claimed = list_ident.size as usize;
+    let cap = match prot.remaining_bytes() {
+        Some(remaining) => claimed.min(remaining),
+        None => claimed.min(READ_THRIFT_VEC_FALLBACK_CAP),
+    };
+    let mut res = Vec::with_capacity(cap);

Review Comment:
   Rather than all this, we can use `Vec::try_reserve_exact` to catch 
allocation errors.
   ```rust
       let len = list_ident.size as usize;
       let mut res = Vec::with_capacity(0);
       if let Err(_) = res.try_reserve_exact(len) {
           return Err(general_err!("cannot allocate vector"));
       }
   ```
   Then we don't need the new `remaining_bytes` function.



##########
parquet/src/parquet_thrift.rs:
##########
@@ -678,16 +697,45 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> 
ReadThrift<'a, R> for &'a [u8] {
     }
 }
 
+/// Fallback upper bound for [`Vec::with_capacity`] when the protocol does not
+/// expose a remaining-byte count. Picked conservatively: even the smallest
+/// Thrift element is at least one byte on the wire, and Parquet metadata
+/// payloads larger than this in any single list are well outside the spec.
+const READ_THRIFT_VEC_FALLBACK_CAP: usize = 1 << 20;
+
 /// Read a Thrift encoded [list] from the input protocol object.
 ///
+/// The list size is taken from the wire as an `i32`. Because that value is
+/// untrusted, the up-front [`Vec::with_capacity`] allocation is bounded by
+/// the number of bytes actually remaining in the input (when the protocol
+/// can report it) — every element on the wire costs at least one byte, so
+/// a list whose `size` exceeds the remaining input is malformed by
+/// definition. Streaming protocols that cannot report a remaining count
+/// fall back to [`READ_THRIFT_VEC_FALLBACK_CAP`].
+///
+/// Negative sizes are rejected explicitly. Although the Thrift spec uses an
+/// unsigned varint, a buggy or hostile encoder can produce a value whose
+/// `i32` interpretation is negative.
+///
 /// [list]: 
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set
 pub(crate) fn read_thrift_vec<'a, T, R>(prot: &mut R) -> Result<Vec<T>>
 where
     R: ThriftCompactInputProtocol<'a>,
     T: ReadThrift<'a, R>,
 {
     let list_ident = prot.read_list_begin()?;
-    let mut res = Vec::with_capacity(list_ident.size as usize);
+    if list_ident.size < 0 {

Review Comment:
   As I've said, I think this should move to `read_list_header`. With a few 
mods to `ThriftProtocolError`, we can replace 
   ```rust
     self.read_vlq()? as _
   ```
   with
   ```rust
     i32::try_from(self.read_vlq()?)?
   ```
   We'd just need conversion from `TryFromIntError` to `ThriftProtocolError`.
   
   I take back what I said about `i32` being wrong. A closer read makes me 
think a positive integer `<= i32::MAX` is what we want.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to