[GitHub] [arrow] alamb commented on a change in pull request #8698: ARROW-10636: [Rust][Parquet] Switch to Rust Stable by removing specialization in parquet

GitBox Tue, 08 Dec 2020 11:58:55 -0800


alamb commented on a change in pull request #8698:
URL: https://github.com/apache/arrow/pull/8698#discussion_r538756445




##########
File path: rust/parquet/src/data_type.rs
##########
@@ -327,7 +422,12 @@ pub trait AsBytes {
 pub trait SliceAsBytes: Sized {
     /// Returns slice of bytes for a slice of this data type.
     fn slice_as_bytes(self_: &[Self]) -> &[u8];
-    fn slice_as_bytes_mut(self_: &mut [Self]) -> &mut [u8];
+    /// Return the internal representation as a mutable slice
+    ///
+    /// # Safety
+    /// If modified you are _required_ to ensure the internal representation
+    /// is valid and correct for the actual raw data
+    unsafe fn slice_as_bytes_mut(self_: &mut [Self]) -> &mut [u8];

Review comment:
       Notes to other reviewers -- this isn't any new unsafe that was added -- 
I think it was just hoised into a trait (previously the code inside the impl 
was marked as unsafe)

##########
File path: rust/parquet/src/encodings/decoding.rs
##########
@@ -384,36 +278,18 @@ impl<T: DataType> Decoder<T> for DictDecoder<T> {
 /// See [`RleValueEncoder`](crate::encoding::RleValueEncoder) for more 
information.
 pub struct RleValueDecoder<T: DataType> {
     values_left: usize,
-    decoder: Option<RleDecoder>,
+    decoder: RleDecoder,

Review comment:
       My results show a significant improvement in rle encoder and decoder, 
FWIW.

##########
File path: rust/parquet/src/data_type.rs
##########
@@ -424,16 +552,396 @@ impl AsBytes for str {
     }
 }
 
-/// Contains the Parquet physical type information as well as the Rust 
primitive type
-/// presentation.
-pub trait DataType: 'static {
-    type T: std::cmp::PartialEq
+pub(crate) mod private {
+    use crate::encodings::decoding::PlainDecoderDetails;
+    use crate::util::bit_util::{BitWriter, BitReader};
+    use crate::util::memory::ByteBufferPtr;
+
+    use byteorder::ByteOrder;
+    use std::convert::TryInto;
+
+    use super::{Result, ParquetError, SliceAsBytes};
+
+    pub type BitIndex = u64;
+
+    /// Sealed trait to start to remove specialisation from implementations
+    ///
+    /// This is done to force the associated value type to be unimplementable 
outside of this
+    /// crate, and thus hint to the type system (and end user) traits are 
public for the contract
+    /// and not for extension.
+    pub trait ParquetValueType : std::cmp::PartialEq
         + std::fmt::Debug
+        + std::fmt::Display
         + std::default::Default
         + std::clone::Clone
-        + AsBytes
-        + FromBytes
-        + PartialOrd;
+        + super::AsBytes
+        + super::FromBytes
+        + super::SliceAsBytes
+        + PartialOrd
+    {
+        /// Encode the value directly from a higher level encoder
+        fn encode<W: std::io::Write>(values: &[Self], writer: &mut W, 
bit_writer: &mut BitWriter) -> Result<()>;
+
+        /// Establish the data that will be decoded in a buffer
+        fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, 
num_values: usize);
+
+        /// Decode the value from a given buffer for a higher level decoder
+        fn decode(buffer: &mut [Self], decoder: &mut PlainDecoderDetails) -> 
Result<usize>;
+
+        /// Return the encoded size for a type
+        fn dict_encoding_size(&self) -> (usize, usize) {
+            (std::mem::size_of::<Self>(), 1)
+        }
+
+        /// Return the value as i64 if possible
+        ///
+        /// This is essentially the same as `std::convert::TryInto<i64>` but 
can
+        /// implemented for `f32` and `f64`, types that would fail orphan rules
+        fn as_i64(&self) -> Result<i64> {
+            Err(general_err!("Type cannot be converted to i64"))
+        }
+
+        /// Return the value as u64 if possible
+        ///
+        /// This is essentially the same as `std::convert::TryInto<u64>` but 
can
+        /// implemented for `f32` and `f64`, types that would fail orphan rules
+        fn as_u64(&self) -> Result<u64> {
+            self.as_i64()
+                .map_err(|_| general_err!("Type cannot be converted to u64"))
+                .map(|x| x as u64)
+        }
+
+        /// Return the value as an Any to allow for downcasts without 
transmutation
+        fn as_any(&self) -> &dyn std::any::Any;
+
+        /// Return the value as an mutable Any to allow for downcasts without 
transmutation
+        fn as_mut_any(&mut self) -> &mut dyn std::any::Any;
+    }
+
+    impl ParquetValueType for bool {
+        #[inline]
+        fn encode<W: std::io::Write>(values: &[Self], _: &mut W, bit_writer: 
&mut BitWriter) -> Result<()> {
+            for value in values {
+                bit_writer.put_value(*value as u64, 1);
+            }
+            Ok(())
+        }
+
+        #[inline]
+        fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, 
num_values: usize) {
+            decoder.bit_reader.replace(BitReader::new(data));
+            decoder.num_values = num_values;
+        }
+
+        #[inline]
+        fn decode(buffer: &mut [Self], decoder: &mut PlainDecoderDetails) -> 
Result<usize> {
+            let bit_reader = decoder.bit_reader.as_mut().unwrap();
+            let num_values = std::cmp::min(buffer.len(), decoder.num_values);
+            let values_read = bit_reader.get_batch(&mut buffer[..num_values], 
1);
+            decoder.num_values -= values_read;
+            Ok(values_read)
+        }
+
+        #[inline]
+        fn as_i64(&self) -> Result<i64> {
+            Ok(*self as i64)
+        }
+
+        #[inline]
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        #[inline]
+        fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
+            self
+        }
+    }
+
+    /// Hopelessly unsafe function that emulates `num::as_ne_bytes`
+    ///
+    /// It is not recommended to use this outside of this private module as, 
while it
+    /// _should_ work for primitive values, it is little better than a 
transmutation
+    /// and can act as a backdoor into mis-interpreting types as arbitary byte 
slices
+    #[inline]
+    fn as_raw<'a, T>(value: *const T) -> &'a [u8] {
+        unsafe {
+            let value = value as *const u8;
+            std::slice::from_raw_parts(value, std::mem::size_of::<T>())
+        }
+    }
+
+    macro_rules! impl_from_raw {
+        ($ty: ty, $self: ident => $as_i64: block) => {
+            impl ParquetValueType for $ty {
+                #[inline]
+                fn encode<W: std::io::Write>(values: &[Self], writer: &mut W, 
_: &mut BitWriter) -> Result<()> {
+                    let raw = unsafe {
+                        std::slice::from_raw_parts(
+                            values.as_ptr() as *const u8,
+                            std::mem::size_of::<$ty>() * values.len(),
+                        )
+                    };
+                    writer.write_all(raw)?;
+
+                    Ok(())
+                }
+
+                #[inline]
+                fn set_data(decoder: &mut PlainDecoderDetails, data: 
ByteBufferPtr, num_values: usize) {
+                    decoder.data.replace(data);
+                    decoder.start = 0;
+                    decoder.num_values = num_values;
+                }
+
+                #[inline]
+                fn decode(buffer: &mut [Self], decoder: &mut 
PlainDecoderDetails) -> Result<usize> {
+                    let data = decoder.data.as_mut().unwrap();
+                    let num_values = std::cmp::min(buffer.len(), 
decoder.num_values);
+                    let bytes_left = data.len() - decoder.start;
+                    let bytes_to_decode = std::mem::size_of::<Self>() * 
num_values;
+
+                    if bytes_left < bytes_to_decode {
+                        return Err(eof_err!("Not enough bytes to decode"));
+                    }
+
+                    // SAFETY: Raw types should be as per the standard rust 
bit-vectors
+                    unsafe {
+                        let raw_buffer = &mut 
Self::slice_as_bytes_mut(buffer)[..bytes_to_decode];
+                        raw_buffer.copy_from_slice(data.range(decoder.start, 
bytes_to_decode).as_ref());
+                    };
+                    decoder.start += bytes_to_decode;
+                    decoder.num_values -= num_values;
+
+                    Ok(num_values)
+                }
+
+                #[inline]
+                fn as_i64(&$self) -> Result<i64> {
+                    $as_i64
+                }
+
+                #[inline]
+                fn as_any(&self) -> &dyn std::any::Any {
+                    self
+                }
+
+                #[inline]
+                fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
+                    self
+                }
+            }
+        }
+    }
+
+    impl_from_raw!(i32, self => { Ok(*self as i64) });
+    impl_from_raw!(i64, self => { Ok(*self) });
+    impl_from_raw!(f32, self => { Err(general_err!("Type cannot be converted 
to i64")) });
+    impl_from_raw!(f64, self => { Err(general_err!("Type cannot be converted 
to i64")) });
+
+    impl ParquetValueType for super::Int96 {
+        #[inline]
+        fn encode<W: std::io::Write>(values: &[Self], writer: &mut W, _: &mut 
BitWriter) -> Result<()> {
+            for value in values {
+                let raw = unsafe {
+                    std::slice::from_raw_parts(value.data() as *const [u32] as 
*const u8, 12)
+                };
+                writer.write_all(raw)?;
+            }
+            Ok(())
+        }
+
+        #[inline]
+        fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, 
num_values: usize) {
+            decoder.data.replace(data);
+            decoder.start = 0;
+            decoder.num_values = num_values;
+        }
+
+        #[inline]
+        fn decode(buffer: &mut [Self], decoder: &mut PlainDecoderDetails) -> 
Result<usize> {
+            // TODO - Remove the duplication between this and the general 
slice method
+            let data = decoder.data.as_ref().expect("set_data should have been 
called");
+            let num_values = std::cmp::min(buffer.len(), decoder.num_values);
+            let bytes_left = data.len() - decoder.start;
+            let bytes_to_decode = 12 * num_values;
+
+            if bytes_left < bytes_to_decode {
+                return Err(eof_err!("Not enough bytes to decode"));
+            }
+
+            let data_range = data.range(decoder.start, bytes_to_decode);
+            let bytes: &[u8] = data_range.data();
+            decoder.start += bytes_to_decode;
+
+            let mut pos = 0; // position in byte array
+            for i in 0..num_values {
+                let elem0 = byteorder::LittleEndian::read_u32(&bytes[pos..pos 
+ 4]);
+                let elem1 = byteorder::LittleEndian::read_u32(&bytes[pos + 
4..pos + 8]);
+                let elem2 = byteorder::LittleEndian::read_u32(&bytes[pos + 
8..pos + 12]);
+
+                buffer[i]
+                    .as_mut_any()
+                    .downcast_mut::<Self>()
+                    .unwrap()
+                    .set_data(elem0, elem1, elem2);
+
+                pos += 12;
+            }
+            decoder.num_values -= num_values;
+
+            Ok(num_values)
+        }
+
+        #[inline]
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        #[inline]
+        fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
+            self
+        }
+    }
+
+    // TODO - Why does macro importing fail?

Review comment:
       I don't understand this comment

##########
File path: rust/parquet/src/data_type.rs
##########
@@ -229,6 +235,82 @@ impl fmt::Display for ByteArray {
     }
 }
 
+/// Wrapper type for performance reasons, this represents 
`FIXED_LEN_BYTE_ARRAY` but in all other
+/// considerations behaves the same as `ByteArray`
+#[repr(transparent)]
+#[derive(Clone, Debug, Default)]
+pub struct FixedLenByteArray(ByteArray);

Review comment:
       I suggest you add this context in the comments of this file, for the 
benefit of future readers (who will likely look at the code and wonder why 
there is a wrapper type that doesn't seem to add any additional logic)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] alamb commented on a change in pull request #8698: ARROW-10636: [Rust][Parquet] Switch to Rust Stable by removing specialization in parquet

Reply via email to