alamb commented on code in PR #2947: URL: https://github.com/apache/arrow-rs/pull/2947#discussion_r1007171469
########## arrow-array/src/array/byte_array.rs: ########## @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::{empty_offsets, print_long_array}; +use crate::iterator::ArrayIter; +use crate::raw_pointer::RawPtrBox; +use crate::types::bytes::ByteArrayNativeType; +use crate::types::ByteArrayType; +use crate::{Array, ArrayAccessor, OffsetSizeTrait}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_data::ArrayData; +use arrow_schema::DataType; +use std::any::Any; + +/// Generic struct for variable-size byte arrays +/// +/// See [`StringArray`] and [`LargeStringArray`] for storing string data Review Comment: ```suggestion /// See [`StringArray`] and [`LargeStringArray`] for storing utf8 encoded string data ``` ########## arrow-array/src/types.rs: ########## @@ -464,7 +466,7 @@ impl Date64Type { } } -mod private { +mod decimal { Review Comment: ```suggestion // Crate private / sealed types for Decimal // not intended to be used outside this crate mod decimal { ``` ########## arrow-array/src/array/string_array.rs: ########## @@ -15,67 +15,27 @@ // specific language governing permissions and limitations // under the License. -use crate::iterator::GenericStringIter; -use crate::raw_pointer::RawPtrBox; +use crate::types::GenericStringType; use crate::{ - empty_offsets, print_long_array, Array, ArrayAccessor, GenericBinaryArray, - GenericListArray, OffsetSizeTrait, + Array, GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait, }; -use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_buffer::{bit_util, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; -use std::any::Any; /// Generic struct for \[Large\]StringArray /// /// See [`StringArray`] and [`LargeStringArray`] for storing /// specific string data. -pub struct GenericStringArray<OffsetSize: OffsetSizeTrait> { - data: ArrayData, - value_offsets: RawPtrBox<OffsetSize>, - value_data: RawPtrBox<u8>, -} +pub type GenericStringArray<OffsetSize> = GenericByteArray<GenericStringType<OffsetSize>>; Review Comment: This is a very nice cleanup to remove duplication 🏆 ########## arrow-array/src/types.rs: ########## @@ -574,6 +576,83 @@ fn format_decimal_str(value_str: &str, precision: usize, scale: usize) -> String } } +pub(crate) mod bytes { Review Comment: ```suggestion // Crate private / sealed types for Byte arrays // not intended to be used outside this crate pub(crate) mod bytes { ``` ########## arrow-array/src/array/byte_array.rs: ########## @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one Review Comment: 👍 ########## arrow-array/src/types.rs: ########## @@ -574,6 +576,83 @@ fn format_decimal_str(value_str: &str, precision: usize, scale: usize) -> String } } +pub(crate) mod bytes { + use super::*; + + pub trait ByteArrayTypeSealed {} + impl<O: OffsetSizeTrait> ByteArrayTypeSealed for GenericStringType<O> {} + impl<O: OffsetSizeTrait> ByteArrayTypeSealed for GenericBinaryType<O> {} + + pub trait ByteArrayNativeType: std::fmt::Debug + Send + Sync { + /// # Safety + /// + /// `b` must be a valid byte sequence for `Self` + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self; + } + + impl ByteArrayNativeType for [u8] { + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { + b + } + } + + impl ByteArrayNativeType for str { + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { + std::str::from_utf8_unchecked(b) + } + } +} + +/// A trait over the variable-size byte array types +/// +/// See [Variable Size Binary Layout](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) +pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed { + type Offset: OffsetSizeTrait; + type Native: bytes::ByteArrayNativeType + AsRef<[u8]> + ?Sized; + const PREFIX: &'static str; Review Comment: ```suggestion /// "Binary" or "String", for use in error messages const PREFIX: &'static str; ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
