prtkgaur commented on code in PR #48345: URL: https://github.com/apache/arrow/pull/48345#discussion_r3196662887
########## cpp/cmake_modules/SetupCxxFlags.cmake: ########## @@ -127,7 +127,7 @@ elseif(ARROW_CPU_FLAG STREQUAL "ppc") endif() elseif(ARROW_CPU_FLAG STREQUAL "aarch64") # Arm64 compiler flags, gcc/clang only - set(ARROW_ARMV8_MARCH "armv8-a") + set(ARROW_ARMV8_MARCH "native") Review Comment: I was trying to get an idea of performance (iterate). Reverted ########## cpp/src/arrow/util/alp/alp_constants.h: ########## @@ -0,0 +1,256 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Constants and type traits for ALP compression + +#pragma once + +#include <cstdint> + +#include "arrow/util/logging.h" + +namespace arrow { +namespace util { +namespace alp { + +// ---------------------------------------------------------------------- +// AlpConstants + +/// \brief Constants used throughout ALP compression +class AlpConstants { + public: + /// Number of elements compressed together as a unit. Fixed for compatibility. + static constexpr uint64_t kAlpVectorSize = 1024; Review Comment: You are suggesting it should be present somewhere in config? ########## cpp/cmake_modules/SetupCxxFlags.cmake: ########## @@ -621,11 +621,11 @@ endif() if(NOT MSVC) set(C_RELEASE_FLAGS "") if(CMAKE_C_FLAGS_RELEASE MATCHES "-O3") - string(APPEND C_RELEASE_FLAGS " -O2") Review Comment: reverted. I might want to add this to readme at some point ########## cpp/src/arrow/util/alp/alp.cc: ########## @@ -0,0 +1,1039 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/alp/alp.h" + +#include <cmath> +#include <cstring> +#include <functional> +#include <iostream> +#include <map> + +#include "arrow/util/alp/alp_constants.h" +#include "arrow/util/bit_stream_utils_internal.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" +#include "arrow/util/bpacking_internal.h" +#include "arrow/util/logging.h" +#include "arrow/util/small_vector.h" +#include "arrow/util/span.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace util { +namespace alp { + +// ALP serialization uses memcpy for multi-byte integers (frame_of_reference, +// num_exceptions, offsets) and assumes little-endian byte order on disk. +static_assert(ARROW_LITTLE_ENDIAN, + "ALP serialization assumes little-endian byte order"); + +// ---------------------------------------------------------------------- +// AlpEncodedVectorInfo implementation (non-templated, 4 bytes) + +void AlpEncodedVectorInfo::Store(arrow::util::span<char> output_buffer) const { + ARROW_CHECK(output_buffer.size() >= GetStoredSize()) Review Comment: added \pre output_buffer.size() >= kStoredSize to the Store declarations in alp.h. ########## cpp/src/arrow/util/alp/alp.cc: ########## @@ -0,0 +1,1039 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/alp/alp.h" + +#include <cmath> +#include <cstring> +#include <functional> +#include <iostream> +#include <map> + +#include "arrow/util/alp/alp_constants.h" +#include "arrow/util/bit_stream_utils_internal.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" +#include "arrow/util/bpacking_internal.h" +#include "arrow/util/logging.h" +#include "arrow/util/small_vector.h" +#include "arrow/util/span.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace util { +namespace alp { + +// ALP serialization uses memcpy for multi-byte integers (frame_of_reference, +// num_exceptions, offsets) and assumes little-endian byte order on disk. +static_assert(ARROW_LITTLE_ENDIAN, + "ALP serialization assumes little-endian byte order"); + +// ---------------------------------------------------------------------- +// AlpEncodedVectorInfo implementation (non-templated, 4 bytes) + +void AlpEncodedVectorInfo::Store(arrow::util::span<char> output_buffer) const { + ARROW_CHECK(output_buffer.size() >= GetStoredSize()) + << "alp_vector_info_output_too_small: " << output_buffer.size() << " vs " + << GetStoredSize(); + + char* ptr = output_buffer.data(); + + // exponent, factor: 1 byte each + *ptr++ = static_cast<char>(exponent); + *ptr++ = static_cast<char>(factor); + + // num_exceptions: 2 bytes + std::memcpy(ptr, &num_exceptions, sizeof(num_exceptions)); +} + +AlpEncodedVectorInfo AlpEncodedVectorInfo::Load( + arrow::util::span<const char> input_buffer) { + ARROW_CHECK(input_buffer.size() >= GetStoredSize()) Review Comment: Done — the header now documents \return the loaded metadata, or Status::Invalid if the buffer is too small. The validation is part of the return contract rather than a hidden ARROW_CHECK. ########## cpp/src/arrow/util/alp/alp.cc: ########## @@ -0,0 +1,1039 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/alp/alp.h" + +#include <cmath> +#include <cstring> +#include <functional> +#include <iostream> +#include <map> + +#include "arrow/util/alp/alp_constants.h" +#include "arrow/util/bit_stream_utils_internal.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" +#include "arrow/util/bpacking_internal.h" +#include "arrow/util/logging.h" +#include "arrow/util/small_vector.h" +#include "arrow/util/span.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace util { +namespace alp { + +// ALP serialization uses memcpy for multi-byte integers (frame_of_reference, +// num_exceptions, offsets) and assumes little-endian byte order on disk. +static_assert(ARROW_LITTLE_ENDIAN, + "ALP serialization assumes little-endian byte order"); + +// ---------------------------------------------------------------------- +// AlpEncodedVectorInfo implementation (non-templated, 4 bytes) + +void AlpEncodedVectorInfo::Store(arrow::util::span<char> output_buffer) const { + ARROW_CHECK(output_buffer.size() >= GetStoredSize()) + << "alp_vector_info_output_too_small: " << output_buffer.size() << " vs " + << GetStoredSize(); + + char* ptr = output_buffer.data(); + + // exponent, factor: 1 byte each + *ptr++ = static_cast<char>(exponent); + *ptr++ = static_cast<char>(factor); + + // num_exceptions: 2 bytes + std::memcpy(ptr, &num_exceptions, sizeof(num_exceptions)); +} + +AlpEncodedVectorInfo AlpEncodedVectorInfo::Load( + arrow::util::span<const char> input_buffer) { + ARROW_CHECK(input_buffer.size() >= GetStoredSize()) + << "alp_vector_info_input_too_small: " << input_buffer.size() << " vs " + << GetStoredSize(); + + AlpEncodedVectorInfo result{}; + const char* ptr = input_buffer.data(); + + // exponent, factor: 1 byte each + result.exponent = static_cast<uint8_t>(*ptr++); + result.factor = static_cast<uint8_t>(*ptr++); + + // num_exceptions: 2 bytes + std::memcpy(&result.num_exceptions, ptr, sizeof(result.num_exceptions)); Review Comment: Sorry can you be more specific. We're already using std::memcpy for multi-byte stores, which is the safe store pattern — no alignment UB and endianness is covered by the static_assert(ARROW_LITTLE_ENDIAN) at the top of the file. ########## cpp/src/arrow/util/alp/alp.cc: ########## @@ -0,0 +1,1039 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/alp/alp.h" + +#include <cmath> +#include <cstring> +#include <functional> +#include <iostream> +#include <map> + +#include "arrow/util/alp/alp_constants.h" +#include "arrow/util/bit_stream_utils_internal.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" +#include "arrow/util/bpacking_internal.h" +#include "arrow/util/logging.h" +#include "arrow/util/small_vector.h" +#include "arrow/util/span.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace util { +namespace alp { + +// ALP serialization uses memcpy for multi-byte integers (frame_of_reference, +// num_exceptions, offsets) and assumes little-endian byte order on disk. +static_assert(ARROW_LITTLE_ENDIAN, + "ALP serialization assumes little-endian byte order"); + +// ---------------------------------------------------------------------- +// AlpEncodedVectorInfo implementation (non-templated, 4 bytes) + +void AlpEncodedVectorInfo::Store(arrow::util::span<char> output_buffer) const { + ARROW_CHECK(output_buffer.size() >= GetStoredSize()) + << "alp_vector_info_output_too_small: " << output_buffer.size() << " vs " + << GetStoredSize(); + + char* ptr = output_buffer.data(); + + // exponent, factor: 1 byte each + *ptr++ = static_cast<char>(exponent); + *ptr++ = static_cast<char>(factor); + + // num_exceptions: 2 bytes + std::memcpy(ptr, &num_exceptions, sizeof(num_exceptions)); Review Comment: AlpEncodedVectorInfo::Load now returns Result<AlpEncodedVectorInfo> and validates buffer size before reading, returning Status::Invalid on malformed input instead of crashing. ########## cpp/src/arrow/util/alp/alp.cc: ########## @@ -0,0 +1,1039 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/alp/alp.h" + +#include <cmath> +#include <cstring> +#include <functional> +#include <iostream> +#include <map> + +#include "arrow/util/alp/alp_constants.h" +#include "arrow/util/bit_stream_utils_internal.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" +#include "arrow/util/bpacking_internal.h" +#include "arrow/util/logging.h" +#include "arrow/util/small_vector.h" +#include "arrow/util/span.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace util { +namespace alp { + +// ALP serialization uses memcpy for multi-byte integers (frame_of_reference, +// num_exceptions, offsets) and assumes little-endian byte order on disk. +static_assert(ARROW_LITTLE_ENDIAN, + "ALP serialization assumes little-endian byte order"); + +// ---------------------------------------------------------------------- +// AlpEncodedVectorInfo implementation (non-templated, 4 bytes) + +void AlpEncodedVectorInfo::Store(arrow::util::span<char> output_buffer) const { + ARROW_CHECK(output_buffer.size() >= GetStoredSize()) + << "alp_vector_info_output_too_small: " << output_buffer.size() << " vs " + << GetStoredSize(); + + char* ptr = output_buffer.data(); + + // exponent, factor: 1 byte each + *ptr++ = static_cast<char>(exponent); + *ptr++ = static_cast<char>(factor); + + // num_exceptions: 2 bytes + std::memcpy(ptr, &num_exceptions, sizeof(num_exceptions)); +} + +AlpEncodedVectorInfo AlpEncodedVectorInfo::Load( + arrow::util::span<const char> input_buffer) { + ARROW_CHECK(input_buffer.size() >= GetStoredSize()) + << "alp_vector_info_input_too_small: " << input_buffer.size() << " vs " + << GetStoredSize(); + + AlpEncodedVectorInfo result{}; + const char* ptr = input_buffer.data(); + + // exponent, factor: 1 byte each + result.exponent = static_cast<uint8_t>(*ptr++); + result.factor = static_cast<uint8_t>(*ptr++); + + // num_exceptions: 2 bytes + std::memcpy(&result.num_exceptions, ptr, sizeof(result.num_exceptions)); + + return result; +} + +// ---------------------------------------------------------------------- +// AlpEncodedForVectorInfo implementation (templated, 5/9 bytes) + +template <typename T> +void AlpEncodedForVectorInfo<T>::Store(arrow::util::span<char> output_buffer) const { + ARROW_CHECK(output_buffer.size() >= GetStoredSize()) + << "alp_for_vector_info_output_too_small: " << output_buffer.size() << " vs " + << GetStoredSize(); + + char* ptr = output_buffer.data(); + + // frame_of_reference: 4 bytes for float, 8 bytes for double + std::memcpy(ptr, &frame_of_reference, sizeof(frame_of_reference)); Review Comment: AlpEncodedForVectorInfo::Load now returns Result<AlpEncodedForVectorInfo> with buffer size validation, same pattern as AlpEncodedVectorInfo::Load. Is that what you meant? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
