Re: [PR] GH-48701: [C++][Parquet] Add ALPpd encoding [arrow]

via GitHub Wed, 24 Jun 2026 23:52:17 -0700


emkornfield commented on code in PR #48345:
URL: https://github.com/apache/arrow/pull/48345#discussion_r3472435587



##########
cpp/src/arrow/util/alp/alp.h:
##########
@@ -0,0 +1,795 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Adaptive Lossless floating-Point (ALP) compression implementation
+
+#pragma once
+
+#include <optional>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/alp/alp_constants.h"
+#include "arrow/util/span.h"
+
+namespace arrow {
+namespace util {
+namespace alp {
+
+// ----------------------------------------------------------------------
+// ALP Overview
+//
+// IMPORTANT: For abstract interfaces or examples how to use ALP, consult
+// alp_codec.h.
+// This is our implementation of the adaptive lossless floating-point
+// compression for decimals (ALP) (https://dl.acm.org/doi/10.1145/3626717).
+// It works by converting a float into a decimal (if possible). The exponent
+// and factor are chosen per vector. Each float is converted using
+// c(f) = int64(f * 10^exponent * 10^-factor). The converted floats are then
+// encoded via a delta frame of reference and bitpacked. Every exception,
+// where the conversion/reconversion changes the value of the float, is stored
+// separately and has to be patched into the decompressed vector afterwards.
+//
+// ==========================================================================
+//                    ALP COMPRESSION/DECOMPRESSION PIPELINE
+// ==========================================================================
+//
+// COMPRESSION FLOW:
+// -----------------
+//
+//   Input: float/double array
+//        |
+//        v
+//   +------------------------------------------------------------------+
+//   | 1. SAMPLING & PRESET GENERATION                                  |
+//   |    * Sample vectors from dataset                                 |
+//   |    * Try all exponent/factor combinations (e, f)                 |
+//   |    * Select best k combinations for preset                       |
+//   +------------------------------------+-----------------------------+
+//                                        | preset.combinations
+//                                        v
+//   +------------------------------------------------------------------+
+//   | 2. PER-VECTOR COMPRESSION                                        |
+//   |    a) Find best (e,f) from preset for this vector                |
+//   |    b) Encode: encoded[i] = int64(value[i] * 10^e * 10^-f)        |
+//   |    c) Verify: if decode(encoded[i]) != value[i] -> exception     |
+//   |    d) Replace exceptions with placeholder value                  |
+//   +------------------------------------+-----------------------------+
+//                                        | encoded integers + exceptions
+//                                        v
+//   +------------------------------------------------------------------+
+//   | 3. FRAME OF REFERENCE (FOR)                                      |
+//   |    * Find min value in encoded integers                          |
+//   |    * Subtract min from all values: delta[i] = encoded[i] - min   |
+//   +------------------------------------+-----------------------------+
+//                                        | delta values (smaller range)
+//                                        v
+//   +------------------------------------------------------------------+
+//   | 4. BIT PACKING                                                   |
+//   |    * Calculate bit_width = log2(max_delta)                       |
+//   |    * Pack each value into bit_width bits                         |
+//   |    * Result: tightly packed binary data                          |
+//   +------------------------------------+-----------------------------+
+//                                        | packed bytes
+//                                        v
+//   +------------------------------------------------------------------+
+//   | 5. SERIALIZATION (offset-based interleaved layout)              |
+//   |    [Header][Offsets...][Vector₀][Vector₁]...                    |
+//   |    where each Vector = [AlpInfo|ForInfo|Data]                   |
+//   +------------------------------------------------------------------+
+//
+//
+// DECOMPRESSION FLOW:
+// -------------------
+//
+//   Serialized bytes -> AlpEncodedVector::Load()
+//        |
+//        v
+//   +------------------------------------------------------------------+
+//   | 1. BIT UNPACKING                                                 |
+//   |    * Extract bit_width from metadata                             |
+//   |    * Unpack each value from bit_width bits -> delta values       |
+//   +------------------------------------+-----------------------------+
+//                                        | delta values
+//                                        v
+//   +------------------------------------------------------------------+
+//   | 2. REVERSE FRAME OF REFERENCE (unFOR)                            |
+//   |    * Add back min: encoded[i] = delta[i] + frame_of_reference    |
+//   +------------------------------------+-----------------------------+
+//                                        | encoded integers
+//                                        v
+//   +------------------------------------------------------------------+
+//   | 3. DECODE                                                        |
+//   |    * Apply inverse formula: value[i] = encoded[i] * 10^-e * 10^f |
+//   +------------------------------------+-----------------------------+
+//                                        | decoded floats (with placeholders)
+//                                        v
+//   +------------------------------------------------------------------+
+//   | 4. PATCH EXCEPTIONS                                              |
+//   |    * Replace values at exception_positions[] with exceptions[]   |
+//   +------------------------------------+-----------------------------+
+//                                        |
+//                                        v
+//   Output: Original float/double array (lossless!)
+//
+// ==========================================================================
+
+// ----------------------------------------------------------------------
+// AlpMode
+
+/// \brief ALP compression mode
+///
+/// Currently only ALP (decimal compression) is implemented.
+enum class AlpMode { kAlp = 0 };
+
+// ----------------------------------------------------------------------
+// AlpExponentAndFactor
+
+/// \brief Helper struct to encapsulate the exponent and factor
+struct AlpExponentAndFactor {
+  uint8_t exponent{0};
+  uint8_t factor{0};
+
+  bool operator==(const AlpExponentAndFactor& other) const {
+    return exponent == other.exponent && factor == other.factor;
+  }
+
+  /// \brief Comparison operator for deterministic std::map ordering
+  bool operator<(const AlpExponentAndFactor& other) const {
+    if (exponent != other.exponent) return exponent < other.exponent;
+    return factor < other.factor;
+  }
+};
+
+// ----------------------------------------------------------------------
+// AlpEncodedVectorInfo (non-templated, ALP core metadata)
+
+/// \brief ALP-specific metadata for an encoded vector (non-templated)
+///
+/// Contains the metadata specific to ALP's float-to-integer conversion:
+///   - exponent/factor: parameters for decimal encoding
+///   - num_exceptions: count of values that couldn't be losslessly encoded
+///
+/// This struct is the same size regardless of the floating-point type 
(float/double).
+/// It is separate from the integer encoding metadata (e.g., FOR) to allow
+/// different integer encodings to be used in the future.
+///
+/// Serialization format (4 bytes):
+///
+///   +------------------------------------------+
+///   |  AlpEncodedVectorInfo (4 bytes)          |
+///   +------------------------------------------+
+///   |  Offset |  Field              |  Size    |
+///   +---------+---------------------+----------+
+///   |    0    |  exponent (uint8_t) |  1 byte  |
+///   |    1    |  factor (uint8_t)   |  1 byte  |
+///   |    2    |  num_exceptions     |  2 bytes |
+///   +------------------------------------------+
+class AlpEncodedVectorInfo {
+ public:
+  AlpEncodedVectorInfo() = default;
+  AlpEncodedVectorInfo(uint8_t exponent, uint8_t factor, int16_t 
num_exceptions)
+      : exponent_(exponent), factor_(factor), num_exceptions_(num_exceptions) 
{}
+
+  uint8_t exponent() const { return exponent_; }
+  uint8_t factor() const { return factor_; }
+  int16_t num_exceptions() const { return num_exceptions_; }
+
+  void set_exponent(uint8_t exponent) { exponent_ = exponent; }
+  void set_factor(uint8_t factor) { factor_ = factor; }
+  void set_num_exceptions(int16_t num_exceptions) { num_exceptions_ = 
num_exceptions; }
+
+  /// Size of the serialized portion (4 bytes, fixed)
+  static constexpr int64_t kStoredSize =
+      sizeof(uint8_t) + sizeof(uint8_t) + sizeof(int16_t);
+  static_assert(kStoredSize == 4, "AlpEncodedVectorInfo stored size must be 4 
bytes");
+
+  /// \brief Store the ALP metadata into an output buffer
+  ///
+  /// \pre output_buffer.size() >= kStoredSize
+  void Store(arrow::util::span<uint8_t> output_buffer) const;
+
+  /// \brief Load ALP metadata from an input buffer
+  ///
+  /// \return the loaded metadata, or Status::Invalid if the buffer is too 
small
+  static Result<AlpEncodedVectorInfo> Load(arrow::util::span<const uint8_t> 
input_buffer);
+
+  /// \brief Get serialized size of the ALP metadata
+  static int64_t GetStoredSize() { return kStoredSize; }
+
+  /// \brief Get exponent and factor as a combined struct
+  AlpExponentAndFactor GetExponentAndFactor() const {
+    return AlpExponentAndFactor{exponent_, factor_};
+  }
+
+  bool operator==(const AlpEncodedVectorInfo& other) const {
+    return exponent_ == other.exponent_ && factor_ == other.factor_ &&
+           num_exceptions_ == other.num_exceptions_;
+  }
+
+  bool operator!=(const AlpEncodedVectorInfo& other) const { return !(*this == 
other); }
+
+ private:
+  uint8_t exponent_ = 0;
+  uint8_t factor_ = 0;
+  int16_t num_exceptions_ = 0;
+};
+
+// ----------------------------------------------------------------------
+// AlpEncodedForVectorInfo (templated, FOR integer encoding metadata)
+
+/// \brief FOR (Frame of Reference) encoding metadata for an encoded vector
+///
+/// Contains the metadata specific to FOR bit-packing integer encoding:
+///   - frame_of_reference: minimum value subtracted from all encoded integers
+///   - bit_width: number of bits used to pack each delta value
+///
+/// This struct is templated because frame_of_reference size depends on T:
+///   - float:  uint32_t frame_of_reference (4 bytes)
+///   - double: uint64_t frame_of_reference (8 bytes)
+///
+/// Serialization format for float (5 bytes):
+///
+///   +------------------------------------------+
+///   |  AlpEncodedForVectorInfo<float> (5B)     |
+///   +------------------------------------------+
+///   |  Offset |  Field              |  Size    |
+///   +---------+---------------------+----------+
+///   |    0    |  frame_of_reference |  4 bytes |
+///   |    4    |  bit_width (uint8_t)|  1 byte  |
+///   +------------------------------------------+
+///
+/// Serialization format for double (9 bytes):
+///
+///   +------------------------------------------+
+///   |  AlpEncodedForVectorInfo<double> (9B)    |
+///   +------------------------------------------+
+///   |  Offset |  Field              |  Size    |
+///   +---------+---------------------+----------+
+///   |    0    |  frame_of_reference |  8 bytes |
+///   |    8    |  bit_width (uint8_t)|  1 byte  |
+///   +------------------------------------------+
+///
+/// \tparam T the floating point type (float or double)
+template <typename T>
+class AlpEncodedForVectorInfo {
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                "AlpEncodedForVectorInfo only supports float and double");
+
+ public:
+  /// Use uint32_t for float, uint64_t for double (matches encoded integer 
size)
+  using ExactType = typename AlpTypedConstants<T>::FloatingToExact;
+
+  AlpEncodedForVectorInfo() = default;
+  AlpEncodedForVectorInfo(ExactType frame_of_reference, uint8_t bit_width)
+      : frame_of_reference_(frame_of_reference), bit_width_(bit_width) {}
+
+  ExactType frame_of_reference() const { return frame_of_reference_; }
+  uint8_t bit_width() const { return bit_width_; }
+
+  void set_frame_of_reference(ExactType frame_of_reference) {
+    frame_of_reference_ = frame_of_reference;
+  }
+  void set_bit_width(uint8_t bit_width) { bit_width_ = bit_width; }
+
+  /// Size of the serialized portion (5 bytes for float, 9 for double)
+  static constexpr int64_t kStoredSize = sizeof(ExactType) + 1;
+
+  /// \brief Compute the bitpacked size in bytes from num_elements and 
bit_width
+  ///
+  /// \param[in] num_elements number of elements in this vector
+  /// \param[in] bw bits per element
+  /// \return the size in bytes of the bitpacked data
+  static int64_t GetBitPackedSize(int32_t num_elements, uint8_t bw) {

Review Comment:
   MIght have already commented but this probably belongs in a bitutil class.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-48701: [C++][Parquet] Add ALPpd encoding [arrow]

Reply via email to