alamb commented on code in PR #7875:
URL: https://github.com/apache/arrow-rs/pull/7875#discussion_r2188160176
##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -605,29 +605,85 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
/// key("bar\0") = 0x0000000000000000000062617200000004
/// ⇒ key("bar") < key("bar\0")
/// ```
+ /// # Inlining and Endianness
+ ///
+ /// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because
Rust’s native in‑memory
+ /// representation is little‑endian on x86/ARM.
+ /// - We extract the low 32 bits numerically (`raw as u32`)—this step is
endianness‑free.
+ /// - We copy the 12 bytes of inline data (original order) into
`buf[0..12]`.
+ /// - We serialize `length` as big‑endian into `buf[12..16]`.
+ /// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most
significant byte
+ /// and `buf[15]` as the least significant, producing a `u128` whose
integer value
+ /// directly encodes “inline data then length” in big‑endian form.
+ ///
+ /// This ensures that a simple `u128` comparison is equivalent to the
desired
+ /// lexicographical comparison of the inline bytes followed by length.
#[inline(always)]
pub fn inline_key_fast(raw: u128) -> u128 {
- // Convert the raw u128 (little-endian) into bytes for manipulation
+ // 1. Decompose `raw` into little‑endian bytes:
+ // - raw_bytes[0..4] = length in LE
+ // - raw_bytes[4..16] = inline string data
let raw_bytes = raw.to_le_bytes();
- // Extract the length (first 4 bytes), convert to big-endian u32, and
promote to u128
- let len_le = &raw_bytes[0..4];
- let len_be = u32::from_le_bytes(len_le.try_into().unwrap()).to_be() as
u128;
-
- // Extract the inline string bytes (next 12 bytes), place them into
the lower 12 bytes of a 16-byte array,
- // padding the upper 4 bytes with zero to form a little-endian u128
value
- let mut inline_bytes = [0u8; 16];
- inline_bytes[4..16].copy_from_slice(&raw_bytes[4..16]);
-
- // Convert to big-endian to ensure correct lexical ordering
- let inline_u128 = u128::from_le_bytes(inline_bytes).to_be();
-
- // Shift right by 32 bits to discard the zero padding (upper 4 bytes),
- // so that the inline string occupies the high 96 bits
- let inline_part = inline_u128 >> 32;
-
- // Combine the inline string part (high 96 bits) and length (low 32
bits) into the final key
- (inline_part << 32) | len_be
+ // 2. Numerically truncate to get the low 32‑bit length
(endianness‑free).
+ let length = raw as u32;
+
+ // 3. Build a 16‑byte buffer in big‑endian order:
+ // - buf[0..12] = inline string bytes (in original order)
+ // - buf[12..16] = length.to_be_bytes() (BE)
+ let mut buf = [0u8; 16];
+ buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data
+
+ // Why convert length to big-endian for comparison?
+ //
+ // Rust (on most platforms) stores integers in little-endian format,
+ // meaning the least significant byte is at the lowest memory address.
+ // For example, an u32 value like 0x22345677 is stored in memory as:
+ //
+ // [0x77, 0x56, 0x34, 0x22] // little-endian layout
+ // ^ ^ ^ ^
+ // LSB ↑↑↑ MSB
Review Comment:
this is a very nice comment
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]