asuresh8 commented on code in PR #9520:
URL: https://github.com/apache/arrow-rs/pull/9520#discussion_r2933855741
##########
parquet/src/arrow/arrow_writer/byte_array.rs:
##########
@@ -356,6 +416,51 @@ impl DictEncoder {
}
}
+ /// Fast path for DictionaryArray input with a lazy remap table.
+ ///
+ /// Instead of interning each row's value individually (O(N) hash
operations),
+ /// this method builds a lazy remap table of size O(D) where D is the
number
+ /// of unique dictionary values actually referenced, then maps each row's
key
+ /// through the remap table using a simple array index lookup.
+ ///
+ /// The `row_to_key` closure extracts the dictionary key (as usize) for a
given
+ /// row index. This avoids allocating a separate `Vec<usize>` for the keys.
+ ///
+ /// The remap table uses `Vec<Option<u64>>` with lazy population: values
are
+ /// interned on first encounter and cached for subsequent rows. This
ensures
+ /// only referenced dictionary values are interned, producing
byte-identical
+ /// output to the per-row path.
+ fn encode_with_remap<T, F>(
+ &mut self,
+ values: T,
+ indices: &[usize],
+ dict_len: usize,
+ row_to_key: F,
+ ) where
+ T: ArrayAccessor + Copy,
+ T::Item: AsRef<[u8]>,
+ F: Fn(usize) -> usize,
+ {
+ let mut remap: Vec<Option<u64>> = vec![None; dict_len];
+
+ self.indices.reserve(indices.len());
+ for &idx in indices {
+ let key = row_to_key(idx);
+ let interned = match remap[key] {
+ Some(cached) => cached,
+ None => {
+ let value = values.value(idx);
+ let fresh = self.interner.intern(value.as_ref());
+ remap[key] = Some(fresh);
+ fresh
+ }
+ };
+ self.indices.push(interned);
Review Comment:
done
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]