alamb commented on code in PR #5707:
URL: https://github.com/apache/arrow-rs/pull/5707#discussion_r1589972225
##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -482,6 +592,59 @@ impl From<Vec<Option<String>>> for StringViewArray {
}
}
+/// A helper struct that used to check if the array is compact view
+///
+/// The checker is lazy and will not check the array until `finish` is called.
+///
+/// This is based on the assumption that the array will most likely to be not
compact,
+/// so it is likely to scan the entire array.
+///
+/// Then it is better to do the check at once, rather than doing it for each
accumulate operation.
+struct CompactChecker {
+ length: usize,
+ intervals: BTreeMap<usize, usize>,
+}
Review Comment:
I recommend we pull the "compact checking" algorithm into a new PR to
discuss it -- I am not sure about the assumption that StringViewArrays will
mostly often be compacted (I would actually expect the opposite)
##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -645,4 +816,175 @@ mod tests {
StringViewArray::new(views, buffers, None);
}
+
+ #[test]
+ #[should_panic(expected = "Invalid interval: offset 0 length 13 is out of
bound of length 12")]
+ fn test_compact_checker() {
+ use super::CompactChecker;
+
+ // single coverage, full
+ let mut checker = CompactChecker::new(10);
+ checker.accumulate(0, 10);
+ assert!(checker.finish());
+
+ // single coverage, partial
+ let mut checker = CompactChecker::new(10);
+ checker.accumulate(0, 5);
+ assert!(!checker.finish());
+
+ // multiple coverage, no overlapping, partial
+ let mut checker = CompactChecker::new(10);
+ checker.accumulate(0, 5);
+ checker.accumulate(5, 4);
+ assert!(!checker.finish());
+
+ //multiple coverage, no overlapping, full
+ let mut checker = CompactChecker::new(10);
+ checker.accumulate(0, 5);
+ checker.accumulate(5, 5);
+ assert!(checker.finish());
+
+ //multiple coverage, overlapping, partial
+ let mut checker = CompactChecker::new(10);
+ checker.accumulate(0, 5);
+ checker.accumulate(4, 5);
+ assert!(!checker.finish());
+
+ //multiple coverage, overlapping, full
+ let mut checker = CompactChecker::new(10);
+ checker.accumulate(0, 5);
+ checker.accumulate(4, 6);
+ assert!(checker.finish());
+
+ //mutiple coverage, no overlapping, full, out of order
+ let mut checker = CompactChecker::new(10);
+ checker.accumulate(4, 6);
+ checker.accumulate(0, 4);
+ assert!(checker.finish());
+
+ // multiple coverage, overlapping, full, out of order
+ let mut checker = CompactChecker::new(10);
+ checker.accumulate(4, 6);
+ checker.accumulate(0, 4);
+ assert!(checker.finish());
+
+ // multiple coverage, overlapping, full, containing null
+ let mut checker = CompactChecker::new(10);
+ checker.accumulate(0, 5);
+ checker.accumulate(5, 0);
+ checker.accumulate(5, 5);
+ assert!(checker.finish());
+
+ // multiple coverage, overlapping, full, containing null
+ let mut checker = CompactChecker::new(10);
+ checker.accumulate(0, 5);
+ checker.accumulate(5, 0);
+ checker.accumulate(4, 6);
+ checker.accumulate(5, 5);
+ assert!(checker.finish());
+
+ // multiple coverage, overlapping, full, containing null
+ //
+ // this case is for attacking those implementation that only check
+ // the lower-bound of the interval
+ let mut checker = CompactChecker::new(10);
+ checker.accumulate(0, 5);
+ checker.accumulate(5, 0);
+ checker.accumulate(1, 9);
+ checker.accumulate(2, 3);
+ checker.accumulate(3, 1);
+ checker.accumulate(9, 1);
+ assert!(checker.finish());
+
+ // panic case, out of bound
+ let mut checker = CompactChecker::new(12);
+ checker.accumulate(0, 13);
+ checker.finish();
+ }
+
+ #[test]
+ fn test_gc() {
+ //
---------------------------------------------------------------------
+ // test compact on compacted data
+
+ let array = {
+ let mut builder = StringViewBuilder::new();
+ builder.append_value("I look at you all");
+ builder.append_option(Some("see the love there that's sleeping"));
+ builder.finish()
+ };
+ let compacted = array.gc();
+ // verify it is a shallow copy
+ assert_eq!(array.buffers[0].as_ptr(), compacted.buffers[0].as_ptr());
+
+ //
---------------------------------------------------------------------
+ // test compact on non-compacted data
+
+ let mut array = {
+ let mut builder = StringViewBuilder::new();
+ builder.append_value("while my guitar gently weeps");
+ builder.finish()
+ };
+ // shrink the view
+ let mut view = ByteView::from(array.views[0]);
+ view.length = 15;
+ let new_views = ScalarBuffer::from(vec![view.into()]);
+ array.views = new_views;
+ let compacted = array.gc();
+ // verify it is a deep copy
+ assert_ne!(array.buffers[0].as_ptr(), compacted.buffers[0].as_ptr());
+ // verify content
+ assert_eq!(array.value(0), compacted.value(0));
+ // verify compacted
+ assert!(compacted.compact_check().iter().all(|x| *x));
+
+ //
---------------------------------------------------------------------
+ // test compact on array containing null
+
+ let mut array = {
+ let mut builder = StringViewBuilder::new();
+ builder.append_null();
+ builder.append_option(Some("I don't know why nobody told you"));
+ builder.finish()
+ };
+
+ let mut view = ByteView::from(array.views[1]);
+ view.length = 15;
+ let new_views = ScalarBuffer::from(vec![array.views[0], view.into()]);
Review Comment:
I dn't undertand what this is doing
I think you can more easily create a stringview with multiple buffers like
this:
https://github.com/apache/arrow-rs/blob/905c46be1a8b1605c7bbd44392fea2bf4182eb01/arrow-cast/src/pretty.rs#L334-L341
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]