This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push: new 85edafc fix: Fix a bug in how filter indices are calculated (#1185) 85edafc is described below commit 85edafcda90051b25cbad13ce6d6321006f65282 Author: Helgi Kristvin Sigurbjarnarson <he...@lacework.net> AuthorDate: Mon Jan 17 07:49:14 2022 -0800 fix: Fix a bug in how filter indices are calculated (#1185) * fix: Fix a bug in how filter indices are calculated Using the definition level and the nullability of the column only produces the correct indices if max_definition - 1 is the list level. For deeper nesting (struct in a list) this produces incorrect indices, silently causing incorrect data to be written. This fix uses the array offsets to compute the indices instead. * add assertions --- parquet/src/arrow/levels.rs | 75 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/parquet/src/arrow/levels.rs b/parquet/src/arrow/levels.rs index 0746088..ea1f212 100644 --- a/parquet/src/arrow/levels.rs +++ b/parquet/src/arrow/levels.rs @@ -758,14 +758,14 @@ impl LevelInfo { /// Given a level's information, calculate the offsets required to index an array correctly. pub(crate) fn filter_array_indices(&self) -> Vec<usize> { - // happy path if not dealing with lists - let is_nullable = match self.level_type { - LevelType::Primitive(is_nullable) => is_nullable, - _ => panic!( + if !matches!(self.level_type, LevelType::Primitive(_)) { + panic!( "Cannot filter indices on a non-primitive array, found {:?}", self.level_type - ), - }; + ); + } + + // happy path if not dealing with lists if self.repetition.is_none() { return self .definition @@ -780,17 +780,26 @@ impl LevelInfo { }) .collect(); } + let mut filtered = vec![]; - // remove slots that are false from definition_mask + let mut definition_levels = self.definition.iter(); let mut index = 0; - self.definition.iter().for_each(|def| { - if *def == self.max_definition { - filtered.push(index); - } - if *def >= self.max_definition - is_nullable as i16 { - index += 1; + + for len in self.array_offsets.windows(2).map(|s| s[1] - s[0]) { + if len == 0 { + // Skip this definition level--the iterator should not be empty, and the definition + // level be less than max_definition, i.e., a null value) + assert!(*definition_levels.next().unwrap() < self.max_definition); + } else { + for (_, def) in (0..len).zip(&mut definition_levels) { + if *def == self.max_definition { + filtered.push(index); + } + index += 1; + } } - }); + } + filtered } } @@ -1773,4 +1782,42 @@ mod tests { assert_eq!(list_level, &expected_level); } + + #[test] + fn test_nested_indices() { + // Given a buffer like + // [0, null, null, 1, 2] + // + // The two level infos below might represent the two structures + // 1: [{a: 0}], [], null, [null, null], [{a: 1}], [{a: 2}] + // 2: [0], [], null, [null, null], [1], [2] + // + // (That is, their only difference is that the leaf values are nested one level deeper in a + // struct). + + let level1 = LevelInfo { + definition: vec![4, 1, 0, 2, 2, 4, 4], + repetition: Some(vec![0, 0, 0, 0, 1, 0, 0]), + array_offsets: vec![0, 1, 1, 1, 3, 4, 5], + array_mask: vec![true, true, false, false, false, false, true], + max_definition: 4, + level_type: LevelType::Primitive(true), + offset: 0, + length: 5, + }; + + let level2 = LevelInfo { + definition: vec![3, 1, 0, 2, 2, 3, 3], + repetition: Some(vec![0, 0, 0, 0, 1, 0, 0]), + array_offsets: vec![0, 1, 1, 1, 3, 4, 5], + array_mask: vec![true, true, false, false, false, false, true], + max_definition: 3, + level_type: LevelType::Primitive(true), + offset: 0, + length: 5, + }; + + // filter_array_indices should return the same indices in this case. + assert_eq!(level1.filter_array_indices(), level2.filter_array_indices()); + } }