This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c6bd492030 Add tests for bad parquet files (#6262)
c6bd492030 is described below
commit c6bd492030c1061a5a41260d7522bc4dab348a77
Author: Andrew Lamb <[email protected]>
AuthorDate: Sat Aug 17 06:43:27 2024 -0400
Add tests for bad parquet files (#6262)
* Add tests for bad parquet files
* Reenable test
* Add test for very subltley different file
---
parquet-testing | 2 +-
parquet/tests/arrow_reader/bad_data.rs | 136 +++++++++++++++++++++++++++++++++
parquet/tests/arrow_reader/mod.rs | 1 +
3 files changed, 138 insertions(+), 1 deletion(-)
diff --git a/parquet-testing b/parquet-testing
index 9b48ff4f94..cb7a967414 160000
--- a/parquet-testing
+++ b/parquet-testing
@@ -1 +1 @@
-Subproject commit 9b48ff4f94dc5e89592d46a119884dbb88100884
+Subproject commit cb7a9674142c137367bf75a01b79c6e214a73199
diff --git a/parquet/tests/arrow_reader/bad_data.rs
b/parquet/tests/arrow_reader/bad_data.rs
new file mode 100644
index 0000000000..6e325f1197
--- /dev/null
+++ b/parquet/tests/arrow_reader/bad_data.rs
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests that reading invalid parquet files returns an error
+
+use arrow::util::test_util::parquet_test_data;
+use parquet::arrow::arrow_reader::ArrowReaderBuilder;
+use parquet::errors::ParquetError;
+use std::collections::HashSet;
+use std::path::PathBuf;
+
+static KNOWN_FILES: &[&str] = &[
+ "PARQUET-1481.parquet",
+ "ARROW-GH-41317.parquet",
+ "ARROW-GH-41321.parquet",
+ "ARROW-RS-GH-6229-DICTHEADER.parquet",
+ "ARROW-RS-GH-6229-LEVELS.parquet",
+ "README.md",
+];
+
+/// Returns the path to 'parquet-testing/bad_data'
+fn bad_data_dir() -> PathBuf {
+ // points to parquet-testing/data
+ let parquet_testing_data = parquet_test_data();
+ PathBuf::from(parquet_testing_data)
+ .parent()
+ .expect("was in parquet-testing/data")
+ .join("bad_data")
+}
+
+#[test]
+// Ensure that if we add a new test the files are added to the tests.
+fn test_invalid_files() {
+ let known_files: HashSet<_> = KNOWN_FILES.iter().cloned().collect();
+ let mut seen_files = HashSet::new();
+
+ let files = std::fs::read_dir(bad_data_dir()).unwrap();
+
+ for file in files {
+ let file_name = file
+ .unwrap()
+ .path()
+ .file_name()
+ .unwrap()
+ .to_str()
+ .unwrap()
+ .to_string();
+
+ // If you see this error, please add a test for the new file following
the model below
+ assert!(
+ known_files.contains(file_name.as_str()),
+ "Found new file in bad_data, please add test: {file_name}"
+ );
+ seen_files.insert(file_name);
+ }
+ for expected_file in known_files {
+ assert!(
+ seen_files.contains(expected_file),
+ "Expected file not found in bad_data directory: {expected_file}"
+ );
+ }
+}
+
+#[test]
+fn test_parquet_1481() {
+ let err = read_file("PARQUET-1481.parquet").unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "Parquet error: unexpected parquet type: -7"
+ );
+}
+
+#[test]
+#[should_panic(expected = "assertion failed: self.current_value.is_some()")]
+fn test_arrow_gh_41321() {
+ let err = read_file("ARROW-GH-41321.parquet").unwrap_err();
+ assert_eq!(err.to_string(), "TBD (currently panics)");
+}
+
+#[test]
+fn test_arrow_gh_41317() {
+ let err = read_file("ARROW-GH-41317.parquet").unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "External: Parquet argument error: External: bad data"
+ );
+}
+
+#[test]
+fn test_arrow_rs_gh_6229_dict_header() {
+ let err = read_file("ARROW-RS-GH-6229-DICTHEADER.parquet").unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "External: Parquet argument error: EOF: eof decoding byte array"
+ );
+}
+
+#[test]
+fn test_arrow_rs_gh_6229_dict_levels() {
+ let err = read_file("ARROW-RS-GH-6229-LEVELS.parquet").unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "External: Parquet argument error: Parquet error: Insufficient
repetition levels read from column"
+ );
+}
+
+/// Reads the file and tries to return the total row count
+/// Returns an error if the file is invalid
+fn read_file(name: &str) -> Result<usize, ParquetError> {
+ let path = bad_data_dir().join(name);
+ println!("Reading file: {:?}", path);
+
+ let file = std::fs::File::open(&path).unwrap();
+ let reader = ArrowReaderBuilder::try_new(file)?.build()?;
+
+ let mut num_rows = 0;
+ for batch in reader {
+ let batch = batch?;
+ num_rows += batch.num_rows();
+ }
+ Ok(num_rows)
+}
diff --git a/parquet/tests/arrow_reader/mod.rs
b/parquet/tests/arrow_reader/mod.rs
index 7e979dcf3e..cc4c8f3c97 100644
--- a/parquet/tests/arrow_reader/mod.rs
+++ b/parquet/tests/arrow_reader/mod.rs
@@ -35,6 +35,7 @@ use parquet::file::properties::{EnabledStatistics,
WriterProperties};
use std::sync::Arc;
use tempfile::NamedTempFile;
+mod bad_data;
mod statistics;
// returns a struct array with columns "int32_col", "float32_col" and
"float64_col" with the specified values