This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new c6bd492030 Add tests for bad parquet files (#6262)
c6bd492030 is described below

commit c6bd492030c1061a5a41260d7522bc4dab348a77
Author: Andrew Lamb <[email protected]>
AuthorDate: Sat Aug 17 06:43:27 2024 -0400

    Add tests for bad parquet files (#6262)
    
    * Add tests for bad parquet files
    
    * Reenable test
    
    * Add test for very subltley different file
---
 parquet-testing                        |   2 +-
 parquet/tests/arrow_reader/bad_data.rs | 136 +++++++++++++++++++++++++++++++++
 parquet/tests/arrow_reader/mod.rs      |   1 +
 3 files changed, 138 insertions(+), 1 deletion(-)

diff --git a/parquet-testing b/parquet-testing
index 9b48ff4f94..cb7a967414 160000
--- a/parquet-testing
+++ b/parquet-testing
@@ -1 +1 @@
-Subproject commit 9b48ff4f94dc5e89592d46a119884dbb88100884
+Subproject commit cb7a9674142c137367bf75a01b79c6e214a73199
diff --git a/parquet/tests/arrow_reader/bad_data.rs 
b/parquet/tests/arrow_reader/bad_data.rs
new file mode 100644
index 0000000000..6e325f1197
--- /dev/null
+++ b/parquet/tests/arrow_reader/bad_data.rs
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests that reading invalid parquet files returns an error
+
+use arrow::util::test_util::parquet_test_data;
+use parquet::arrow::arrow_reader::ArrowReaderBuilder;
+use parquet::errors::ParquetError;
+use std::collections::HashSet;
+use std::path::PathBuf;
+
+static KNOWN_FILES: &[&str] = &[
+    "PARQUET-1481.parquet",
+    "ARROW-GH-41317.parquet",
+    "ARROW-GH-41321.parquet",
+    "ARROW-RS-GH-6229-DICTHEADER.parquet",
+    "ARROW-RS-GH-6229-LEVELS.parquet",
+    "README.md",
+];
+
+/// Returns the path to 'parquet-testing/bad_data'
+fn bad_data_dir() -> PathBuf {
+    // points to parquet-testing/data
+    let parquet_testing_data = parquet_test_data();
+    PathBuf::from(parquet_testing_data)
+        .parent()
+        .expect("was in parquet-testing/data")
+        .join("bad_data")
+}
+
+#[test]
+// Ensure that if we add a new test the files are added to the tests.
+fn test_invalid_files() {
+    let known_files: HashSet<_> = KNOWN_FILES.iter().cloned().collect();
+    let mut seen_files = HashSet::new();
+
+    let files = std::fs::read_dir(bad_data_dir()).unwrap();
+
+    for file in files {
+        let file_name = file
+            .unwrap()
+            .path()
+            .file_name()
+            .unwrap()
+            .to_str()
+            .unwrap()
+            .to_string();
+
+        // If you see this error, please add a test for the new file following 
the model below
+        assert!(
+            known_files.contains(file_name.as_str()),
+            "Found new file in bad_data, please add test: {file_name}"
+        );
+        seen_files.insert(file_name);
+    }
+    for expected_file in known_files {
+        assert!(
+            seen_files.contains(expected_file),
+            "Expected file not found in bad_data directory: {expected_file}"
+        );
+    }
+}
+
+#[test]
+fn test_parquet_1481() {
+    let err = read_file("PARQUET-1481.parquet").unwrap_err();
+    assert_eq!(
+        err.to_string(),
+        "Parquet error: unexpected parquet type: -7"
+    );
+}
+
+#[test]
+#[should_panic(expected = "assertion failed: self.current_value.is_some()")]
+fn test_arrow_gh_41321() {
+    let err = read_file("ARROW-GH-41321.parquet").unwrap_err();
+    assert_eq!(err.to_string(), "TBD (currently panics)");
+}
+
+#[test]
+fn test_arrow_gh_41317() {
+    let err = read_file("ARROW-GH-41317.parquet").unwrap_err();
+    assert_eq!(
+        err.to_string(),
+        "External: Parquet argument error: External: bad data"
+    );
+}
+
+#[test]
+fn test_arrow_rs_gh_6229_dict_header() {
+    let err = read_file("ARROW-RS-GH-6229-DICTHEADER.parquet").unwrap_err();
+    assert_eq!(
+        err.to_string(),
+        "External: Parquet argument error: EOF: eof decoding byte array"
+    );
+}
+
+#[test]
+fn test_arrow_rs_gh_6229_dict_levels() {
+    let err = read_file("ARROW-RS-GH-6229-LEVELS.parquet").unwrap_err();
+    assert_eq!(
+        err.to_string(),
+        "External: Parquet argument error: Parquet error: Insufficient 
repetition levels read from column"
+    );
+}
+
+/// Reads the file and tries to return the total row count
+/// Returns an error if the file is invalid
+fn read_file(name: &str) -> Result<usize, ParquetError> {
+    let path = bad_data_dir().join(name);
+    println!("Reading file: {:?}", path);
+
+    let file = std::fs::File::open(&path).unwrap();
+    let reader = ArrowReaderBuilder::try_new(file)?.build()?;
+
+    let mut num_rows = 0;
+    for batch in reader {
+        let batch = batch?;
+        num_rows += batch.num_rows();
+    }
+    Ok(num_rows)
+}
diff --git a/parquet/tests/arrow_reader/mod.rs 
b/parquet/tests/arrow_reader/mod.rs
index 7e979dcf3e..cc4c8f3c97 100644
--- a/parquet/tests/arrow_reader/mod.rs
+++ b/parquet/tests/arrow_reader/mod.rs
@@ -35,6 +35,7 @@ use parquet::file::properties::{EnabledStatistics, 
WriterProperties};
 use std::sync::Arc;
 use tempfile::NamedTempFile;
 
+mod bad_data;
 mod statistics;
 
 // returns a struct array with columns "int32_col", "float32_col" and 
"float64_col" with the specified values

Reply via email to