This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new bfce5f208e GH-47756: [C++][CI] Fuzz CSV reader (#47757)
bfce5f208e is described below
commit bfce5f208e2470648fb9b1a47d0e6521a278efaf
Author: Antoine Pitrou <[email protected]>
AuthorDate: Mon Oct 13 15:55:41 2025 +0200
GH-47756: [C++][CI] Fuzz CSV reader (#47757)
### Rationale for this change
We are already fuzzing the IPC and Parquet reader. The CSV reader is
another important user-facing component that is worth fuzzing.
### What changes are included in this PR?
1. Add fuzz target for the CSV reader (currently only fuzzing the
non-streaming table reader)
2. Generate rudimentary seed corpus using CSV test files from the
`arrow-testing` repo and the Pandas test suite
### Are these changes tested?
Yes, by the OSS-Fuzz CI build.
### Are there any user-facing changes?
No.
* GitHub Issue: #47756
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
ci/scripts/cpp_test.sh | 1 +
cpp/build-support/fuzzing/generate_corpuses.sh | 19 +++++++++
cpp/build-support/fuzzing/pack_corpus.py | 3 +-
cpp/src/arrow/csv/CMakeLists.txt | 2 +
cpp/src/arrow/csv/fuzz.cc | 59 ++++++++++++++++++++++++++
docs/source/developers/cpp/fuzzing.rst | 1 +
6 files changed, 84 insertions(+), 1 deletion(-)
diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh
index 4243e78bca..cc1b81a19b 100755
--- a/ci/scripts/cpp_test.sh
+++ b/ci/scripts/cpp_test.sh
@@ -184,6 +184,7 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
if [ "${ARROW_PARQUET}" == "ON" ]; then
"${binary_output_dir}/parquet-arrow-fuzz"
"${ARROW_TEST_DATA}"/parquet/fuzzing/*-testcase-*
fi
+ # TODO run CSV fuzz regression tests once we have any
fi
popd
diff --git a/cpp/build-support/fuzzing/generate_corpuses.sh
b/cpp/build-support/fuzzing/generate_corpuses.sh
index ffd5c54e44..233c9be0eb 100755
--- a/cpp/build-support/fuzzing/generate_corpuses.sh
+++ b/cpp/build-support/fuzzing/generate_corpuses.sh
@@ -27,6 +27,8 @@ fi
set -ex
CORPUS_DIR=/tmp/corpus
+PANDAS_DIR=/tmp/pandas
+
ARROW_ROOT=$(cd $(dirname $BASH_SOURCE)/../../..; pwd)
ARROW_CPP=$ARROW_ROOT/cpp
OUT=$1
@@ -35,6 +37,8 @@ OUT=$1
# where "<FUZZ TARGET>" is the exact name of the fuzz target executable the
# seed corpus is generated for.
+# Arrow IPC
+
IPC_INTEGRATION_FILES=$(find
${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.stream")
rm -rf ${CORPUS_DIR}
@@ -52,9 +56,24 @@ rm -rf ${CORPUS_DIR}
${OUT}/arrow-ipc-generate-tensor-fuzz-corpus -stream ${CORPUS_DIR}
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR}
${OUT}/arrow-ipc-tensor-stream-fuzz_seed_corpus.zip
+# Parquet
+
rm -rf ${CORPUS_DIR}
${OUT}/parquet-arrow-generate-fuzz-corpus ${CORPUS_DIR}
# Add Parquet testing examples
cp ${ARROW_CPP}/submodules/parquet-testing/data/*.parquet ${CORPUS_DIR}
cp ${ARROW_CPP}/submodules/parquet-testing/bad_data/*.parquet ${CORPUS_DIR}
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR}
${OUT}/parquet-arrow-fuzz_seed_corpus.zip
+
+# CSV
+
+rm -rf ${PANDAS_DIR}
+git clone --depth=1 https://github.com/pandas-dev/pandas ${PANDAS_DIR}
+
+rm -rf ${CORPUS_DIR}
+mkdir -p ${CORPUS_DIR}
+# Add examples from arrow-testing repo
+cp ${ARROW_ROOT}/testing/data/csv/*.csv ${CORPUS_DIR}
+# Add examples from Pandas test suite
+find ${PANDAS_DIR}/ -name "*.csv" -exec cp --backup=numbered '{}'
${CORPUS_DIR} \;
+${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR}
${OUT}/arrow-csv-fuzz_seed_corpus.zip
diff --git a/cpp/build-support/fuzzing/pack_corpus.py
b/cpp/build-support/fuzzing/pack_corpus.py
index 94d9a88b38..4480ec514b 100755
--- a/cpp/build-support/fuzzing/pack_corpus.py
+++ b/cpp/build-support/fuzzing/pack_corpus.py
@@ -29,7 +29,7 @@ import zipfile
def process_dir(corpus_dir, zip_output):
seen_hashes = {}
- for child in corpus_dir.iterdir():
+ for child in sorted(corpus_dir.iterdir()):
if not child.is_file():
raise IOError(f"Not a file: {child}")
with child.open('rb') as f:
@@ -39,6 +39,7 @@ def process_dir(corpus_dir, zip_output):
raise ValueError(
f"Duplicate hash: {arcname} (in file {child}), "
f"already seen in file {seen_hashes[arcname]}")
+ print(f" {child} -> {arcname}")
zip_output.writestr(str(arcname), data)
seen_hashes[arcname] = child
diff --git a/cpp/src/arrow/csv/CMakeLists.txt b/cpp/src/arrow/csv/CMakeLists.txt
index a112ca423e..55047ca204 100644
--- a/cpp/src/arrow/csv/CMakeLists.txt
+++ b/cpp/src/arrow/csv/CMakeLists.txt
@@ -30,6 +30,8 @@ add_arrow_benchmark(converter_benchmark PREFIX "arrow-csv")
add_arrow_benchmark(parser_benchmark PREFIX "arrow-csv")
add_arrow_benchmark(writer_benchmark PREFIX "arrow-csv")
+add_arrow_fuzz_target(fuzz PREFIX "arrow-csv")
+
arrow_install_all_headers("arrow/csv")
# pkg-config support
diff --git a/cpp/src/arrow/csv/fuzz.cc b/cpp/src/arrow/csv/fuzz.cc
new file mode 100644
index 0000000000..9e500e5281
--- /dev/null
+++ b/cpp/src/arrow/csv/fuzz.cc
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/buffer.h"
+#include "arrow/csv/reader.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/io/memory.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/util/macros.h"
+
+namespace arrow::csv {
+
+Status FuzzCsvReader(const uint8_t* data, int64_t size) {
+ auto io_context = arrow::io::default_io_context();
+
+ auto read_options = ReadOptions::Defaults();
+ // Make chunking more likely
+ read_options.block_size = 4096;
+ auto parse_options = ParseOptions::Defaults();
+ auto convert_options = ConvertOptions::Defaults();
+ convert_options.auto_dict_encode = true;
+
+ auto input_stream =
+
std::make_shared<::arrow::io::BufferReader>(std::make_shared<Buffer>(data,
size));
+
+ // TODO test other reader types
+ ARROW_ASSIGN_OR_RAISE(auto table_reader,
+ TableReader::Make(io_context, input_stream,
read_options,
+ parse_options, convert_options));
+ ARROW_ASSIGN_OR_RAISE(auto table, table_reader->Read());
+ RETURN_NOT_OK(table->ValidateFull());
+ return Status::OK();
+}
+
+} // namespace arrow::csv
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ auto status = arrow::csv::FuzzCsvReader(data, static_cast<int64_t>(size));
+ ARROW_UNUSED(status);
+ return 0;
+}
diff --git a/docs/source/developers/cpp/fuzzing.rst
b/docs/source/developers/cpp/fuzzing.rst
index 851d58fb56..7c8b346074 100644
--- a/docs/source/developers/cpp/fuzzing.rst
+++ b/docs/source/developers/cpp/fuzzing.rst
@@ -29,6 +29,7 @@ fuzz testing on several parts of the Arrow C++ feature set,
currently:
* the IPC stream format
* the IPC file format
* the Parquet file format
+* the CSV file format
We welcome any contribution to expand the scope of fuzz testing and cover
areas ingesting potentially invalid or malicious data.