(arrow) branch main updated: GH-47756: [C++][CI] Fuzz CSV reader (#47757)

apitrou Fri, 17 Oct 2025 20:56:27 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new bfce5f208e GH-47756: [C++][CI] Fuzz CSV reader (#47757)
bfce5f208e is described below

commit bfce5f208e2470648fb9b1a47d0e6521a278efaf
Author: Antoine Pitrou <[email protected]>
AuthorDate: Mon Oct 13 15:55:41 2025 +0200

    GH-47756: [C++][CI] Fuzz CSV reader (#47757)
    
    ### Rationale for this change
    
    We are already fuzzing the IPC and Parquet reader. The CSV reader is 
another important user-facing component that is worth fuzzing.
    
    ### What changes are included in this PR?
    
    1. Add fuzz target for the CSV reader (currently only fuzzing the 
non-streaming table reader)
    2. Generate rudimentary seed corpus using CSV test files from the 
`arrow-testing` repo and the Pandas test suite
    
    ### Are these changes tested?
    
    Yes, by the OSS-Fuzz CI build.
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #47756
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 ci/scripts/cpp_test.sh                         |  1 +
 cpp/build-support/fuzzing/generate_corpuses.sh | 19 +++++++++
 cpp/build-support/fuzzing/pack_corpus.py       |  3 +-
 cpp/src/arrow/csv/CMakeLists.txt               |  2 +
 cpp/src/arrow/csv/fuzz.cc                      | 59 ++++++++++++++++++++++++++
 docs/source/developers/cpp/fuzzing.rst         |  1 +
 6 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh
index 4243e78bca..cc1b81a19b 100755
--- a/ci/scripts/cpp_test.sh
+++ b/ci/scripts/cpp_test.sh
@@ -184,6 +184,7 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
     if [ "${ARROW_PARQUET}" == "ON" ]; then
       "${binary_output_dir}/parquet-arrow-fuzz" 
"${ARROW_TEST_DATA}"/parquet/fuzzing/*-testcase-*
     fi
+    # TODO run CSV fuzz regression tests once we have any
 fi
 
 popd
diff --git a/cpp/build-support/fuzzing/generate_corpuses.sh 
b/cpp/build-support/fuzzing/generate_corpuses.sh
index ffd5c54e44..233c9be0eb 100755
--- a/cpp/build-support/fuzzing/generate_corpuses.sh
+++ b/cpp/build-support/fuzzing/generate_corpuses.sh
@@ -27,6 +27,8 @@ fi
 set -ex
 
 CORPUS_DIR=/tmp/corpus
+PANDAS_DIR=/tmp/pandas
+
 ARROW_ROOT=$(cd $(dirname $BASH_SOURCE)/../../..; pwd)
 ARROW_CPP=$ARROW_ROOT/cpp
 OUT=$1
@@ -35,6 +37,8 @@ OUT=$1
 # where "<FUZZ TARGET>" is the exact name of the fuzz target executable the
 # seed corpus is generated for.
 
+# Arrow IPC
+
 IPC_INTEGRATION_FILES=$(find 
${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.stream")
 
 rm -rf ${CORPUS_DIR}
@@ -52,9 +56,24 @@ rm -rf ${CORPUS_DIR}
 ${OUT}/arrow-ipc-generate-tensor-fuzz-corpus -stream ${CORPUS_DIR}
 ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} 
${OUT}/arrow-ipc-tensor-stream-fuzz_seed_corpus.zip
 
+# Parquet
+
 rm -rf ${CORPUS_DIR}
 ${OUT}/parquet-arrow-generate-fuzz-corpus ${CORPUS_DIR}
 # Add Parquet testing examples
 cp ${ARROW_CPP}/submodules/parquet-testing/data/*.parquet ${CORPUS_DIR}
 cp ${ARROW_CPP}/submodules/parquet-testing/bad_data/*.parquet ${CORPUS_DIR}
 ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} 
${OUT}/parquet-arrow-fuzz_seed_corpus.zip
+
+# CSV
+
+rm -rf ${PANDAS_DIR}
+git clone --depth=1 https://github.com/pandas-dev/pandas ${PANDAS_DIR}
+
+rm -rf ${CORPUS_DIR}
+mkdir -p ${CORPUS_DIR}
+# Add examples from arrow-testing repo
+cp ${ARROW_ROOT}/testing/data/csv/*.csv ${CORPUS_DIR}
+# Add examples from Pandas test suite
+find ${PANDAS_DIR}/ -name "*.csv" -exec cp --backup=numbered '{}' 
${CORPUS_DIR} \;
+${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} 
${OUT}/arrow-csv-fuzz_seed_corpus.zip
diff --git a/cpp/build-support/fuzzing/pack_corpus.py 
b/cpp/build-support/fuzzing/pack_corpus.py
index 94d9a88b38..4480ec514b 100755
--- a/cpp/build-support/fuzzing/pack_corpus.py
+++ b/cpp/build-support/fuzzing/pack_corpus.py
@@ -29,7 +29,7 @@ import zipfile
 def process_dir(corpus_dir, zip_output):
     seen_hashes = {}
 
-    for child in corpus_dir.iterdir():
+    for child in sorted(corpus_dir.iterdir()):
         if not child.is_file():
             raise IOError(f"Not a file: {child}")
         with child.open('rb') as f:
@@ -39,6 +39,7 @@ def process_dir(corpus_dir, zip_output):
             raise ValueError(
                 f"Duplicate hash: {arcname} (in file {child}), "
                 f"already seen in file {seen_hashes[arcname]}")
+        print(f"  {child} -> {arcname}")
         zip_output.writestr(str(arcname), data)
         seen_hashes[arcname] = child
 
diff --git a/cpp/src/arrow/csv/CMakeLists.txt b/cpp/src/arrow/csv/CMakeLists.txt
index a112ca423e..55047ca204 100644
--- a/cpp/src/arrow/csv/CMakeLists.txt
+++ b/cpp/src/arrow/csv/CMakeLists.txt
@@ -30,6 +30,8 @@ add_arrow_benchmark(converter_benchmark PREFIX "arrow-csv")
 add_arrow_benchmark(parser_benchmark PREFIX "arrow-csv")
 add_arrow_benchmark(writer_benchmark PREFIX "arrow-csv")
 
+add_arrow_fuzz_target(fuzz PREFIX "arrow-csv")
+
 arrow_install_all_headers("arrow/csv")
 
 # pkg-config support
diff --git a/cpp/src/arrow/csv/fuzz.cc b/cpp/src/arrow/csv/fuzz.cc
new file mode 100644
index 0000000000..9e500e5281
--- /dev/null
+++ b/cpp/src/arrow/csv/fuzz.cc
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/buffer.h"
+#include "arrow/csv/reader.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/io/memory.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/util/macros.h"
+
+namespace arrow::csv {
+
+Status FuzzCsvReader(const uint8_t* data, int64_t size) {
+  auto io_context = arrow::io::default_io_context();
+
+  auto read_options = ReadOptions::Defaults();
+  // Make chunking more likely
+  read_options.block_size = 4096;
+  auto parse_options = ParseOptions::Defaults();
+  auto convert_options = ConvertOptions::Defaults();
+  convert_options.auto_dict_encode = true;
+
+  auto input_stream =
+      
std::make_shared<::arrow::io::BufferReader>(std::make_shared<Buffer>(data, 
size));
+
+  // TODO test other reader types
+  ARROW_ASSIGN_OR_RAISE(auto table_reader,
+                        TableReader::Make(io_context, input_stream, 
read_options,
+                                          parse_options, convert_options));
+  ARROW_ASSIGN_OR_RAISE(auto table, table_reader->Read());
+  RETURN_NOT_OK(table->ValidateFull());
+  return Status::OK();
+}
+
+}  // namespace arrow::csv
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  auto status = arrow::csv::FuzzCsvReader(data, static_cast<int64_t>(size));
+  ARROW_UNUSED(status);
+  return 0;
+}
diff --git a/docs/source/developers/cpp/fuzzing.rst 
b/docs/source/developers/cpp/fuzzing.rst
index 851d58fb56..7c8b346074 100644
--- a/docs/source/developers/cpp/fuzzing.rst
+++ b/docs/source/developers/cpp/fuzzing.rst
@@ -29,6 +29,7 @@ fuzz testing on several parts of the Arrow C++ feature set, 
currently:
 * the IPC stream format
 * the IPC file format
 * the Parquet file format
+* the CSV file format
 
 We welcome any contribution to expand the scope of fuzz testing and cover
 areas ingesting potentially invalid or malicious data.

(arrow) branch main updated: GH-47756: [C++][CI] Fuzz CSV reader (#47757)

Reply via email to