This is an automated email from the ASF dual-hosted git repository.
AlenkaF pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 3d6e138aef GH-22232: [C++][Python] Introduce optional
default_column_type parameter (#47663)
3d6e138aef is described below
commit 3d6e138aef27741c0915fc8311872be44e9a1feb
Author: Vlad Borovtsov <[email protected]>
AuthorDate: Wed May 13 10:29:32 2026 +0200
GH-22232: [C++][Python] Introduce optional default_column_type parameter
(#47663)
### Rationale for this change
Add an optional default_column_type parameter to the CSV reading API (C++
and Python) to provide a fallback type when per-column types aren’t specified,
improving schema consistency and complementing the existing column_types logic.
### What changes are included in this PR?
- c++: new convert option "default_column_type" to augment logic around
column_types parameter
- 3 reader tests: DefaultColumnTypePartialDefault,
DefaultColumnTypeAllStringsWithHeader, DefaultColumnTypeAllStringsNoHeader).
The last two tests are inspired by
https://github.com/pandas-dev/pandas/pull/62242 and
https://github.com/pandas-dev/pandas/issues/57666
- python: corresponding changes to make cpp change consumable from python
- python: extended test_convert_options test - include. default_column_type
- python: added new test "test_default_column_type" which tests how the
field impacts schema; also test implicitly verifies leading zero preservation
- relevant documentation update for python component;
### Are these changes tested?
Yes. Existing and new tests are passing.
C++:
> [==========] Running 3 tests from 1 test suite.
> [----------] Global test environment set-up.
> [----------] 3 tests from ReaderTests
> [ RUN ] ReaderTests.DefaultColumnTypePartialDefault
> [ OK ] ReaderTests.DefaultColumnTypePartialDefault (3 ms)
> [ RUN ] ReaderTests.DefaultColumnTypeAllStringsWithHeader
> [ OK ] ReaderTests.DefaultColumnTypeAllStringsWithHeader (0 ms)
> [ RUN ] ReaderTests.DefaultColumnTypeAllStringsNoHeader
> [ OK ] ReaderTests.DefaultColumnTypeAllStringsNoHeader (0 ms)
> [----------] 3 tests from ReaderTests (4 ms total)
>
> [----------] Global test environment tear-down
> [==========] 3 tests from 1 test suite ran. (4 ms total)
> [ PASSED ] 3 tests.
All:
> [==========] 264 tests from 46 test suites ran. (452 ms total)
> [ PASSED ] 264 tests.
pyarrow:
New tests are passing.
### Are there any user-facing changes?
I believe this change is backward compatible. Parameter is optional and its
default value doesn't change the existing behavior; All the existing rests are
passing.
Maybe relevant: https://github.com/apache/arrow/issues/22232
Relates to https://github.com/apache/arrow/issues/47502
* GitHub Issue: #47502
* GitHub Issue: #22232
Lead-authored-by: Vlad Borovtsov <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: AlenkaF <[email protected]>
---
cpp/src/arrow/csv/column_builder_test.cc | 19 +++++++
cpp/src/arrow/csv/options.h | 4 ++
cpp/src/arrow/csv/reader.cc | 10 +++-
cpp/src/arrow/csv/reader_test.cc | 87 ++++++++++++++++++++++++++++++++
docs/source/python/csv.rst | 1 +
python/pyarrow/_csv.pyx | 82 ++++++++++++++++++++++++++----
python/pyarrow/includes/libarrow.pxd | 1 +
python/pyarrow/tests/test_csv.py | 72 +++++++++++++++++++++++++-
8 files changed, 263 insertions(+), 13 deletions(-)
diff --git a/cpp/src/arrow/csv/column_builder_test.cc
b/cpp/src/arrow/csv/column_builder_test.cc
index dddfb922e7..94de8c92d4 100644
--- a/cpp/src/arrow/csv/column_builder_test.cc
+++ b/cpp/src/arrow/csv/column_builder_test.cc
@@ -346,6 +346,25 @@ TEST_F(InferringColumnBuilderTest, SingleChunkInteger) {
{ArrayFromJSON(int64(), "[null, 123, 456]")});
}
+TEST_F(InferringColumnBuilderTest,
SingleChunkDefaultColumnTypeDoesNotOverrideInference) {
+ auto options = ConvertOptions::Defaults();
+ options.default_column_type = utf8();
+ auto tg = TaskGroup::MakeSerial();
+
+ CheckInferred(tg, {{"0000404", "0000505", "0000606"}}, options,
+ {ArrayFromJSON(int64(), "[404, 505, 606]")});
+}
+
+TEST_F(InferringColumnBuilderTest,
+ MultipleChunkDefaultColumnTypeDoesNotOverrideInference) {
+ auto options = ConvertOptions::Defaults();
+ options.default_column_type = utf8();
+ auto tg = TaskGroup::MakeSerial();
+
+ CheckInferred(tg, {{"0000404"}, {"0000505", "0000606"}}, options,
+ {ArrayFromJSON(int64(), "[404]"), ArrayFromJSON(int64(),
"[505, 606]")});
+}
+
TEST_F(InferringColumnBuilderTest, MultipleChunkInteger) {
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 10e55bf838..f0b923d0f3 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -76,6 +76,10 @@ struct ARROW_EXPORT ConvertOptions {
bool check_utf8 = true;
/// Optional per-column types (disabling type inference on those columns)
std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
+ /// Default type to use for columns not in `column_types`
+ ///
+ /// If set, this disables type inference on all columns.
+ std::shared_ptr<DataType> default_column_type;
/// Recognized spellings for null values
std::vector<std::string> null_values;
/// Recognized spellings for boolean true values
diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
index 8720331965..09e8290ba2 100644
--- a/cpp/src/arrow/csv/reader.cc
+++ b/cpp/src/arrow/csv/reader.cc
@@ -674,8 +674,14 @@ class ReaderMixin {
// Does the named column have a fixed type?
auto it = convert_options_.column_types.find(col_name);
if (it == convert_options_.column_types.end()) {
- conversion_schema_.columns.push_back(
- ConversionSchema::InferredColumn(std::move(col_name), col_index));
+ // If not explicitly typed, respect default_column_type when provided
+ if (convert_options_.default_column_type != nullptr) {
+ conversion_schema_.columns.push_back(ConversionSchema::TypedColumn(
+ std::move(col_name), col_index,
convert_options_.default_column_type));
+ } else {
+ conversion_schema_.columns.push_back(
+ ConversionSchema::InferredColumn(std::move(col_name),
col_index));
+ }
} else {
conversion_schema_.columns.push_back(
ConversionSchema::TypedColumn(std::move(col_name), col_index,
it->second));
diff --git a/cpp/src/arrow/csv/reader_test.cc b/cpp/src/arrow/csv/reader_test.cc
index 23206717a1..9e0f4804b0 100644
--- a/cpp/src/arrow/csv/reader_test.cc
+++ b/cpp/src/arrow/csv/reader_test.cc
@@ -531,5 +531,92 @@ TEST(CountRowsAsync, Errors) {
internal::GetCpuThreadPool(), read_options,
parse_options));
}
+TEST(ReaderTests, DefaultColumnTypePartialDefault) {
+ auto table_buffer = std::make_shared<Buffer>(
+ "id,name,value,date\n"
+ "0000101,apple,0003.1400,2024-01-15\n"
+ "00102,banana,001.6180,2024-02-20\n"
+ "0003,cherry,02.71800,2024-03-25\n");
+
+ auto input = std::make_shared<io::BufferReader>(table_buffer);
+ auto read_options = ReadOptions::Defaults();
+ auto parse_options = ParseOptions::Defaults();
+ auto convert_options = ConvertOptions::Defaults();
+ convert_options.column_types["id"] = int64();
+ convert_options.default_column_type = utf8();
+
+ ASSERT_OK_AND_ASSIGN(auto reader,
+ TableReader::Make(io::default_io_context(), input,
read_options,
+ parse_options, convert_options));
+ ASSERT_OK_AND_ASSIGN(auto table, reader->Read());
+
+ auto expected_schema = schema({field("id", int64()), field("name", utf8()),
+ field("value", utf8()), field("date",
utf8())});
+ AssertSchemaEqual(expected_schema, table->schema());
+
+ auto expected_table = TableFromJSON(
+ expected_schema,
+ {R"([{"id":101, "name":"apple", "value":"0003.1400",
"date":"2024-01-15"},
+ {"id":102, "name":"banana", "value":"001.6180",
"date":"2024-02-20"},
+ {"id":3, "name":"cherry", "value":"02.71800",
"date":"2024-03-25"}])"});
+ ASSERT_TRUE(table->Equals(*expected_table));
+}
+
+TEST(ReaderTests, DefaultColumnTypeForcesTypedColumns) {
+ auto table_buffer = std::make_shared<Buffer>(
+ "id,amount,code\n"
+ "0000404,000045.6700,001\n"
+ "0000505,000000.10,010\n");
+
+ auto input = std::make_shared<io::BufferReader>(table_buffer);
+ auto read_options = ReadOptions::Defaults();
+ auto parse_options = ParseOptions::Defaults();
+ auto convert_options = ConvertOptions::Defaults();
+ convert_options.default_column_type = utf8();
+
+ ASSERT_OK_AND_ASSIGN(auto reader,
+ TableReader::Make(io::default_io_context(), input,
read_options,
+ parse_options, convert_options));
+ ASSERT_OK_AND_ASSIGN(auto table, reader->Read());
+
+ auto expected_schema =
+ schema({field("id", utf8()), field("amount", utf8()), field("code",
utf8())});
+ AssertSchemaEqual(expected_schema, table->schema());
+
+ auto expected_table = TableFromJSON(
+ expected_schema, {R"([{"id":"0000404", "amount":"000045.6700",
"code":"001"},
+ {"id":"0000505", "amount":"000000.10", "code":"010"}])"});
+ ASSERT_TRUE(table->Equals(*expected_table));
+}
+
+TEST(ReaderTests, DefaultColumnTypeAllStringsNoHeader) {
+ // Input without header; autogenerate column names and default all to strings
+ auto table_buffer = std::make_shared<Buffer>("AB|000388907|000045.6700\n");
+
+ auto input = std::make_shared<io::BufferReader>(table_buffer);
+ auto read_options = ReadOptions::Defaults();
+ read_options.autogenerate_column_names = true; // treat first row as data
+ auto parse_options = ParseOptions::Defaults();
+ parse_options.delimiter = '|';
+ auto convert_options = ConvertOptions::Defaults();
+ convert_options.default_column_type = utf8();
+
+ ASSERT_OK_AND_ASSIGN(auto reader,
+ TableReader::Make(io::default_io_context(), input,
read_options,
+ parse_options, convert_options));
+ ASSERT_OK_AND_ASSIGN(auto table, reader->Read());
+
+ auto expected_schema =
+ schema({field("f0", utf8()), field("f1", utf8()), field("f2", utf8())});
+ AssertSchemaEqual(expected_schema, table->schema());
+
+ auto expected_table = TableFromJSON(expected_schema, {R"([{
+ "f0":"AB",
+ "f1":"000388907",
+ "f2":"000045.6700"
+ }])"});
+ ASSERT_TRUE(table->Equals(*expected_table));
+}
+
} // namespace csv
} // namespace arrow
diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst
index 2bc2ccabc9..28946d1599 100644
--- a/docs/source/python/csv.rst
+++ b/docs/source/python/csv.rst
@@ -153,6 +153,7 @@ Available convert options are:
~ConvertOptions.check_utf8
~ConvertOptions.column_types
+ ~ConvertOptions.default_column_type
~ConvertOptions.null_values
~ConvertOptions.true_values
~ConvertOptions.false_values
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index 79985530af..f2cefb8ff3 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -646,6 +646,9 @@ cdef class ConvertOptions(_Weakrefable):
column_types : pyarrow.Schema or dict, optional
Explicitly map column names to column types. Passing this argument
disables type inference on the defined columns.
+ default_column_type : pyarrow.DataType, optional
+ Explicitly map columns not specified in column_types to a default type.
+ Passing this argument disables type inference on all columns.
null_values : list, optional
A sequence of strings that denote nulls in the data
(defaults are appropriate in most cases). Note that by default,
@@ -840,6 +843,40 @@ cdef class ConvertOptions(_Weakrefable):
fast: bool
----
fast: [[true,true,false,false,null]]
+
+ Set a default column type for all columns (disables type inference):
+
+ >>> convert_options = csv.ConvertOptions(default_column_type=pa.string())
+ >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options)
+ pyarrow.Table
+ animals: string
+ n_legs: string
+ entry: string
+ fast: string
+ ----
+ animals: [["Flamingo","Horse","Brittle stars","Centipede",""]]
+ n_legs: [["2","4","5","100","6"]]
+ entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]]
+ fast: [["Yes","Yes","No","No",""]]
+
+ Combine default_column_type with column_types (specific column types
override default):
+
+ >>> convert_options = csv.ConvertOptions(
+ ... column_types={"n_legs": pa.int64(), "fast":
pa.bool_()},
+ ... default_column_type=pa.string(),
+ ... true_values=["Yes"],
+ ... false_values=["No"])
+ >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options)
+ pyarrow.Table
+ animals: string
+ n_legs: int64
+ entry: string
+ fast: bool
+ ----
+ animals: [["Flamingo","Horse","Brittle stars","Centipede",""]]
+ n_legs: [[2,4,5,100,6]]
+ entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]]
+ fast: [[true,true,false,false,null]]
"""
# Avoid mistakingly creating attributes
@@ -849,7 +886,7 @@ cdef class ConvertOptions(_Weakrefable):
self.options.reset(
new CCSVConvertOptions(CCSVConvertOptions.Defaults()))
- def __init__(self, *, check_utf8=None, column_types=None, null_values=None,
+ def __init__(self, *, check_utf8=None, column_types=None,
default_column_type=None, null_values=None,
true_values=None, false_values=None, decimal_point=None,
strings_can_be_null=None, quoted_strings_can_be_null=None,
include_columns=None, include_missing_columns=None,
@@ -859,6 +896,8 @@ cdef class ConvertOptions(_Weakrefable):
self.check_utf8 = check_utf8
if column_types is not None:
self.column_types = column_types
+ if default_column_type is not None:
+ self.default_column_type = default_column_type
if null_values is not None:
self.null_values = null_values
if true_values is not None:
@@ -943,6 +982,27 @@ cdef class ConvertOptions(_Weakrefable):
assert typ != NULL
deref(self.options).column_types[tobytes(k)] = typ
+ @property
+ def default_column_type(self):
+ """
+ Explicitly map columns not specified in column_types to a default type.
+ """
+ if deref(self.options).default_column_type != NULL:
+ return
pyarrow_wrap_data_type(deref(self.options).default_column_type)
+ else:
+ return None
+
+ @default_column_type.setter
+ def default_column_type(self, value):
+ cdef:
+ shared_ptr[CDataType] typ
+ if value is not None:
+ typ = pyarrow_unwrap_data_type(ensure_type(value))
+ assert typ != NULL
+ deref(self.options).default_column_type = typ
+ else:
+ deref(self.options).default_column_type.reset()
+
@property
def null_values(self):
"""
@@ -1104,6 +1164,7 @@ cdef class ConvertOptions(_Weakrefable):
return (
self.check_utf8 == other.check_utf8 and
self.column_types == other.column_types and
+ self.default_column_type == other.default_column_type and
self.null_values == other.null_values and
self.true_values == other.true_values and
self.false_values == other.false_values and
@@ -1120,17 +1181,17 @@ cdef class ConvertOptions(_Weakrefable):
)
def __getstate__(self):
- return (self.check_utf8, self.column_types, self.null_values,
- self.true_values, self.false_values, self.decimal_point,
- self.timestamp_parsers, self.strings_can_be_null,
- self.quoted_strings_can_be_null, self.auto_dict_encode,
- self.auto_dict_max_cardinality, self.include_columns,
- self.include_missing_columns)
+ return (self.check_utf8, self.column_types, self.default_column_type,
+ self.null_values, self.true_values, self.false_values,
+ self.decimal_point, self.timestamp_parsers,
+ self.strings_can_be_null, self.quoted_strings_can_be_null,
+ self.auto_dict_encode, self.auto_dict_max_cardinality,
+ self.include_columns, self.include_missing_columns)
def __setstate__(self, state):
- (self.check_utf8, self.column_types, self.null_values,
- self.true_values, self.false_values, self.decimal_point,
- self.timestamp_parsers, self.strings_can_be_null,
+ (self.check_utf8, self.column_types, self.default_column_type,
+ self.null_values, self.true_values, self.false_values,
+ self.decimal_point, self.timestamp_parsers, self.strings_can_be_null,
self.quoted_strings_can_be_null, self.auto_dict_encode,
self.auto_dict_max_cardinality, self.include_columns,
self.include_missing_columns) = state
@@ -1145,6 +1206,7 @@ cdef class ConvertOptions(_Weakrefable):
return (f"""
check_utf8={self.check_utf8},
column_types={self.column_types},
+ default_column_type={self.default_column_type!r},
null_values={self.null_values},
true_values={self.true_values},
false_values={self.false_values},
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 8ee7784461..79522c1247 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2113,6 +2113,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv"
nogil:
cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions":
c_bool check_utf8
unordered_map[c_string, shared_ptr[CDataType]] column_types
+ shared_ptr[CDataType] default_column_type
vector[c_string] null_values
vector[c_string] true_values
vector[c_string] false_values
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index d608d2bee5..ac9012ebdf 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -321,7 +321,8 @@ def test_convert_options(pickle_module):
include_columns=['def', 'abc'],
include_missing_columns=False,
auto_dict_encode=True,
- timestamp_parsers=[ISO8601, '%y-%m'])
+ timestamp_parsers=[ISO8601, '%y-%m'],
+ default_column_type=pa.int16())
with pytest.raises(ValueError):
opts.decimal_point = '..'
@@ -349,6 +350,17 @@ def test_convert_options(pickle_module):
with pytest.raises(TypeError):
opts.column_types = 0
+ assert opts.default_column_type is None
+ opts.default_column_type = pa.string()
+ assert opts.default_column_type == pa.string()
+ opts.default_column_type = 'int32'
+ assert opts.default_column_type == pa.int32()
+ opts.default_column_type = None
+ assert opts.default_column_type is None
+
+ with pytest.raises(TypeError, match='DataType expected'):
+ opts.default_column_type = 123
+
assert isinstance(opts.null_values, list)
assert '' in opts.null_values
assert 'N/A' in opts.null_values
@@ -368,10 +380,12 @@ def test_convert_options(pickle_module):
assert opts.timestamp_parsers == [ISO8601]
opts = cls(column_types={'a': pa.null()},
+ default_column_type=pa.int16(),
null_values=['N', 'nn'], true_values=['T', 'tt'],
false_values=['F', 'ff'], auto_dict_max_cardinality=999,
timestamp_parsers=[ISO8601, '%Y-%m-%d'])
assert opts.column_types == {'a': pa.null()}
+ assert opts.default_column_type == pa.int16()
assert opts.null_values == ['N', 'nn']
assert opts.false_values == ['F', 'ff']
assert opts.true_values == ['T', 'tt']
@@ -381,6 +395,7 @@ def test_convert_options(pickle_module):
expected_repr_inner = ("""
check_utf8=True,
column_types={'a': DataType(null)},
+ default_column_type=DataType(int16),
null_values=['N', 'nn'],
true_values=['T', 'tt'],
false_values=['F', 'ff'],
@@ -1381,6 +1396,61 @@ class BaseCSVTableRead(BaseTestCSV):
'y': ['b', 'd', 'f'],
}
+ def test_default_column_type(self):
+ rows = b"a,b,c,d\n001,2.5,hello,true\n4,3.14,world,false\n"
+
+ # Test with default_column_type only -
+ # all columns should use the specified type.
+ opts = ConvertOptions(default_column_type=pa.string())
+ table = self.read_bytes(rows, convert_options=opts)
+ schema = pa.schema([('a', pa.string()),
+ ('b', pa.string()),
+ ('c', pa.string()),
+ ('d', pa.string())])
+ assert table.schema == schema
+ assert table.to_pydict() == {
+ 'a': ["001", "4"],
+ 'b': ["2.5", "3.14"],
+ 'c': ["hello", "world"],
+ 'd': ["true", "false"],
+ }
+
+ # Test with both column_types and default_column_type
+ # Columns specified in column_types should override default_column_type
+ opts = ConvertOptions(
+ column_types={'b': pa.float64(), 'd': pa.bool_()},
+ default_column_type=pa.string()
+ )
+ table = self.read_bytes(rows, convert_options=opts)
+ schema = pa.schema([('a', pa.string()),
+ ('b', pa.float64()),
+ ('c', pa.string()),
+ ('d', pa.bool_())])
+ assert table.schema == schema
+ assert table.to_pydict() == {
+ 'a': ["001", "4"],
+ 'b': [2.5, 3.14],
+ 'c': ["hello", "world"],
+ 'd': [True, False],
+ }
+
+ # Test that default_column_type disables type inference
+ opts_no_default = ConvertOptions(column_types={'b': pa.float64()})
+ table_no_default = self.read_bytes(rows,
convert_options=opts_no_default)
+
+ opts_with_default = ConvertOptions(
+ column_types={'b': pa.float64()},
+ default_column_type=pa.string()
+ )
+ table_with_default = self.read_bytes(rows,
convert_options=opts_with_default)
+
+ # Column 'a' should be int64 without default, string with default
+ assert table_no_default.schema.field('a').type == pa.int64()
+ assert table_with_default.schema.field('a').type == pa.string()
+ # Column 'b' should always be float64 since explicitly typed
+ assert table_no_default.schema.field('b').type == pa.float64()
+ assert table_with_default.schema.field('b').type == pa.float64()
+
def test_no_ending_newline(self):
# No \n after last line
rows = b"a,b,c\n1,2,3\n4,5,6"