Re: [PR] Implement all CSV reader options [datafusion-python]

via GitHub Tue, 03 Feb 2026 14:13:29 -0800


nuno-faria commented on code in PR #1361:
URL: 
https://github.com/apache/datafusion-python/pull/1361#discussion_r2761205127



##########
python/datafusion/options.py:
##########
@@ -0,0 +1,273 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Options for reading various file formats."""
+
+from __future__ import annotations
+
+import warnings
+from typing import TYPE_CHECKING
+
+import pyarrow as pa
+
+if TYPE_CHECKING:
+    from datafusion.expr import SortExpr
+
+from ._internal import options
+
+__all__ = ["CsvReadOptions"]
+
+DEFAULT_MAX_INFER_SCHEMA = 1000
+
+
+class CsvReadOptions:
+    """Options for reading CSV files.
+
+    This class provides a builder pattern for configuring CSV reading options.
+    All methods starting with ``with_`` return ``self`` to allow method 
chaining.
+    """
+
+    def __init__(
+        self,
+        *,
+        has_header: bool = True,
+        delimiter: str = ",",
+        quote: str = '"',
+        terminator: str | None = None,
+        escape: str | None = None,
+        comment: str | None = None,
+        newlines_in_values: bool = False,
+        schema: pa.Schema | None = None,
+        schema_infer_max_records: int = DEFAULT_MAX_INFER_SCHEMA,
+        file_extension: str = ".csv",
+        table_partition_cols: list[tuple[str, pa.DataType]] | None = None,
+        file_compression_type: str = "",
+        file_sort_order: list[list[SortExpr]] | None = None,
+        null_regex: str | None = None,
+        truncated_rows: bool = False,
+    ) -> None:
+        """Initialize CsvReadOptions.
+
+        Args:
+            has_header: Does the CSV file have a header row? If schema 
inference
+                is run on a file with no headers, default column names are 
created.
+            delimiter: Column delimiter character. Must be a single ASCII 
character.
+            quote: Quote character for fields containing delimiters or 
newlines.
+                Must be a single ASCII character.
+            terminator: Optional line terminator character. If ``None``, uses 
CRLF.
+                Must be a single ASCII character.

Review Comment:
   I'm surprised the default is `CRLF`.



##########
docs/source/user-guide/io/csv.rst:
##########
@@ -36,3 +36,22 @@ An alternative is to use 
:py:func:`~datafusion.context.SessionContext.register_c
 
     ctx.register_csv("file", "file.csv")
     df = ctx.table("file")
+
+If you require additional control over how to read the CSV file, you can use
+:py:class:`~datafusion.options.CsvReadOptions` to set a variety of options.
+
+.. code-block:: python
+

Review Comment:
   I think it would be nice to have a link to docs.rs or similar that points to 
all available options for `CsvReadOptions`.



##########
python/tests/test_context.py:
##########
@@ -710,3 +710,68 @@ def test_create_dataframe_with_global_ctx(batch):
     result = df.collect()[0].column(0)
 
     assert result == pa.array([4, 5, 6])
+
+
+def test_csv_read_options_builder_pattern():
+    """Test CsvReadOptions builder pattern."""
+    from datafusion import CsvReadOptions
+
+    options = (
+        CsvReadOptions()
+        .with_has_header(False)  # noqa: FBT003
+        .with_delimiter("|")
+        .with_quote("'")
+        .with_schema_infer_max_records(2000)
+        .with_truncated_rows(True)  # noqa: FBT003
+        .with_newlines_in_values(True)  # noqa: FBT003
+        .with_file_extension(".tsv")
+    )
+    assert options.has_header is False
+    assert options.delimiter == "|"
+    assert options.quote == "'"
+    assert options.schema_infer_max_records == 2000
+    assert options.truncated_rows is True
+    assert options.newlines_in_values is True
+    assert options.file_extension == ".tsv"
+
+
[email protected](
+    ("as_read", "global_ctx"),
+    [
+        (True, True),
+        (True, False),
+        (False, False),
+    ],
+)
+def test_read_csv_with_options(tmp_path, as_read, global_ctx):
+    """Test reading CSV with CsvReadOptions."""
+    from datafusion import CsvReadOptions, SessionContext
+
+    # Create a test CSV file
+    csv_path = tmp_path / "test.csv"
+    csv_content = "name;age;city\nAlice;30;New York\nBob;25\n#Charlie;35;Paris"
+    csv_path.write_text(csv_content)
+
+    ctx = SessionContext()
+
+    # Test with CsvReadOptions
+    options = CsvReadOptions(
+        has_header=True, delimiter=";", comment="#", truncated_rows=True
+    )

Review Comment:
   Should more parameters be tested? Like "quote", "truncated_rows", 
"compression", "null_regex", ...



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Implement all CSV reader options [datafusion-python]

Reply via email to