jorisvandenbossche commented on a change in pull request #11008: URL: https://github.com/apache/arrow/pull/11008#discussion_r699226689
########## File path: python/pyarrow/dataset.py ########## @@ -714,9 +729,11 @@ def write_dataset(data, base_dir, basename_template=None, format=None, and `format` is not specified, it defaults to the same format as the specified FileSystemDataset. When writing a Table or RecordBatch, this keyword is required. - partitioning : Partitioning, optional + partitioning : Partitioning or list[str], optional The partitioning scheme specified with the ``partitioning()`` - function. + function or as a list of field names. + partitioning_flavor : One of the partitioning flavors supported by + ``pyarrow.dataset.partitioning``. Review comment: ```suggestion partitioning_flavor : str, optional One of the partitioning flavors supported by ``pyarrow.dataset.partitioning``. ``` ########## File path: python/pyarrow/tests/test_dataset.py ########## @@ -1577,6 +1577,38 @@ def test_dictionary_partitioning_outer_nulls_raises(tempdir): ds.write_dataset(table, tempdir, format='parquet', partitioning=part) +def test_write_dataset_with_field_names(tempdir): + table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']}) + + ds.write_dataset(table, tempdir, format='parquet', + partitioning=["b"]) + + load_back = ds.dataset(tempdir, partitioning=["b"]) + partitioning_dirs = { + str(pathlib.Path(f).relative_to(tempdir).parent) for f in load_back.files + } + assert partitioning_dirs == {"x", "y", "z"} + + load_back_table = load_back.to_table() + assert load_back_table.to_pydict() == table.to_pydict() Review comment: ```suggestion assert load_back_table.equals(table) ``` ########## File path: python/pyarrow/dataset.py ########## @@ -678,17 +678,32 @@ def dataset(source, schema=None, format=None, filesystem=None, ) -def _ensure_write_partitioning(scheme): - if scheme is None: - scheme = partitioning(pa.schema([])) - if not isinstance(scheme, Partitioning): - # TODO support passing field names, and get types from schema - raise ValueError("partitioning needs to be actual Partitioning object") - return scheme +def _ensure_write_partitioning(part, schema, flavor): + if isinstance(part, Partitioning) and flavor: + raise ValueError( + "Providing a partitioning_flavor with " + "a Partitioning object is not supported" + ) + elif isinstance(part, (tuple, list)): + # Name of fields were provided instead of a partitioning object. + # Create a partitioning factory with those field names. + part = partitioning( + schema=pa.schema([schema.field_by_name(f) for f in part]), + flavor=flavor + ) + elif part is None: + part = partitioning(pa.schema([]), flavor=flavor) + + if not isinstance(part, Partitioning): + raise ValueError( + "partitioning must be a Partitioning object with a schema" Review comment: ```suggestion "partitioning must be a Partitioning object constructed with a schema" ``` ########## File path: python/pyarrow/dataset.py ########## @@ -788,7 +805,8 @@ def file_visitor(written_file): if max_partitions is None: max_partitions = 1024 - partitioning = _ensure_write_partitioning(partitioning) + partitioning = _ensure_write_partitioning(partitioning, schema=schema, Review comment: It seems that schema can be None at this point, in which case `_ensure_write_partitioning` might error. It seems that when passing a Scanner `schema` must be None (or when constructing a Scanner from an iterable, the `schema` is set to None after constructing the scanner, see L781 above). But in those cases, we can get the `schema` from the scanner to pass here. ########## File path: python/pyarrow/tests/test_dataset.py ########## @@ -1577,6 +1577,38 @@ def test_dictionary_partitioning_outer_nulls_raises(tempdir): ds.write_dataset(table, tempdir, format='parquet', partitioning=part) +def test_write_dataset_with_field_names(tempdir): + table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']}) + + ds.write_dataset(table, tempdir, format='parquet', + partitioning=["b"]) + + load_back = ds.dataset(tempdir, partitioning=["b"]) + partitioning_dirs = { + str(pathlib.Path(f).relative_to(tempdir).parent) for f in load_back.files + } + assert partitioning_dirs == {"x", "y", "z"} + + load_back_table = load_back.to_table() + assert load_back_table.to_pydict() == table.to_pydict() + + +def test_write_dataset_with_field_names_hive(tempdir): + table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']}) + + ds.write_dataset(table, tempdir, format='parquet', + partitioning=["b"], partitioning_flavor="hive") + + load_back = ds.dataset(tempdir, partitioning="hive") + partitioning_dirs = { + str(pathlib.Path(f).relative_to(tempdir).parent) for f in load_back.files + } + assert partitioning_dirs == {"b=x", "b=y", "b=z"} + + load_back_table = load_back.to_table() + assert load_back_table.to_pydict() == table.to_pydict() Review comment: same here ########## File path: python/pyarrow/tests/test_dataset.py ########## @@ -1577,6 +1577,38 @@ def test_dictionary_partitioning_outer_nulls_raises(tempdir): ds.write_dataset(table, tempdir, format='parquet', partitioning=part) +def test_write_dataset_with_field_names(tempdir): Review comment: I know the file is not super well structured (and there is a test just above that uses write_dataset as well), but _most_ write_dataset tests are below (after line 3000). So I would put this test for example right after `test_write_dataset_partitioned_dict(tempdir)` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org