jorisvandenbossche commented on a change in pull request #9702:
URL: https://github.com/apache/arrow/pull/9702#discussion_r763025401
##########
File path: python/pyarrow/orc.py
##########
@@ -54,9 +54,25 @@ def nrows(self):
return self.reader.nrows()
@property
- def nstripes(self):
- """The number of stripes in the file"""
- return self.reader.nstripes()
+ def file_version(self):
+ """Format version of the ORC file, must be 0.11 or 0.12"""
+ return self.reader.file_version()
+
+ @property
+ def compression(self):
+ """Compression codec of the file"""
+ return self.reader.compression()
+
+ @property
+ def compression_size(self):
+ """Number of bytes to buffer for the compression codec in the file"""
+ return self.reader.compression_size()
Review comment:
Does this map to the option "compression_block_size" ? If so, maybe
using the same name would help (I know the C++ liborc API is consistent though)
##########
File path: python/pyarrow/orc.py
##########
@@ -143,17 +230,54 @@ def write(self, table):
schema : pyarrow.lib.Table
The table to be written into the ORC file
"""
+ assert self.is_open
self.writer.write(table)
def close(self):
"""
Close the ORC file
"""
- self.writer.close()
+ if self.is_open:
+ self.writer.close()
+ self.is_open = False
-def write_table(table, where):
- """
+def write_table(table, where, file_version='0.12',
+ batch_size=1024,
+ stripe_size=67108864,
+ compression='zlib',
+ compression_block_size=65536,
+ compression_strategy='speed',
+ row_index_stride=10000,
+ padding_tolerance=0.0,
+ dictionary_key_size_threshold=0.0,
+ bloom_filter_columns=None,
+ bloom_filter_fpp=0.05):
+ if isinstance(where, Table):
+ warnings.warn(
+ "The order of the arguments has changed. Pass as "
+ "'write_table(table, where)' instead. The old order will raise "
+ "an error in the future.", FutureWarning, stacklevel=2
+ )
+ table, where = where, table
+ with ORCWriter(
+ where,
+ file_version=file_version,
+ batch_size=batch_size,
+ stripe_size=stripe_size,
+ compression=compression,
+ compression_block_size=compression_block_size,
+ compression_strategy=compression_strategy,
+ row_index_stride=row_index_stride,
+ padding_tolerance=padding_tolerance,
+ dictionary_key_size_threshold=dictionary_key_size_threshold,
+ bloom_filter_columns=bloom_filter_columns,
+ bloom_filter_fpp=bloom_filter_fpp
+ ) as writer:
+ writer.write(table)
+
+
+write_table.__doc__ = """
Write a table into an ORC file
Review comment:
If storing separately here, you need to remove the one level of
indentation I think.
If you now check the docstring interactively, you get:
```
In [16]: orc.write_table?
Signature:
...
Docstring:
Write a table into an ORC file
Parameters
----------
table : pyarrow.lib.Table
The table to be written into the ORC file
where : str or pyarrow.io.NativeFile
Writable target. For passing Python file objects or byte buffers,
see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream
or pyarrow.io.FixedSizeBufferWriter.
file_version : {"0.11", "0.12"}, default "0.12"
Determine which ORC file version to use. Hive 0.11 / ORC v0 is the older
version as defined `here <https://orc.apache.org/specification/ORCv0/>`
while Hive 0.12 / ORC v1 is the newer one as defined
`here <https://orc.apache.org/specification/ORCv1/>`.
batch_size : int, default 1024
Number of rows the ORC writer writes at a time.
stripe_size : int, default 64 * 1024 * 1024
Size of each ORC stripe.
...
```
##########
File path: python/pyarrow/_orc.pyx
##########
@@ -38,6 +38,283 @@ from pyarrow.lib cimport (check_status, _Weakrefable,
get_writer)
from pyarrow.lib import tobytes
+cdef compression_kind_from_enum(CompressionKind compression_kind_):
+ return {
+ _CompressionKind_NONE: 'NONE',
+ _CompressionKind_ZLIB: 'ZLIB',
+ _CompressionKind_SNAPPY: 'SNAPPY',
+ _CompressionKind_LZO: 'LZO',
+ _CompressionKind_LZ4: 'LZ4',
+ _CompressionKind_ZSTD: 'ZSTD',
+ }.get(compression_kind_, 'UNKNOWN')
+
+cdef CompressionKind compression_kind_from_name(name):
+ name = name.upper()
+ if name == 'ZLIB':
+ return _CompressionKind_ZLIB
+ elif name == 'SNAPPY':
+ return _CompressionKind_SNAPPY
+ elif name == 'LZO':
+ return _CompressionKind_LZO
+ elif name == 'LZ4':
+ return _CompressionKind_LZ4
+ elif name == 'ZSTD':
+ return _CompressionKind_ZSTD
+ else:
+ return _CompressionKind_NONE
Review comment:
Because this is a cdef function, you will probably need to add `except
*` to the signature (see suggestion above) to ensure the error gets bubbled up
to python and not ignored. Can you also add a test for this to ensure it
actually raises an error?
##########
File path: python/pyarrow/orc.py
##########
@@ -54,9 +54,25 @@ def nrows(self):
return self.reader.nrows()
@property
- def nstripes(self):
- """The number of stripes in the file"""
- return self.reader.nstripes()
+ def file_version(self):
+ """Format version of the ORC file, must be 0.11 or 0.12"""
+ return self.reader.file_version()
+
+ @property
+ def compression(self):
+ """Compression codec of the file"""
+ return self.reader.compression()
+
+ @property
+ def compression_size(self):
+ """Number of bytes to buffer for the compression codec in the file"""
+ return self.reader.compression_size()
+
+ @property
+ def row_index_stride(self):
+ """Number of rows per an entry in the row index or 0
+ if there is no row index"""
Review comment:
```suggestion
if there is no row index"""
```
##########
File path: python/pyarrow/_orc.pyx
##########
@@ -39,6 +39,206 @@ from pyarrow.lib cimport (check_status, _Weakrefable,
from pyarrow.lib import tobytes
+cdef compression_kind_from_enum(CompressionKind compression_kind_):
+ return {
+ _CompressionKind_NONE: 'UNCOMPRESSED',
+ _CompressionKind_ZLIB: 'ZLIB',
+ _CompressionKind_SNAPPY: 'SNAPPY',
+ _CompressionKind_LZO: 'LZO',
+ _CompressionKind_LZ4: 'LZ4',
+ _CompressionKind_ZSTD: 'ZSTD',
+ }.get(compression_kind_, 'UNKNOWN')
+
+
+cdef CompressionKind compression_kind_from_name(name):
+ name = name.upper()
+ if name == 'ZLIB':
+ return _CompressionKind_ZLIB
+ elif name == 'SNAPPY':
+ return _CompressionKind_SNAPPY
+ elif name == 'LZO':
+ return _CompressionKind_LZO
+ elif name == 'LZ4':
+ return _CompressionKind_LZ4
+ elif name == 'ZSTD':
+ return _CompressionKind_ZSTD
+ elif name == 'UNCOMPRESSED':
+ return _CompressionKind_NONE
+ raise ValueError('Unknown CompressionKind: {0}'.format(name))
+
+
+cdef compression_strategy_from_enum(CompressionStrategy compression_strategy_):
+ return {
+ _CompressionStrategy_SPEED: 'SPEED',
+ _CompressionStrategy_COMPRESSION: 'COMPRESSION',
+ }.get(compression_strategy_, 'UNKNOWN')
+
+
+cdef CompressionStrategy compression_strategy_from_name(name):
+ name = name.upper()
+ # SPEED is the default value in the ORC C++ implementaton
+ if name == 'COMPRESSION':
+ return _CompressionStrategy_COMPRESSION
+ elif name == 'SPEED':
+ return _CompressionStrategy_SPEED
+ raise ValueError('Unknown CompressionStrategy: {0}'.format(name))
+
+
+cdef rle_version_from_enum(RleVersion rle_version_):
+ return {
+ _RleVersion_1: '1',
+ _RleVersion_2: '2',
+ }.get(rle_version_, 'UNKNOWN')
+
+
+cdef bloom_filter_version_from_enum(BloomFilterVersion bloom_filter_version_):
+ return {
+ _BloomFilterVersion_ORIGINAL: 'ORIGINAL',
+ _BloomFilterVersion_UTF8: 'UTF8',
+ _BloomFilterVersion_FUTURE: 'FUTURE',
+ }.get(bloom_filter_version_, 'UNKNOWN')
+
+
+cdef file_version_from_class(FileVersion file_version_):
+ cdef object file_version = file_version_.ToString()
+ return file_version
Review comment:
Can you convert the return value to a string? (now it is bytes) `return
frombytes(file_version)` should normally do it
##########
File path: python/pyarrow/_orc.pyx
##########
@@ -38,6 +38,283 @@ from pyarrow.lib cimport (check_status, _Weakrefable,
get_writer)
from pyarrow.lib import tobytes
+cdef compression_kind_from_enum(CompressionKind compression_kind_):
+ return {
+ _CompressionKind_NONE: 'NONE',
+ _CompressionKind_ZLIB: 'ZLIB',
+ _CompressionKind_SNAPPY: 'SNAPPY',
+ _CompressionKind_LZO: 'LZO',
+ _CompressionKind_LZ4: 'LZ4',
+ _CompressionKind_ZSTD: 'ZSTD',
+ }.get(compression_kind_, 'UNKNOWN')
+
+cdef CompressionKind compression_kind_from_name(name):
Review comment:
```suggestion
cdef CompressionKind compression_kind_from_name(name) except *:
```
(see below)
##########
File path: python/pyarrow/orc.py
##########
@@ -54,9 +54,25 @@ def nrows(self):
return self.reader.nrows()
@property
- def nstripes(self):
- """The number of stripes in the file"""
- return self.reader.nstripes()
Review comment:
I think you accidentally removed nstripes here
##########
File path: python/pyarrow/orc.py
##########
@@ -143,17 +230,54 @@ def write(self, table):
schema : pyarrow.lib.Table
The table to be written into the ORC file
"""
+ assert self.is_open
self.writer.write(table)
def close(self):
"""
Close the ORC file
"""
- self.writer.close()
+ if self.is_open:
+ self.writer.close()
+ self.is_open = False
-def write_table(table, where):
- """
+def write_table(table, where, file_version='0.12',
+ batch_size=1024,
+ stripe_size=67108864,
+ compression='zlib',
+ compression_block_size=65536,
+ compression_strategy='speed',
+ row_index_stride=10000,
+ padding_tolerance=0.0,
+ dictionary_key_size_threshold=0.0,
+ bloom_filter_columns=None,
+ bloom_filter_fpp=0.05):
+ if isinstance(where, Table):
+ warnings.warn(
+ "The order of the arguments has changed. Pass as "
+ "'write_table(table, where)' instead. The old order will raise "
+ "an error in the future.", FutureWarning, stacklevel=2
+ )
+ table, where = where, table
+ with ORCWriter(
+ where,
+ file_version=file_version,
+ batch_size=batch_size,
+ stripe_size=stripe_size,
+ compression=compression,
+ compression_block_size=compression_block_size,
+ compression_strategy=compression_strategy,
+ row_index_stride=row_index_stride,
+ padding_tolerance=padding_tolerance,
+ dictionary_key_size_threshold=dictionary_key_size_threshold,
+ bloom_filter_columns=bloom_filter_columns,
+ bloom_filter_fpp=bloom_filter_fpp
+ ) as writer:
+ writer.write(table)
+
+
+write_table.__doc__ = """
Write a table into an ORC file
Review comment:
Also best check if the indentation in `ORCWriter` class docstring is OK
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]