[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #9702: ARROW-11297: [C++][Python] Add ORC writer options

GitBox Mon, 06 Dec 2021 06:00:28 -0800


jorisvandenbossche commented on a change in pull request #9702:
URL: https://github.com/apache/arrow/pull/9702#discussion_r763025401




##########
File path: python/pyarrow/orc.py
##########
@@ -54,9 +54,25 @@ def nrows(self):
         return self.reader.nrows()
 
     @property
-    def nstripes(self):
-        """The number of stripes in the file"""
-        return self.reader.nstripes()
+    def file_version(self):
+        """Format version of the ORC file, must be 0.11 or 0.12"""
+        return self.reader.file_version()
+
+    @property
+    def compression(self):
+        """Compression codec of the file"""
+        return self.reader.compression()
+
+    @property
+    def compression_size(self):
+        """Number of bytes to buffer for the compression codec in the file"""
+        return self.reader.compression_size()

Review comment:
       Does this map to the option "compression_block_size" ? If so, maybe 
using the same name would help (I know the C++ liborc API is consistent though)

##########
File path: python/pyarrow/orc.py
##########
@@ -143,17 +230,54 @@ def write(self, table):
         schema : pyarrow.lib.Table
             The table to be written into the ORC file
         """
+        assert self.is_open
         self.writer.write(table)
 
     def close(self):
         """
         Close the ORC file
         """
-        self.writer.close()
+        if self.is_open:
+            self.writer.close()
+            self.is_open = False
 
 
-def write_table(table, where):
-    """
+def write_table(table, where, file_version='0.12',
+                batch_size=1024,
+                stripe_size=67108864,
+                compression='zlib',
+                compression_block_size=65536,
+                compression_strategy='speed',
+                row_index_stride=10000,
+                padding_tolerance=0.0,
+                dictionary_key_size_threshold=0.0,
+                bloom_filter_columns=None,
+                bloom_filter_fpp=0.05):
+    if isinstance(where, Table):
+        warnings.warn(
+            "The order of the arguments has changed. Pass as "
+            "'write_table(table, where)' instead. The old order will raise "
+            "an error in the future.", FutureWarning, stacklevel=2
+        )
+        table, where = where, table
+    with ORCWriter(
+        where,
+        file_version=file_version,
+        batch_size=batch_size,
+        stripe_size=stripe_size,
+        compression=compression,
+        compression_block_size=compression_block_size,
+        compression_strategy=compression_strategy,
+        row_index_stride=row_index_stride,
+        padding_tolerance=padding_tolerance,
+        dictionary_key_size_threshold=dictionary_key_size_threshold,
+        bloom_filter_columns=bloom_filter_columns,
+        bloom_filter_fpp=bloom_filter_fpp
+    ) as writer:
+        writer.write(table)
+
+
+write_table.__doc__ = """
     Write a table into an ORC file

Review comment:
       If storing separately here, you need to remove the one level of 
indentation I think.
   
   If you now check the docstring interactively, you get:
   
   ```
   In [16]: orc.write_table?
   Signature:
   ...
   Docstring:
       Write a table into an ORC file
   
       Parameters
       ----------
       table : pyarrow.lib.Table
           The table to be written into the ORC file
       where : str or pyarrow.io.NativeFile
           Writable target. For passing Python file objects or byte buffers,
           see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream
           or pyarrow.io.FixedSizeBufferWriter.
       file_version : {"0.11", "0.12"}, default "0.12"
       Determine which ORC file version to use. Hive 0.11 / ORC v0 is the older
       version as defined `here <https://orc.apache.org/specification/ORCv0/>`
       while Hive 0.12 / ORC v1 is the newer one as defined
       `here <https://orc.apache.org/specification/ORCv1/>`.
   batch_size : int, default 1024
       Number of rows the ORC writer writes at a time.
   stripe_size : int, default 64 * 1024 * 1024
       Size of each ORC stripe.
   ...
   ```

##########
File path: python/pyarrow/_orc.pyx
##########
@@ -38,6 +38,283 @@ from pyarrow.lib cimport (check_status, _Weakrefable,
                           get_writer)
 from pyarrow.lib import tobytes
 
+cdef compression_kind_from_enum(CompressionKind compression_kind_):
+    return {
+        _CompressionKind_NONE: 'NONE',
+        _CompressionKind_ZLIB: 'ZLIB',
+        _CompressionKind_SNAPPY: 'SNAPPY',
+        _CompressionKind_LZO: 'LZO',
+        _CompressionKind_LZ4: 'LZ4',
+        _CompressionKind_ZSTD: 'ZSTD',
+    }.get(compression_kind_, 'UNKNOWN')
+
+cdef CompressionKind compression_kind_from_name(name):
+    name = name.upper()
+    if name == 'ZLIB':
+        return _CompressionKind_ZLIB
+    elif name == 'SNAPPY':
+        return _CompressionKind_SNAPPY
+    elif name == 'LZO':
+        return _CompressionKind_LZO
+    elif name == 'LZ4':
+        return _CompressionKind_LZ4
+    elif name == 'ZSTD':
+        return _CompressionKind_ZSTD
+    else:
+        return _CompressionKind_NONE

Review comment:
       Because this is a cdef function, you will probably need to add `except 
*` to the signature (see suggestion above) to ensure the error gets bubbled up 
to python and not ignored. Can you also add a test for this to ensure it 
actually raises an error? 

##########
File path: python/pyarrow/orc.py
##########
@@ -54,9 +54,25 @@ def nrows(self):
         return self.reader.nrows()
 
     @property
-    def nstripes(self):
-        """The number of stripes in the file"""
-        return self.reader.nstripes()
+    def file_version(self):
+        """Format version of the ORC file, must be 0.11 or 0.12"""
+        return self.reader.file_version()
+
+    @property
+    def compression(self):
+        """Compression codec of the file"""
+        return self.reader.compression()
+
+    @property
+    def compression_size(self):
+        """Number of bytes to buffer for the compression codec in the file"""
+        return self.reader.compression_size()
+
+    @property
+    def row_index_stride(self):
+        """Number of rows per an entry in the row index or 0
+           if there is no row index"""

Review comment:
       ```suggestion
           if there is no row index"""
   ```

##########
File path: python/pyarrow/_orc.pyx
##########
@@ -39,6 +39,206 @@ from pyarrow.lib cimport (check_status, _Weakrefable,
 from pyarrow.lib import tobytes
 
 
+cdef compression_kind_from_enum(CompressionKind compression_kind_):
+    return {
+        _CompressionKind_NONE: 'UNCOMPRESSED',
+        _CompressionKind_ZLIB: 'ZLIB',
+        _CompressionKind_SNAPPY: 'SNAPPY',
+        _CompressionKind_LZO: 'LZO',
+        _CompressionKind_LZ4: 'LZ4',
+        _CompressionKind_ZSTD: 'ZSTD',
+    }.get(compression_kind_, 'UNKNOWN')
+
+
+cdef CompressionKind compression_kind_from_name(name):
+    name = name.upper()
+    if name == 'ZLIB':
+        return _CompressionKind_ZLIB
+    elif name == 'SNAPPY':
+        return _CompressionKind_SNAPPY
+    elif name == 'LZO':
+        return _CompressionKind_LZO
+    elif name == 'LZ4':
+        return _CompressionKind_LZ4
+    elif name == 'ZSTD':
+        return _CompressionKind_ZSTD
+    elif name == 'UNCOMPRESSED':
+        return _CompressionKind_NONE
+    raise ValueError('Unknown CompressionKind: {0}'.format(name))
+
+
+cdef compression_strategy_from_enum(CompressionStrategy compression_strategy_):
+    return {
+        _CompressionStrategy_SPEED: 'SPEED',
+        _CompressionStrategy_COMPRESSION: 'COMPRESSION',
+    }.get(compression_strategy_, 'UNKNOWN')
+
+
+cdef CompressionStrategy compression_strategy_from_name(name):
+    name = name.upper()
+    # SPEED is the default value in the ORC C++ implementaton
+    if name == 'COMPRESSION':
+        return _CompressionStrategy_COMPRESSION
+    elif name == 'SPEED':
+        return _CompressionStrategy_SPEED
+    raise ValueError('Unknown CompressionStrategy: {0}'.format(name))
+
+
+cdef rle_version_from_enum(RleVersion rle_version_):
+    return {
+        _RleVersion_1: '1',
+        _RleVersion_2: '2',
+    }.get(rle_version_, 'UNKNOWN')
+
+
+cdef bloom_filter_version_from_enum(BloomFilterVersion bloom_filter_version_):
+    return {
+        _BloomFilterVersion_ORIGINAL: 'ORIGINAL',
+        _BloomFilterVersion_UTF8: 'UTF8',
+        _BloomFilterVersion_FUTURE: 'FUTURE',
+    }.get(bloom_filter_version_, 'UNKNOWN')
+
+
+cdef file_version_from_class(FileVersion file_version_):
+    cdef object file_version = file_version_.ToString()
+    return file_version

Review comment:
       Can you convert the return value to a string? (now it is bytes) `return 
frombytes(file_version)` should normally do it

##########
File path: python/pyarrow/_orc.pyx
##########
@@ -38,6 +38,283 @@ from pyarrow.lib cimport (check_status, _Weakrefable,
                           get_writer)
 from pyarrow.lib import tobytes
 
+cdef compression_kind_from_enum(CompressionKind compression_kind_):
+    return {
+        _CompressionKind_NONE: 'NONE',
+        _CompressionKind_ZLIB: 'ZLIB',
+        _CompressionKind_SNAPPY: 'SNAPPY',
+        _CompressionKind_LZO: 'LZO',
+        _CompressionKind_LZ4: 'LZ4',
+        _CompressionKind_ZSTD: 'ZSTD',
+    }.get(compression_kind_, 'UNKNOWN')
+
+cdef CompressionKind compression_kind_from_name(name):

Review comment:
       ```suggestion
   cdef CompressionKind compression_kind_from_name(name) except *:
   ```
   
   (see below)

##########
File path: python/pyarrow/orc.py
##########
@@ -54,9 +54,25 @@ def nrows(self):
         return self.reader.nrows()
 
     @property
-    def nstripes(self):
-        """The number of stripes in the file"""
-        return self.reader.nstripes()

Review comment:
       I think you accidentally removed nstripes here

##########
File path: python/pyarrow/orc.py
##########
@@ -143,17 +230,54 @@ def write(self, table):
         schema : pyarrow.lib.Table
             The table to be written into the ORC file
         """
+        assert self.is_open
         self.writer.write(table)
 
     def close(self):
         """
         Close the ORC file
         """
-        self.writer.close()
+        if self.is_open:
+            self.writer.close()
+            self.is_open = False
 
 
-def write_table(table, where):
-    """
+def write_table(table, where, file_version='0.12',
+                batch_size=1024,
+                stripe_size=67108864,
+                compression='zlib',
+                compression_block_size=65536,
+                compression_strategy='speed',
+                row_index_stride=10000,
+                padding_tolerance=0.0,
+                dictionary_key_size_threshold=0.0,
+                bloom_filter_columns=None,
+                bloom_filter_fpp=0.05):
+    if isinstance(where, Table):
+        warnings.warn(
+            "The order of the arguments has changed. Pass as "
+            "'write_table(table, where)' instead. The old order will raise "
+            "an error in the future.", FutureWarning, stacklevel=2
+        )
+        table, where = where, table
+    with ORCWriter(
+        where,
+        file_version=file_version,
+        batch_size=batch_size,
+        stripe_size=stripe_size,
+        compression=compression,
+        compression_block_size=compression_block_size,
+        compression_strategy=compression_strategy,
+        row_index_stride=row_index_stride,
+        padding_tolerance=padding_tolerance,
+        dictionary_key_size_threshold=dictionary_key_size_threshold,
+        bloom_filter_columns=bloom_filter_columns,
+        bloom_filter_fpp=bloom_filter_fpp
+    ) as writer:
+        writer.write(table)
+
+
+write_table.__doc__ = """
     Write a table into an ORC file

Review comment:
       Also best check if the indentation in `ORCWriter` class docstring is OK




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #9702: ARROW-11297: [C++][Python] Add ORC writer options

Reply via email to