iajoiner commented on a change in pull request #9702:
URL: https://github.com/apache/arrow/pull/9702#discussion_r787091946



##########
File path: python/pyarrow/_orc.pyx
##########
@@ -36,7 +36,233 @@ from pyarrow.lib cimport (check_status, _Weakrefable,
                           pyarrow_unwrap_table,
                           get_reader,
                           get_writer)
-from pyarrow.lib import tobytes
+from pyarrow.lib import frombytes, tobytes
+
+
+cdef compression_type_from_enum(CCompressionType compression_type_):
+    return {
+        CCompressionType_UNCOMPRESSED: 'UNCOMPRESSED',
+        CCompressionType_GZIP: 'ZLIB',
+        CCompressionType_SNAPPY: 'SNAPPY',
+        CCompressionType_LZ4: 'LZ4',
+        CCompressionType_ZSTD: 'ZSTD',
+    }.get(compression_type_, 'UNKNOWN')
+
+
+cdef CCompressionType compression_type_from_name(name) except *:
+    if not isinstance(name, str):
+        raise TypeError('compression must be a string')
+    name = name.upper()
+    if name == 'ZLIB':
+        return CCompressionType_GZIP
+    elif name == 'SNAPPY':
+        return CCompressionType_SNAPPY
+    elif name == 'LZ4':
+        return CCompressionType_LZ4
+    elif name == 'ZSTD':
+        return CCompressionType_ZSTD
+    elif name == 'UNCOMPRESSED':
+        return CCompressionType_UNCOMPRESSED
+    raise ValueError('Unknown CompressionKind: {0}'.format(name))
+
+
+cdef compression_strategy_from_enum(CompressionStrategy compression_strategy_):
+    return {
+        _CompressionStrategy_SPEED: 'SPEED',
+        _CompressionStrategy_COMPRESSION: 'COMPRESSION',
+    }.get(compression_strategy_, 'UNKNOWN')
+
+
+cdef CompressionStrategy compression_strategy_from_name(name) except *:
+    if not isinstance(name, str):
+        raise TypeError('compression strategy must be a string')
+    name = name.upper()
+    # SPEED is the default value in the ORC C++ implementaton
+    if name == 'COMPRESSION':
+        return _CompressionStrategy_COMPRESSION
+    elif name == 'SPEED':
+        return _CompressionStrategy_SPEED
+    raise ValueError('Unknown CompressionStrategy: {0}'.format(name))
+
+
+cdef rle_version_from_enum(RleVersion rle_version_):
+    return {
+        _RleVersion_1: '1',
+        _RleVersion_2: '2',
+    }.get(rle_version_, 'UNKNOWN')
+
+
+cdef bloom_filter_version_from_enum(BloomFilterVersion bloom_filter_version_):
+    return {
+        _BloomFilterVersion_ORIGINAL: 'ORIGINAL',
+        _BloomFilterVersion_UTF8: 'UTF8',
+        _BloomFilterVersion_FUTURE: 'FUTURE',
+    }.get(bloom_filter_version_, 'UNKNOWN')
+
+
+cdef file_version_from_class(FileVersion file_version_):
+    cdef object file_version = file_version_.ToString()
+    return frombytes(file_version)
+
+
+cdef writer_id_from_enum(WriterId writer_id_):
+    return {
+        _WriterId_ORC_JAVA_WRITER: 'ORC_JAVA',
+        _WriterId_ORC_CPP_WRITER: 'ORC_CPP',
+        _WriterId_PRESTO_WRITER: 'PRESTO',
+        _WriterId_SCRITCHLEY_GO: 'SCRITCHLEY_GO',
+        _WriterId_TRINO_WRITER: 'TRINO',
+    }.get(writer_id_, 'UNKNOWN')
+
+
+cdef writer_version_from_enum(WriterVersion writer_version_):
+    return {
+        _WriterVersion_ORIGINAL: 'ORIGINAL',
+        _WriterVersion_HIVE_8732: 'HIVE_8732',
+        _WriterVersion_HIVE_4243: 'HIVE_4243',
+        _WriterVersion_HIVE_12055: 'HIVE_12055',
+        _WriterVersion_HIVE_13083: 'HIVE_13083',
+        _WriterVersion_ORC_101: 'ORC_101',
+        _WriterVersion_ORC_135: 'ORC_135',
+        _WriterVersion_ORC_517: 'ORC_517',
+        _WriterVersion_ORC_203: 'ORC_203',
+        _WriterVersion_ORC_14: 'ORC_14',
+    }.get(writer_version_, 'UNKNOWN')
+
+
+cdef shared_ptr[WriteOptions] _create_write_options(
+    file_version=None,
+    batch_size=None,
+    stripe_size=None,
+    compression=None,
+    compression_block_size=None,
+    compression_strategy=None,
+    row_index_stride=None,
+    padding_tolerance=None,
+    dictionary_key_size_threshold=None,
+    bloom_filter_columns=None,
+    bloom_filter_fpp=None
+) except *:
+    """General writer options"""
+    cdef:
+        shared_ptr[WriteOptions] options
+
+    options = make_shared[WriteOptions]()
+
+    # batch_size
+
+    if batch_size is not None:
+        if isinstance(batch_size, int) and batch_size > 0:
+            deref(options).batch_size = batch_size
+        else:
+            raise ValueError("Invalid ORC writer batch size: {0}"
+                             .format(batch_size))
+
+    # file_version
+
+    if file_version is not None:
+        if str(file_version) == "0.12":
+            deref(options).file_version = FileVersion(0, 12)
+        elif str(file_version) == "0.11":
+            deref(options).file_version = FileVersion(0, 11)
+        else:
+            raise ValueError("Unsupported ORC file version: {0}"
+                             .format(file_version))
+
+    # stripe_size
+
+    if stripe_size is not None:
+        if isinstance(stripe_size, int) and stripe_size > 0:
+            deref(options).stripe_size = stripe_size
+        else:
+            raise ValueError("Invalid ORC stripe size: {0}"
+                             .format(stripe_size))
+
+    # compression
+
+    if compression is not None:
+        if isinstance(compression, basestring):
+            deref(options).compression = compression_type_from_name(
+                compression)
+        else:
+            raise TypeError("Unsupported ORC compression type: {0}"
+                            .format(compression))
+
+    # compression_block_size
+
+    if compression_block_size is not None:
+        if (isinstance(compression_block_size, int) and
+                compression_block_size > 0):
+            deref(options).compression_block_size = compression_block_size
+        else:
+            raise ValueError("Invalid ORC compression block size: {0}"
+                             .format(compression_block_size))
+
+    # compression_strategy
+
+    if compression_strategy is not None:
+        if isinstance(compression, basestring):
+            deref(options).compression_strategy = \
+                compression_strategy_from_name(compression_strategy)
+        else:
+            raise TypeError("Unsupported ORC compression strategy: {0}"
+                            .format(compression_strategy))
+
+    # row_index_stride
+
+    if row_index_stride is not None:
+        if isinstance(row_index_stride, int) and row_index_stride > 0:
+            deref(options).row_index_stride = row_index_stride
+        else:
+            raise ValueError("Invalid ORC row index stride: {0}"
+                             .format(row_index_stride))
+
+    # padding_tolerance
+
+    if padding_tolerance is not None:
+        try:
+            padding_tolerance = float(padding_tolerance)
+            deref(options).padding_tolerance = padding_tolerance
+        except Exception:
+            raise ValueError("Invalid ORC padding tolerance: {0}"
+                             .format(padding_tolerance))
+
+    # dictionary_key_size_threshold
+
+    if dictionary_key_size_threshold is not None:
+        try:
+            dictionary_key_size_threshold = float(
+                dictionary_key_size_threshold)
+            deref(options).dictionary_key_size_threshold = \
+                dictionary_key_size_threshold

Review comment:
       I'm almost certain that it should be in [0, 1]. Yes I have just added it.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to