This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 787afa1594 GH-39651: [Python] Basic pyarrow bindings for 
Binary/StringView classes (#39652)
787afa1594 is described below

commit 787afa1594586d2d556d21471647f9cd2c55b18f
Author: Joris Van den Bossche <jorisvandenboss...@gmail.com>
AuthorDate: Tue Jan 30 12:54:19 2024 +0100

    GH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes 
(#39652)
    
    ### Rationale for this change
    
    First step for https://github.com/apache/arrow/issues/39633: exposing the 
Array, DataType and Scalar classes for BinaryView and StringView, such that 
those can already be represented in pyarrow.
    
    (I exposed a variant of StringBuilder as well, just for now to be able to 
create test data)
    
    * Closes: #39651
    
    Authored-by: Joris Van den Bossche <jorisvandenboss...@gmail.com>
    Signed-off-by: Joris Van den Bossche <jorisvandenboss...@gmail.com>
---
 docs/source/python/api/arrays.rst          |  4 ++
 docs/source/python/api/datatypes.rst       |  4 ++
 python/pyarrow/__init__.py                 |  7 ++--
 python/pyarrow/array.pxi                   | 14 +++++++
 python/pyarrow/builder.pxi                 | 66 ++++++++++++++++++++++++++++++
 python/pyarrow/includes/libarrow.pxd       |  9 ++++
 python/pyarrow/lib.pxd                     |  8 ++++
 python/pyarrow/lib.pyx                     |  2 +
 python/pyarrow/scalar.pxi                  | 10 +++++
 python/pyarrow/src/arrow/python/helpers.cc |  2 +
 python/pyarrow/tests/test_builder.py       | 21 +++++++++-
 python/pyarrow/tests/test_misc.py          |  4 ++
 python/pyarrow/tests/test_scalars.py       | 28 ++++++++++++-
 python/pyarrow/tests/test_types.py         |  8 ++++
 python/pyarrow/types.pxi                   | 32 +++++++++++++++
 python/pyarrow/types.py                    | 10 +++++
 16 files changed, 223 insertions(+), 6 deletions(-)

diff --git a/docs/source/python/api/arrays.rst 
b/docs/source/python/api/arrays.rst
index 73b5e063ff..b858862dcf 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -63,6 +63,8 @@ may expose data type-specific methods or properties.
    FixedSizeBinaryArray
    LargeBinaryArray
    LargeStringArray
+   BinaryViewArray,
+   StringViewArray,
    Time32Array
    Time64Array
    Date32Array
@@ -119,6 +121,8 @@ classes may expose data type-specific methods or properties.
    FixedSizeBinaryScalar
    LargeBinaryScalar
    LargeStringScalar
+   BinaryViewScalar
+   StringViewScalar
    Time32Scalar
    Time64Scalar
    Date32Scalar
diff --git a/docs/source/python/api/datatypes.rst 
b/docs/source/python/api/datatypes.rst
index 4066ef3142..642c243b21 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -55,6 +55,8 @@ These should be used to create Arrow data types and schemas.
    large_binary
    large_string
    large_utf8
+   binary_view
+   string_view
    decimal128
    list_
    large_list
@@ -168,6 +170,8 @@ represents a given data type (such as ``int32``) or general 
category
    is_large_binary
    is_large_unicode
    is_large_string
+   is_binary_view
+   is_string_view
    is_fixed_size_binary
    is_map
    is_dictionary
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 9da94885ec..4dbd1258d3 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -163,7 +163,7 @@ from pyarrow.lib import (null, bool_,
                          time32, time64, timestamp, date32, date64, duration,
                          month_day_nano_interval,
                          float16, float32, float64,
-                         binary, string, utf8,
+                         binary, string, utf8, binary_view, string_view,
                          large_binary, large_string, large_utf8,
                          decimal128, decimal256,
                          list_, large_list, map_, struct,
@@ -205,6 +205,7 @@ from pyarrow.lib import (null, bool_,
                          FixedSizeListArray, UnionArray,
                          BinaryArray, StringArray,
                          LargeBinaryArray, LargeStringArray,
+                         BinaryViewArray, StringViewArray,
                          FixedSizeBinaryArray,
                          DictionaryArray,
                          Date32Array, Date64Array, TimestampArray,
@@ -223,8 +224,8 @@ from pyarrow.lib import (null, bool_,
                          Time32Scalar, Time64Scalar,
                          TimestampScalar, DurationScalar,
                          MonthDayNanoIntervalScalar,
-                         BinaryScalar, LargeBinaryScalar,
-                         StringScalar, LargeStringScalar,
+                         BinaryScalar, LargeBinaryScalar, BinaryViewScalar,
+                         StringScalar, LargeStringScalar, StringViewScalar,
                          FixedSizeBinaryScalar, DictionaryScalar,
                          MapScalar, StructScalar, UnionScalar,
                          RunEndEncodedScalar, ExtensionScalar)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 1416f5f434..1029f3a629 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -2942,6 +2942,12 @@ cdef class LargeStringArray(Array):
                                   null_count, offset)
 
 
+cdef class StringViewArray(Array):
+    """
+    Concrete class for Arrow arrays of string (or utf8) view data type.
+    """
+
+
 cdef class BinaryArray(Array):
     """
     Concrete class for Arrow arrays of variable-sized binary data type.
@@ -2968,6 +2974,12 @@ cdef class LargeBinaryArray(Array):
         return (<CLargeBinaryArray*> self.ap).total_values_length()
 
 
+cdef class BinaryViewArray(Array):
+    """
+    Concrete class for Arrow arrays of variable-sized binary view data type.
+    """
+
+
 cdef class DictionaryArray(Array):
     """
     Concrete class for dictionary-encoded Arrow arrays.
@@ -3669,6 +3681,8 @@ cdef dict _array_classes = {
     _Type_STRING: StringArray,
     _Type_LARGE_BINARY: LargeBinaryArray,
     _Type_LARGE_STRING: LargeStringArray,
+    _Type_BINARY_VIEW: BinaryViewArray,
+    _Type_STRING_VIEW: StringViewArray,
     _Type_DICTIONARY: DictionaryArray,
     _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
     _Type_DECIMAL128: Decimal128Array,
diff --git a/python/pyarrow/builder.pxi b/python/pyarrow/builder.pxi
index a34ea5412e..2af39e2c58 100644
--- a/python/pyarrow/builder.pxi
+++ b/python/pyarrow/builder.pxi
@@ -80,3 +80,69 @@ cdef class StringBuilder(_Weakrefable):
 
     def __len__(self):
         return self.builder.get().length()
+
+
+cdef class StringViewBuilder(_Weakrefable):
+    """
+    Builder class for UTF8 string views.
+
+    This class exposes facilities for incrementally adding string values and
+    building the null bitmap for a pyarrow.Array (type='string_view').
+    """
+    cdef:
+        unique_ptr[CStringViewBuilder] builder
+
+    def __cinit__(self, MemoryPool memory_pool=None):
+        cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
+        self.builder.reset(new CStringViewBuilder(pool))
+
+    def append(self, value):
+        """
+        Append a single value to the builder.
+
+        The value can either be a string/bytes object or a null value
+        (np.nan or None).
+
+        Parameters
+        ----------
+        value : string/bytes or np.nan/None
+            The value to append to the string array builder.
+        """
+        if value is None or value is np.nan:
+            self.builder.get().AppendNull()
+        elif isinstance(value, (bytes, str)):
+            self.builder.get().Append(tobytes(value))
+        else:
+            raise TypeError('StringViewBuilder only accepts string objects')
+
+    def append_values(self, values):
+        """
+        Append all the values from an iterable.
+
+        Parameters
+        ----------
+        values : iterable of string/bytes or np.nan/None values
+            The values to append to the string array builder.
+        """
+        for value in values:
+            self.append(value)
+
+    def finish(self):
+        """
+        Return result of builder as an Array object; also resets the builder.
+
+        Returns
+        -------
+        array : pyarrow.Array
+        """
+        cdef shared_ptr[CArray] out
+        with nogil:
+            self.builder.get().Finish(&out)
+        return pyarrow_wrap_array(out)
+
+    @property
+    def null_count(self):
+        return self.builder.get().null_count()
+
+    def __len__(self):
+        return self.builder.get().length()
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 74e92594b0..d92f09da77 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -126,6 +126,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         _Type_LARGE_BINARY" arrow::Type::LARGE_BINARY"
         _Type_LARGE_STRING" arrow::Type::LARGE_STRING"
         _Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"
+        _Type_BINARY_VIEW" arrow::Type::BINARY_VIEW"
+        _Type_STRING_VIEW" arrow::Type::STRING_VIEW"
 
         _Type_LIST" arrow::Type::LIST"
         _Type_LARGE_LIST" arrow::Type::LARGE_LIST"
@@ -1295,7 +1297,14 @@ cdef extern from "arrow/builder.h" namespace "arrow" 
nogil:
 
     cdef cppclass CStringBuilder" arrow::StringBuilder"(CBinaryBuilder):
         CStringBuilder(CMemoryPool* pool)
+        CStatus Append(const c_string& value)
+
+    cdef cppclass CBinaryViewBuilder" arrow::BinaryViewBuilder"(CArrayBuilder):
+        CBinaryViewBuilder(shared_ptr[CDataType], CMemoryPool* pool)
+        CStatus Append(const char* value, int32_t length)
 
+    cdef cppclass CStringViewBuilder" 
arrow::StringViewBuilder"(CBinaryViewBuilder):
+        CStringViewBuilder(CMemoryPool* pool)
         CStatus Append(const c_string& value)
 
     cdef cppclass CTimestampBuilder "arrow::TimestampBuilder"(CArrayBuilder):
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 58ec34addb..c110486406 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -445,6 +445,14 @@ cdef class BinaryArray(Array):
     pass
 
 
+cdef class StringViewArray(Array):
+    pass
+
+
+cdef class BinaryViewArray(Array):
+    pass
+
+
 cdef class DictionaryArray(Array):
     cdef:
         object _indices, _dictionary
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 29a0bed559..b0368b67f7 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -106,6 +106,8 @@ Type_STRING = _Type_STRING
 Type_LARGE_BINARY = _Type_LARGE_BINARY
 Type_LARGE_STRING = _Type_LARGE_STRING
 Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
+Type_BINARY_VIEW = _Type_BINARY_VIEW
+Type_STRING_VIEW = _Type_STRING_VIEW
 Type_LIST = _Type_LIST
 Type_LARGE_LIST = _Type_LARGE_LIST
 Type_MAP = _Type_MAP
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 9a66dc8122..2772acf818 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -665,6 +665,14 @@ cdef class LargeStringScalar(StringScalar):
     pass
 
 
+cdef class BinaryViewScalar(BinaryScalar):
+    pass
+
+
+cdef class StringViewScalar(StringScalar):
+    pass
+
+
 cdef class ListScalar(Scalar):
     """
     Concrete class for list-like scalars.
@@ -1051,8 +1059,10 @@ cdef dict _scalar_classes = {
     _Type_BINARY: BinaryScalar,
     _Type_LARGE_BINARY: LargeBinaryScalar,
     _Type_FIXED_SIZE_BINARY: FixedSizeBinaryScalar,
+    _Type_BINARY_VIEW: BinaryViewScalar,
     _Type_STRING: StringScalar,
     _Type_LARGE_STRING: LargeStringScalar,
+    _Type_STRING_VIEW: StringViewScalar,
     _Type_LIST: ListScalar,
     _Type_LARGE_LIST: LargeListScalar,
     _Type_FIXED_SIZE_LIST: FixedSizeListScalar,
diff --git a/python/pyarrow/src/arrow/python/helpers.cc 
b/python/pyarrow/src/arrow/python/helpers.cc
index c266abc169..2c86c86a91 100644
--- a/python/pyarrow/src/arrow/python/helpers.cc
+++ b/python/pyarrow/src/arrow/python/helpers.cc
@@ -63,6 +63,8 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
       GET_PRIMITIVE_TYPE(STRING, utf8);
       GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary);
       GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8);
+      GET_PRIMITIVE_TYPE(BINARY_VIEW, binary_view);
+      GET_PRIMITIVE_TYPE(STRING_VIEW, utf8_view);
       GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval);
     default:
       return nullptr;
diff --git a/python/pyarrow/tests/test_builder.py 
b/python/pyarrow/tests/test_builder.py
index 50d801026b..abc8a0013d 100644
--- a/python/pyarrow/tests/test_builder.py
+++ b/python/pyarrow/tests/test_builder.py
@@ -20,7 +20,7 @@ import weakref
 import numpy as np
 
 import pyarrow as pa
-from pyarrow.lib import StringBuilder
+from pyarrow.lib import StringBuilder, StringViewBuilder
 
 
 def test_weakref():
@@ -65,3 +65,22 @@ def test_string_builder_append_after_finish():
     sbuilder.append("No effect")
     expected = [None, None, "text", None, "other text"]
     assert arr.to_pylist() == expected
+
+
+def test_string_view_builder():
+    builder = StringViewBuilder()
+    builder.append(b"a byte string")
+    builder.append("a string")
+    builder.append("a longer not-inlined string")
+    builder.append(np.nan)
+    builder.append_values([None, "text"])
+    assert len(builder) == 6
+    assert builder.null_count == 2
+    arr = builder.finish()
+    assert isinstance(arr, pa.Array)
+    assert arr.null_count == 2
+    assert arr.type == 'string_view'
+    expected = [
+        "a byte string", "a string", "a longer not-inlined string", None, 
None, "text"
+    ]
+    assert arr.to_pylist() == expected
diff --git a/python/pyarrow/tests/test_misc.py 
b/python/pyarrow/tests/test_misc.py
index 8b8c50882b..8cec878328 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -185,6 +185,8 @@ def test_set_timezone_db_path_non_windows():
     pa.UnionArray,
     pa.BinaryArray,
     pa.StringArray,
+    pa.BinaryViewArray,
+    pa.StringViewArray,
     pa.FixedSizeBinaryArray,
     pa.DictionaryArray,
     pa.Date32Array,
@@ -221,6 +223,8 @@ def test_set_timezone_db_path_non_windows():
     pa.StringScalar,
     pa.BinaryScalar,
     pa.FixedSizeBinaryScalar,
+    pa.BinaryViewScalar,
+    pa.StringViewScalar,
     pa.ListScalar,
     pa.LargeListScalar,
     pa.MapScalar,
diff --git a/python/pyarrow/tests/test_scalars.py 
b/python/pyarrow/tests/test_scalars.py
index 74dee59558..4a239b23d5 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -51,6 +51,9 @@ from pyarrow.tests import util
     (b"bytes", None, pa.BinaryScalar),
     ("largestring", pa.large_string(), pa.LargeStringScalar),
     (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar),
+    # TODO(GH-39633) pa.scalar(..) requires python->arrow conversion to be 
implemented
+    # ("string_view", pa.string_view(), pa.StringViewScalar),
+    # (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar),
     (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar),
     ([1, 2, 3], None, pa.ListScalar),
     ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar),
@@ -488,7 +491,8 @@ def test_month_day_nano_interval():
 @pytest.mark.parametrize('value', ['foo', 'mañana'])
 @pytest.mark.parametrize(('ty', 'scalar_typ'), [
     (pa.string(), pa.StringScalar),
-    (pa.large_string(), pa.LargeStringScalar)
+    (pa.large_string(), pa.LargeStringScalar),
+    # (pa.string_view(), pa.StringViewScalar),
 ])
 def test_string(value, ty, scalar_typ):
     s = pa.scalar(value, type=ty)
@@ -503,10 +507,30 @@ def test_string(value, ty, scalar_typ):
     assert buf.to_pybytes() == value.encode()
 
 
+@pytest.mark.parametrize('value', ['foo', 'mañana'])
+def test_string_view(value):
+    # TODO: replace with normal scalar construction
+    builder = pa.lib.StringViewBuilder()
+    builder.append(value)
+    arr = builder.finish()
+
+    s = arr[0]
+    assert isinstance(s, pa.StringViewScalar)
+    assert s.as_py() == value
+    assert s.as_py() != 'something'
+    assert repr(value) in repr(s)
+    assert str(s) == str(value)
+
+    buf = s.as_buffer()
+    assert isinstance(buf, pa.Buffer)
+    assert buf.to_pybytes() == value.encode()
+
+
 @pytest.mark.parametrize('value', [b'foo', b'bar'])
 @pytest.mark.parametrize(('ty', 'scalar_typ'), [
     (pa.binary(), pa.BinaryScalar),
-    (pa.large_binary(), pa.LargeBinaryScalar)
+    (pa.large_binary(), pa.LargeBinaryScalar),
+    # (pa.binary_view(), pa.BinaryViewScalar),
 ])
 def test_binary(value, ty, scalar_typ):
     s = pa.scalar(value, type=ty)
diff --git a/python/pyarrow/tests/test_types.py 
b/python/pyarrow/tests/test_types.py
index c8a52c6b62..a5ab3128dc 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -61,6 +61,8 @@ def get_many_types():
         pa.binary(10),
         pa.large_string(),
         pa.large_binary(),
+        pa.string_view(),
+        pa.binary_view(),
         pa.list_(pa.int32()),
         pa.list_(pa.int32(), 2),
         pa.large_list(pa.uint16()),
@@ -244,6 +246,12 @@ def test_is_binary_string():
     assert types.is_fixed_size_binary(pa.binary(5))
     assert not types.is_fixed_size_binary(pa.binary())
 
+    assert types.is_string_view(pa.string_view())
+    assert not types.is_string_view(pa.string())
+    assert types.is_binary_view(pa.binary_view())
+    assert not types.is_binary_view(pa.binary())
+    assert not types.is_binary_view(pa.string_view())
+
 
 def test_is_temporal_date_time_timestamp():
     date_types = [pa.date32(), pa.date64()]
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index b6dc53d633..ce3736b5af 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -4375,6 +4375,36 @@ def large_utf8():
     return large_string()
 
 
+def binary_view():
+    """
+    Create a variable-length binary view type.
+
+    Examples
+    --------
+    Create an instance of a string type:
+
+    >>> import pyarrow as pa
+    >>> pa.binary_view()
+    DataType(binary_view)
+    """
+    return primitive_type(_Type_BINARY_VIEW)
+
+
+def string_view():
+    """
+    Create UTF8 variable-length string view type.
+
+    Examples
+    --------
+    Create an instance of a string type:
+
+    >>> import pyarrow as pa
+    >>> pa.string_view()
+    DataType(string_view)
+    """
+    return primitive_type(_Type_STRING_VIEW)
+
+
 def list_(value_type, int list_size=-1):
     """
     Create ListType instance from child data type or field.
@@ -4991,6 +5021,8 @@ cdef dict _type_aliases = {
     'large_str': large_string,
     'large_utf8': large_string,
     'large_binary': large_binary,
+    'binary_view': binary_view,
+    'string_view': string_view,
     'date32': date32,
     'date64': date64,
     'date32[day]': date32,
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index 5d7dbe4b45..32398dac9c 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -243,6 +243,16 @@ def is_fixed_size_binary(t):
     return t.id == lib.Type_FIXED_SIZE_BINARY
 
 
+@doc(is_null, datatype="variable-length binary view")
+def is_binary_view(t):
+    return t.id == lib.Type_BINARY_VIEW
+
+
+@doc(is_null, datatype="variable-length string (utf-8) view")
+def is_string_view(t):
+    return t.id == lib.Type_STRING_VIEW
+
+
 @doc(is_null, datatype="date")
 def is_date(t):
     return t.id in _DATE_TYPES

Reply via email to