(fury) branch main updated: feat(python): support latin1/utf16 string encoding in python (#1997)

pandalee Tue, 07 Jan 2025 08:32:19 -0800

This is an automated email from the ASF dual-hosted git repository.

pandalee pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fury.git



The following commit(s) were added to refs/heads/main by this push:
     new 880c6e50 feat(python): support latin1/utf16 string encoding in python 
(#1997)
880c6e50 is described below

commit 880c6e50e93889971eb9426515c19a68f78e9324
Author: Shawn Yang <[email protected]>
AuthorDate: Wed Jan 8 00:32:10 2025 +0800

    feat(python): support latin1/utf16 string encoding in python (#1997)
    
    ## What does this PR do?
    
    Support support latin1/utf16 string encoding in python. For utf16, since
    python doesn't use surrogate pairs, this pr also added a vectorized
    surrogate pairs check function.
    
    Note:
    - Python UCS-2 doesn't contains surrogate pairs, we must check utf16
    first before contruct string from the binary. This is different from
    java/nodejs
    
    ## Related issues
    
    Closes #1967
    
    ## Does this PR introduce any user-facing change?
    
    <!--
    If any user-facing interface changes, please [open an
    issue](https://github.com/apache/fury/issues/new/choose) describing the
    need to do so and update the document if necessary.
    -->
    
    - [ ] Does this PR introduce any public API change?
    - [ ] Does this PR introduce any binary protocol compatibility change?
    
    ## Benchmark
    
    <!--
    When the PR has an impact on performance (if you don't know whether the
    PR will have an impact on performance, you can submit the PR first, and
    if it will have impact on performance, the code reviewer will explain
    it), be sure to attach a benchmark data here.
    -->
---
 cpp/fury/util/string_util.cc                       |  2 +-
 cpp/fury/util/string_util.h                        | 66 ++++++++++++++-
 cpp/fury/util/string_util_test.cc                  | 21 ++++-
 .../apache/fury/serializer/ArraySerializers.java   |  4 +-
 .../org/apache/fury/serializer/Serializers.java    |  6 +-
 .../apache/fury/serializer/StringSerializer.java   | 97 +++++++---------------
 python/README.md                                   |  2 +-
 python/pyfury/_fury.py                             | 15 +---
 python/pyfury/_serialization.pyx                   |  9 +-
 python/pyfury/_util.pyx                            | 56 +++++++++++--
 python/pyfury/includes/libutil.pxd                 |  4 +
 python/pyfury/tests/test_serializer.py             | 13 +++
 12 files changed, 191 insertions(+), 104 deletions(-)

diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc
index 3c0543c9..3c28ac79 100644
--- a/cpp/fury/util/string_util.cc
+++ b/cpp/fury/util/string_util.cc
@@ -623,4 +623,4 @@ std::u16string utf8ToUtf16(const std::string &utf8, bool 
is_little_endian) {
 
 #endif
 
-} // namespace fury
\ No newline at end of file
+} // namespace fury
diff --git a/cpp/fury/util/string_util.h b/cpp/fury/util/string_util.h
index d5bdde54..35254eab 100644
--- a/cpp/fury/util/string_util.h
+++ b/cpp/fury/util/string_util.h
@@ -19,14 +19,78 @@
 
 #pragma once
 
+#include <cstdint>
 #include <string>
+// AVX not included here since some older intel cpu doesn't support avx2
+// but the built wheel for avx2 is same as sse2.
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define USE_NEON_SIMD
+#elif defined(__SSE2__)
+#include <emmintrin.h>
+#define USE_SSE2_SIMD
+#endif
 
 namespace fury {
 
 bool isLatin(const std::string &str);
 
+static inline bool hasSurrogatePairFallback(const uint16_t *data, size_t size) 
{
+  for (size_t i = 0; i < size; ++i) {
+    auto c = data[i];
+    if (c >= 0xD800 && c <= 0xDFFF) {
+      return true;
+    }
+  }
+  return false;
+}
+
+#if defined(USE_NEON_SIMD)
+inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) {
+  size_t i = 0;
+  uint16x8_t lower_bound = vdupq_n_u16(0xD800);
+  uint16x8_t higher_bound = vdupq_n_u16(0xDFFF);
+  for (; i + 7 < length; i += 8) {
+    uint16x8_t chunk = vld1q_u16(data + i);
+    uint16x8_t mask1 = vcgeq_u16(chunk, lower_bound);
+    uint16x8_t mask2 = vcleq_u16(chunk, higher_bound);
+    if (vmaxvq_u16(mask1 & mask2)) {
+      return true; // Detected a high surrogate
+    }
+  }
+  return hasSurrogatePairFallback(data + i, length - i);
+}
+#elif defined(USE_SSE2_SIMD)
+inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) {
+  size_t i = 0;
+  __m128i lower_bound = _mm_set1_epi16(0xd7ff);
+  __m128i higher_bound = _mm_set1_epi16(0xe000);
+  for (; i + 7 < length; i += 8) {
+    __m128i chunk =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + i));
+    __m128i cmp1 = _mm_cmpgt_epi16(chunk, lower_bound);
+    __m128i cmp2 = _mm_cmpgt_epi16(higher_bound, chunk);
+    if (_mm_movemask_epi8(_mm_and_si128(cmp1, cmp2)) != 0) {
+      return true; // Detected a surrogate
+    }
+  }
+  return hasSurrogatePairFallback(data + i, length - i);
+}
+#else
+inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) {
+  return hasSurrogatePairFallback(data, length);
+}
+#endif
+
+inline bool utf16HasSurrogatePairs(const std::u16string &str) {
+  // Get the data pointer
+  const std::uint16_t *data =
+      reinterpret_cast<const std::uint16_t *>(str.data());
+  return utf16HasSurrogatePairs(data, str.size());
+}
+
 std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian);
 
 std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian);
 
-} // namespace fury
\ No newline at end of file
+} // namespace fury
diff --git a/cpp/fury/util/string_util_test.cc 
b/cpp/fury/util/string_util_test.cc
index 60c72c80..5fe800dd 100644
--- a/cpp/fury/util/string_util_test.cc
+++ b/cpp/fury/util/string_util_test.cc
@@ -122,7 +122,24 @@ std::u16string generateRandomUTF16String(size_t length) {
   return str;
 }
 
-// Basic implementation
+TEST(StringUtilTest, TestUtf16HasSurrogatePairs) {
+  EXPECT_FALSE(utf16HasSurrogatePairs(std::u16string({0x99, 0x100})));
+  std::u16string utf16 = {0xD83D, 0xDE00}; // 😀 emoji
+  EXPECT_TRUE(utf16HasSurrogatePairs(utf16));
+  EXPECT_TRUE(utf16HasSurrogatePairs(generateRandomUTF16String(3) + u"性能好"));
+  EXPECT_TRUE(
+      utf16HasSurrogatePairs(generateRandomUTF16String(10) + u"性能好"));
+  EXPECT_TRUE(
+      utf16HasSurrogatePairs(generateRandomUTF16String(30) + u"性能好"));
+  EXPECT_TRUE(
+      utf16HasSurrogatePairs(generateRandomUTF16String(60) + u"性能好"));
+  EXPECT_TRUE(
+      utf16HasSurrogatePairs(generateRandomUTF16String(120) + u"性能好"));
+  EXPECT_TRUE(
+      utf16HasSurrogatePairs(generateRandomUTF16String(200) + u"性能好"));
+  EXPECT_TRUE(
+      utf16HasSurrogatePairs(generateRandomUTF16String(300) + u"性能好"));
+}
 
 // Swap bytes to convert from big endian to little endian
 inline uint16_t swapBytes(uint16_t value) {
@@ -542,4 +559,4 @@ TEST(UTF8ToUTF16Test, PerformanceTest) {
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
-}
\ No newline at end of file
+}
diff --git 
a/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java 
b/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java
index 1d091894..d21de509 100644
--- 
a/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java
+++ 
b/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java
@@ -682,7 +682,7 @@ public class ArraySerializers {
       for (String elem : value) {
         if (elem != null) {
           buffer.writeByte(Fury.NOT_NULL_VALUE_FLAG);
-          stringSerializer.writeUTF8String(buffer, elem);
+          stringSerializer.writeString(buffer, elem);
         } else {
           buffer.writeByte(Fury.NULL_FLAG);
         }
@@ -695,7 +695,7 @@ public class ArraySerializers {
       String[] value = new String[numElements];
       for (int i = 0; i < numElements; i++) {
         if (buffer.readByte() >= Fury.NOT_NULL_VALUE_FLAG) {
-          value[i] = stringSerializer.readUTF8String(buffer);
+          value[i] = stringSerializer.readString(buffer);
         } else {
           value[i] = null;
         }
diff --git 
a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java 
b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java
index e5b01225..a57a0725 100644
--- a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java
+++ b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java
@@ -228,7 +228,7 @@ public class Serializers {
 
     @Override
     public void xwrite(MemoryBuffer buffer, T value) {
-      stringSerializer.writeUTF8String(buffer, value.toString());
+      stringSerializer.writeString(buffer, value.toString());
     }
 
     @Override
@@ -276,7 +276,7 @@ public class Serializers {
 
     @Override
     public StringBuilder xread(MemoryBuffer buffer) {
-      return new StringBuilder(stringSerializer.readUTF8String(buffer));
+      return new StringBuilder(stringSerializer.readString(buffer));
     }
   }
 
@@ -299,7 +299,7 @@ public class Serializers {
 
     @Override
     public StringBuffer xread(MemoryBuffer buffer) {
-      return new StringBuffer(stringSerializer.readUTF8String(buffer));
+      return new StringBuffer(stringSerializer.readString(buffer));
     }
   }
 
diff --git 
a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java 
b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
index fd84c7ca..22f3eeca 100644
--- 
a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
+++ 
b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
@@ -121,7 +121,7 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
 
   @Override
   public void xwrite(MemoryBuffer buffer, String value) {
-    writeUTF8String(buffer, value);
+    writeJavaString(buffer, value);
   }
 
   @Override
@@ -131,68 +131,52 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
 
   @Override
   public String xread(MemoryBuffer buffer) {
-    return readUTF8String(buffer);
+    return readJavaString(buffer);
   }
 
   public void writeString(MemoryBuffer buffer, String value) {
-    if (isJava) {
-      writeJavaString(buffer, value);
-    } else {
-      writeUTF8String(buffer, value);
-    }
+    writeJavaString(buffer, value);
   }
 
   public Expression writeStringExpr(Expression strSerializer, Expression 
buffer, Expression str) {
-    if (isJava) {
-      if (STRING_VALUE_FIELD_IS_BYTES) {
-        if (compressString) {
-          return new Invoke(strSerializer, "writeCompressedBytesString", 
buffer, str);
-        } else {
-          return new StaticInvoke(StringSerializer.class, "writeBytesString", 
buffer, str);
-        }
+    if (STRING_VALUE_FIELD_IS_BYTES) {
+      if (compressString) {
+        return new Invoke(strSerializer, "writeCompressedBytesString", buffer, 
str);
       } else {
-        if (!STRING_VALUE_FIELD_IS_CHARS) {
-          throw new UnsupportedOperationException();
-        }
-        if (compressString) {
-          return new Invoke(strSerializer, "writeCompressedCharsString", 
buffer, str);
-        } else {
-          return new Invoke(strSerializer, "writeCharsString", buffer, str);
-        }
+        return new StaticInvoke(StringSerializer.class, "writeBytesString", 
buffer, str);
       }
     } else {
-      return new Invoke(strSerializer, "writeUTF8String", buffer, str);
+      if (!STRING_VALUE_FIELD_IS_CHARS) {
+        throw new UnsupportedOperationException();
+      }
+      if (compressString) {
+        return new Invoke(strSerializer, "writeCompressedCharsString", buffer, 
str);
+      } else {
+        return new Invoke(strSerializer, "writeCharsString", buffer, str);
+      }
     }
   }
 
   public String readString(MemoryBuffer buffer) {
-    if (isJava) {
-      return readJavaString(buffer);
-    } else {
-      return readUTF8String(buffer);
-    }
+    return readJavaString(buffer);
   }
 
   public Expression readStringExpr(Expression strSerializer, Expression 
buffer) {
-    if (isJava) {
-      if (STRING_VALUE_FIELD_IS_BYTES) {
-        if (compressString) {
-          return new Invoke(strSerializer, "readCompressedBytesString", 
STRING_TYPE, buffer);
-        } else {
-          return new Invoke(strSerializer, "readBytesString", STRING_TYPE, 
buffer);
-        }
+    if (STRING_VALUE_FIELD_IS_BYTES) {
+      if (compressString) {
+        return new Invoke(strSerializer, "readCompressedBytesString", 
STRING_TYPE, buffer);
       } else {
-        if (!STRING_VALUE_FIELD_IS_CHARS) {
-          throw new UnsupportedOperationException();
-        }
-        if (compressString) {
-          return new Invoke(strSerializer, "readCompressedCharsString", 
STRING_TYPE, buffer);
-        } else {
-          return new Invoke(strSerializer, "readCharsString", STRING_TYPE, 
buffer);
-        }
+        return new Invoke(strSerializer, "readBytesString", STRING_TYPE, 
buffer);
       }
     } else {
-      return new Invoke(strSerializer, "readUTF8String", STRING_TYPE, buffer);
+      if (!STRING_VALUE_FIELD_IS_CHARS) {
+        throw new UnsupportedOperationException();
+      }
+      if (compressString) {
+        return new Invoke(strSerializer, "readCompressedCharsString", 
STRING_TYPE, buffer);
+      } else {
+        return new Invoke(strSerializer, "readCharsString", STRING_TYPE, 
buffer);
+      }
     }
   }
 
@@ -275,13 +259,6 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     }
   }
 
-  @CodegenInvoke
-  public void writeUTF8String(MemoryBuffer buffer, String value) {
-    byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
-    buffer.writeVarUint32(bytes.length);
-    buffer.writeBytes(bytes);
-  }
-
   // Invoked by fury JIT
   public String readJavaString(MemoryBuffer buffer) {
     if (STRING_VALUE_FIELD_IS_BYTES) {
@@ -367,24 +344,6 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     }
   }
 
-  @CodegenInvoke
-  public String readUTF8String(MemoryBuffer buffer) {
-    int numBytes = buffer.readVarUint32Small14();
-    buffer.checkReadableBytes(numBytes);
-    final byte[] targetArray = buffer.getHeapMemory();
-    if (targetArray != null) {
-      String str =
-          new String(
-              targetArray, buffer._unsafeHeapReaderIndex(), numBytes, 
StandardCharsets.UTF_8);
-      buffer.increaseReaderIndex(numBytes);
-      return str;
-    } else {
-      final byte[] tmpArray = getByteArray(numBytes);
-      buffer.readBytes(tmpArray, 0, numBytes);
-      return new String(tmpArray, 0, numBytes, StandardCharsets.UTF_8);
-    }
-  }
-
   public char[] readCharsLatin1(MemoryBuffer buffer, int numBytes) {
     buffer.checkReadableBytes(numBytes);
     byte[] srcArray = buffer.getHeapMemory();
diff --git a/python/README.md b/python/README.md
index d06a9a8c..4f3859d8 100644
--- a/python/README.md
+++ b/python/README.md
@@ -12,7 +12,7 @@ pip install -v -e .
 
 ### Environment Requirements
 
-- python 3.6+
+- python 3.8+
 
 ## Testing
 
diff --git a/python/pyfury/_fury.py b/python/pyfury/_fury.py
index 21ca687b..d97c6230 100644
--- a/python/pyfury/_fury.py
+++ b/python/pyfury/_fury.py
@@ -156,9 +156,10 @@ class Fury:
                 stacklevel=2,
             )
             self.pickler = Pickler(self.buffer)
+            self.unpickler = Unpickler(self.buffer)
         else:
-            self.pickler = _PicklerStub(self.buffer)
-        self.unpickler = None
+            self.pickler = _PicklerStub()
+            self.unpickler = _UnpicklerStub()
         self._buffer_callback = None
         self._buffers = None
         self._unsupported_callback = None
@@ -334,10 +335,6 @@ class Fury:
     ):
         if type(buffer) == bytes:
             buffer = Buffer(buffer)
-        if self.require_class_registration:
-            self.unpickler = _UnpicklerStub(buffer)
-        else:
-            self.unpickler = Unpickler(buffer)
         if unsupported_objects is not None:
             self._unsupported_objects = iter(unsupported_objects)
         if self.language == Language.XLANG:
@@ -527,9 +524,6 @@ _ENABLE_CLASS_REGISTRATION_FORCIBLY = os.getenv(
 
 
 class _PicklerStub:
-    def __init__(self, buf):
-        self.buf = buf
-
     def dump(self, o):
         raise ValueError(
             f"Class {type(o)} is not registered, "
@@ -542,9 +536,6 @@ class _PicklerStub:
 
 
 class _UnpicklerStub:
-    def __init__(self, buf):
-        self.buf = buf
-
     def load(self):
         raise ValueError(
             "pickle is not allowed when class registration enabled, Please 
register"
diff --git a/python/pyfury/_serialization.pyx b/python/pyfury/_serialization.pyx
index 86abdf02..0da4ee9a 100644
--- a/python/pyfury/_serialization.pyx
+++ b/python/pyfury/_serialization.pyx
@@ -643,7 +643,8 @@ cdef class Fury:
             )
             self.pickler = Pickler(self.buffer)
         else:
-            self.pickler = _PicklerStub(self.buffer)
+            self.pickler = _PicklerStub()
+            self.unpickler = _UnpicklerStub()
         self.unpickler = None
         self._buffer_callback = None
         self._buffers = None
@@ -815,9 +816,7 @@ cdef class Fury:
 
     cpdef inline _deserialize(
             self, Buffer buffer, buffers=None, unsupported_objects=None):
-        if self.require_class_registration:
-            self.unpickler = _UnpicklerStub(buffer)
-        else:
+        if not self.require_class_registration:
             self.unpickler = Unpickler(buffer)
         if unsupported_objects is not None:
             self._unsupported_objects = iter(unsupported_objects)
@@ -955,6 +954,8 @@ cdef class Fury:
     cpdef inline handle_unsupported_read(self, Buffer buffer):
         cdef c_bool in_band = buffer.read_bool()
         if in_band:
+            if self.unpickler is None:
+                self.unpickler.buffer = Unpickler(buffer)
             return self.unpickler.load()
         else:
             assert self._unsupported_objects is not None
diff --git a/python/pyfury/_util.pyx b/python/pyfury/_util.pyx
index 9baaca5c..ca87d81e 100644
--- a/python/pyfury/_util.pyx
+++ b/python/pyfury/_util.pyx
@@ -22,14 +22,16 @@
 
 cimport cython
 from cpython cimport *
+from cpython.unicode cimport *
 from libcpp.memory cimport shared_ptr, make_shared
 from libc.stdint cimport *
 from libcpp cimport bool as c_bool
 from pyfury.includes.libutil cimport(
-    CBuffer, AllocateBuffer, GetBit, SetBit, ClearBit, SetBitTo, CStatus, 
StatusCode
+    CBuffer, AllocateBuffer, GetBit, SetBit, ClearBit, SetBitTo, CStatus, 
StatusCode, utf16HasSurrogatePairs
 )
 
 cdef int32_t max_buffer_size = 2 ** 31 - 1
+cdef int UTF16_LE = -1
 
 
 @cython.final
@@ -149,7 +151,6 @@ cdef class Buffer:
 
     cpdef inline check_bound(self, int32_t offset, int32_t length):
         cdef int32_t size_ = self.c_buffer.get().size()
-        # if offset + length > size_:
         if offset | length | (offset + length) | (size_- (offset + length)) < 
0:
             raise ValueError(f"Address range {offset, offset + length} "
                              f"out of bound {0, size_}")
@@ -543,15 +544,52 @@ cdef class Buffer:
         return length
 
     cpdef inline write_string(self, str value):
-        cdef Py_ssize_t length
-        cdef const char * buf = PyUnicode_AsUTF8AndSize(value, &length)
-        self.write_c_buffer(<const uint8_t *>buf, length)
+        cdef Py_ssize_t length = PyUnicode_GET_LENGTH(value)
+        cdef int32_t kind = PyUnicode_KIND(value)
+        # Note: buffer will be native endian for PyUnicode_2BYTE_KIND
+        cdef void* buffer = PyUnicode_DATA(value)
+        cdef uint64_t header = 0
+        cdef int32_t buffer_size
+        if kind == PyUnicode_1BYTE_KIND:
+            buffer_size = length
+            header = (length << 2) | 0
+        elif kind == PyUnicode_2BYTE_KIND:
+            buffer_size = length << 1
+            header = (length << 3) | 1
+        else:
+            buffer = <void *>(PyUnicode_AsUTF8AndSize(value, &length))
+            buffer_size = length
+            header = (buffer_size << 2) | 2
+        self.write_varuint64(header)
+        if buffer_size == 0:  # access an emtpy buffer may raise out-of-bound 
exception.
+            return
+        self.grow(buffer_size)
+        self.check_bound(self.writer_index, buffer_size)
+        self.c_buffer.get().CopyFrom(self.writer_index, <const uint8_t 
*>buffer, 0, buffer_size)
+        self.writer_index += buffer_size
 
     cpdef inline str read_string(self):
-        cdef uint8_t* buf
-        cdef int32_t length = self.read_c_buffer(&buf)
-        str_obj = PyUnicode_DecodeUTF8(<const char *>buf, length, "strict")
-        return str_obj
+        cdef uint64_t header = self.read_varuint64()
+        cdef uint32_t size = header >> 2
+        self.check_bound(self.reader_index, size)
+        cdef const char * buf = <const char *>(self.c_buffer.get().data() + 
self.reader_index)
+        self.reader_index += size
+        cdef uint32_t encoding = header & <uint32_t>0b11
+        if encoding == 0:
+            # PyUnicode_FromASCII
+            return PyUnicode_DecodeLatin1(buf, size, "strict")
+        elif encoding == 1:
+            if utf16HasSurrogatePairs(<const uint16_t *>buf, size >> 1):
+                return PyUnicode_DecodeUTF16(
+                    buf,
+                    size,  # len of string in bytes
+                    NULL,  # special error handling options, we don't need any
+                    &UTF16_LE,  # fury use little-endian
+                )
+            else:
+                return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, buf, 
size >> 1)
+        else:
+            return PyUnicode_DecodeUTF8(buf, size, "strict")
 
     def __len__(self):
         return self._c_size
diff --git a/python/pyfury/includes/libutil.pxd 
b/python/pyfury/includes/libutil.pxd
index 49f4c715..72a64003 100644
--- a/python/pyfury/includes/libutil.pxd
+++ b/python/pyfury/includes/libutil.pxd
@@ -107,3 +107,7 @@ cdef extern from "fury/util/bit_util.h" namespace 
"fury::util" nogil:
     void SetBitTo(uint8_t *bits, int64_t i, c_bool bit_is_set)
 
     c_string hex(uint8_t *data, int32_t length)
+
+
+cdef extern from "fury/util/string_util.h" namespace "fury" nogil:
+    c_bool utf16HasSurrogatePairs(uint16_t* data, size_t size)
diff --git a/python/pyfury/tests/test_serializer.py 
b/python/pyfury/tests/test_serializer.py
index 361639a6..ec6d399d 100644
--- a/python/pyfury/tests/test_serializer.py
+++ b/python/pyfury/tests/test_serializer.py
@@ -61,6 +61,15 @@ def test_tuple():
     assert ser_de(fury, (-1.0, 2)) == (-1.0, 2)
 
 
+def test_string():
+    fury = Fury(language=Language.PYTHON, ref_tracking=True)
+    assert ser_de(fury, "hello") == "hello"
+    assert ser_de(fury, "hello，世界") == "hello，世界"
+    assert ser_de(fury, "hello，世界" * 10) == "hello，世界" * 10
+    assert ser_de(fury, "hello，😀") == "hello，😀"
+    assert ser_de(fury, "hello，😀" * 10) == "hello，😀" * 10
+
+
 def test_dict():
     fury = Fury(language=Language.PYTHON, ref_tracking=True)
     assert ser_de(fury, {1: 2}) == {1: 2}
@@ -544,3 +553,7 @@ def test_function():
     df = pd.DataFrame({"a": list(range(10))})
     df_sum = fury.deserialize(fury.serialize(df.sum))
     assert df_sum().equals(df.sum())
+
+
+if __name__ == "__main__":
+    test_string()


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(fury) branch main updated: feat(python): support latin1/utf16 string encoding in python (#1997)

Reply via email to