This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 89b5932  Add metadata builder functions (#12)
89b5932 is described below

commit 89b59322fe29dc26e4792039a219252622c3a95c
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Aug 9 13:25:31 2022 -0300

    Add metadata builder functions (#12)
    
    * add metadata builder functions
    
    * don't copy existing metadata unless needed
    
    * better comments in tests
    
    * fix buffer change
    
    * test builder from existing string, error for null key/value input
    
    * All metadata values are StringViews
    
    * less annoying getvalue
    
    * less annoying string view from const char
    
    * move the string view + helper to the inlined typedefs
    
    * everything is a stringview
    
    * don't use NULL in header
    
    * more consistent inline function definition for ArrowCharView()
---
 src/nanoarrow/metadata.c        | 178 +++++++++++++++++++++++++++++++++++-----
 src/nanoarrow/metadata_test.cc  |  89 ++++++++++++++++++--
 src/nanoarrow/nanoarrow.h       |  46 +++++++----
 src/nanoarrow/schema_view.c     |   6 +-
 src/nanoarrow/typedefs_inline.h |  13 +++
 src/nanoarrow/utils_inline.h    |  46 +++++++++++
 6 files changed, 334 insertions(+), 44 deletions(-)

diff --git a/src/nanoarrow/metadata.c b/src/nanoarrow/metadata.c
index 123a8d8..2f24cbc 100644
--- a/src/nanoarrow/metadata.c
+++ b/src/nanoarrow/metadata.c
@@ -84,29 +84,22 @@ int64_t ArrowMetadataSizeOf(const char* metadata) {
   return size;
 }
 
-ArrowErrorCode ArrowMetadataGetValue(const char* metadata, const char* key,
-                                     const char* default_value,
-                                     struct ArrowStringView* value_out) {
-  struct ArrowStringView target_key_view = {key, strlen(key)};
-  value_out->data = default_value;
-  if (default_value != NULL) {
-    value_out->n_bytes = strlen(default_value);
-  } else {
-    value_out->n_bytes = 0;
-  }
-
+static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata,
+                                                    struct ArrowStringView* 
key,
+                                                    struct ArrowStringView* 
value_out) {
   struct ArrowMetadataReader reader;
-  struct ArrowStringView key_view;
-  struct ArrowStringView value;
+  struct ArrowStringView existing_key;
+  struct ArrowStringView existing_value;
   ArrowMetadataReaderInit(&reader, metadata);
 
   int64_t size = sizeof(int32_t);
-  while (ArrowMetadataReaderRead(&reader, &key_view, &value) == NANOARROW_OK) {
-    int key_equal = target_key_view.n_bytes == key_view.n_bytes &&
-                    strncmp(target_key_view.data, key_view.data, 
key_view.n_bytes) == 0;
+  while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) ==
+         NANOARROW_OK) {
+    int key_equal = key->n_bytes == existing_key.n_bytes &&
+                    strncmp(key->data, existing_key.data, 
existing_key.n_bytes) == 0;
     if (key_equal) {
-      value_out->data = value.data;
-      value_out->n_bytes = value.n_bytes;
+      value_out->data = existing_value.data;
+      value_out->n_bytes = existing_value.n_bytes;
       break;
     }
   }
@@ -114,8 +107,151 @@ ArrowErrorCode ArrowMetadataGetValue(const char* 
metadata, const char* key,
   return NANOARROW_OK;
 }
 
-char ArrowMetadataHasKey(const char* metadata, const char* key) {
-  struct ArrowStringView value;
-  ArrowMetadataGetValue(metadata, key, NULL, &value);
+ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct 
ArrowStringView key,
+                                     struct ArrowStringView* value_out) {
+  if (value_out == NULL) {
+    return EINVAL;
+  }
+
+  return ArrowMetadataGetValueInternal(metadata, &key, value_out);
+}
+
+char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) {
+  struct ArrowStringView value = ArrowCharView(NULL);
+  ArrowMetadataGetValue(metadata, key, &value);
   return value.data != NULL;
 }
+
+ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer,
+                                        const char* metadata) {
+  ArrowBufferInit(buffer);
+  int result = ArrowBufferAppend(buffer, metadata, 
ArrowMetadataSizeOf(metadata));
+  if (result != NANOARROW_OK) {
+    return result;
+  }
+
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowMetadataBuilderAppendInternal(struct ArrowBuffer* 
buffer,
+                                                         struct 
ArrowStringView* key,
+                                                         struct 
ArrowStringView* value) {
+  if (value == NULL) {
+    return NANOARROW_OK;
+  }
+
+  int result;
+
+  if (buffer->capacity_bytes == 0) {
+    int32_t zero = 0;
+    result = ArrowBufferAppend(buffer, &zero, sizeof(int32_t));
+    if (result != NANOARROW_OK) {
+      return result;
+    }
+  }
+
+  if (buffer->capacity_bytes < sizeof(int32_t)) {
+    return EINVAL;
+  }
+
+  int32_t n_keys;
+  memcpy(&n_keys, buffer->data, sizeof(int32_t));
+
+  int32_t key_size = key->n_bytes;
+  int32_t value_size = value->n_bytes;
+  result = ArrowBufferReserve(buffer,
+                              sizeof(int32_t) + key_size + sizeof(int32_t) + 
value_size);
+  if (result != NANOARROW_OK) {
+    return result;
+  }
+
+  ArrowBufferAppendUnsafe(buffer, &key_size, sizeof(int32_t));
+  ArrowBufferAppendUnsafe(buffer, key->data, key_size);
+  ArrowBufferAppendUnsafe(buffer, &value_size, sizeof(int32_t));
+  ArrowBufferAppendUnsafe(buffer, value->data, value_size);
+
+  n_keys++;
+  memcpy(buffer->data, &n_keys, sizeof(int32_t));
+
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowMetadataBuilderSetInternal(struct ArrowBuffer* 
buffer,
+                                                      struct ArrowStringView* 
key,
+                                                      struct ArrowStringView* 
value) {
+  // Inspect the current value to see if we can avoid copying the buffer
+  struct ArrowStringView current_value = ArrowCharView(NULL);
+  int result =
+      ArrowMetadataGetValueInternal((const char*)buffer->data, key, 
&current_value);
+  if (result != NANOARROW_OK) {
+    return result;
+  }
+
+  // The key should be removed but no key exists
+  if (value == NULL && current_value.data == NULL) {
+    return NANOARROW_OK;
+  }
+
+  // The key/value can be appended because no key exists
+  if (value != NULL && current_value.data == NULL) {
+    return ArrowMetadataBuilderAppendInternal(buffer, key, value);
+  }
+
+  struct ArrowMetadataReader reader;
+  struct ArrowStringView existing_key;
+  struct ArrowStringView existing_value;
+  result = ArrowMetadataReaderInit(&reader, (const char*)buffer->data);
+  if (result != NANOARROW_OK) {
+    return result;
+  }
+
+  struct ArrowBuffer new_buffer;
+  result = ArrowMetadataBuilderInit(&new_buffer, NULL);
+  if (result != NANOARROW_OK) {
+    return result;
+  }
+
+  while (reader.remaining_keys > 0) {
+    result = ArrowMetadataReaderRead(&reader, &existing_key, &existing_value);
+    if (result != NANOARROW_OK) {
+      ArrowBufferReset(&new_buffer);
+      return result;
+    }
+
+    if (key->n_bytes == existing_key.n_bytes &&
+        strncmp((const char*)key->data, (const char*)existing_key.data,
+                existing_key.n_bytes) == 0) {
+      result = ArrowMetadataBuilderAppendInternal(&new_buffer, key, value);
+      value = NULL;
+    } else {
+      result =
+          ArrowMetadataBuilderAppendInternal(&new_buffer, &existing_key, 
&existing_value);
+    }
+
+    if (result != NANOARROW_OK) {
+      ArrowBufferReset(&new_buffer);
+      return result;
+    }
+  }
+
+  ArrowBufferReset(buffer);
+  ArrowBufferMove(&new_buffer, buffer);
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer,
+                                          struct ArrowStringView key,
+                                          struct ArrowStringView value) {
+  return ArrowMetadataBuilderAppendInternal(buffer, &key, &value);
+}
+
+ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer,
+                                       struct ArrowStringView key,
+                                       struct ArrowStringView value) {
+  return ArrowMetadataBuilderSetInternal(buffer, &key, &value);
+}
+
+ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer,
+                                          struct ArrowStringView key) {
+  return ArrowMetadataBuilderSetInternal(buffer, &key, NULL);
+}
diff --git a/src/nanoarrow/metadata_test.cc b/src/nanoarrow/metadata_test.cc
index 5ac959c..c6b47f0 100644
--- a/src/nanoarrow/metadata_test.cc
+++ b/src/nanoarrow/metadata_test.cc
@@ -25,7 +25,7 @@
 
 using namespace arrow;
 
-TEST(SchemaTest, Metadata) {
+TEST(MetadataTest, Metadata) {
   // (test will only work on little endian)
   char simple_metadata[] = {'\1', '\0', '\0', '\0', '\3', '\0', '\0', '\0', 
'k', 'e',
                             'y',  '\5', '\0', '\0', '\0', 'v',  'a',  'l',  
'u', 'e'};
@@ -33,14 +33,91 @@ TEST(SchemaTest, Metadata) {
   EXPECT_EQ(ArrowMetadataSizeOf(nullptr), 0);
   EXPECT_EQ(ArrowMetadataSizeOf(simple_metadata), sizeof(simple_metadata));
 
-  EXPECT_EQ(ArrowMetadataHasKey(simple_metadata, "key"), 1);
-  EXPECT_EQ(ArrowMetadataHasKey(simple_metadata, "not_a_key"), 0);
+  EXPECT_EQ(ArrowMetadataHasKey(simple_metadata, ArrowCharView("key")), 1);
+  EXPECT_EQ(ArrowMetadataHasKey(simple_metadata, ArrowCharView("not_a_key")), 
0);
 
-  struct ArrowStringView value;
-  EXPECT_EQ(ArrowMetadataGetValue(simple_metadata, "key", "default_val", 
&value),
+  struct ArrowStringView value = ArrowCharView("default_val");
+  EXPECT_EQ(ArrowMetadataGetValue(simple_metadata, ArrowCharView("key"), 
&value),
             NANOARROW_OK);
   EXPECT_EQ(std::string(value.data, value.n_bytes), "value");
-  EXPECT_EQ(ArrowMetadataGetValue(simple_metadata, "not_a_key", "default_val", 
&value),
+
+  value = ArrowCharView("default_val");
+  EXPECT_EQ(ArrowMetadataGetValue(simple_metadata, ArrowCharView("not_a_key"), 
&value),
             NANOARROW_OK);
   EXPECT_EQ(std::string(value.data, value.n_bytes), "default_val");
 }
+
+TEST(MetadataTest, MetadataBuild) {
+  // (test will only work on little endian)
+  char simple_metadata[] = {'\1', '\0', '\0', '\0', '\3', '\0', '\0', '\0', 
'k', 'e',
+                            'y',  '\5', '\0', '\0', '\0', 'v',  'a',  'l',  
'u', 'e'};
+
+  // Metadata builder from copy
+  struct ArrowBuffer metadata_builder;
+  ASSERT_EQ(ArrowMetadataBuilderInit(&metadata_builder, simple_metadata), 
NANOARROW_OK);
+  EXPECT_EQ(metadata_builder.size_bytes, sizeof(simple_metadata));
+  EXPECT_EQ(memcmp(metadata_builder.data, simple_metadata, 
metadata_builder.size_bytes),
+            0);
+  ArrowBufferReset(&metadata_builder);
+
+  // Empty metadata
+  ASSERT_EQ(ArrowMetadataBuilderInit(&metadata_builder, nullptr), 
NANOARROW_OK);
+  EXPECT_EQ(metadata_builder.size_bytes, 0);
+  EXPECT_EQ(metadata_builder.data, nullptr);
+
+  // Recreate simple_metadata
+  ASSERT_EQ(ArrowMetadataBuilderAppend(&metadata_builder, ArrowCharView("key"),
+                                       ArrowCharView("value")),
+            NANOARROW_OK);
+  ASSERT_EQ(metadata_builder.size_bytes, ArrowMetadataSizeOf(simple_metadata));
+  EXPECT_EQ(memcmp(metadata_builder.data, simple_metadata, 
metadata_builder.size_bytes),
+            0);
+
+  // Remove a key that doesn't exist
+  ASSERT_EQ(ArrowMetadataBuilderRemove(&metadata_builder, 
ArrowCharView("key2")),
+            NANOARROW_OK);
+  ASSERT_EQ(metadata_builder.size_bytes, ArrowMetadataSizeOf(simple_metadata));
+  EXPECT_EQ(memcmp(metadata_builder.data, simple_metadata, 
metadata_builder.size_bytes),
+            0);
+
+  // Add a new key
+  ASSERT_EQ(ArrowMetadataBuilderSet(&metadata_builder, ArrowCharView("key2"),
+                                    ArrowCharView("value2")),
+            NANOARROW_OK);
+  EXPECT_EQ(metadata_builder.size_bytes, ArrowMetadataSizeOf(simple_metadata) +
+                                             sizeof(int32_t) + 4 + 
sizeof(int32_t) + 6);
+
+  struct ArrowStringView value = ArrowCharView(nullptr);
+  ASSERT_EQ(ArrowMetadataGetValue((const char*)metadata_builder.data,
+                                  ArrowCharView("key2"), &value),
+            NANOARROW_OK);
+  EXPECT_EQ(std::string(value.data, value.n_bytes), "value2");
+
+  // Set an existing key
+  ASSERT_EQ(ArrowMetadataBuilderSet(&metadata_builder, ArrowCharView("key"),
+                                    ArrowCharView("value3")),
+            NANOARROW_OK);
+  value = ArrowCharView(nullptr);
+  ASSERT_EQ(ArrowMetadataGetValue((const char*)metadata_builder.data,
+                                  ArrowCharView("key"), &value),
+            NANOARROW_OK);
+  EXPECT_EQ(std::string(value.data, value.n_bytes), "value3");
+  value = ArrowCharView(nullptr);
+  ASSERT_EQ(ArrowMetadataGetValue((const char*)metadata_builder.data,
+                                  ArrowCharView("key2"), &value),
+            NANOARROW_OK);
+  EXPECT_EQ(std::string(value.data, value.n_bytes), "value2");
+
+  // Remove a key that does exist
+  ASSERT_EQ(ArrowMetadataBuilderRemove(&metadata_builder, 
ArrowCharView("key")),
+            NANOARROW_OK);
+  EXPECT_EQ(ArrowMetadataHasKey((const char*)metadata_builder.data, 
ArrowCharView("key")),
+            false);
+  value = ArrowCharView(nullptr);
+  ASSERT_EQ(ArrowMetadataGetValue((const char*)metadata_builder.data,
+                                  ArrowCharView("key2"), &value),
+            NANOARROW_OK);
+  EXPECT_EQ(std::string(value.data, value.n_bytes), "value2");
+
+  ArrowBufferReset(&metadata_builder);
+}
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index 8958951..a6dbe89 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -87,18 +87,8 @@ const char* ArrowErrorMessage(struct ArrowError* error);
 
 /// \defgroup nanoarrow-utils Utility data structures
 
-/// \brief An non-owning view of a string
-struct ArrowStringView {
-  /// \brief A pointer to the start of the string
-  ///
-  /// If n_bytes is 0, this value may be NULL.
-  const char* data;
-
-  /// \brief The size of the string in bytes,
-  ///
-  /// (Not including the null terminator.)
-  int64_t n_bytes;
-};
+/// \brief Create a string view from a null-terminated string
+static inline struct ArrowStringView ArrowCharView(const char* value);
 
 /// \brief Arrow time unit enumerator
 ///
@@ -207,13 +197,38 @@ ArrowErrorCode ArrowMetadataReaderRead(struct 
ArrowMetadataReader* reader,
 int64_t ArrowMetadataSizeOf(const char* metadata);
 
 /// \brief Check for a key in schema metadata
-char ArrowMetadataHasKey(const char* metadata, const char* key);
+char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key);
 
 /// \brief Extract a value from schema metadata
-ArrowErrorCode ArrowMetadataGetValue(const char* metadata, const char* key,
-                                     const char* default_value,
+///
+/// If key does not exist in metadata, value_out is unmodified
+ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct 
ArrowStringView key,
                                      struct ArrowStringView* value_out);
 
+/// \brief Initialize a builder for schema metadata from key/value pairs
+///
+/// metadata can be an existing metadata string or NULL to initialize
+/// an empty metadata string.
+ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const 
char* metadata);
+
+/// \brief Append a key/value pair to a buffer containing serialized metadata
+ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer,
+                                          struct ArrowStringView key,
+                                          struct ArrowStringView value);
+
+/// \brief Set a key/value pair to a buffer containing serialized metadata
+///
+/// Ensures that the only entry for key in the metadata is set to value.
+/// This function maintains the existing position of (the first instance of)
+/// key if present in the data.
+ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer,
+                                       struct ArrowStringView key,
+                                       struct ArrowStringView value);
+
+/// \brief Remove a key from a buffer containing serialized metadata
+ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer,
+                                          struct ArrowStringView key);
+
 /// }@
 
 /// \defgroup nanoarrow-schema-view Schema consumer helpers
@@ -498,6 +513,7 @@ ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* 
array, int64_t i,
 // Inline function definitions
 #include "bitmap_inline.h"
 #include "buffer_inline.h"
+#include "utils_inline.h"
 
 #ifdef __cplusplus
 }
diff --git a/src/nanoarrow/schema_view.c b/src/nanoarrow/schema_view.c
index 54d586a..7a3ca93 100644
--- a/src/nanoarrow/schema_view.c
+++ b/src/nanoarrow/schema_view.c
@@ -668,9 +668,11 @@ ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* 
schema_view,
     }
   }
 
-  ArrowMetadataGetValue(schema->metadata, "ARROW:extension:name", NULL,
+  schema_view->extension_name = ArrowCharView(NULL);
+  schema_view->extension_metadata = ArrowCharView(NULL);
+  ArrowMetadataGetValue(schema->metadata, 
ArrowCharView("ARROW:extension:name"),
                         &schema_view->extension_name);
-  ArrowMetadataGetValue(schema->metadata, "ARROW:extension:metadata", NULL,
+  ArrowMetadataGetValue(schema->metadata, 
ArrowCharView("ARROW:extension:metadata"),
                         &schema_view->extension_metadata);
 
   return NANOARROW_OK;
diff --git a/src/nanoarrow/typedefs_inline.h b/src/nanoarrow/typedefs_inline.h
index c04f909..5aca1ec 100644
--- a/src/nanoarrow/typedefs_inline.h
+++ b/src/nanoarrow/typedefs_inline.h
@@ -166,6 +166,19 @@ enum ArrowType {
   NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO
 };
 
+/// \brief An non-owning view of a string
+struct ArrowStringView {
+  /// \brief A pointer to the start of the string
+  ///
+  /// If n_bytes is 0, this value may be NULL.
+  const char* data;
+
+  /// \brief The size of the string in bytes,
+  ///
+  /// (Not including the null terminator.)
+  int64_t n_bytes;
+};
+
 /// \brief Array buffer allocation and deallocation
 ///
 /// Container for allocate, reallocate, and free methods that can be used
diff --git a/src/nanoarrow/utils_inline.h b/src/nanoarrow/utils_inline.h
new file mode 100644
index 0000000..4c61555
--- /dev/null
+++ b/src/nanoarrow/utils_inline.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_UTILS_INLINE_H_INCLUDED
+#define NANOARROW_UTILS_INLINE_H_INCLUDED
+
+#include <string.h>
+
+#include "typedefs_inline.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline struct ArrowStringView ArrowCharView(const char* value) {
+  struct ArrowStringView out;
+
+  out.data = value;
+  if (value) {
+    out.n_bytes = (int64_t)strlen(value);
+  } else {
+    out.n_bytes = 0;
+  }
+
+  return out;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

Reply via email to