bkietz commented on code in PR #578:
URL: https://github.com/apache/arrow-nanoarrow/pull/578#discussion_r1710106120


##########
src/nanoarrow/common/array.c:
##########
@@ -1335,3 +1335,177 @@ ArrowErrorCode ArrowArrayViewValidate(struct 
ArrowArrayView* array_view,
   ArrowErrorSet(error, "validation_level not recognized");
   return EINVAL;
 }
+
+struct ArrowComparisonInternalState {
+  enum ArrowCompareLevel level;
+  int is_equal;
+  struct ArrowBuffer path;
+  struct ArrowError reason;
+};
+
+#define RETURN_NOT_EQUAL_IMPL(state_, path_size_, cond_, reason_) \
+  do {                                                            \
+    if (cond_) {                                                  \
+      ArrowErrorSet(&state_->reason, "%s", reason_);              \
+      state_->path.size_bytes = (path_size_);                     \
+      state_->is_equal = 0;                                       \
+      return NANOARROW_OK;                                        \
+    }                                                             \
+  } while (0)
+
+#define RETURN_NOT_EQUAL(state_, path_size_, condition_) \
+  RETURN_NOT_EQUAL_IMPL(state_, path_size_, condition_, #condition_)
+
+static ArrowErrorCode ArrowArrayViewCompareStructure(
+    const struct ArrowArrayView* actual, const struct ArrowArrayView* expected,
+    struct ArrowComparisonInternalState* state) {
+  int64_t path_size = state->path.size_bytes;
+
+  RETURN_NOT_EQUAL(state, path_size, actual->storage_type != 
expected->storage_type);
+  RETURN_NOT_EQUAL(state, path_size, actual->n_children != 
expected->n_children);
+  RETURN_NOT_EQUAL(state, path_size,
+                   actual->dictionary == NULL && expected->dictionary != NULL);
+  RETURN_NOT_EQUAL(state, path_size,
+                   actual->dictionary != NULL && expected->dictionary == NULL);
+
+  char child_id[128];
+  for (int64_t i = 0; i < actual->n_children; i++) {
+    snprintf(child_id, sizeof(child_id), ".children[%" PRId64 "]", i);
+    state->path.size_bytes = path_size;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowBufferAppendStringView(&state->path, ArrowCharView(child_id)));
+    NANOARROW_RETURN_NOT_OK(ArrowArrayViewCompareStructure(actual->children[i],
+                                                           
expected->children[i], state));
+    if (!state->is_equal) {
+      return NANOARROW_OK;
+    }
+  }
+
+  if (actual->dictionary != NULL) {
+    state->path.size_bytes = path_size;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowBufferAppendStringView(&state->path, 
ArrowCharView(".dictionary")));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewCompareStructure(actual->dictionary, 
expected->dictionary, state));
+    if (!state->is_equal) {
+      return NANOARROW_OK;
+    }
+  }
+
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowArrayViewCompareIdentical(
+    const struct ArrowArrayView* actual, const struct ArrowArrayView* expected,
+    struct ArrowComparisonInternalState* state) {
+  int64_t path_size = state->path.size_bytes;
+
+  RETURN_NOT_EQUAL(state, path_size, actual->length != expected->length);
+  RETURN_NOT_EQUAL(state, path_size, actual->offset != expected->offset);
+  RETURN_NOT_EQUAL(state, path_size, actual->null_count != 
expected->null_count);
+
+  char child_id[128];
+  for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
+    snprintf(child_id, sizeof(child_id), ".buffers[%d]", i);
+    state->path.size_bytes = path_size;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowBufferAppendStringView(&state->path, ArrowCharView(child_id)));
+
+    RETURN_NOT_EQUAL(
+        state, state->path.size_bytes,
+        actual->buffer_views[i].size_bytes != 
expected->buffer_views[i].size_bytes);
+    int64_t buffer_size = actual->buffer_views[i].size_bytes;
+    if (buffer_size == 0) {
+      continue;
+    }
+
+    RETURN_NOT_EQUAL(state, state->path.size_bytes,
+                     memcmp(actual->buffer_views[i].data.data,
+                            expected->buffer_views[i].data.data, buffer_size) 
!= 0);

Review Comment:
   This should be fine for now, but there are some cases where this will not be 
sufficient. For example:
   
   - if you have two int32 arrays both of which are `[1, null, 3]` they might 
still memcmp unequal if the values "under" the null bit are not identical
   - if you have two utf8 arrays both of which are `["hello", "world"]` the 
might still memcmp unequal if their first offsets aren't identical
   



##########
src/nanoarrow/nanoarrow.h:
##########
@@ -1064,6 +1065,19 @@ ArrowErrorCode ArrowArrayViewValidate(struct 
ArrowArrayView* array_view,
                                       enum ArrowValidationLevel 
validation_level,
                                       struct ArrowError* error);
 
+/// \brief Compare two ArrowArrayView objects for equality
+///
+/// Given two ArrowArrayView instances, place either 0 (not equal) and
+/// 1 (equal) at the address pointed to by out. If the comparison determines
+/// that actual and expected are not equal, a reason will be communicated via
+/// error if error is non-NULL.
+///
+/// Returns NANOARROW_OK if the comparison completed successfully.
+ArrowErrorCode ArrowArrayViewCompare(const struct ArrowArrayView* actual,
+                                     const struct ArrowArrayView* expected,
+                                     enum ArrowCompareLevel level, int* out,
+                                     struct ArrowError* error);

Review Comment:
   This should probably not be an `ArrowError` since it isn't 
really/necessarily communicating an error. For another thing, it might be 
desirable to add `NANOARROW_COMPARE_DIFF` later (diffing has been very useful 
in arrow C++) or other comparison levels where we'd like more than 1024 bytes 
of explanation.
   ```suggestion
                                        char** explanation);
   ```
   ```suggestion
                                        struct ArrowBuffer* explanation);
   ```



##########
src/nanoarrow/nanoarrow.h:
##########
@@ -1064,6 +1065,19 @@ ArrowErrorCode ArrowArrayViewValidate(struct 
ArrowArrayView* array_view,
                                       enum ArrowValidationLevel 
validation_level,
                                       struct ArrowError* error);
 
+/// \brief Compare two ArrowArrayView objects for equality
+///
+/// Given two ArrowArrayView instances, place either 0 (not equal) and
+/// 1 (equal) at the address pointed to by out. If the comparison determines
+/// that actual and expected are not equal, a reason will be communicated via
+/// error if error is non-NULL.
+///
+/// Returns NANOARROW_OK if the comparison completed successfully.
+ArrowErrorCode ArrowArrayViewCompare(const struct ArrowArrayView* actual,
+                                     const struct ArrowArrayView* expected,
+                                     enum ArrowCompareLevel level, int* out,
+                                     struct ArrowError* error);

Review Comment:
   (especially in the context of testing but also in general) I think array 
comparison shouldn't include validation and none of the allocations would be 
huge; what other sources of actual errors apply?



##########
src/nanoarrow/common/array.c:
##########
@@ -1335,3 +1335,177 @@ ArrowErrorCode ArrowArrayViewValidate(struct 
ArrowArrayView* array_view,
   ArrowErrorSet(error, "validation_level not recognized");
   return EINVAL;
 }
+
+struct ArrowComparisonInternalState {
+  enum ArrowCompareLevel level;
+  int is_equal;
+  struct ArrowBuffer path;
+  struct ArrowError reason;
+};
+
+#define RETURN_NOT_EQUAL_IMPL(state_, path_size_, cond_, reason_) \
+  do {                                                            \
+    if (cond_) {                                                  \
+      ArrowErrorSet(&state_->reason, "%s", reason_);              \
+      state_->path.size_bytes = (path_size_);                     \
+      state_->is_equal = 0;                                       \
+      return NANOARROW_OK;                                        \
+    }                                                             \
+  } while (0)
+
+#define RETURN_NOT_EQUAL(state_, path_size_, condition_) \
+  RETURN_NOT_EQUAL_IMPL(state_, path_size_, condition_, #condition_)
+
+static ArrowErrorCode ArrowArrayViewCompareStructure(
+    const struct ArrowArrayView* actual, const struct ArrowArrayView* expected,
+    struct ArrowComparisonInternalState* state) {
+  int64_t path_size = state->path.size_bytes;
+
+  RETURN_NOT_EQUAL(state, path_size, actual->storage_type != 
expected->storage_type);
+  RETURN_NOT_EQUAL(state, path_size, actual->n_children != 
expected->n_children);
+  RETURN_NOT_EQUAL(state, path_size,
+                   actual->dictionary == NULL && expected->dictionary != NULL);
+  RETURN_NOT_EQUAL(state, path_size,
+                   actual->dictionary != NULL && expected->dictionary == NULL);
+

Review Comment:
   Should we compare layouts or union type ids here?



##########
src/nanoarrow/common/array.c:
##########
@@ -1335,3 +1335,177 @@ ArrowErrorCode ArrowArrayViewValidate(struct 
ArrowArrayView* array_view,
   ArrowErrorSet(error, "validation_level not recognized");
   return EINVAL;
 }
+
+struct ArrowComparisonInternalState {
+  enum ArrowCompareLevel level;
+  int is_equal;
+  struct ArrowBuffer path;
+  struct ArrowError reason;
+};
+
+#define RETURN_NOT_EQUAL_IMPL(state_, path_size_, cond_, reason_) \
+  do {                                                            \
+    if (cond_) {                                                  \
+      ArrowErrorSet(&state_->reason, "%s", reason_);              \
+      state_->path.size_bytes = (path_size_);                     \
+      state_->is_equal = 0;                                       \
+      return NANOARROW_OK;                                        \
+    }                                                             \
+  } while (0)
+
+#define RETURN_NOT_EQUAL(state_, path_size_, condition_) \
+  RETURN_NOT_EQUAL_IMPL(state_, path_size_, condition_, #condition_)
+
+static ArrowErrorCode ArrowArrayViewCompareStructure(
+    const struct ArrowArrayView* actual, const struct ArrowArrayView* expected,
+    struct ArrowComparisonInternalState* state) {
+  int64_t path_size = state->path.size_bytes;
+
+  RETURN_NOT_EQUAL(state, path_size, actual->storage_type != 
expected->storage_type);

Review Comment:
   Nits:
   - could this be named `RETURN_UNLESS` instead? the condition isn't 
necessarily equality comparison
   - could the condition be the first argument? I think that'd be more readable
   - IIUC we could eliminate the separate path_size argument



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to