nealrichardson commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672252916



##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && 
array->data()->buffers[1].use_count() == 1 &&
+           array->data()->buffers[1]->is_mutable();

Review comment:
       I thought arrays were immutable? when/how likely is this to be true?

##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,117 @@ test_that("altrep vectors from int32 and dbl arrays with 
nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  expect_altrep_rountrip <- function(x, fn, ...) {
+    expect_identical(fn(x, ...), fn(Array$create(x)$as_vector(), ...))

Review comment:
       Should this also assert altrep-ness, maybe something like this?
   
   ```suggestion
       alt <- Array$create(x)$as_vector()
       expect_true(is_altrep(alt))
       result <- fn(alt)
       # Calling fn() didn't cause the vector to lose its altrep-ness
       expect_true(is_altrep(alt))
       expect_identical(result, fn(x, ...), ...))
   ```

##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", 
R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", 
R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double 
inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, 
options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, 
options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 
scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const 
Int64Scalar&>(*sum.scalar()).value;
+      if (value < INT32_MIN || value > INT32_MAX) {

Review comment:
       `INT32_MIN` is `NA_integer_` right? So it needs to be out of bounds too. 
Or does this constant come from R and already have that factored in?
   
   ```suggestion
         if (value <= INT32_MIN || value > INT32_MAX) {
   ```

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       I would have expected that, instead of altering the Arrow data, we would 
have `_ELT` and `_GET_REGION` methods that would insert the sentinels in the R 
vector when the data is accessed. What are the tradeoffs of that way vs. this 
way?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to