romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672304332



##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && 
array->data()->buffers[1].use_count() == 1 &&
+           array->data()->buffers[1]->is_mutable();

Review comment:
       specifically here when mutating does not logically change anything. it's 
my understanding that what's there is undefined and that it is inconsequential 
to change it. 
   
   it may be false however when the buffer comes from a source that prevents 
mutation, I don't have a specific example but I guess reading from disk or 
something. 

##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,117 @@ test_that("altrep vectors from int32 and dbl arrays with 
nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  expect_altrep_rountrip <- function(x, fn, ...) {
+    expect_identical(fn(x, ...), fn(Array$create(x)$as_vector(), ...))

Review comment:
       Thanks. I'll update to add more things to `expect_altrep_rountrip()`

##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", 
R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", 
R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double 
inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, 
options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, 
options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 
scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const 
Int64Scalar&>(*sum.scalar()).value;
+      if (value < INT32_MIN || value > INT32_MAX) {

Review comment:
       Thanks. R indeed converts to numeric in that case: 
   
   ``` r
   str(sum(as.integer(c(-2^31 + 1L))))
   #>  int -2147483647
   str(sum(as.integer(c(-2^31 + 1L, -1L))))
   #>  num -2.15e+09
   ```
   
   <sup>Created on 2021-07-19 by the [reprex 
package](https://reprex.tidyverse.org) (v2.0.0)</sup>

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Oh interesting. I guess the advantage of this approach is that, for the 
price of this initial setting of the sentinels, we can then have a `DATAPTR()` 
so access all the data of the array randomly. 
   
   OTOH, we could leave the data untouched and have an `_Elt` that would check 
the null bitmap, and an `GetRegion()` that would also locally use the null 
bitmap. This `GetRegion()` would indeed copy the region data though, so 
potentially be more expensive, even when the region has no nulls. 

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Chances are we might need to implement something like this for arrays 
that are backed by an immutable buffer, so maybe I can do that, and then we can 
compare/measure and evaluate how risky this is. 
   
   Then we have 3 cases: 
    - Array with no nulls: best altrep with DATAPTR(), no need for setting 
sentinels (what we had prior to this pull request). 
    - Array with some nulls, immutable: using GetRegion, I guess with a 
materialized R vector in the data2 altrep slot, so that DATAPTR() materializes 
and returns this. 
    
    - Array with some nulls, "mutable": hacky proposal of this pr. 
   
   Perhaps we only need the first 2. 
   
   
   
   
   
   

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Also, I believe the second dot will have some code shared with altrep 
for ChunkedArray. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to