romainfrancois commented on a change in pull request #8650:
URL: https://github.com/apache/arrow/pull/8650#discussion_r538351779



##########
File path: r/src/r_to_arrow.cpp
##########
@@ -0,0 +1,814 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+#include "./arrow_vctrs.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/array/builder_base.h>
+#include <arrow/array/builder_binary.h>
+#include <arrow/array/builder_decimal.h>
+#include <arrow/array/builder_dict.h>
+#include <arrow/array/builder_nested.h>
+#include <arrow/array/builder_primitive.h>
+#include <arrow/type_traits.h>
+#include <arrow/util/checked_cast.h>
+#include <arrow/util/converter.h>
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+using internal::Converter;
+using internal::DictionaryConverter;
+using internal::ListConverter;
+using internal::PrimitiveConverter;
+using internal::StructConverter;
+
+using internal::MakeChunker;
+using internal::MakeConverter;
+
+namespace r {
+
+struct RConversionOptions {
+  RConversionOptions() = default;
+
+  std::shared_ptr<arrow::DataType> type;
+  bool strict;
+  int64_t size;
+};
+
+enum RVectorType {
+  BOOLEAN,
+  UINT8,
+  INT32,
+  FLOAT64,
+  INT64,
+  COMPLEX,
+  STRING,
+  DATAFRAME,
+  DATE,
+  TIME,
+  TIMESTAMP,
+  BINARY,
+  LIST,
+  FACTOR,
+
+  OTHER
+};
+
+RVectorType GetVectorType(SEXP x) {
+  switch (TYPEOF(x)) {
+    case LGLSXP:
+      return BOOLEAN;
+    case RAWSXP:
+      return UINT8;
+    case INTSXP:
+      if (Rf_inherits(x, "factor")) {
+        return FACTOR;
+      }
+      return INT32;
+    case STRSXP:
+      return STRING;
+    case CPLXSXP:
+      return COMPLEX;
+    case REALSXP: {
+      if (Rf_inherits(x, "Date")) {
+        return DATE;
+      } else if (Rf_inherits(x, "integer64")) {
+        return INT64;
+      } else if (Rf_inherits(x, "POSIXct")) {
+        return TIMESTAMP;
+      } else if (Rf_inherits(x, "difftime")) {
+        return TIME;
+      } else {
+        return FLOAT64;
+      }
+    }
+    case VECSXP: {
+      if (Rf_inherits(x, "data.frame")) {
+        return DATAFRAME;
+      }
+
+      if (Rf_inherits(x, "arrow_binary")) {
+        return BINARY;
+      }
+
+      return LIST;
+    }
+    default:
+      break;
+  }
+  return OTHER;
+}
+
+struct RScalar {
+  RVectorType rtype;
+  void* data;
+  bool null;
+};
+
+struct RBytesView {
+  const char* bytes;
+  R_xlen_t size;
+  bool is_utf8;
+
+  Status ParseString(RScalar* value) {
+    SEXP s = *reinterpret_cast<SEXP*>(value->data);
+    bytes = CHAR(s);
+    size = XLENGTH(s);
+
+    // TODO: test it
+    is_utf8 = true;
+
+    return Status::OK();
+  }
+
+  Status ParseRaw(RScalar* value) {
+    SEXP raw;
+
+    if (value->rtype == LIST || value->rtype == BINARY) {
+      raw = *reinterpret_cast<SEXP*>(value->data);
+      if (TYPEOF(raw) != RAWSXP) {
+        return Status::Invalid("can only handle RAW vectors");
+      }
+    } else {
+      return Status::NotImplemented("cannot parse binary with 
RBytesView::ParseRaw()");
+    }
+
+    bytes = reinterpret_cast<const char*>(RAW_RO(raw));
+    size = XLENGTH(raw);
+    is_utf8 = false;
+
+    return Status::OK();
+  }
+};
+
+template <typename Int>
+Result<float> IntegerScalarToFloat32Safe(int64_t value) {
+  constexpr int64_t kFloatMax = 1LL << 24;
+  constexpr int64_t kFloatMin = -(1LL << 24);
+
+  if (value < kFloatMin || value > kFloatMax) {
+    return Status::Invalid("Integer value ", value, " is outside of the range 
exactly",
+                           " representable by a IEEE 754 single precision 
value");
+  }
+  return static_cast<float>(value);
+}
+
+template <typename Int>
+Result<double> IntegerScalarToDoubleSafe(int64_t value) {
+  constexpr int64_t kDoubleMax = 1LL << 53;
+  constexpr int64_t kDoubleMin = -(1LL << 53);
+
+  if (value < kDoubleMin || value > kDoubleMax) {
+    return Status::Invalid("Integer value ", value, " is outside of the range 
exactly",
+                           " representable by a IEEE 754 double precision 
value");
+  }
+  return static_cast<double>(value);
+}
+
+class RValue {
+ public:
+  static bool IsNull(RScalar* obj) { return obj->null; }
+
+  static Result<bool> Convert(const BooleanType*, const RConversionOptions&,
+                              RScalar* value) {
+    if (value->rtype == BOOLEAN) {
+      return *reinterpret_cast<bool*>(value->data);
+    }
+
+    return Status::Invalid("invalid conversion to bool, expecting a logical 
vector");
+  }
+
+  static Result<uint16_t> Convert(const HalfFloatType*, const 
RConversionOptions&,
+                                  RScalar* value) {
+    return Status::NotImplemented("conversion to half float from R not 
implemented");
+  }
+
+  static Result<float> Convert(const FloatType*, const RConversionOptions&,
+                               RScalar* value) {
+    switch (value->rtype) {
+      case FLOAT64:
+        return static_cast<float>(*reinterpret_cast<double*>(value->data));
+      case INT32:
+        return 
IntegerScalarToFloat32Safe<int>(*reinterpret_cast<int*>(value->data));
+      case UINT8:
+        return IntegerScalarToFloat32Safe<uint8_t>(
+            *reinterpret_cast<unsigned char*>(value->data));
+      case INT64:
+        return IntegerScalarToFloat32Safe<int64_t>(
+            *reinterpret_cast<int64_t*>(value->data));
+      default:
+        break;
+    }
+    return Status::Invalid("invalid conversion to float");
+  }
+
+  static Result<double> Convert(const DoubleType*, const RConversionOptions&,
+                                RScalar* value) {
+    switch (value->rtype) {
+      case FLOAT64:
+        return static_cast<float>(*reinterpret_cast<double*>(value->data));
+      case INT32:
+        return 
IntegerScalarToDoubleSafe<int>(*reinterpret_cast<int*>(value->data));
+      case UINT8:
+        return IntegerScalarToDoubleSafe<uint8_t>(
+            *reinterpret_cast<unsigned char*>(value->data));
+      case INT64:
+        return IntegerScalarToDoubleSafe<int64_t>(
+            *reinterpret_cast<int64_t*>(value->data));
+      default:
+        break;
+    }
+
+    return Status::Invalid("invalid conversion to double");
+  }
+
+  static Result<uint8_t> Convert(const UInt8Type*, const RConversionOptions&,
+                                 RScalar* value) {
+    // TODO: handle conversion from other types
+    if (value->rtype == UINT8) {
+      return *reinterpret_cast<uint8_t*>(value->data);
+    }
+
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to uint8");
+  }
+
+  static Result<int8_t> Convert(const Int8Type*, const RConversionOptions&,
+                                RScalar* value) {
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to int8");
+  }
+
+  static Result<int16_t> Convert(const Int16Type*, const RConversionOptions&,
+                                 RScalar* value) {
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to int16");
+  }
+
+  static Result<uint16_t> Convert(const UInt16Type*, const RConversionOptions&,
+                                  RScalar* value) {
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to uint16");
+  }
+
+  static Result<int32_t> Convert(const Int32Type*, const RConversionOptions&,
+                                 RScalar* value) {
+    // TODO: handle conversion from other types
+    if (value->rtype == INT32) {
+      return *reinterpret_cast<int32_t*>(value->data);
+    }
+
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to int32");
+  }
+
+  static Result<uint32_t> Convert(const UInt32Type*, const RConversionOptions&,
+                                  RScalar* value) {
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to uint32");
+  }
+
+  static Result<int64_t> Convert(const Int64Type*, const RConversionOptions&,
+                                 RScalar* value) {
+    // TODO: handle conversion from other types
+    if (value->rtype == INT64) {
+      return *reinterpret_cast<int64_t*>(value->data);
+    }
+
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to int64");
+  }
+
+  static Result<uint64_t> Convert(const UInt64Type*, const RConversionOptions&,
+                                  RScalar* value) {
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to uint64");
+  }
+
+  static Result<int32_t> Convert(const Date32Type*, const RConversionOptions&,
+                                 RScalar* value) {
+    if (value->rtype == DATE) {
+      return static_cast<int32_t>(*reinterpret_cast<double*>(value->data));
+    }
+
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to date32");
+  }
+
+  static Result<int64_t> Convert(const Date64Type*, const RConversionOptions&,
+                                 RScalar* value) {
+    constexpr static int64_t kMillisecondsPerDay = 86400000;
+
+    if (value->rtype == DATE) {
+      return static_cast<int64_t>(*reinterpret_cast<double*>(value->data) *
+                                  kMillisecondsPerDay);
+    }
+
+    return Status::Invalid("invalid conversion to date64");
+  }
+
+  static Result<int32_t> Convert(const Time32Type*, const RConversionOptions&,
+                                 RScalar* value) {
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to time32");
+  }
+
+  static Result<int64_t> Convert(const Time64Type*, const RConversionOptions&,
+                                 RScalar* value) {
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to time64");
+  }
+
+  static Result<Decimal128> Convert(const Decimal128Type*, const 
RConversionOptions&,
+                                    RScalar* value) {
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to decimal128");
+  }
+
+  static Result<Decimal256> Convert(const Decimal256Type*, const 
RConversionOptions&,
+                                    RScalar* value) {
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to decimal256");
+  }
+
+  template <typename T>
+  static enable_if_string<T, Status> Convert(const T*, const 
RConversionOptions&,
+                                             RScalar* value, RBytesView& view) 
{
+    switch (value->rtype) {
+      case STRING:
+      case FACTOR:
+        return view.ParseString(value);
+      default:
+        break;
+    }
+
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to string");
+  }
+
+  static Status Convert(const BaseBinaryType*, const RConversionOptions&, 
RScalar* value,
+                        RBytesView& view) {
+    switch (value->rtype) {
+      case BINARY:
+      case LIST:
+        return view.ParseRaw(value);
+
+      case STRING:
+        return Status::NotImplemented("conversion string -> binary");
+
+      default:
+        break;
+    }
+
+    // TODO: improve error
+    return Status::Invalid("invalid conversion to binary");
+  }
+
+  static Status Convert(const FixedSizeBinaryType* type, const 
RConversionOptions&,
+                        RScalar* value, RBytesView& view) {
+    ARROW_RETURN_NOT_OK(view.ParseRaw(value));
+    if (view.size != type->byte_width()) {
+      return Status::Invalid("invalid size");
+    }
+    return Status::OK();
+  }
+};
+
+template <typename T>
+bool is_NA(T value);
+
+template <>
+bool is_NA<int>(int value) {
+  return value == NA_INTEGER;
+}
+
+template <>
+bool is_NA<double>(double value) {
+  return ISNA(value);
+}
+
+template <>
+bool is_NA<uint8_t>(uint8_t value) {
+  return false;
+}
+
+template <>
+bool is_NA<cpp11::r_bool>(cpp11::r_bool value) {
+  return value == NA_LOGICAL;
+}
+
+template <>
+bool is_NA<cpp11::r_string>(cpp11::r_string value) {
+  return value == NA_STRING;
+}
+
+template <>
+bool is_NA<SEXP>(SEXP value) {
+  return Rf_isNull(value);
+}
+
+template <>
+bool is_NA<int64_t>(int64_t value) {
+  return value == NA_INT64;
+}
+
+template <RVectorType rtype, typename T, class VisitorFunc>
+inline Status VisitRPrimitiveVector(SEXP x, R_xlen_t size, VisitorFunc&& func) 
{
+  RScalar obj{rtype, nullptr, false};
+  cpp11::r_vector<T> values(x);
+  for (T value : values) {
+    obj.data = reinterpret_cast<void*>(&value);
+    obj.null = is_NA<T>(value);
+    RETURN_NOT_OK(func(&obj));
+  }
+  return Status::OK();
+}
+
+template <class VisitorFunc>
+inline Status VisitInt64Vector(SEXP x, R_xlen_t size, VisitorFunc&& func) {
+  RScalar obj{INT64, nullptr, false};
+  cpp11::doubles values(x);
+  for (double value : values) {
+    obj.data = reinterpret_cast<void*>(&value);
+    obj.null = is_NA<int64_t>(*reinterpret_cast<int64_t*>(&value));
+    RETURN_NOT_OK(func(&obj));
+  }
+  return Status::OK();
+}
+
+template <class VisitorFunc>
+inline Status VisitFactor(SEXP x, R_xlen_t size, VisitorFunc&& func) {
+  cpp11::strings levels(Rf_getAttrib(x, R_LevelsSymbol));
+  SEXP* levels_ptr = const_cast<SEXP*>(STRING_PTR_RO(levels));
+
+  RScalar obj{FACTOR, nullptr, false};
+  cpp11::r_vector<int> values(x);
+
+  for (int value : values) {
+    if (is_NA<int>(value)) {
+      obj.null = true;
+    } else {
+      obj.null = false;
+      obj.data = reinterpret_cast<void*>(&levels_ptr[value - 1]);
+    }
+    RETURN_NOT_OK(func(&obj));
+  }
+  return Status::OK();
+}
+
+template <typename T>
+inline Status VisitDataFrame(SEXP x, R_xlen_t size, T* converter);
+
+template <typename T>
+inline Status VisitVector(SEXP x, R_xlen_t size, T* converter) {
+  if (converter->type()->id() == Type::STRUCT) {
+    return VisitDataFrame(x, size, converter);
+  }
+
+  RVectorType rtype = GetVectorType(x);
+  auto func = [&converter](RScalar* obj) { return converter->Append(obj); };
+  using VisitorFunc = decltype(func);
+
+  switch (rtype) {
+    case BOOLEAN:
+      return VisitRPrimitiveVector<BOOLEAN, cpp11::r_bool, VisitorFunc>(
+          x, size, std::forward<VisitorFunc>(func));
+    case UINT8:
+      return VisitRPrimitiveVector<UINT8, uint8_t, VisitorFunc>(
+          x, size, std::forward<VisitorFunc>(func));
+    case INT32:
+      return VisitRPrimitiveVector<INT32, int, VisitorFunc>(
+          x, size, std::forward<VisitorFunc>(func));
+    case FLOAT64:
+      return VisitRPrimitiveVector<FLOAT64, double, VisitorFunc>(
+          x, size, std::forward<VisitorFunc>(func));
+    case DATE:
+      return VisitRPrimitiveVector<DATE, double, VisitorFunc>(
+          x, size, std::forward<VisitorFunc>(func));
+
+    case STRING:
+      return VisitRPrimitiveVector<STRING, cpp11::r_string, VisitorFunc>(
+          x, size, std::forward<VisitorFunc>(func));
+
+    case INT64:
+      return VisitInt64Vector<VisitorFunc>(x, size, 
std::forward<VisitorFunc>(func));
+
+    case BINARY:
+      return VisitRPrimitiveVector<BINARY, SEXP, VisitorFunc>(
+          x, size, std::forward<VisitorFunc>(func));
+
+    case LIST:
+      return VisitRPrimitiveVector<LIST, SEXP, VisitorFunc>(
+          x, size, std::forward<VisitorFunc>(func));
+
+    case FACTOR:
+      return VisitFactor<VisitorFunc>(x, size, 
std::forward<VisitorFunc>(func));
+
+    default:
+      break;
+  }
+
+  return Status::Invalid("No visitor for R type ", rtype);
+}
+
+template <typename T>
+Status Extend(T* converter, SEXP x, R_xlen_t size) {
+  RETURN_NOT_OK(converter->Reserve(size));
+  return VisitVector(x, size, converter);
+}
+
+using RConverter = Converter<RScalar*, RConversionOptions>;

Review comment:
       I'll have a shot, thanks :-)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to