paleolimbot commented on a change in pull request #12030:
URL: https://github.com/apache/arrow/pull/12030#discussion_r777665555



##########
File path: r/src/io.cpp
##########
@@ -178,4 +180,134 @@ void io___BufferOutputStream__Write(
   StopIfNotOk(stream->Write(RAW(bytes), bytes.size()));
 }
 
+// TransformInputStream::TransformFunc wrapper
+
+class RIconvWrapper {
+ public:
+  RIconvWrapper(std::string to, std::string from)
+      : handle_(Riconv_open(to.c_str(), from.c_str())) {
+    if (handle_ == ((void*)-1)) {
+      cpp11::stop("Can't convert encoding from '%s' to '%s'", from.c_str(), 
to.c_str());
+    }
+  }
+
+  size_t iconv(const char** inbuf, size_t* inbytesleft, char** outbuf,
+               size_t* outbytesleft) {
+    return Riconv(handle_, inbuf, inbytesleft, outbuf, outbytesleft);
+  }
+
+  ~RIconvWrapper() {
+    if (handle_ != ((void*)-1)) {
+      Riconv_close(handle_);
+    }
+  }
+
+ protected:
+  void* handle_;
+};
+
+struct ReencodeUTF8TransformFunctionWrapper {
+  explicit ReencodeUTF8TransformFunctionWrapper(std::string from)
+      : from_(from), iconv_("UTF-8", from), n_pending_(0) {}
+
+  // This may get copied and we need a fresh RIconvWrapper for each copy.
+  ReencodeUTF8TransformFunctionWrapper(const 
ReencodeUTF8TransformFunctionWrapper& ref)
+      : ReencodeUTF8TransformFunctionWrapper(ref.from_) {}
+
+  arrow::Result<std::shared_ptr<arrow::Buffer>> operator()(
+      const std::shared_ptr<arrow::Buffer>& src) {
+    ARROW_ASSIGN_OR_RAISE(auto dest, arrow::AllocateResizableBuffer(32));
+
+    size_t out_bytes_left = dest->size();
+    char* out_buf = (char*)dest->data();
+    size_t out_bytes_used = 0;
+
+    size_t in_bytes_left;
+    const char* in_buf;
+    int64_t n_src_bytes_in_pending = 0;
+
+    // There may be a few leftover bytes from the last call to iconv. Process 
these first
+    // using the internal buffer as the source. This may also result in a 
partial
+    // character left over but will always get us into the src buffer.
+    if (n_pending_ > 0) {
+      // fill the pending_ buffer with characters and call iconv() once
+      n_src_bytes_in_pending =
+          std::min<int64_t>(sizeof(pending_) - n_pending_, src->size());
+      memcpy(pending_ + n_pending_, src->data(), n_src_bytes_in_pending);
+      in_buf = pending_;
+      in_bytes_left = n_pending_ + n_src_bytes_in_pending;
+
+      iconv_.iconv(&in_buf, &in_bytes_left, &out_buf, &out_bytes_left);
+
+      int64_t chars_read_out = out_buf - ((char*)dest->data());
+      out_bytes_used += chars_read_out;
+
+      int64_t chars_read_in = n_pending_ + n_src_bytes_in_pending - 
in_bytes_left;
+      in_buf = (const char*)src->data() + chars_read_in - n_pending_;
+      in_bytes_left = src->size() + n_pending_ - chars_read_in;
+    } else {
+      in_buf = (const char*)src->data();
+      in_bytes_left = src->size();
+    }
+
+    // UTF-8 has a maximum of 4 bytes per character, so it's OK if we have a 
few bytes
+    // left after processing all of src. If we have more than this, it means 
the
+    // output buffer wasn't big enough.
+    while (in_bytes_left >= 4) {
+      int64_t new_size = std::max<int64_t>(src->size(), dest->size() * 2);
+      auto reserve_result = dest->Resize(new_size);
+      if (!reserve_result.ok()) {
+        return reserve_result;
+      }
+
+      out_buf = (char*)dest->data() + out_bytes_used;
+      out_bytes_left = dest->size() - out_bytes_used;
+
+      const char* in_buf_before = in_buf;
+      char* out_buf_before = out_buf;
+      iconv_.iconv(&in_buf, &in_bytes_left, &out_buf, &out_bytes_left);

Review comment:
       It often fails (invalid input, too few bytes in output, partial 
character at the end of the input buffer), so the return value isn't useful 
(because we need to do different things depending on which of these happened). 
I've added errors (when invalid input occurs) and comments to clarify these 
cases (and added a missing check for invalid input in the `pending_` buffer).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to