emkornfield commented on a change in pull request #10201:
URL: https://github.com/apache/arrow/pull/10201#discussion_r635791186
##########
File path: cpp/src/jni/dataset/jni_util.cc
##########
@@ -211,32 +250,183 @@ std::vector<std::string> ToStringVector(JNIEnv* env,
jobjectArray& str_array) {
return vector;
}
-arrow::Result<jbyteArray> ToSchemaByteArray(JNIEnv* env,
- std::shared_ptr<arrow::Schema>
schema) {
- ARROW_ASSIGN_OR_RAISE(
- std::shared_ptr<arrow::Buffer> buffer,
- arrow::ipc::SerializeSchema(*schema, arrow::default_memory_pool()))
+Result<jbyteArray> ToSchemaByteArray(JNIEnv* env, std::shared_ptr<Schema>
schema) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> buffer,
+ ipc::SerializeSchema(*schema, default_memory_pool()))
jbyteArray out = env->NewByteArray(buffer->size());
auto src = reinterpret_cast<const jbyte*>(buffer->data());
env->SetByteArrayRegion(out, 0, buffer->size(), src);
return out;
}
-arrow::Result<std::shared_ptr<arrow::Schema>> FromSchemaByteArray(
- JNIEnv* env, jbyteArray schemaBytes) {
- arrow::ipc::DictionaryMemo in_memo;
+Result<std::shared_ptr<Schema>> FromSchemaByteArray(JNIEnv* env, jbyteArray
schemaBytes) {
+ ipc::DictionaryMemo in_memo;
int schemaBytes_len = env->GetArrayLength(schemaBytes);
jbyte* schemaBytes_data = env->GetByteArrayElements(schemaBytes, nullptr);
- auto serialized_schema = std::make_shared<arrow::Buffer>(
+ auto serialized_schema = std::make_shared<Buffer>(
reinterpret_cast<uint8_t*>(schemaBytes_data), schemaBytes_len);
- arrow::io::BufferReader buf_reader(serialized_schema);
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Schema> schema,
- arrow::ipc::ReadSchema(&buf_reader, &in_memo))
+ io::BufferReader buf_reader(serialized_schema);
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Schema> schema,
+ ipc::ReadSchema(&buf_reader, &in_memo))
env->ReleaseByteArrayElements(schemaBytes, schemaBytes_data, JNI_ABORT);
return schema;
}
+Status SetMetadataForSingleField(std::shared_ptr<ArrayData> array_data,
+ std::vector<ipc::internal::FieldMetadata>&
nodes_meta,
+ std::vector<ipc::internal::BufferMetadata>&
buffers_meta,
+ std::shared_ptr<KeyValueMetadata>&
custom_metadata) {
+ nodes_meta.push_back({array_data->length, array_data->null_count, 0L});
+
+ for (size_t i = 0; i < array_data->buffers.size(); i++) {
+ auto buffer = array_data->buffers.at(i);
+ uint8_t* data = nullptr;
+ int64_t size = 0;
+ if (buffer != nullptr) {
+ data = (uint8_t*)buffer->data();
+ size = buffer->size();
+ }
+ ipc::internal::BufferMetadata buffer_metadata{};
+ buffer_metadata.offset = reinterpret_cast<int64_t>(data);
+ buffer_metadata.length = size;
+ // store buffer refs into custom metadata
+ jlong ref = CreateNativeRef(buffer);
+ custom_metadata->Append(
+ "NATIVE_BUFFER_REF_" + std::to_string(i),
+ util::base64_encode(reinterpret_cast<unsigned char*>(&ref),
sizeof(ref)));
Review comment:
what is the performance overhead of this. Could another approach be to
use to record batches. One that contains the data, and one that contains all
the reference pointers?
Or at the very least only one metadata entry and encode all the buffer
references as some sort of flatbuffer, protobuf or json list?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]