This is an automated email from the ASF dual-hosted git repository. kevingurney pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push: new 71329ce33a GH-37042: [MATLAB] Implement Feather V1 Writer using new MATLAB Interface APIs (#37043) 71329ce33a is described below commit 71329ce33a18a53e322514d0e463677ebad648c9 Author: sgilmore10 <74676073+sgilmor...@users.noreply.github.com> AuthorDate: Mon Aug 7 15:22:03 2023 -0400 GH-37042: [MATLAB] Implement Feather V1 Writer using new MATLAB Interface APIs (#37043) ### Rationale for this change Now that we've have the basic building blocks for tabular IO in the MATLAB Interface (`Array`, `Schema`, `RecordBatch`), we can implement a Feather V1 writer in terms of the new APIs. This is the first in a series of pull requests in which we will work on replacing the legacy feather V1 infrastructure with a new implementation that use the MATLAB Interface APIs. A side effect of doing this work is that we can eventually delete a lot of legacy build infrastructure and code. ### What changes are included in this PR? 1. Added a new class called `arrow.internal.io.feather.Writer` which can be used to write feather V1 files. It has one public property named `Filename` and one public method `write`. Below is an example of its usage: ```matlab >> T = table([1; 2; 3], single([10; 11; 12])); T = 3×2 table Var1 Var2 ____ ____ 1 10 2 11 3 12 >> filename = "/tmp/table.feather"; >> writer = arrow.internal.io.feather.Writer(filename) writer = Writer with properties: Filename: "/tmp/table.feather" >> writer.write(T); ``` 2. Added an `unwrap` method to `proxy::RecordBatch` so that the `FeatherWriter::write` method can access the underlying `RecordBatch` from the proxy. 3. Changed the `SetAccess` and `GetAccess` of the `Proxy` property on `arrow.tabular.RecordBatch` to `private` and `public`, respectively. ### Are these changes tested? Yes, added a new test file called `tRoundTrip.m` in the `matlab/test/arrow/io/feather` folder. ### Are there any user-facing changes? No. ### Future Directions 1. Add a new class for reading feather V1 files (See #37041). 2. Integrate this class in the public `featherwrite` function. 5. Once this class is integrated with `featherwrite`, we can delete the legacy build infrastructure and source code. * Closes: #37042 Authored-by: Sarah Gilmore <sgilm...@mathworks.com> Signed-off-by: Kevin Gurney <kgur...@mathworks.com> --- matlab/src/cpp/arrow/matlab/error/error.h | 4 + .../matlab/io/feather/proxy/feather_writer.cc | 90 ++++++++++++++++++++++ .../feather/proxy/feather_writer.h} | 24 +++--- matlab/src/cpp/arrow/matlab/proxy/factory.cc | 2 + .../cpp/arrow/matlab/tabular/proxy/record_batch.cc | 4 + .../cpp/arrow/matlab/tabular/proxy/record_batch.h | 2 + .../matlab/+arrow/+internal/+io/+feather/Writer.m | 48 ++++++++++++ matlab/src/matlab/+arrow/+tabular/RecordBatch.m | 2 +- matlab/test/arrow/io/feather/tRoundTrip.m | 52 +++++++++++++ matlab/tools/cmake/BuildMatlabArrowInterface.cmake | 4 +- 10 files changed, 217 insertions(+), 15 deletions(-) diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index b7c0d7d696..e1d2982f28 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -180,4 +180,8 @@ namespace arrow::matlab::error { static const char* UNKNOWN_PROXY_FOR_ARRAY_TYPE = "arrow:array:UnknownProxyForArrayType"; static const char* RECORD_BATCH_NUMERIC_INDEX_WITH_EMPTY_RECORD_BATCH = "arrow:tabular:recordbatch:NumericIndexWithEmptyRecordBatch"; static const char* RECORD_BATCH_INVALID_NUMERIC_COLUMN_INDEX = "arrow:tabular:recordbatch:InvalidNumericColumnIndex"; + static const char* FAILED_TO_OPEN_FILE_FOR_WRITE = "arrow:io:FailedToOpenFileForWrite"; + static const char* FEATHER_FAILED_TO_WRITE_TABLE = "arrow:io:feather:FailedToWriteTable"; + static const char* TABLE_FROM_RECORD_BATCH = "arrow:table:FromRecordBatch"; + } diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc new file mode 100644 index 0000000000..a27e1fb0e6 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/matlab/io/feather/proxy/feather_writer.h" +#include "arrow/matlab/tabular/proxy/record_batch.h" +#include "arrow/matlab/error/error.h" + +#include "arrow/result.h" +#include "arrow/table.h" +#include "arrow/util/utf8.h" + +#include "arrow/io/file.h" +#include "arrow/ipc/feather.h" + +#include "libmexclass/proxy/ProxyManager.h" + +namespace arrow::matlab::io::feather::proxy { + + FeatherWriter::FeatherWriter(const std::string& filename) : filename{filename} { + REGISTER_METHOD(FeatherWriter, getFilename); + REGISTER_METHOD(FeatherWriter, write); + } + + libmexclass::proxy::MakeResult FeatherWriter::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + mda::StructArray opts = constructor_arguments[0]; + const mda::StringArray filename_mda = opts[0]["Filename"]; + + const auto filename_utf16 = std::u16string(filename_mda[0]); + MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8, + arrow::util::UTF16StringToUTF8(filename_utf16), + error::UNICODE_CONVERSION_ERROR_ID); + + return std::make_shared<FeatherWriter>(filename_utf8); + } + + void FeatherWriter::getFilename(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_filename, + arrow::util::UTF8StringToUTF16(filename), + context, + error::UNICODE_CONVERSION_ERROR_ID); + mda::ArrayFactory factory; + auto str_mda = factory.createScalar(utf16_filename); + context.outputs[0] = str_mda; + } + + void FeatherWriter::write(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::StructArray opts = context.inputs[0]; + const mda::TypedArray<uint64_t> record_batch_proxy_id_mda = opts[0]["RecordBatchProxyID"]; + const uint64_t record_batch_proxy_id = record_batch_proxy_id_mda[0]; + + auto proxy = libmexclass::proxy::ProxyManager::getProxy(record_batch_proxy_id); + auto record_batch_proxy = std::static_pointer_cast<arrow::matlab::tabular::proxy::RecordBatch>(proxy); + auto record_batch = record_batch_proxy->unwrap(); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto table, + arrow::Table::FromRecordBatches({record_batch}), + context, + error::TABLE_FROM_RECORD_BATCH); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(std::shared_ptr<arrow::io::OutputStream> output_stream, + arrow::io::FileOutputStream::Open(filename), + context, + error::FAILED_TO_OPEN_FILE_FOR_WRITE); + + // Specify the feather file format version as V1 + arrow::ipc::feather::WriteProperties write_props; + write_props.version = arrow::ipc::feather::kFeatherV1Version; + + MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(ipc::feather::WriteTable(*table, output_stream.get(), write_props), + context, + error::FEATHER_FAILED_TO_WRITE_TABLE); + } +} diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h similarity index 59% copy from matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h copy to matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h index b5d741060a..dadb479887 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h +++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.h @@ -17,27 +17,25 @@ #pragma once -#include "arrow/record_batch.h" +#include "arrow/status.h" #include "libmexclass/proxy/Proxy.h" -namespace arrow::matlab::tabular::proxy { +namespace arrow::matlab::io::feather::proxy { - class RecordBatch : public libmexclass::proxy::Proxy { + class FeatherWriter : public libmexclass::proxy::Proxy { public: - RecordBatch(std::shared_ptr<arrow::RecordBatch> record_batch); + FeatherWriter(const std::string& filename); - virtual ~RecordBatch() {} + ~FeatherWriter() {} - static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); - + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); + protected: - void toString(libmexclass::proxy::method::Context& context); - void numColumns(libmexclass::proxy::method::Context& context); - void columnNames(libmexclass::proxy::method::Context& context); - void getColumnByIndex(libmexclass::proxy::method::Context& context); + void getFilename(libmexclass::proxy::method::Context& context); + void write(libmexclass::proxy::method::Context& context); - std::shared_ptr<arrow::RecordBatch> record_batch; + private: + const std::string filename; }; - } diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index 7d18c6c6b6..7a2a4f3192 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -25,6 +25,7 @@ #include "arrow/matlab/type/proxy/string_type.h" #include "arrow/matlab/type/proxy/timestamp_type.h" #include "arrow/matlab/type/proxy/field.h" +#include "arrow/matlab/io/feather/proxy/feather_writer.h" #include "factory.h" @@ -60,6 +61,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name, REGISTER_PROXY(arrow.type.proxy.BooleanType , arrow::matlab::type::proxy::PrimitiveCType<bool>); REGISTER_PROXY(arrow.type.proxy.StringType , arrow::matlab::type::proxy::StringType); REGISTER_PROXY(arrow.type.proxy.TimestampType , arrow::matlab::type::proxy::TimestampType); + REGISTER_PROXY(arrow.io.feather.proxy.FeatherWriter , arrow::matlab::io::feather::proxy::FeatherWriter); return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, "Did not find matching C++ proxy for " + class_name}; }; diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc index ed30472f6c..e159e926ec 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc @@ -56,6 +56,10 @@ namespace arrow::matlab::tabular::proxy { REGISTER_METHOD(RecordBatch, getColumnByIndex); } + std::shared_ptr<arrow::RecordBatch> RecordBatch::unwrap() { + return record_batch; + } + void RecordBatch::toString(libmexclass::proxy::method::Context& context) { namespace mda = ::matlab::data; MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_string, arrow::util::UTF8StringToUTF16(record_batch->ToString()), context, error::UNICODE_CONVERSION_ERROR_ID); diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h index b5d741060a..b8c038816b 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h @@ -29,6 +29,8 @@ namespace arrow::matlab::tabular::proxy { virtual ~RecordBatch() {} + std::shared_ptr<arrow::RecordBatch> unwrap(); + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); protected: diff --git a/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m new file mode 100644 index 0000000000..470c41fd5b --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m @@ -0,0 +1,48 @@ +%WRITER Class for writing feather V1 files. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef Writer < matlab.mixin.Scalar + + properties(Hidden, SetAccess=private, GetAccess=public) + Proxy + end + + properties(Dependent) + Filename + end + + methods + function obj = Writer(filename) + arguments + filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText} + end + + args = struct(Filename=filename); + proxyName = "arrow.io.feather.proxy.FeatherWriter"; + obj.Proxy = arrow.internal.proxy.create(proxyName, args); + end + + function write(obj, T) + rb = arrow.recordbatch(T); + args = struct(RecordBatchProxyID=rb.Proxy.ID); + obj.Proxy.write(args); + end + + function filename = get.Filename(obj) + filename = obj.Proxy.getFilename(); + end + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m index 0d002797f0..be5eee7d89 100644 --- a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m +++ b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m @@ -23,7 +23,7 @@ classdef RecordBatch < matlab.mixin.CustomDisplay & ... ColumnNames end - properties (Access=protected) + properties (Hidden, SetAccess=private, GetAccess=public) Proxy end diff --git a/matlab/test/arrow/io/feather/tRoundTrip.m b/matlab/test/arrow/io/feather/tRoundTrip.m new file mode 100644 index 0000000000..d56152be6d --- /dev/null +++ b/matlab/test/arrow/io/feather/tRoundTrip.m @@ -0,0 +1,52 @@ +%TROUNDTRIP Round trip tests for feather. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tRoundTrip < matlab.unittest.TestCase + + methods(TestClassSetup) + % Delete once arrow.internal.io.feather.Reader is submitted. + function addFeatherFunctionsToMATLABPath(testCase) + import matlab.unittest.fixtures.PathFixture + % Add Feather test utilities to the MATLAB path. + testCase.applyFixture(PathFixture('../../../util')); + % arrow.cpp.call must be on the MATLAB path. + testCase.assertTrue(~isempty(which('arrow.cpp.call')), ... + '''arrow.cpp.call'' must be on the MATLAB path. Use ''addpath'' to add folders to the MATLAB path.'); + end + end + + methods(Test) + function Basic(testCase) + import matlab.unittest.fixtures.TemporaryFolderFixture + + fixture = testCase.applyFixture(TemporaryFolderFixture); + filename = fullfile(fixture.Folder, "temp.feather"); + + DoubleVar = [10; 20; 30; 40]; + SingleVar = single([10; 15; 20; 25]); + tWrite = table(DoubleVar, SingleVar); + + featherwrite(tWrite, filename); + tRead = featherread(filename); + testCase.verifyEqual(tWrite, tRead); + end + end +end + +function featherwrite(T, filename) + writer = arrow.internal.io.feather.Writer(filename); + writer.write(T); +end \ No newline at end of file diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index f4696cfad2..1d57999417 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -55,7 +55,9 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/string_type.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/field.cc" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc") + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc") + set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy")