This is an automated email from the ASF dual-hosted git repository.
kevingurney pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 818f71d085 GH-38418: [MATLAB] Add method for extracting one row of an
`arrow.tabular.Table` as a string (#38463)
818f71d085 is described below
commit 818f71d085b6f820903afc6b1f1e577d8e45ff47
Author: sgilmore10 <[email protected]>
AuthorDate: Thu Oct 26 15:57:48 2023 -0400
GH-38418: [MATLAB] Add method for extracting one row of an
`arrow.tabular.Table` as a string (#38463)
### Rationale for this change
We would like to modify the display of the `arrow.tabular.Table` and
`arrow.tabular.RecordBatch` classes to be more "MATLAB-like". In order to do
this, we need to add a method to their respective C++ Proxy classes that
returns a single row of the Table/RecordBatch as a MATLAB `string` array.
### What changes are included in this PR?
Added new function template:
```cpp
template <typename TabularLike>
arrow::matlab::tabular::print_row(const std::shared_ptr<TabularLike>&
tabularObject, const int64_t row_index)
```
This function template returns a string representation of the specified row
in `tabbularObject`.
Added a new proxy method called `getRowString` to both the `Table` and
`RecordBatch` C++ proxy classes. These methods invoke `print_row` to return a
string representation of one row in the `Table`/`RecordBatch`. Neither MATLAB
class `arrow.tabular.Table` nor `arrow.tabular.RecordBatch` expose these
methods directly because they will only be used internally for display.
Below is an example Output of `getRowString()`:
```matlab
>> matlabTable = table([1; 2; 3], ["ABC"; "DE"; "FGH"], datetime(2023, 10,
25) + days(0:2)');
>> arrowTable = arrow.table(matlabTable);
>> rowOneAsString = arrowTable.Proxy.getRowString(struct(Index=int64(1)))
rowOneAsString =
"1 | "ABC" | 2023-10-25 00:00:00.000000"
```
### Are these changes tested?
Yes, added a new test class called `tTabularInternal.m`. Because
`getRowString()` is not a method on the MATLAB classes `arrow.tabular.Table`
and `arrow.tabular.RecordBatch`, this test class calls `getRowString()` on
their `Proxy` properties, which are public but hidden.
### Are there any user-facing changes?
No.
* Closes: #38418
Lead-authored-by: Sarah Gilmore <[email protected]>
Co-authored-by: sgilmore10 <[email protected]>
Co-authored-by: Kevin Gurney <[email protected]>
Signed-off-by: Kevin Gurney <[email protected]>
---
matlab/src/cpp/arrow/matlab/error/error.h | 1 +
.../cpp/arrow/matlab/tabular/get_row_as_string.h | 77 +++++++++++++++
.../cpp/arrow/matlab/tabular/proxy/record_batch.cc | 18 ++++
.../cpp/arrow/matlab/tabular/proxy/record_batch.h | 1 +
matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc | 19 ++++
matlab/src/cpp/arrow/matlab/tabular/proxy/table.h | 1 +
.../+test/+tabular/createAllSupportedArrayTypes.m | 5 +-
matlab/test/arrow/tabular/tTabularInternal.m | 110 +++++++++++++++++++++
8 files changed, 230 insertions(+), 2 deletions(-)
diff --git a/matlab/src/cpp/arrow/matlab/error/error.h
b/matlab/src/cpp/arrow/matlab/error/error.h
index 2d8f5c432c..33e80bca8c 100644
--- a/matlab/src/cpp/arrow/matlab/error/error.h
+++ b/matlab/src/cpp/arrow/matlab/error/error.h
@@ -202,4 +202,5 @@ namespace arrow::matlab::error {
static const char* INDEX_OUT_OF_RANGE = "arrow:index:OutOfRange";
static const char* BUFFER_VIEW_OR_COPY_FAILED =
"arrow:buffer:ViewOrCopyFailed";
static const char* ARRAY_PRETTY_PRINT_FAILED =
"arrow:array:PrettyPrintFailed";
+ static const char* TABULAR_GET_ROW_AS_STRING_FAILED =
"arrow:tabular:GetRowAsStringFailed";
}
diff --git a/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h
b/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h
new file mode 100644
index 0000000000..824b6c19a7
--- /dev/null
+++ b/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/pretty_print.h"
+
+#include <sstream>
+
+namespace arrow::matlab::tabular {
+
+ namespace {
+ arrow::PrettyPrintOptions make_pretty_print_options() {
+ auto opts = arrow::PrettyPrintOptions::Defaults();
+ opts.skip_new_lines = true;
+ opts.array_delimiters.open = "";
+ opts.array_delimiters.close = "";
+ opts.chunked_array_delimiters.open = "";
+ opts.chunked_array_delimiters.close = "";
+ return opts;
+ }
+ }
+
+ template <typename TabularType>
+ arrow::Result<std::string> get_row_as_string(const
std::shared_ptr<TabularType>& tabular_object, const int64_t matlab_row_index) {
+ std::stringstream ss;
+ const int64_t row_index = matlab_row_index - 1;
+ if (row_index >= tabular_object->num_rows() || row_index < 0) {
+ ss << "Invalid Row Index: " << matlab_row_index;
+ return arrow::Status::Invalid(ss.str());
+ }
+
+ const auto opts = make_pretty_print_options();
+ const auto num_columns = tabular_object->num_columns();
+ const auto& columns = tabular_object->columns();
+
+ for (int32_t i = 0; i < num_columns; ++i) {
+ const auto& column = columns[i];
+ const auto type_id = column->type()->id();
+ if (arrow::is_primitive(type_id) || arrow::is_string(type_id)) {
+ auto slice = column->Slice(row_index, 1);
+ ARROW_RETURN_NOT_OK(arrow::PrettyPrint(*slice, opts, &ss));
+ } else if (type_id == arrow::Type::type::STRUCT) {
+ // Use <Struct> as a placeholder since we don't have a good
+ // way to display StructArray elements horiztonally on screen.
+ ss << "<Struct>";
+ } else if (type_id == arrow::Type::type::LIST) {
+ // Use <List> as a placeholder since we don't have a good
+ // way to display ListArray elements horiztonally on screen.
+ ss << "<List>";
+ } else {
+ return arrow::Status::NotImplemented("Datatype " +
column->type()->ToString() + "is not currently supported for display.");
+ }
+
+ if (i + 1 < num_columns) {
+ // Only add the delimiter if there is at least
+ // one more element to print.
+ ss << " | ";
+ }
+ }
+ return ss.str();
+ }
+}
\ No newline at end of file
diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc
b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc
index 679c7382f6..7d24ad01d7 100644
--- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc
+++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc
@@ -23,6 +23,7 @@
#include "arrow/matlab/error/error.h"
#include "arrow/matlab/tabular/proxy/record_batch.h"
#include "arrow/matlab/tabular/proxy/schema.h"
+#include "arrow/matlab/tabular/get_row_as_string.h"
#include "arrow/type.h"
#include "arrow/util/utf8.h"
@@ -58,6 +59,7 @@ namespace arrow::matlab::tabular::proxy {
REGISTER_METHOD(RecordBatch, getColumnByIndex);
REGISTER_METHOD(RecordBatch, getColumnByName);
REGISTER_METHOD(RecordBatch, getSchema);
+ REGISTER_METHOD(RecordBatch, getRowAsString);
}
std::shared_ptr<arrow::RecordBatch> RecordBatch::unwrap() {
@@ -218,4 +220,20 @@ namespace arrow::matlab::tabular::proxy {
context.outputs[0] = schema_proxy_id_mda;
}
+ void RecordBatch::getRowAsString(libmexclass::proxy::method::Context&
context) {
+ namespace mda = ::matlab::data;
+ using namespace libmexclass::proxy;
+ mda::ArrayFactory factory;
+
+ mda::StructArray args = context.inputs[0];
+ const mda::TypedArray<int64_t> index_mda = args[0]["Index"];
+ const auto matlab_row_index = int64_t(index_mda[0]);
+
+ MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf8,
arrow::matlab::tabular::get_row_as_string(record_batch, matlab_row_index),
+ context,
error::TABULAR_GET_ROW_AS_STRING_FAILED);
+ MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf16,
arrow::util::UTF8StringToUTF16(row_str_utf8),
+ context,
error::UNICODE_CONVERSION_ERROR_ID);
+ context.outputs[0] = factory.createScalar(row_str_utf16);
+ }
+
}
diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
index b136ad1ea5..c417d8198f 100644
--- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
+++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h
@@ -41,6 +41,7 @@ namespace arrow::matlab::tabular::proxy {
void getColumnByIndex(libmexclass::proxy::method::Context&
context);
void getColumnByName(libmexclass::proxy::method::Context& context);
void getSchema(libmexclass::proxy::method::Context& context);
+ void getRowAsString(libmexclass::proxy::method::Context& context);
std::shared_ptr<arrow::RecordBatch> record_batch;
};
diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc
b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc
index 228e28dad9..cf628407b1 100644
--- a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc
+++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc
@@ -24,6 +24,8 @@
#include "arrow/matlab/error/error.h"
#include "arrow/matlab/tabular/proxy/table.h"
#include "arrow/matlab/tabular/proxy/schema.h"
+#include "arrow/matlab/tabular/get_row_as_string.h"
+
#include "arrow/type.h"
#include "arrow/util/utf8.h"
@@ -57,6 +59,7 @@ namespace arrow::matlab::tabular::proxy {
REGISTER_METHOD(Table, getSchema);
REGISTER_METHOD(Table, getColumnByIndex);
REGISTER_METHOD(Table, getColumnByName);
+ REGISTER_METHOD(Table, getRowAsString);
}
std::shared_ptr<arrow::Table> Table::unwrap() {
@@ -212,4 +215,20 @@ namespace arrow::matlab::tabular::proxy {
context.outputs[0] = chunked_array_proxy_id_mda;
}
+ void Table::getRowAsString(libmexclass::proxy::method::Context& context) {
+ namespace mda = ::matlab::data;
+ using namespace libmexclass::proxy;
+ mda::ArrayFactory factory;
+
+ mda::StructArray args = context.inputs[0];
+ const mda::TypedArray<int64_t> index_mda = args[0]["Index"];
+ const auto matlab_row_index = int64_t(index_mda[0]);
+
+ MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf8,
arrow::matlab::tabular::get_row_as_string(table, matlab_row_index),
+ context,
error::TABULAR_GET_ROW_AS_STRING_FAILED);
+ MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf16,
arrow::util::UTF8StringToUTF16(row_str_utf8),
+ context,
error::UNICODE_CONVERSION_ERROR_ID);
+ context.outputs[0] = factory.createScalar(row_str_utf16);
+ }
+
}
diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h
b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h
index dae86a180b..bfcea15bbd 100644
--- a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h
+++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h
@@ -41,6 +41,7 @@ namespace arrow::matlab::tabular::proxy {
void getSchema(libmexclass::proxy::method::Context& context);
void getColumnByIndex(libmexclass::proxy::method::Context&
context);
void getColumnByName(libmexclass::proxy::method::Context& context);
+ void getRowAsString(libmexclass::proxy::method::Context& context);
std::shared_ptr<arrow::Table> table;
};
diff --git
a/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m
b/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m
index ad2f026d64..a9682d3173 100644
---
a/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m
+++
b/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m
@@ -24,8 +24,8 @@ function [arrowArrays, matlabData] =
createAllSupportedArrayTypes(opts)
end
% Seed the random number generator to ensure
- % reproducible results in tests.
- rng(1);
+ % reproducible results in tests across MATLAB sessions.
+ rng(1, "twister");
import arrow.type.ID
import arrow.array.*
@@ -101,6 +101,7 @@ function classes = getArrayClassNames()
% Return the class names as a string array
classes = string({metaClass.Name});
+ classes = sort(classes);
end
function dict = getNumericArrayToMatlabDictionary()
diff --git a/matlab/test/arrow/tabular/tTabularInternal.m
b/matlab/test/arrow/tabular/tTabularInternal.m
new file mode 100644
index 0000000000..28075d7763
--- /dev/null
+++ b/matlab/test/arrow/tabular/tTabularInternal.m
@@ -0,0 +1,110 @@
+%TTABULARINTERNAL Unit tests for internal functionality of tabular types.
+
+% Licensed to the Apache Software Foundation (ASF) under one or more
+% contributor license agreements. See the NOTICE file distributed with
+% this work for additional information regarding copyright ownership.
+% The ASF licenses this file to you under the Apache License, Version
+% 2.0 (the "License"); you may not use this file except in compliance
+% with the License. You may obtain a copy of the License at
+%
+% http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS,
+% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+% implied. See the License for the specific language governing
+% permissions and limitations under the License.
+
+classdef tTabularInternal < matlab.unittest.TestCase
+
+ properties(TestParameter)
+ TabularObjectWithAllTypes
+
+ TabularObjectWithOneColumn
+
+ TabularObjectWithThreeRows
+ end
+
+ methods (TestParameterDefinition, Static)
+ function TabularObjectWithAllTypes =
initializeTabularObjectWithAllTypes()
+ arrays =
arrow.internal.test.tabular.createAllSupportedArrayTypes(NumRows=1);
+ arrowTable = arrow.tabular.Table.fromArrays(arrays{:});
+ arrowRecordBatch = arrow.tabular.Table.fromArrays(arrays{:});
+ TabularObjectWithAllTypes = struct(Table=arrowTable, ...
+ RecordBatch=arrowRecordBatch);
+ end
+
+ function TabularObjectWithOneColumn =
initializeTabularObjectWithOneColumn()
+ t = table((1:3)');
+ arrowTable = arrow.table(t);
+ arrowRecordBatch = arrow.recordBatch(t);
+ TabularObjectWithOneColumn = struct(Table=arrowTable, ...
+ RecordBatch=arrowRecordBatch);
+ end
+
+ function TabularObjectWithThreeRows =
initializeTabularObjectWithThreeRows()
+ t = table((1:3)', ["A"; "B"; "C"]);
+ arrowTable = arrow.table(t);
+ arrowRecordBatch = arrow.recordBatch(t);
+ TabularObjectWithThreeRows = struct(Table=arrowTable, ...
+ RecordBatch=arrowRecordBatch);
+ end
+ end
+
+ methods (Test)
+ function RowWithAllTypes(testCase, TabularObjectWithAllTypes)
+ % Verify getRowAsString successfully returns the expected string
+ % when called on a Table/RecordBatch that contains all
+ % supported array types.
+ proxy = TabularObjectWithAllTypes.Proxy;
+ columnStrs = ["false", "2024-02-23", "2023-08-24", "78", "38", ...
+ "24", "48", "89", "102", "<List>", """107""",
"<Struct>", ...
+ "00:03:44", "00:00:07.000000", "2024-02-10
00:00:00.000000", ...
+ "107", "143", "36", "51"];
+ expectedString = strjoin(columnStrs, " | ");
+ actualString = proxy.getRowAsString(struct(Index=int64(1)));
+ testCase.verifyEqual(actualString, expectedString);
+ end
+
+ function RowWithOneColumn(testCase, TabularObjectWithOneColumn)
+ % Verify getRowAsString successfully returns the expected string
+ % when called on a Table/RecordBatch with one column.
+ proxy = TabularObjectWithOneColumn.Proxy;
+ expectedString = "1";
+ actualString = proxy.getRowAsString(struct(Index=int64(1)));
+ testCase.verifyEqual(actualString, expectedString);
+ end
+
+ function RowIndex(testCase, TabularObjectWithThreeRows)
+ % Verify getRowAsString returns the expected string for
+ % the provided row index.
+ proxy = TabularObjectWithThreeRows.Proxy;
+
+ actualString = proxy.getRowAsString(struct(Index=int64(1)));
+ expectedString = "1 | ""A""";
+ testCase.verifyEqual(actualString, expectedString);
+
+ actualString = proxy.getRowAsString(struct(Index=int64(2)));
+ expectedString = "2 | ""B""";
+ testCase.verifyEqual(actualString, expectedString);
+
+ actualString = proxy.getRowAsString(struct(Index=int64(3)));
+ expectedString = "3 | ""C""";
+ testCase.verifyEqual(actualString, expectedString);
+ end
+
+ function GetRowAsStringFailed(testCase, TabularObjectWithThreeRows)
+ % Verify getRowAsString throws an error with the ID
+ % arrow:tabular:GetRowAsStringFailed if provided invalid index
+ % values.
+ proxy = TabularObjectWithThreeRows.Proxy;
+ fcn = @() proxy.getRowAsString(struct(Index=int64(0)));
+ testCase.verifyError(fcn, "arrow:tabular:GetRowAsStringFailed");
+
+ fcn = @() proxy.getRowAsString(struct(Index=int64(4)));
+ testCase.verifyError(fcn, "arrow:tabular:GetRowAsStringFailed");
+ end
+
+ end
+
+end
\ No newline at end of file