This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 9fd3d53 Add Arrow C Data Interface and nanoarrow (#44)
9fd3d53 is described below
commit 9fd3d53f3cd96b342f2ef16fe9d7e6eee9c49a2a
Author: Gang Wu <[email protected]>
AuthorDate: Fri Mar 21 16:52:52 2025 +0800
Add Arrow C Data Interface and nanoarrow (#44)
Closes #33
---
.github/workflows/cpp-linter.yml | 5 +-
.github/workflows/test.yml | 4 +-
cmake_modules/IcebergThirdpartyToolchain.cmake | 28 ++++++++
src/iceberg/CMakeLists.txt | 27 +++++++-
src/iceberg/arrow_c_data.h | 71 ++++++++++++++++++++
src/iceberg/arrow_c_data_internal.cc | 76 ++++++++++++++++++++++
.../iceberg/arrow_c_data_internal.h | 31 ++++-----
test/CMakeLists.txt | 2 +
test/{ => arrow}/CMakeLists.txt | 17 +++--
test/arrow/arrow_test.cc | 62 ++++++++++++++++++
test/{ => avro}/CMakeLists.txt | 16 ++---
test/{core/avro_unittest.cc => avro/avro_test.cc} | 0
test/core/CMakeLists.txt | 8 ---
13 files changed, 297 insertions(+), 50 deletions(-)
diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml
index da58dca..b31e625 100644
--- a/.github/workflows/cpp-linter.yml
+++ b/.github/workflows/cpp-linter.yml
@@ -30,7 +30,10 @@ jobs:
cpp-linter:
runs-on: ubuntu-24.04
steps:
- - uses: actions/checkout@v4
+ - name: Checkout iceberg-cpp
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 #
v4.2.2
+ with:
+ fetch-depth: 0
- name: Run build
run: |
mkdir build && cd build
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5029414..43aebad 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -55,8 +55,8 @@ jobs:
shell: bash
run: ci/scripts/build_example.sh $(pwd)/example
macos:
- name: AArch64 macOS 14
- runs-on: macos-14
+ name: AArch64 macOS 15
+ runs-on: macos-15
timeout-minutes: 30
strategy:
fail-fast: false
diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake
b/cmake_modules/IcebergThirdpartyToolchain.cmake
index 0f95ea4..53b20c1 100644
--- a/cmake_modules/IcebergThirdpartyToolchain.cmake
+++ b/cmake_modules/IcebergThirdpartyToolchain.cmake
@@ -65,6 +65,10 @@ function(resolve_arrow_dependency)
set(ARROW_BUILD_STATIC
ON
CACHE BOOL "" FORCE)
+ # To workaround https://github.com/apache/arrow/pull/45513
+ set(ARROW_IPC
+ ON
+ CACHE BOOL "" FORCE)
set(ARROW_FILESYSTEM
OFF
CACHE BOOL "" FORCE)
@@ -198,3 +202,27 @@ endfunction()
if(ICEBERG_AVRO)
resolve_avro_dependency()
endif()
+
+# ----------------------------------------------------------------------
+# Nanoarrow
+
+# It is also possible to vendor nanoarrow using the bundled source code.
+function(resolve_nanoarrow_dependency)
+ prepare_fetchcontent()
+
+ fetchcontent_declare(nanoarrow
+ ${FC_DECLARE_COMMON_OPTIONS}
+ URL
"https://dlcdn.apache.org/arrow/apache-arrow-nanoarrow-0.6.0/apache-arrow-nanoarrow-0.6.0.tar.gz"
+ )
+ fetchcontent_makeavailable(nanoarrow)
+
+ set_target_properties(nanoarrow PROPERTIES OUTPUT_NAME
"iceberg_vendored_nanoarrow"
+ POSITION_INDEPENDENT_CODE ON)
+ install(TARGETS nanoarrow
+ EXPORT iceberg_targets
+ RUNTIME DESTINATION "${ICEBERG_INSTALL_BINDIR}"
+ ARCHIVE DESTINATION "${ICEBERG_INSTALL_LIBDIR}"
+ LIBRARY DESTINATION "${ICEBERG_INSTALL_LIBDIR}")
+endfunction()
+
+resolve_nanoarrow_dependency()
diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index 8411c7a..0c1475b 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -15,13 +15,36 @@
# specific language governing permissions and limitations
# under the License.
-set(ICEBERG_SOURCES demo_table.cc schema.cc schema_field.cc type.cc)
+set(ICEBERG_SOURCES
+ arrow_c_data_internal.cc
+ demo_table.cc
+ schema.cc
+ schema_field.cc
+ type.cc)
+
+set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
+set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS)
+set(ICEBERG_STATIC_INSTALL_INTERFACE_LIBS)
+set(ICEBERG_SHARED_INSTALL_INTERFACE_LIBS)
+
+list(APPEND ICEBERG_STATIC_BUILD_INTERFACE_LIBS nanoarrow::nanoarrow)
+list(APPEND ICEBERG_SHARED_BUILD_INTERFACE_LIBS nanoarrow::nanoarrow)
+list(APPEND ICEBERG_STATIC_INSTALL_INTERFACE_LIBS "Iceberg::nanoarrow")
+list(APPEND ICEBERG_SHARED_INSTALL_INTERFACE_LIBS "Iceberg::nanoarrow")
add_iceberg_lib(iceberg
SOURCES
${ICEBERG_SOURCES}
PRIVATE_INCLUDES
- ${ICEBERG_INCLUDES})
+ ${ICEBERG_INCLUDES}
+ SHARED_LINK_LIBS
+ ${ICEBERG_SHARED_BUILD_INTERFACE_LIBS}
+ STATIC_LINK_LIBS
+ ${ICEBERG_STATIC_BUILD_INTERFACE_LIBS}
+ STATIC_INSTALL_INTERFACE_LIBS
+ ${ICEBERG_STATIC_INSTALL_INTERFACE_LIBS}
+ SHARED_INSTALL_INTERFACE_LIBS
+ ${ICEBERG_SHARED_INSTALL_INTERFACE_LIBS})
iceberg_install_all_headers(iceberg)
diff --git a/src/iceberg/arrow_c_data.h b/src/iceberg/arrow_c_data.h
new file mode 100644
index 0000000..43c2adb
--- /dev/null
+++ b/src/iceberg/arrow_c_data.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/arrow_c_data.h
+/// Arrow C data interface
+///
+/// The Arrow C Data interface
(https://arrow.apache.org/docs/format/CDataInterface.html)
+/// is part of the Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow
documentation for
+/// documentation of these structures.
+
+#include <cstdint>
+
+#ifndef ARROW_C_DATA_INTERFACE
+# define ARROW_C_DATA_INTERFACE
+
+extern "C" {
+struct ArrowSchema {
+ // Array type description
+ const char* format;
+ const char* name;
+ const char* metadata;
+ int64_t flags;
+ int64_t n_children;
+ struct ArrowSchema** children;
+ struct ArrowSchema* dictionary;
+
+ // Release callback
+ void (*release)(struct ArrowSchema*);
+ // Opaque producer-specific data
+ void* private_data;
+};
+
+struct ArrowArray {
+ // Array data description
+ int64_t length;
+ int64_t null_count;
+ int64_t offset;
+ int64_t n_buffers;
+ int64_t n_children;
+ const void** buffers;
+ struct ArrowArray** children;
+ struct ArrowArray* dictionary;
+
+ // Release callback
+ void (*release)(struct ArrowArray*);
+ // Opaque producer-specific data
+ void* private_data;
+};
+
+} // extern "C"
+
+#endif // ARROW_C_DATA_INTERFACE
diff --git a/src/iceberg/arrow_c_data_internal.cc
b/src/iceberg/arrow_c_data_internal.cc
new file mode 100644
index 0000000..9716b25
--- /dev/null
+++ b/src/iceberg/arrow_c_data_internal.cc
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/arrow_c_data_internal.h"
+
+#include <array>
+#include <string>
+#include <utility>
+
+namespace iceberg::internal {
+
+std::pair<ArrowSchema, ArrowArray>
CreateExampleArrowSchemaAndArrayByNanoarrow() {
+ ArrowSchema out_schema;
+
+ // Initializes the root struct schema
+ NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(&out_schema,
NANOARROW_TYPE_STRUCT));
+ NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateChildren(&out_schema, 2));
+
+ // Set up the non-nullable int64 field
+ struct ArrowSchema* int64_field = out_schema.children[0];
+ ArrowSchemaInit(int64_field);
+ NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(int64_field,
NANOARROW_TYPE_INT64));
+ NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(int64_field, "id"));
+ int64_field->flags &= ~ARROW_FLAG_NULLABLE;
+
+ // Set up the nullable string field
+ struct ArrowSchema* string_field = out_schema.children[1];
+ ArrowSchemaInit(string_field);
+ NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(string_field,
NANOARROW_TYPE_STRING));
+ NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(string_field, "name"));
+ string_field->flags |= ARROW_FLAG_NULLABLE;
+
+ constexpr int64_t kNumValues = 3;
+ std::array<int64_t, kNumValues> int64_values = {1, 2, 3};
+ std::array<std::string, kNumValues> string_values = {"a", "b", "c"};
+
+ ArrowArray out_array;
+ NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(&out_array, &out_schema,
nullptr));
+ ArrowArray* int64_array = out_array.children[0];
+ ArrowArray* string_array = out_array.children[1];
+
+ NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(int64_array));
+ NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(string_array));
+
+ for (int64_t i = 0; i < kNumValues; i++) {
+ NANOARROW_THROW_NOT_OK(ArrowArrayAppendInt(int64_array, int64_values[i]));
+ NANOARROW_THROW_NOT_OK(
+ ArrowArrayAppendString(string_array,
ArrowCharView(string_values[i].c_str())));
+ }
+
+ NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(int64_array,
nullptr));
+ NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(string_array,
nullptr));
+
+ out_array.length = kNumValues;
+ out_array.null_count = 0;
+
+ return {out_schema, out_array};
+}
+
+} // namespace iceberg::internal
diff --git a/test/core/avro_unittest.cc b/src/iceberg/arrow_c_data_internal.h
similarity index 64%
copy from test/core/avro_unittest.cc
copy to src/iceberg/arrow_c_data_internal.h
index 5bdcfca..2d913c5 100644
--- a/test/core/avro_unittest.cc
+++ b/src/iceberg/arrow_c_data_internal.h
@@ -17,24 +17,17 @@
* under the License.
*/
-#include <gtest/gtest.h>
-#include <iceberg/avro/demo_avro.h>
+#pragma once
-TEST(AVROTest, TestDemoAvro) {
- std::string expected =
- "{\n\
- \"type\": \"record\",\n\
- \"name\": \"testrecord\",\n\
- \"fields\": [\n\
- {\n\
- \"name\": \"testbytes\",\n\
- \"type\": \"bytes\",\n\
- \"default\": \"\"\n\
- }\n\
- ]\n\
-}\n\
-";
+#include <nanoarrow/nanoarrow.hpp>
- auto avro = iceberg::avro::DemoAvro();
- EXPECT_EQ(avro.print(), expected);
-}
+namespace iceberg::internal {
+
+/**
+ * @brief Create a simple schema with non-nullable int64 and nullable string
fields.
+ *
+ * This is the example code to demonstrate the usage of nanoarrow API.
+ */
+std::pair<ArrowSchema, ArrowArray>
CreateExampleArrowSchemaAndArrayByNanoarrow();
+
+} // namespace iceberg::internal
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c8c7fdf..e29a76e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -23,4 +23,6 @@ fetchcontent_declare(googletest
GTest)
fetchcontent_makeavailable(googletest)
+add_subdirectory(arrow)
+add_subdirectory(avro)
add_subdirectory(core)
diff --git a/test/CMakeLists.txt b/test/arrow/CMakeLists.txt
similarity index 66%
copy from test/CMakeLists.txt
copy to test/arrow/CMakeLists.txt
index c8c7fdf..0ef6586 100644
--- a/test/CMakeLists.txt
+++ b/test/arrow/CMakeLists.txt
@@ -15,12 +15,11 @@
# specific language governing permissions and limitations
# under the License.
-fetchcontent_declare(googletest
- GIT_REPOSITORY https://github.com/google/googletest.git
- GIT_TAG b514bdc898e2951020cbdca1304b75f5950d1f59 #
release-1.15.2
- FIND_PACKAGE_ARGS
- NAMES
- GTest)
-fetchcontent_makeavailable(googletest)
-
-add_subdirectory(core)
+if(ICEBERG_ARROW)
+ add_executable(arrow_unittest)
+ target_sources(arrow_unittest PRIVATE arrow_test.cc)
+ target_link_libraries(arrow_unittest PRIVATE iceberg_arrow_static
Arrow::arrow_static
+ GTest::gtest_main)
+ target_include_directories(arrow_unittest PRIVATE "${ICEBERG_INCLUDES}")
+ add_test(NAME arrow_unittest COMMAND arrow_unittest)
+endif()
diff --git a/test/arrow/arrow_test.cc b/test/arrow/arrow_test.cc
new file mode 100644
index 0000000..1d730fc
--- /dev/null
+++ b/test/arrow/arrow_test.cc
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <arrow/api.h>
+#include <arrow/c/bridge.h>
+#include <arrow/result.h>
+#include <gtest/gtest.h>
+
+#include "iceberg/arrow_c_data_internal.h"
+
+namespace iceberg {
+
+TEST(ArrowCDataTest, CheckArrowSchemaAndArrayByNanoarrow) {
+ auto [schema, array] =
internal::CreateExampleArrowSchemaAndArrayByNanoarrow();
+
+ auto arrow_schema = ::arrow::ImportSchema(&schema).ValueOrDie();
+ EXPECT_EQ(arrow_schema->num_fields(), 2);
+
+ auto id_field = arrow_schema->field(0);
+ EXPECT_EQ(id_field->name(), "id");
+ EXPECT_EQ(id_field->type()->id(), ::arrow::Type::INT64);
+ EXPECT_FALSE(id_field->nullable());
+
+ auto name_field = arrow_schema->field(1);
+ EXPECT_EQ(name_field->name(), "name");
+ EXPECT_EQ(name_field->type()->id(), ::arrow::Type::STRING);
+ EXPECT_TRUE(name_field->nullable());
+
+ auto arrow_record_batch = ::arrow::ImportRecordBatch(&array,
arrow_schema).ValueOrDie();
+ EXPECT_EQ(arrow_record_batch->num_rows(), 3);
+ EXPECT_EQ(arrow_record_batch->num_columns(), 2);
+
+ auto id_column = arrow_record_batch->column(0);
+ EXPECT_EQ(id_column->type()->id(), ::arrow::Type::INT64);
+ EXPECT_EQ(id_column->GetScalar(0).ValueOrDie()->ToString(), "1");
+ EXPECT_EQ(id_column->GetScalar(1).ValueOrDie()->ToString(), "2");
+ EXPECT_EQ(id_column->GetScalar(2).ValueOrDie()->ToString(), "3");
+
+ auto name_column = arrow_record_batch->column(1);
+ EXPECT_EQ(name_column->type()->id(), ::arrow::Type::STRING);
+ EXPECT_EQ(name_column->GetScalar(0).ValueOrDie()->ToString(), "a");
+ EXPECT_EQ(name_column->GetScalar(1).ValueOrDie()->ToString(), "b");
+ EXPECT_EQ(name_column->GetScalar(2).ValueOrDie()->ToString(), "c");
+}
+
+} // namespace iceberg
diff --git a/test/CMakeLists.txt b/test/avro/CMakeLists.txt
similarity index 69%
copy from test/CMakeLists.txt
copy to test/avro/CMakeLists.txt
index c8c7fdf..9cd1c0b 100644
--- a/test/CMakeLists.txt
+++ b/test/avro/CMakeLists.txt
@@ -15,12 +15,10 @@
# specific language governing permissions and limitations
# under the License.
-fetchcontent_declare(googletest
- GIT_REPOSITORY https://github.com/google/googletest.git
- GIT_TAG b514bdc898e2951020cbdca1304b75f5950d1f59 #
release-1.15.2
- FIND_PACKAGE_ARGS
- NAMES
- GTest)
-fetchcontent_makeavailable(googletest)
-
-add_subdirectory(core)
+if(ICEBERG_AVRO)
+ add_executable(avro_unittest)
+ target_sources(avro_unittest PRIVATE avro_test.cc)
+ target_link_libraries(avro_unittest PRIVATE iceberg_avro_static
GTest::gtest_main)
+ target_include_directories(avro_unittest PRIVATE "${ICEBERG_INCLUDES}")
+ add_test(NAME avro_unittest COMMAND avro_unittest)
+endif()
diff --git a/test/core/avro_unittest.cc b/test/avro/avro_test.cc
similarity index 100%
rename from test/core/avro_unittest.cc
rename to test/avro/avro_test.cc
diff --git a/test/core/CMakeLists.txt b/test/core/CMakeLists.txt
index 23b0844..6e82b9b 100644
--- a/test/core/CMakeLists.txt
+++ b/test/core/CMakeLists.txt
@@ -22,14 +22,6 @@ target_link_libraries(core_unittest PRIVATE iceberg_static
GTest::gtest_main GTe
target_include_directories(core_unittest PRIVATE "${ICEBERG_INCLUDES}")
add_test(NAME core_unittest COMMAND core_unittest)
-if(ICEBERG_AVRO)
- add_executable(avro_unittest)
- target_sources(avro_unittest PRIVATE avro_unittest.cc)
- target_link_libraries(avro_unittest PRIVATE iceberg_avro_static
GTest::gtest_main)
- target_include_directories(avro_unittest PRIVATE "${ICEBERG_INCLUDES}")
- add_test(NAME avro_unittest COMMAND avro_unittest)
-endif()
-
add_executable(expected_test)
target_sources(expected_test PRIVATE expected_test.cc)
target_link_libraries(expected_test PRIVATE iceberg_static GTest::gtest_main)