This is an automated email from the ASF dual-hosted git repository.
amoeba pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-cookbook.git
The following commit(s) were added to refs/heads/main by this push:
new 86555e1 GH-362: Build Arrow from scratch for dev cookbooks (#374)
86555e1 is described below
commit 86555e18e3c0920771b2ba7a8838a16facd461fc
Author: Bryce Mecum <[email protected]>
AuthorDate: Wed Apr 22 18:55:33 2026 -0700
GH-362: Build Arrow from scratch for dev cookbooks (#374)
---
.github/workflows/deploy_development_cookbooks.yml | 2 +
.github/workflows/test_arrow_nightly_cookbook.yml | 2 +
Makefile | 2 +-
cpp/CONTRIBUTING.md | 3 +-
cpp/code/CMakeLists.txt | 99 ++++++++++++++++------
cpp/code/common.h | 21 +----
cpp/code/datasets.cc | 9 +-
cpp/code/flight.cc | 22 +++--
cpp/code/main.cc | 5 ++
cpp/dev.yml | 13 ++-
10 files changed, 116 insertions(+), 62 deletions(-)
diff --git a/.github/workflows/deploy_development_cookbooks.yml
b/.github/workflows/deploy_development_cookbooks.yml
index a8addeb..260ef07 100644
--- a/.github/workflows/deploy_development_cookbooks.yml
+++ b/.github/workflows/deploy_development_cookbooks.yml
@@ -66,6 +66,8 @@ jobs:
run:
echo ${CONDA_PREFIX}
- name: Build cookbook
+ env:
+ - ARROW_NIGHTLY: 1
run:
make cpp
- name: Upload cpp book
diff --git a/.github/workflows/test_arrow_nightly_cookbook.yml
b/.github/workflows/test_arrow_nightly_cookbook.yml
index 5421d0f..b6b0b98 100644
--- a/.github/workflows/test_arrow_nightly_cookbook.yml
+++ b/.github/workflows/test_arrow_nightly_cookbook.yml
@@ -48,6 +48,8 @@ jobs:
test_cpp_dev:
name: "Test C++ Cookbook on Arrow Nightlies"
runs-on: ubuntu-latest
+ env:
+ ARROW_NIGHTLY: 1
defaults:
run:
shell: bash -l {0}
diff --git a/Makefile b/Makefile
index 1265435..06ceff9 100644
--- a/Makefile
+++ b/Makefile
@@ -88,7 +88,7 @@ cpptest:
@echo ">>> Running C++ Tests/Snippets <<<\n"
rm -rf cpp/recipe-test-build
mkdir cpp/recipe-test-build
- cd cpp/recipe-test-build && cmake ../code -DCMAKE_BUILD_TYPE=Release &&
cmake --build . && ctest --output-on-failure -j 1
+ cd cpp/recipe-test-build && cmake ../code -G Ninja
-DCMAKE_BUILD_TYPE=Release && cmake --build . && ctest --output-on-failure -j 1
mkdir -p cpp/build
cp cpp/recipe-test-build/recipes_out.arrow cpp/build
diff --git a/cpp/CONTRIBUTING.md b/cpp/CONTRIBUTING.md
index a82b37c..2b9d849 100644
--- a/cpp/CONTRIBUTING.md
+++ b/cpp/CONTRIBUTING.md
@@ -95,7 +95,7 @@ output block when the recipe is rendered into the cookbook.
## Referencing Arrow C++ Documentation
The Arrow project has its own documentation for the C++ implementation that
-is hosted at https://arrow.apache.org/docs/cpp/index.html. Fortunately,
+is hosted at <https://arrow.apache.org/docs/cpp/index.html>. Fortunately,
this documentation is also built with Sphinx and so we can use the extension
`intersphinx` to reference sections of this documentation. To do so simply
write a standard Sphinx reference like so:
@@ -121,6 +121,7 @@ cmake build. For example:
```
mkdir cpp/code/build
cd cpp/code/build
+# Optional: Run `export ARROW_NIGHTLY=1` to build Arrow from git.
cmake ../code -DCMAKE_BUILD_TYPE=Debug
cmake --build .
ctest
diff --git a/cpp/code/CMakeLists.txt b/cpp/code/CMakeLists.txt
index 46a15e9..7c9890f 100644
--- a/cpp/code/CMakeLists.txt
+++ b/cpp/code/CMakeLists.txt
@@ -18,23 +18,66 @@
cmake_minimum_required(VERSION 3.19)
project(arrow-cookbook)
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++")
endif()
# Add Arrow and other required packages
-find_package(Arrow REQUIRED)
-if(NOT ${ARROW_VERSION} VERSION_GREATER "9.0.0")
- get_filename_component(ARROW_CMAKE_BASE_DIR ${Arrow_CONFIG} DIRECTORY)
- list(INSERT CMAKE_MODULE_PATH 0 ${ARROW_CMAKE_BASE_DIR})
+if(DEFINED ENV{ARROW_NIGHTLY})
+ set(CMAKE_BUILD_TYPE Debug)
+ set(ARROW_BUILD_SHARED True)
+ set(ARROW_DEPENDENCY_SOURCE "AUTO")
+ set(ARROW_ENABLE_THREADING ON)
+ set(ARROW_SIMD_LEVEL NONE) # macOS-specific workaround
+
+ set(ARROW_WITH_SNAPPY ON)
+
+ set(ARROW_ACERO ON)
+ set(ARROW_COMPUTE ON)
+ set(ARROW_DATASET ON)
+ set(ARROW_FILESYSTEM ON)
+ set(ARROW_FLIGHT ON)
+ set(ARROW_IPC ON)
+ set(ARROW_PARQUET ON)
+
+ include(FetchContent)
+
+ FetchContent_Declare(Arrow
+ GIT_REPOSITORY https://github.com/apache/arrow.git
+ GIT_TAG main
+ GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
+ OVERRIDE_FIND_PACKAGE
+ )
+
+ FetchContent_MakeAvailable(Arrow)
+
+ # These are some Linux-only things the FetchContent build needs in order
+ # to compile
+ file(INSTALL "${arrow_BINARY_DIR}/src/arrow/util/config.h"
+ DESTINATION "${arrow_SOURCE_DIR}/cpp/src/arrow/util")
+ file(INSTALL "${arrow_BINARY_DIR}/src/parquet/parquet_version.h"
+ DESTINATION "${arrow_SOURCE_DIR}/cpp/src/parquet")
+ target_include_directories(
+ arrow_shared
+ SYSTEM INTERFACE "$<BUILD_INTERFACE:${arrow_SOURCE_DIR}/cpp/src>"
+ )
+ # Force FetchContent Arrow headers to the front of every target's include
+ # list so they take priority over any system Arrow headers added transitively
+ # (e.g. /opt/homebrew/include from GTest::gtest). Without this the recipe
+ # executables compile against the older installed Arrow headers but link
+ # against the FetchContent Arrow runtime, causing ABI mismatches.
+ include_directories(BEFORE SYSTEM "${arrow_SOURCE_DIR}/cpp/src")
+
+else()
+ find_package(Arrow REQUIRED)
+ find_package(ArrowDataset REQUIRED)
+ find_package(ArrowFlight REQUIRED)
+ find_package(Parquet REQUIRED)
endif()
-find_package(ArrowDataset REQUIRED)
-find_package(ArrowFlight REQUIRED)
-find_package(Parquet REQUIRED)
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
- set(CMAKE_CXX_CLANG_TIDY "clang-tidy")
+ set(CMAKE_CXX_CLANG_TIDY "clang-tidy")
endif()
# Create test targets
@@ -44,31 +87,36 @@ find_package(GTest REQUIRED)
include(GoogleTest)
function(RECIPE TARGET)
- add_executable(
+ add_executable(
${TARGET}
${TARGET}.cc
common.cc
main.cc
)
- if(TARGET Arrow::arrow_shared)
- target_link_libraries(
+ if(TARGET Arrow::arrow_shared)
+ target_link_libraries(
${TARGET}
ArrowDataset::arrow_dataset_shared
ArrowFlight::arrow_flight_shared GTest::gtest
)
- else()
- target_link_libraries(parquet_shared INTERFACE arrow_shared)
- target_link_libraries(arrow_dataset_shared INTERFACE parquet_shared)
- target_link_libraries(arrow_flight_shared INTERFACE arrow_shared)
- target_link_libraries(${TARGET} arrow_dataset_shared
arrow_flight_shared GTest::gtest)
+ else()
+ target_link_libraries(parquet_shared INTERFACE arrow_shared)
+ target_link_libraries(arrow_dataset_shared INTERFACE parquet_shared)
+ target_link_libraries(arrow_flight_shared INTERFACE arrow_shared)
+ target_link_libraries(${TARGET} arrow_dataset_shared arrow_flight_shared
GTest::gtest)
+ endif()
+ if (MSVC)
+ target_compile_options(${TARGET} PRIVATE /W4 /WX)
+ else ()
+ target_compile_options(${TARGET} PRIVATE -Wall -Wextra -Wpedantic -Werror)
+ # _Nullable/_Nonnull nullability annotations in absl macros trigger
+ # -Wnullability-extension under -Wpedantic; this is Clang-only.
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ target_compile_options(${TARGET} PRIVATE -Wno-nullability-extension)
endif()
- if (MSVC)
- target_compile_options(${TARGET} PRIVATE /W4 /WX)
- else ()
- target_compile_options(${TARGET} PRIVATE -Wall -Wextra -Wpedantic
-Werror)
- endif ()
+ endif ()
- gtest_discover_tests(${TARGET})
+ gtest_discover_tests(${TARGET})
endfunction()
recipe(basic_arrow)
@@ -76,10 +124,9 @@ recipe(creating_arrow_objects)
recipe(datasets)
recipe(flight)
-
# Add protobuf to flight
-find_package(gRPC CONFIG REQUIRED)
find_package(Threads)
+find_package(gRPC CONFIG REQUIRED)
set(PROTO_FILES
protos/helloworld.proto
diff --git a/cpp/code/common.h b/cpp/code/common.h
index 756ca81..018ee8a 100644
--- a/cpp/code/common.h
+++ b/cpp/code/common.h
@@ -18,30 +18,11 @@
#ifndef ARROW_COOKBOOK_COMMON_H
#define ARROW_COOKBOOK_COMMON_H
-#include <arrow/result.h>
-#include <arrow/status.h>
+#include <arrow/testing/gtest_util.h>
#include <sstream>
#include <string>
-#define ARROW_STRINGIFY(x) #x
-#define ARROW_CONCAT(x, y) x##y
-
-#define ARROW_ASSIGN_OR_RAISE_NAME(x, y) ARROW_CONCAT(x, y)
-
-#define ASSERT_OK(expr) \
- for (const ::arrow::Status _st = ::arrow::ToStatus((expr)); !_st.ok();) \
- FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString()
-
-#define ASSIGN_OR_HANDLE_ERROR_IMPL(handle_error, status_name, lhs, rexpr) \
- auto&& status_name = (rexpr); \
- handle_error(status_name.status()); \
- lhs = std::move(status_name).ValueOrDie();
-
-#define ASSERT_OK_AND_ASSIGN(lhs, rexpr) \
- ASSIGN_OR_HANDLE_ERROR_IMPL( \
- ASSERT_OK, ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__),
lhs, rexpr);
-
inline std::stringstream rout;
void StartRecipe(const std::string& recipe_name);
diff --git a/cpp/code/datasets.cc b/cpp/code/datasets.cc
index 8f0ba8b..3329fde 100644
--- a/cpp/code/datasets.cc
+++ b/cpp/code/datasets.cc
@@ -87,10 +87,15 @@ class DatasetReadingTest : public ::testing::Test {
fs->OpenInputFile(airquality_path));
std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
parquet::ParquetFileReader::Open(file);
- ARROW_ASSIGN_OR_RAISE(auto reader, parquet::arrow::FileReader::Make(
- arrow::default_memory_pool(), std::move(parquet_reader)));
+ ARROW_ASSIGN_OR_RAISE(auto reader,
+
parquet::arrow::FileReader::Make(arrow::default_memory_pool(),
+
std::move(parquet_reader)));
std::shared_ptr<arrow::Table> table;
+#if ARROW_VERSION_MAJOR >= 24
+ ARROW_ASSIGN_OR_RAISE(table, reader->ReadTable());
+#else
ARROW_RETURN_NOT_OK(reader->ReadTable(&table));
+#endif
return table;
}
diff --git a/cpp/code/flight.cc b/cpp/code/flight.cc
index 7cd0317..038f43e 100644
--- a/cpp/code/flight.cc
+++ b/cpp/code/flight.cc
@@ -97,7 +97,11 @@ class ParquetStorageService : public
arrow::flight::FlightServerBase {
parquet::arrow::OpenFile(std::move(input),
arrow::default_memory_pool()));
std::shared_ptr<arrow::Table> table;
+#if ARROW_VERSION_MAJOR >= 24
+ ARROW_ASSIGN_OR_RAISE(table, reader->ReadTable());
+#else
ARROW_RETURN_NOT_OK(reader->ReadTable(&table));
+#endif
// Note that we can't directly pass TableBatchReader to
// RecordBatchStream because TableBatchReader keeps a non-owning
// reference to the underlying Table, which would then get freed
@@ -148,7 +152,7 @@ class ParquetStorageService : public
arrow::flight::FlightServerBase {
endpoint.ticket.ticket = file_info.base_name();
arrow::flight::Location location;
ARROW_ASSIGN_OR_RAISE(location,
- arrow::flight::Location::ForGrpcTcp("localhost", port()));
+ arrow::flight::Location::ForGrpcTcp("localhost",
port()));
endpoint.locations.push_back(location);
int64_t total_records = reader->parquet_reader()->metadata()->num_rows();
@@ -197,7 +201,7 @@ arrow::Status TestPutGetDelete() {
arrow::flight::Location server_location;
ARROW_ASSIGN_OR_RAISE(server_location,
- arrow::flight::Location::ForGrpcTcp("0.0.0.0", 0));
+ arrow::flight::Location::ForGrpcTcp("0.0.0.0", 0));
arrow::flight::FlightServerOptions options(server_location);
auto server = std::unique_ptr<arrow::flight::FlightServerBase>(
@@ -209,7 +213,7 @@ arrow::Status TestPutGetDelete() {
StartRecipe("ParquetStorageService::Connect");
arrow::flight::Location location;
ARROW_ASSIGN_OR_RAISE(location,
- arrow::flight::Location::ForGrpcTcp("localhost", server->port()));
+ arrow::flight::Location::ForGrpcTcp("localhost",
server->port()));
std::unique_ptr<arrow::flight::FlightClient> client;
ARROW_ASSIGN_OR_RAISE(client,
arrow::flight::FlightClient::Connect(location));
@@ -315,7 +319,7 @@ arrow::Status TestClientOptions() {
arrow::flight::Location server_location;
ARROW_ASSIGN_OR_RAISE(server_location,
- arrow::flight::Location::ForGrpcTcp("0.0.0.0", 0));
+ arrow::flight::Location::ForGrpcTcp("0.0.0.0", 0));
arrow::flight::FlightServerOptions options(server_location);
auto server = std::unique_ptr<arrow::flight::FlightServerBase>(
@@ -329,12 +333,12 @@ arrow::Status TestClientOptions() {
arrow::flight::Location location;
ARROW_ASSIGN_OR_RAISE(location,
- arrow::flight::Location::ForGrpcTcp("localhost", server->port()));
+ arrow::flight::Location::ForGrpcTcp("localhost",
server->port()));
std::unique_ptr<arrow::flight::FlightClient> client;
// pass client_options into Connect()
ARROW_ASSIGN_OR_RAISE(client,
- arrow::flight::FlightClient::Connect(location, client_options));
+ arrow::flight::FlightClient::Connect(location,
client_options));
rout << "Connected to " << location.ToString() << std::endl;
EndRecipe("TestClientOptions::Connect");
@@ -352,7 +356,7 @@ arrow::Status TestCustomGrpcImpl() {
StartRecipe("CustomGrpcImpl::StartServer");
arrow::flight::Location server_location;
ARROW_ASSIGN_OR_RAISE(server_location,
- arrow::flight::Location::ForGrpcTcp("0.0.0.0", 5000));
+ arrow::flight::Location::ForGrpcTcp("0.0.0.0", 0));
arrow::flight::FlightServerOptions options(server_location);
auto server = std::unique_ptr<arrow::flight::FlightServerBase>(
@@ -372,8 +376,8 @@ arrow::Status TestCustomGrpcImpl() {
EndRecipe("CustomGrpcImpl::StartServer");
StartRecipe("CustomGrpcImpl::CreateClient");
- auto client_channel =
- grpc::CreateChannel("0.0.0.0:5000", grpc::InsecureChannelCredentials());
+ auto client_channel = grpc::CreateChannel("0.0.0.0:" +
std::to_string(server->port()),
+
grpc::InsecureChannelCredentials());
auto stub = HelloWorldService::NewStub(client_channel);
diff --git a/cpp/code/main.cc b/cpp/code/main.cc
index 3fbe3ea..e32e922 100644
--- a/cpp/code/main.cc
+++ b/cpp/code/main.cc
@@ -19,12 +19,17 @@
#include <filesystem>
+#include <arrow/compute/api.h>
#include <arrow/status.h>
#include "gtest/gtest.h"
#include "common.h"
int main(int argc, char** argv) {
+ if (!arrow::compute::Initialize().ok()) {
+ std::cerr << "Failed to initialize Arrow compute functions" << std::endl;
+ return -1;
+ }
testing::InitGoogleTest(&argc, argv);
int retval = RUN_ALL_TESTS();
if (retval == 0 && HasRecipeOutput()) {
diff --git a/cpp/dev.yml b/cpp/dev.yml
index d461f87..90eda6e 100644
--- a/cpp/dev.yml
+++ b/cpp/dev.yml
@@ -16,15 +16,22 @@
name: cookbook-cpp-dev
channels:
- - arrow-nightlies
- conda-forge
dependencies:
- python=3.10
- compilers
- - arrow-nightlies::libarrow
+ - cmake
+ - ninja
- sphinx
- gtest
- gmock
- - arrow-nightlies::pyarrow
- clang-tools
- zlib
+ - grpc-cpp
+ - protobuf
+ - abseil-cpp
+ - c-ares
+ - re2
+ - thrift-cpp
+ - rapidjson
+ - snappy