This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 24eece3e6 ORC-2102: [C++] Remove HDFS support
24eece3e6 is described below
commit 24eece3e6f3f1bc9903952327136b2dcedfe3abd
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Sun Feb 22 19:03:45 2026 -0800
ORC-2102: [C++] Remove HDFS support
### What changes were proposed in this pull request?
This PR aims to remove HDFS support code from Apache ORC repository.
### Why are the changes needed?
Apache ORC 2.0.1 deprecated HDFS Support to address the following issues.
- https://github.com/apache/orc/pull/1885
- #1857
- https://github.com/apache/orc/issues/2134
### How was this patch tested?
Pass the CIs.
### Was this patch authored or co-authored using generative AI tooling?
Generated-by: `Gemini 3.1 Pro (High)` on `Antigravity`
This closes #2136 .
Closes #2544 from dongjoon-hyun/ORC-2102.
Authored-by: Dongjoon Hyun <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.github/workflows/build_and_test.yml | 1 -
CMakeLists.txt | 3 -
c++/include/orc/OrcFile.hh | 8 --
c++/libs/libhdfspp/imported_timestamp | 10 --
c++/libs/libhdfspp/libhdfspp.tar.gz | Bin 948949 -> 0 bytes
c++/libs/libhdfspp/pull_hdfs.sh | 32 ------
c++/src/CMakeLists.txt | 8 --
c++/src/OrcFile.cc | 10 +-
c++/src/OrcHdfsFile.cc | 178 --------------------------------
cmake_modules/ThirdpartyToolchain.cmake | 68 +-----------
conan/all/conanfile.py | 1 -
11 files changed, 2 insertions(+), 317 deletions(-)
diff --git a/.github/workflows/build_and_test.yml
b/.github/workflows/build_and_test.yml
index c0884f51e..d5e7f7685 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -155,7 +155,6 @@ jobs:
cd build
cmake .. -G "Visual Studio 17 2022" \
-DCMAKE_BUILD_TYPE=RELEASE \
- -DBUILD_LIBHDFSPP=OFF \
-DBUILD_TOOLS=OFF \
-DBUILD_JAVA=OFF \
-DANALYZE_JAVA=OFF \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae0f9b44f..3a23e1258 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,9 +41,6 @@ option (ANALYZE_JAVA
"Run static analysis of the Java"
OFF)
-option (BUILD_LIBHDFSPP
- "Include LIBHDFSPP library in the build process"
- OFF)
option (BUILD_SPARSEHASH
"Include sparsehash library in the build process"
diff --git a/c++/include/orc/OrcFile.hh b/c++/include/orc/OrcFile.hh
index ea71567c5..1b5a61139 100644
--- a/c++/include/orc/OrcFile.hh
+++ b/c++/include/orc/OrcFile.hh
@@ -135,14 +135,6 @@ namespace orc {
std::unique_ptr<InputStream> readLocalFile(const std::string& path,
ReaderMetrics* metrics = nullptr);
- /**
- * Create a stream to an HDFS file.
- * @param path the uri of the file in HDFS
- * @param metrics the metrics of the reader
- */
- [[deprecated("readHdfsFile is deprecated in 2.0.1")]]
std::unique_ptr<InputStream> readHdfsFile(
- const std::string& path, ReaderMetrics* metrics = nullptr);
-
/**
* Create a reader to read the ORC file.
* @param stream the stream to read
diff --git a/c++/libs/libhdfspp/imported_timestamp
b/c++/libs/libhdfspp/imported_timestamp
deleted file mode 100644
index 84965ce4a..000000000
--- a/c++/libs/libhdfspp/imported_timestamp
+++ /dev/null
@@ -1,10 +0,0 @@
-Wed Aug 30 10:56:51 EDT 2017
-HDFS-10787
-commit 9587bb04a818a2661e264f619b09c15ce10ff38e
-Author: Anatoli Shein <[email protected]>
-Date: Wed Aug 30 10:49:42 2017 -0400
-
- fixed warnings3
-diffs: --------------
- --------------
-Wed Aug 30 10:56:51 EDT 2017
diff --git a/c++/libs/libhdfspp/libhdfspp.tar.gz
b/c++/libs/libhdfspp/libhdfspp.tar.gz
deleted file mode 100644
index 35c4d6127..000000000
Binary files a/c++/libs/libhdfspp/libhdfspp.tar.gz and /dev/null differ
diff --git a/c++/libs/libhdfspp/pull_hdfs.sh b/c++/libs/libhdfspp/pull_hdfs.sh
deleted file mode 100755
index a207a93f8..000000000
--- a/c++/libs/libhdfspp/pull_hdfs.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-if [ -z "$1" ]; then
- echo "Usage: pull_hdfs [path_to_hdfs_git_root]"
- exit 1;
-fi
-if [ ! -d "$1" ]; then
- echo "$1 is not a directory"
-fi
-if [ ! -d "$1/hadoop-hdfs-project" ]; then
- echo "$1 is not the root of a hadoop git checkout"
-fi
-
-HADOOP_ROOT=$1
-echo HADOOP_ROOT=$HADOOP_ROOT
-OUT=$(readlink -m `dirname $0`)
-echo OUT=$OUT
-TS=$OUT/imported_timestamp
-
- cd $HADOOP_ROOT &&
- mvn -pl :hadoop-hdfs-native-client -Pnative compile
-Dnative_make_args="copy_hadoop_files"
- (date > $TS; git rev-parse --abbrev-ref HEAD >> $TS; git log -n 1 >> $TS;
\
- echo "diffs: --------------" >> $TS; git diff HEAD >> $TS; \
- echo " --------------" >> $TS)
- cd $OUT &&
- #Delete everything except for pull_hdfs.sh and imported_timestamp
- find . ! -name 'pull_hdfs.sh' ! -name 'imported_timestamp' ! -name '.' !
-name '..' -exec rm -rf {} + &&
- cp -R
$HADOOP_ROOT/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp
. &&
- cp -R
$HADOOP_ROOT/hadoop-hdfs-project/hadoop-hdfs-native-client/target/main/native/libhdfspp/extern
libhdfspp/ &&
- cd libhdfspp &&
- tar -czf ../libhdfspp.tar.gz * &&
- cd .. &&
- rm -rf libhdfspp &&
- date >> $TS
\ No newline at end of file
diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt
index a1fd549ce..efddae23e 100644
--- a/c++/src/CMakeLists.txt
+++ b/c++/src/CMakeLists.txt
@@ -191,9 +191,6 @@ set(SOURCE_FILES
Vector.cc
Writer.cc)
-if(BUILD_LIBHDFSPP)
- set(SOURCE_FILES ${SOURCE_FILES} OrcHdfsFile.cc)
-endif(BUILD_LIBHDFSPP)
if(BUILD_ENABLE_AVX512)
set(SOURCE_FILES
@@ -212,7 +209,6 @@ target_link_libraries (orc
$<BUILD_INTERFACE:orc::Snappy>
$<BUILD_INTERFACE:orc::lz4>
$<BUILD_INTERFACE:orc::zstd>
- $<BUILD_INTERFACE:${LIBHDFSPP_LIBRARIES}>
$<BUILD_INTERFACE:$<TARGET_NAME_IF_EXISTS:orc::sparsehash>>
)
@@ -227,16 +223,12 @@ target_include_directories (orc
PRIVATE
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_CURRENT_SOURCE_DIR}
- ${LIBHDFSPP_INCLUDE_DIR}
)
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL
"GNU")
target_compile_options(orc PRIVATE -Wall -Wextra
$<$<BOOL:${STOP_BUILD_ON_WARNING}>:-Werror>)
endif ()
-if (BUILD_LIBHDFSPP)
- target_compile_definitions(orc PUBLIC -DBUILD_LIBHDFSPP)
-endif (BUILD_LIBHDFSPP)
if (BUILD_SPARSEHASH)
target_compile_definitions(orc PUBLIC -DBUILD_SPARSEHASH)
diff --git a/c++/src/OrcFile.cc b/c++/src/OrcFile.cc
index be8672432..470adfc1f 100644
--- a/c++/src/OrcFile.cc
+++ b/c++/src/OrcFile.cc
@@ -103,15 +103,7 @@ namespace orc {
}
std::unique_ptr<InputStream> readFile(const std::string& path,
ReaderMetrics* metrics) {
-#ifdef BUILD_LIBHDFSPP
- if (strncmp(path.c_str(), "hdfs://", 7) == 0) {
- return orc::readHdfsFile(std::string(path), metrics);
- } else {
-#endif
- return orc::readLocalFile(std::string(path), metrics);
-#ifdef BUILD_LIBHDFSPP
- }
-#endif
+ return orc::readLocalFile(std::string(path), metrics);
}
DIAGNOSTIC_POP
diff --git a/c++/src/OrcHdfsFile.cc b/c++/src/OrcHdfsFile.cc
deleted file mode 100644
index d878e276c..000000000
--- a/c++/src/OrcHdfsFile.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "orc/OrcFile.hh"
-
-#include "Adaptor.hh"
-#include "Utils.hh"
-#include "orc/Exceptions.hh"
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include "hdfspp/hdfspp.h"
-
-namespace orc {
-
- DIAGNOSTIC_PUSH
-
-#ifdef __clang__
- DIAGNOSTIC_IGNORE("-Wunused-private-field")
-#endif
-
- class HdfsFileInputStream : public InputStream {
- private:
- std::string filename_;
- std::unique_ptr<hdfs::FileHandle> file_;
- std::unique_ptr<hdfs::FileSystem> fileSystem_;
- uint64_t totalLength_;
- const uint64_t readSize_ = 1024 * 1024; // 1 MB
- ReaderMetrics* metrics_;
-
- public:
- HdfsFileInputStream(std::string filename, ReaderMetrics* metrics) :
metrics_(metrics) {
- filename_ = filename;
-
- // Building a URI object from the given uri_path
- hdfs::URI uri;
- try {
- uri = hdfs::URI::parse_from_string(filename_);
- } catch (const hdfs::uri_parse_error&) {
- throw ParseError("Malformed URI: " + filename_);
- }
-
- // This sets conf path to default "$HADOOP_CONF_DIR" or
"/etc/hadoop/conf"
- // and loads configs core-site.xml and hdfs-site.xml from the conf path
- hdfs::ConfigParser parser;
- if (!parser.LoadDefaultResources()) {
- throw ParseError("Could not load default resources. ");
- }
- auto stats = parser.ValidateResources();
- // validating core-site.xml
- if (!stats[0].second.ok()) {
- throw ParseError(stats[0].first + " is invalid: " +
stats[0].second.ToString());
- }
- // validating hdfs-site.xml
- if (!stats[1].second.ok()) {
- throw ParseError(stats[1].first + " is invalid: " +
stats[1].second.ToString());
- }
- hdfs::Options options;
- if (!parser.get_options(options)) {
- throw ParseError("Could not load Options object. ");
- }
- hdfs::IoService* io_service = hdfs::IoService::New();
- // Wrapping file_system into a unique pointer to guarantee deletion
- fileSystem_ =
- std::unique_ptr<hdfs::FileSystem>(hdfs::FileSystem::New(io_service,
"", options));
- if (fileSystem_.get() == nullptr) {
- throw ParseError("Can't create FileSystem object. ");
- }
- hdfs::Status status;
- // Checking if the user supplied the host
- if (!uri.get_host().empty()) {
- // Using port if supplied, otherwise using "" to look up port in
configs
- std::string port = uri.has_port() ? std::to_string(uri.get_port()) :
"";
- status = fileSystem_->Connect(uri.get_host(), port);
- if (!status.ok()) {
- throw ParseError("Can't connect to " + uri.get_host() + ":" + port +
". " +
- status.ToString());
- }
- } else {
- status = fileSystem_->ConnectToDefaultFs();
- if (!status.ok()) {
- if (!options.defaultFS.get_host().empty()) {
- throw ParseError("Error connecting to " + options.defaultFS.str()
+ ". " +
- status.ToString());
- } else {
- throw ParseError("Error connecting to the cluster: defaultFS is
empty. " +
- status.ToString());
- }
- }
- }
-
- if (fileSystem_.get() == nullptr) {
- throw ParseError("Can't connect the file system. ");
- }
-
- hdfs::FileHandle* file_raw = nullptr;
- status = fileSystem_->Open(uri.get_path(true), &file_raw);
- if (!status.ok()) {
- throw ParseError("Can't open " + uri.get_path(true) + ". " +
status.ToString());
- }
- // Wrapping file_raw into a unique pointer to guarantee deletion
- file_.reset(file_raw);
-
- hdfs::StatInfo stat_info;
- status = fileSystem_->GetFileInfo(uri.get_path(true), stat_info);
- if (!status.ok()) {
- throw ParseError("Can't stat " + uri.get_path(true) + ". " +
status.ToString());
- }
- totalLength_ = stat_info.length;
- }
-
- uint64_t getLength() const override {
- return totalLength_;
- }
-
- uint64_t getNaturalReadSize() const override {
- return readSize_;
- }
-
- void read(void* buf, uint64_t length, uint64_t offset) override {
- SCOPED_STOPWATCH(metrics, IOBlockingLatencyUs, IOCount);
- if (!buf) {
- throw ParseError("Buffer is null");
- }
-
- char* buf_ptr = reinterpret_cast<char*>(buf);
- hdfs::Status status;
- size_t total_bytes_read = 0;
- size_t last_bytes_read = 0;
-
- do {
- status =
- file_->PositionRead(buf_ptr, static_cast<size_t>(length) -
total_bytes_read,
- static_cast<off_t>(offset + total_bytes_read),
&last_bytes_read);
- if (!status.ok()) {
- throw ParseError("Error reading the file: " + status.ToString());
- }
- total_bytes_read += last_bytes_read;
- buf_ptr += last_bytes_read;
- } while (total_bytes_read < length);
- }
-
- const std::string& getName() const override {
- return filename_;
- }
-
- ~HdfsFileInputStream() override;
- };
-
- DIAGNOSTIC_POP
-
- HdfsFileInputStream::~HdfsFileInputStream() {}
-
- std::unique_ptr<InputStream> readHdfsFile(const std::string& path,
ReaderMetrics* metrics) {
- return std::make_unique<HdfsFileInputStream>(path, metrics);
- }
-} // namespace orc
diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
index c03afa8f2..a130b7dfe 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -702,70 +702,4 @@ if(BUILD_SPARSEHASH)
endblock()
endif()
-# ----------------------------------------------------------------------
-# LIBHDFSPP
-if(BUILD_LIBHDFSPP)
- set (BUILD_LIBHDFSPP FALSE)
- if(ORC_CXX_HAS_THREAD_LOCAL)
- find_package(CyrusSASL)
- find_package(OpenSSL)
- find_package(Threads)
- if (CYRUS_SASL_SHARED_LIB AND OPENSSL_LIBRARIES)
- set (BUILD_LIBHDFSPP TRUE)
- set (LIBHDFSPP_PREFIX "${THIRDPARTY_DIR}/libhdfspp_ep-install")
- set (LIBHDFSPP_INCLUDE_DIR "${LIBHDFSPP_PREFIX}/include")
- set (LIBHDFSPP_STATIC_LIB_NAME hdfspp_static)
- set (LIBHDFSPP_STATIC_LIB
"${LIBHDFSPP_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${LIBHDFSPP_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}")
- set (LIBHDFSPP_SRC_URL
"${PROJECT_SOURCE_DIR}/c++/libs/libhdfspp/libhdfspp.tar.gz")
- set (LIBHDFSPP_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
- -DCMAKE_INSTALL_PREFIX=${LIBHDFSPP_PREFIX}
- -DPROTOBUF_INCLUDE_DIR=${PROTOBUF_INCLUDE_DIR}
- -DPROTOBUF_LIBRARY=${PROTOBUF_STATIC_LIB}
- -DPROTOBUF_PROTOC_LIBRARY=${PROTOC_STATIC_LIB}
-
-DPROTOBUF_PROTOC_EXECUTABLE=${PROTOBUF_EXECUTABLE}
- -DOPENSSL_ROOT_DIR=${OPENSSL_ROOT_DIR}
- -DCMAKE_C_FLAGS=${EP_C_FLAGS}
- -DBUILD_SHARED_LIBS=OFF
- -DHDFSPP_LIBRARY_ONLY=TRUE
- -DBUILD_SHARED_HDFSPP=FALSE)
-
- if (BUILD_POSITION_INDEPENDENT_LIB)
- set(LIBHDFSPP_CMAKE_ARGS ${LIBHDFSPP_CMAKE_ARGS}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON)
- endif ()
-
- ExternalProject_Add (libhdfspp_ep
- DEPENDS orc::protobuf
- URL ${LIBHDFSPP_SRC_URL}
- LOG_DOWNLOAD 0
- LOG_CONFIGURE 0
- LOG_BUILD 0
- LOG_INSTALL 0
- BUILD_BYPRODUCTS "${LIBHDFSPP_STATIC_LIB}"
- CMAKE_ARGS ${LIBHDFSPP_CMAKE_ARGS})
-
- orc_add_built_library(libhdfspp_ep libhdfspp ${LIBHDFSPP_STATIC_LIB}
${LIBHDFSPP_INCLUDE_DIR})
-
- set (LIBHDFSPP_LIBRARIES
- libhdfspp
- ${CYRUS_SASL_SHARED_LIB}
- ${OPENSSL_LIBRARIES}
- ${CMAKE_THREAD_LIBS_INIT})
-
- elseif(CYRUS_SASL_SHARED_LIB)
- message(STATUS
- "WARNING: Libhdfs++ library was not built because the required OpenSSL
library was not found")
- elseif(OPENSSL_LIBRARIES)
- message(STATUS
- "WARNING: Libhdfs++ library was not built because the required CyrusSASL
library was not found")
- else ()
- message(STATUS
- "WARNING: Libhdfs++ library was not built because the required CyrusSASL
and OpenSSL libraries were not found")
- endif(CYRUS_SASL_SHARED_LIB AND OPENSSL_LIBRARIES)
- else(ORC_CXX_HAS_THREAD_LOCAL)
- message(STATUS
- "WARNING: Libhdfs++ library was not built because the required feature
- thread_local storage is not supported by your compiler. Known compilers
that
- support this feature: GCC, Visual Studio, Clang (community version),
- Clang (version for iOS 9 and later), Clang (version for Xcode 8 and
later)")
- endif(ORC_CXX_HAS_THREAD_LOCAL)
-endif(BUILD_LIBHDFSPP)
+
diff --git a/conan/all/conanfile.py b/conan/all/conanfile.py
index cc79f5b0d..56c7b3407 100644
--- a/conan/all/conanfile.py
+++ b/conan/all/conanfile.py
@@ -119,7 +119,6 @@ class OrcRecipe(ConanFile):
tc.variables["BUILD_JAVA"] = False
tc.variables["BUILD_CPP_TESTS"] = False
tc.variables["BUILD_TOOLS"] = self.options.build_tools
- tc.variables["BUILD_LIBHDFSPP"] = False
tc.variables["BUILD_POSITION_INDEPENDENT_LIB"] =
bool(self.options.get_safe("fPIC", True))
tc.variables["INSTALL_VENDORED_LIBS"] = False
# AVX512 support is determined by ORC_USER_SIMD_LEVEL env var at
runtime, defaults to off