Repository: arrow Updated Branches: refs/heads/master 3d2e4df21 -> 6178bf7b0
ARROW-350: Added Kerberos to HDFS client Author: Christopher C. Aycock <christopher.ayc...@twosigma.com> Closes #185 from chrisaycock/ARROW-350 and squashes the following commits: c2a4e64 [Christopher C. Aycock] Renamed 'kerb' parameter to 'kerb_ticket' f1d63de [Christopher C. Aycock] ARROW-350: Added Kerberos to HDFS client 8f1052f [Christopher C. Aycock] ARROW-345: Proper locations of libhdfs and libjvm on Mac Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/6178bf7b Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/6178bf7b Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/6178bf7b Branch: refs/heads/master Commit: 6178bf7b0f0cf66f52536f5d5fb5ee104e696f3c Parents: 3d2e4df Author: Christopher C. Aycock <christopher.ayc...@twosigma.com> Authored: Fri Oct 28 21:13:02 2016 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Fri Oct 28 21:13:02 2016 -0400 ---------------------------------------------------------------------- cpp/doc/HDFS.md | 22 ++++++- cpp/src/arrow/io/hdfs.cc | 16 ++++- cpp/src/arrow/io/hdfs.h | 9 +-- cpp/src/arrow/io/libhdfs_shim.cc | 87 ++++++++++++++++++++-------- python/pyarrow/includes/libarrow_io.pxd | 1 + python/pyarrow/io.pyx | 29 +++++++--- 6 files changed, 124 insertions(+), 40 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/doc/HDFS.md ---------------------------------------------------------------------- diff --git a/cpp/doc/HDFS.md b/cpp/doc/HDFS.md index 83311db..6b1bb8c 100644 --- a/cpp/doc/HDFS.md +++ b/cpp/doc/HDFS.md @@ -43,7 +43,7 @@ LD_LIBRARY_PATH), and relies on some environment variables. export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` ``` -#### Setting $JAVA_HOME automatically on OS X +### Mac Specifics The installed location of Java on OS X can vary, however the following snippet will set it automatically for you: @@ -51,3 +51,23 @@ will set it automatically for you: ```shell export JAVA_HOME=$(/usr/libexec/java_home) ``` + +Homebrew's Hadoop does not have native libs. Apache doesn't build these, so +users must build Hadoop to get the native libs. See this Stack Overflow +answer for details: + +http://stackoverflow.com/a/40051353/478288 + +Be sure to include the path to the native libs in `JAVA_LIBRARY_PATH`: + +```shell +export JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native:$JAVA_LIBRARY_PATH +``` + +If you get an error about needing to install Java 6, then add *BundledApp* and +*JNI* to the `JVMCapabilities` in `$JAVA_HOME/../Info.plist`. See + +https://oliverdowling.com.au/2015/10/09/oracles-jre-8-on-mac-os-x-el-capitan/ + +https://derflounder.wordpress.com/2015/08/08/modifying-oracles-java-sdk-to-run-java-applications-on-os-x/ + http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/src/arrow/io/hdfs.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index b74f846..6490a75 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -287,12 +287,25 @@ class HdfsClient::HdfsClientImpl { Status Connect(const HdfsConnectionConfig* config) { RETURN_NOT_OK(ConnectLibHdfs()); - fs_ = hdfsConnectAsUser(config->host.c_str(), config->port, config->user.c_str()); + // connect to HDFS with the builder object + hdfsBuilder* builder = hdfsNewBuilder(); + if (!config->host.empty()) { + hdfsBuilderSetNameNode(builder, config->host.c_str()); + } + hdfsBuilderSetNameNodePort(builder, config->port); + if (!config->user.empty()) { + hdfsBuilderSetUserName(builder, config->user.c_str()); + } + if (!config->kerb_ticket.empty()) { + hdfsBuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str()); + } + fs_ = hdfsBuilderConnect(builder); if (fs_ == nullptr) { return Status::IOError("HDFS connection failed"); } namenode_host_ = config->host; port_ = config->port; user_ = config->user; + kerb_ticket_ = config->kerb_ticket; return Status::OK(); } @@ -425,6 +438,7 @@ class HdfsClient::HdfsClientImpl { std::string namenode_host_; std::string user_; int port_; + std::string kerb_ticket_; hdfsFS fs_; }; http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/src/arrow/io/hdfs.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h index 4a4e3ec..48699c9 100644 --- a/cpp/src/arrow/io/hdfs.h +++ b/cpp/src/arrow/io/hdfs.h @@ -60,19 +60,16 @@ struct HdfsConnectionConfig { std::string host; int port; std::string user; - - // TODO: Kerberos, etc. + std::string kerb_ticket; }; class ARROW_EXPORT HdfsClient : public FileSystemClient { public: ~HdfsClient(); - // Connect to an HDFS cluster at indicated host, port, and as user + // Connect to an HDFS cluster given a configuration // - // @param host (in) - // @param port (in) - // @param user (in): user to identify as + // @param config (in): configuration for connecting // @param fs (out): the created client // @returns Status static Status Connect( http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/src/arrow/io/libhdfs_shim.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc index f256c31..07eb625 100644 --- a/cpp/src/arrow/io/libhdfs_shim.cc +++ b/cpp/src/arrow/io/libhdfs_shim.cc @@ -73,9 +73,17 @@ static HINSTANCE libjvm_handle = NULL; // NOTE(wesm): cpplint does not like use of short and other imprecise C types -static hdfsFS (*ptr_hdfsConnectAsUser)( - const char* host, tPort port, const char* user) = NULL; -static hdfsFS (*ptr_hdfsConnect)(const char* host, tPort port) = NULL; +static hdfsBuilder* (*ptr_hdfsNewBuilder)(void) = NULL; +static void (*ptr_hdfsBuilderSetNameNode)( + hdfsBuilder* bld, const char* nn) = NULL; +static void (*ptr_hdfsBuilderSetNameNodePort)( + hdfsBuilder* bld, tPort port) = NULL; +static void (*ptr_hdfsBuilderSetUserName)( + hdfsBuilder* bld, const char* userName) = NULL; +static void (*ptr_hdfsBuilderSetKerbTicketCachePath)( + hdfsBuilder* bld, const char* kerbTicketCachePath) = NULL; +static hdfsFS (*ptr_hdfsBuilderConnect)(hdfsBuilder* bld) = NULL; + static int (*ptr_hdfsDisconnect)(hdfsFS fs) = NULL; static hdfsFile (*ptr_hdfsOpenFile)(hdfsFS fs, const char* path, int flags, @@ -149,18 +157,29 @@ static void* get_symbol(const char* symbol) { #endif } -hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char* user) { - return ptr_hdfsConnectAsUser(host, port, user); +hdfsBuilder* hdfsNewBuilder(void) { + return ptr_hdfsNewBuilder(); } -// Returns NULL on failure -hdfsFS hdfsConnect(const char* host, tPort port) { - if (ptr_hdfsConnect) { - return ptr_hdfsConnect(host, port); - } else { - // TODO: error reporting when shim setup fails - return NULL; - } +void hdfsBuilderSetNameNode(hdfsBuilder* bld, const char* nn) { + ptr_hdfsBuilderSetNameNode(bld, nn); +} + +void hdfsBuilderSetNameNodePort(hdfsBuilder* bld, tPort port) { + ptr_hdfsBuilderSetNameNodePort(bld, port); +} + +void hdfsBuilderSetUserName(hdfsBuilder* bld, const char* userName) { + ptr_hdfsBuilderSetUserName(bld, userName); +} + +void hdfsBuilderSetKerbTicketCachePath(hdfsBuilder* bld, + const char* kerbTicketCachePath) { + ptr_hdfsBuilderSetKerbTicketCachePath(bld , kerbTicketCachePath); +} + +hdfsFS hdfsBuilderConnect(hdfsBuilder* bld) { + return ptr_hdfsBuilderConnect(bld); } int hdfsDisconnect(hdfsFS fs) { @@ -342,18 +361,36 @@ int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime) { } static std::vector<fs::path> get_potential_libhdfs_paths() { - std::vector<fs::path> libhdfs_potential_paths = { - // find one in the local directory - fs::path("./libhdfs.so"), fs::path("./hdfs.dll"), - // find a global libhdfs.so - fs::path("libhdfs.so"), fs::path("hdfs.dll"), + std::vector<fs::path> libhdfs_potential_paths; + std::string file_name; + + // OS-specific file name +#ifdef __WIN32 + file_name = "hdfs.dll"; +#elif __APPLE__ + file_name = "libhdfs.dylib"; +#else + file_name = "libhdfs.so"; +#endif + + // Common paths + std::vector<fs::path> search_paths = { + fs::path(""), + fs::path(".") }; + // Path from environment variable const char* hadoop_home = std::getenv("HADOOP_HOME"); if (hadoop_home != nullptr) { - auto path = fs::path(hadoop_home) / "lib/native/libhdfs.so"; - libhdfs_potential_paths.push_back(path); + auto path = fs::path(hadoop_home) / "lib/native"; + search_paths.push_back(path); } + + // All paths with file name + for (auto& path : search_paths) { + libhdfs_potential_paths.push_back(path / file_name); + } + return libhdfs_potential_paths; } @@ -371,7 +408,7 @@ static std::vector<fs::path> get_potential_libjvm_paths() { file_name = "jvm.dll"; #elif __APPLE__ search_prefixes = {""}; - search_suffixes = {""}; + search_suffixes = {"", "/jre/lib/server"}; file_name = "libjvm.dylib"; // SFrame uses /usr/libexec/java_home to find JAVA_HOME; for now we are @@ -513,8 +550,12 @@ Status ARROW_EXPORT ConnectLibHdfs() { return Status::IOError("Prior attempt to load libhdfs failed"); } - GET_SYMBOL_REQUIRED(hdfsConnect); - GET_SYMBOL_REQUIRED(hdfsConnectAsUser); + GET_SYMBOL_REQUIRED(hdfsNewBuilder); + GET_SYMBOL_REQUIRED(hdfsBuilderSetNameNode); + GET_SYMBOL_REQUIRED(hdfsBuilderSetNameNodePort); + GET_SYMBOL_REQUIRED(hdfsBuilderSetUserName); + GET_SYMBOL_REQUIRED(hdfsBuilderSetKerbTicketCachePath); + GET_SYMBOL_REQUIRED(hdfsBuilderConnect); GET_SYMBOL_REQUIRED(hdfsCreateDirectory); GET_SYMBOL_REQUIRED(hdfsDelete); GET_SYMBOL_REQUIRED(hdfsDisconnect); http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/python/pyarrow/includes/libarrow_io.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd index 8074915..7703415 100644 --- a/python/pyarrow/includes/libarrow_io.pxd +++ b/python/pyarrow/includes/libarrow_io.pxd @@ -93,6 +93,7 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: c_string host int port c_string user + c_string kerb_ticket cdef cppclass HdfsPathInfo: ObjectType kind; http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/python/pyarrow/io.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index 16ebfa1..0e6b81e 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -288,9 +288,6 @@ cdef class HdfsClient: shared_ptr[CHdfsClient] client cdef readonly: - object host - int port - object user bint is_open def __cinit__(self): @@ -301,6 +298,9 @@ cdef class HdfsClient: self.close() def close(self): + """ + Disconnect from the HDFS cluster + """ self._ensure_client() with nogil: check_status(self.client.get().Disconnect()) @@ -313,14 +313,21 @@ cdef class HdfsClient: raise IOError('HDFS client is closed') @classmethod - def connect(cls, host, port, user): + def connect(cls, host="default", port=0, user=None, kerb_ticket=None): """ + Connect to an HDFS cluster. All parameters are optional and should + only be set if the defaults need to be overridden. + + Authentication should be automatic if the HDFS cluster uses Kerberos. + However, if a username is specified, then the ticket cache will likely + be required. Parameters ---------- - host : - port : - user : + host : NameNode. Set to "default" for fs.defaultFS from core-site.xml. + port : NameNode's port. Set to 0 for default or logical (HA) nodes. + user : Username when connecting to HDFS; None implies login user. + kerb_ticket : Path to Kerberos ticket cache. Notes ----- @@ -335,9 +342,13 @@ cdef class HdfsClient: HdfsClient out = HdfsClient() HdfsConnectionConfig conf - conf.host = tobytes(host) + if host is not None: + conf.host = tobytes(host) conf.port = port - conf.user = tobytes(user) + if user is not None: + conf.user = tobytes(user) + if kerb_ticket is not None: + conf.kerb_ticket = tobytes(kerb_ticket) with nogil: check_status(CHdfsClient.Connect(&conf, &out.client))