pitrou commented on code in PR #46626: URL: https://github.com/apache/arrow/pull/46626#discussion_r2129055443
########## cpp/src/arrow/util/secure_string.cc: ########## @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// __STDC_WANT_LIB_EXT1__ and string.h are required by memset_s: +// https://en.cppreference.com/w/c/string/byte/memset +#define __STDC_WANT_LIB_EXT1__ 1 +#include <string.h> +#include <utility> + +#if defined(ARROW_USE_OPENSSL) +# include <openssl/crypto.h> +# include <openssl/opensslv.h> +#endif + +#include "arrow/util/windows_compatibility.h" +#if defined(_WIN32) +# include <windows.h> +#endif + +#include "arrow/util/logging.h" +#include "arrow/util/secure_string.h" +#include "arrow/util/span.h" + +namespace arrow::util { + +/// Note: +/// A std::string is securely moved into a SecureString in two steps: +/// 1. the std::string is moved via std::move(string) +/// 2. the std::string is securely cleared +/// +/// The std::move has two different effects, depending on the size of the string. +/// A very short string (called local string) stores the string in a local buffer, +/// a long string stores a pointer to allocated memory that stores the string. +/// +/// If the string is a small string, std::move copies the local buffer. +/// If the string is a long string, std::move moves the pointer and then resets the +/// string size to 0 (which turns the string into a local string). +/// +/// In both cases, after a std::move(string), the string uses the local buffer. +/// +/// Thus, after a std::move(string), calling SecureClear(std::string*) only +/// securely clears the **local buffer** of the string. Therefore, std::move(string) +/// must move the pointer of long string into SecureString (which later clears the +/// string). Otherwise, the content of the string cannot be securely cleared. +/// +/// This condition is checked by secure_move. + +void secure_move(std::string& string, std::string& dst) { Review Comment: Nit: rename this to `SecureMove` and put it in the anonymous namespace so that it's not exposed as a public symbol. ########## cpp/src/arrow/util/secure_string_test.cc: ########## @@ -0,0 +1,478 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include <algorithm> +#include <vector> + +#include "arrow/util/secure_string.h" + +namespace arrow::util::test { + +std::string_view StringArea(const std::string& string) { + return {string.data(), string.capacity()}; +} + +// same as GTest ASSERT_PRED_FORMAT2 macro, but without the outer GTEST_ASSERT_ +#define COMPARE(val1, val2) \ + ::testing::internal::EqHelper::Compare(#val1, #val2, val1, val2) + +::testing::AssertionResult AssertSecurelyCleared(const std::string_view& area) { Review Comment: I think we shouldn't call it "AssertSomething" since it doesn't actually assert. Perhaps `SecurelyCleared`? ########## cpp/src/arrow/util/secure_string_test.cc: ########## @@ -0,0 +1,478 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include <algorithm> +#include <vector> + +#include "arrow/util/secure_string.h" + +namespace arrow::util::test { + +std::string_view StringArea(const std::string& string) { + return {string.data(), string.capacity()}; +} + +// same as GTest ASSERT_PRED_FORMAT2 macro, but without the outer GTEST_ASSERT_ +#define COMPARE(val1, val2) \ + ::testing::internal::EqHelper::Compare(#val1, #val2, val1, val2) + +::testing::AssertionResult AssertSecurelyCleared(const std::string_view& area) { + // the entire area is filled with zeros + std::string zeros(area.size(), '\0'); + return COMPARE(area, std::string_view(zeros)); +} + +::testing::AssertionResult AssertSecurelyCleared(const std::string& string) { + return AssertSecurelyCleared(StringArea(string)); +} + +/** + * Checks the area has been securely cleared after some position. + */ +::testing::AssertionResult AssertSecurelyCleared(const std::string_view& area, + const size_t pos) { + // the area after pos is filled with zeros + if (pos < area.size()) { + std::string zeros(area.size() - pos, '\0'); + return COMPARE(area.substr(pos), std::string_view(zeros)); + } + return ::testing::AssertionSuccess(); +} + +/** + * Checks the area has been securely cleared from the secret value. + * Assumes the area has been deallocated, so it might have been reclaimed and changed + * after cleaning. We cannot check for all-zeros, best we can check here is no secret + * character has leaked. If by any chance the modification produced a former key character + * at the right position, this will be false negative / flaky. Therefore, we check for + * three consecutive secret characters before we fail. + */ +::testing::AssertionResult AssertSecurelyCleared(const std::string_view& area, + const std::string& secret_value) { +#if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) + return testing::AssertionSuccess() << "Not checking deallocated memory"; +#else + // accessing deallocated memory will fail when running with Address Sanitizer enabled + auto leaks = 0; + for (size_t i = 0; i < std::min(area.length(), secret_value.length()); i++) { + if (area[i] == secret_value[i]) { + leaks++; + } else { + if (leaks >= 3) { + break; + } + leaks = 0; + } + } + if (leaks >= 3) { + return ::testing::AssertionFailure() + << leaks << " characters of secret leaked into " << area; + } + return ::testing::AssertionSuccess(); +#endif +} + +TEST(TestSecureString, AssertSecurelyCleared) { + // This tests AssertSecurelyCleared helper methods is actually able to identify secret + // leakage. It retrieves assertion results and asserts result type and message. + testing::AssertionResult result = testing::AssertionSuccess(); + + // check short string with all zeros + auto short_zeros = std::string(8, '\0'); + short_zeros.resize(short_zeros.capacity(), '\0'); // for string buffers longer than 8 + short_zeros.resize(8); // now the entire string buffer has zeros + // checks the entire string buffer (capacity) + ASSERT_TRUE(AssertSecurelyCleared(short_zeros)); + // checks only 10 bytes (length) + ASSERT_TRUE(AssertSecurelyCleared(std::string_view(short_zeros))); + + // check long string with all zeros + auto long_zeros = std::string(1000, '\0'); + long_zeros.resize(long_zeros.capacity(), '\0'); // for longer string buffers + long_zeros.resize(1000); // now the entire string buffer has zeros + // checks the entire string buffer (capacity) + ASSERT_TRUE(AssertSecurelyCleared(long_zeros)); + // checks only 1000 bytes (length) + ASSERT_TRUE(AssertSecurelyCleared(std::string_view(long_zeros))); + + auto no_zeros = std::string("abcdefghijklmnopqrstuvwxyz"); + // string buffer in no_zeros can be larger than no_zeros.length() + // assert only the area that we can control + auto no_zeros_view = std::string_view(no_zeros); + result = AssertSecurelyCleared(no_zeros_view); + ASSERT_FALSE(result); + ASSERT_EQ(std::string(result.message()), + "Expected equality of these values:\n" + " area\n" + " Which is: \"abcdefghijklmnopqrstuvwxyz\"\n" + " std::string_view(zeros)\n" + " Which is: " + "\"\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\" + "0\\0\\0\\0\\0\""); + + // check short string with zeros and non-zeros after string length + auto stars = std::string(12, '*'); + auto short_some_zeros = stars; + memset(short_some_zeros.data(), '\0', 8); + short_some_zeros.resize(8); + // string buffer in short_some_zeros can be larger than 12 + // assert only the area that we can control + auto short_some_zeros_view = std::string_view(short_some_zeros.data(), 12); + result = AssertSecurelyCleared(short_some_zeros_view); + ASSERT_FALSE(result); + ASSERT_EQ(std::string(result.message()), + "Expected equality of these values:\n" + " area\n" + " Which is: \"\\0\\0\\0\\0\\0\\0\\0\\0\\0***\"\n" + " std::string_view(zeros)\n" + " Which is: \"\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\""); + + ASSERT_TRUE(AssertSecurelyCleared(short_some_zeros, stars)); +#if !defined(ARROW_VALGRIND) && !defined(ADDRESS_SANITIZER) + result = AssertSecurelyCleared(short_some_zeros_view, stars); + ASSERT_FALSE(result); + ASSERT_EQ(std::string(result.message()), + "3 characters of secret leaked into " + "\\0\\0\\0\\0\\0\\0\\0\\0\\0***"); +#endif + + // check long string with zeros and non-zeros after string length + stars = std::string(42, '*'); + auto long_some_zeros = stars; + memset(long_some_zeros.data(), '\0', 32); + long_some_zeros.resize(32); + // string buffer in long_some_zeros can be larger than 42 + // assert only the area that we can control + auto long_some_zeros_view = std::string_view(long_some_zeros.data(), 42); + result = AssertSecurelyCleared(long_some_zeros_view); + ASSERT_FALSE(result); + ASSERT_EQ(std::string(result.message()), + "Expected equality of these values:\n" + " area\n" + " Which is: " + "\"\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\" + "0\\0\\0\\0\\0\\0\\0\\0\\0*********\"\n" + " std::string_view(zeros)\n" + " Which is: " + "\"\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\" + "0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\""); + + ASSERT_TRUE(AssertSecurelyCleared(long_some_zeros, stars)); +#if !defined(ARROW_VALGRIND) && !defined(ADDRESS_SANITIZER) + result = AssertSecurelyCleared(long_some_zeros_view, stars); + ASSERT_FALSE(result); + ASSERT_EQ(std::string(result.message()), + "9 characters of secret leaked into " + "\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\" + "0\\0\\0\\0\\0\\0\\0\\0\\0*********"); +#endif + + // check string with non-zeros and zeros after string length + auto some_zeros_back = std::string(no_zeros.length() + 3, '\0'); + some_zeros_back = no_zeros; + memset(some_zeros_back.data() + no_zeros.length() * sizeof(char), '\0', 3 + 1); + // string buffer in some_zeros_back can be larger than no_zeros.length() + 3 + // assert only the area that we can control + auto some_zeros_back_view = + std::string_view(some_zeros_back.data(), no_zeros.length() + 3); + ASSERT_TRUE(AssertSecurelyCleared(some_zeros_back_view, no_zeros.length())); +} + +TEST(TestSecureString, SecureClearString) { + // short string + { + std::string tiny("abc"); + auto old_area = StringArea(tiny); + SecureString::SecureClear(&tiny); + ASSERT_TRUE(AssertSecurelyCleared(tiny)); + ASSERT_TRUE(AssertSecurelyCleared(old_area)); + } + + // long string + { + std::string large(1024, 'x'); + large.resize(512, 'y'); + auto old_area = StringArea(large); + SecureString::SecureClear(&large); + ASSERT_TRUE(AssertSecurelyCleared(large)); + ASSERT_TRUE(AssertSecurelyCleared(old_area)); + } + + // empty string + { + // this creates an empty string with some non-zero characters in the string buffer + // we test that all those characters are securely cleared + std::string empty("abcdef"); + empty.resize(0); + auto old_area = StringArea(empty); + SecureString::SecureClear(&empty); + ASSERT_TRUE(AssertSecurelyCleared(empty)); + ASSERT_TRUE(AssertSecurelyCleared(old_area)); + } +} + +TEST(TestSecureString, Construct) { + // move-constructing from a string securely clears that string + std::string string("hello world"); + auto old_string = StringArea(string); + SecureString secret_from_string(std::move(string)); + ASSERT_TRUE(AssertSecurelyCleared(string)); + ASSERT_TRUE(AssertSecurelyCleared(old_string)); + ASSERT_FALSE(secret_from_string.empty()); Review Comment: Perhaps also assert the SecureString contents here? ```suggestion ASSERT_FALSE(secret_from_string.empty()); ASSERT_EQ(secret_from_string.as_view(), "hello world"); ``` ########## cpp/src/arrow/util/secure_string.cc: ########## @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// __STDC_WANT_LIB_EXT1__ and string.h are required by memset_s: +// https://en.cppreference.com/w/c/string/byte/memset +#define __STDC_WANT_LIB_EXT1__ 1 +#include <string.h> +#include <utility> + +#if defined(ARROW_USE_OPENSSL) +# include <openssl/crypto.h> +# include <openssl/opensslv.h> +#endif + +#include "arrow/util/windows_compatibility.h" +#if defined(_WIN32) +# include <windows.h> +#endif + +#include "arrow/util/logging.h" +#include "arrow/util/secure_string.h" +#include "arrow/util/span.h" + +namespace arrow::util { + +/// Note: +/// A std::string is securely moved into a SecureString in two steps: +/// 1. the std::string is moved via std::move(string) +/// 2. the std::string is securely cleared +/// +/// The std::move has two different effects, depending on the size of the string. +/// A very short string (called local string) stores the string in a local buffer, +/// a long string stores a pointer to allocated memory that stores the string. +/// +/// If the string is a small string, std::move copies the local buffer. +/// If the string is a long string, std::move moves the pointer and then resets the +/// string size to 0 (which turns the string into a local string). +/// +/// In both cases, after a std::move(string), the string uses the local buffer. +/// +/// Thus, after a std::move(string), calling SecureClear(std::string*) only +/// securely clears the **local buffer** of the string. Therefore, std::move(string) +/// must move the pointer of long string into SecureString (which later clears the +/// string). Otherwise, the content of the string cannot be securely cleared. +/// +/// This condition is checked by secure_move. + +void secure_move(std::string& string, std::string& dst) { + auto ptr = string.data(); + dst = std::move(string); + + // We require the buffer address string.data() to remain (not be freed), + // or reused by dst. Otherwise, we cannot securely clear string after this move + ARROW_CHECK(string.data() == ptr || dst.data() == ptr); +} + +SecureString::SecureString(SecureString&& other) noexcept { + secure_move(other.secret_, secret_); + other.Dispose(); +} + +SecureString::SecureString(std::string&& secret) noexcept { + secure_move(secret, secret_); + SecureClear(&secret); +} + +SecureString::SecureString(size_t n, char c) noexcept : secret_(n, c) {} + +SecureString& SecureString::operator=(SecureString&& other) noexcept { + if (this == &other) { + // self-assignment + return *this; + } + Dispose(); + secure_move(other.secret_, secret_); + other.Dispose(); + return *this; +} + +SecureString& SecureString::operator=(const SecureString& other) { + if (this == &other) { + // self-assignment + return *this; + } + Dispose(); + secret_ = other.secret_; + return *this; +} + +SecureString& SecureString::operator=(std::string&& secret) noexcept { + Dispose(); + secure_move(secret, secret_); + SecureClear(&secret); + return *this; +} + +bool SecureString::operator==(const SecureString& other) const { + return secret_ == other.secret_; +} + +bool SecureString::operator!=(const SecureString& other) const { + return secret_ != other.secret_; +} + +bool SecureString::empty() const { return secret_.empty(); } + +std::size_t SecureString::size() const { return secret_.size(); } + +std::size_t SecureString::length() const { return secret_.length(); } + +std::size_t SecureString::capacity() const { return secret_.capacity(); } + +span<uint8_t> SecureString::as_span() { + return {reinterpret_cast<uint8_t*>(secret_.data()), secret_.size()}; +} + +span<const uint8_t> SecureString::as_span() const { + return {reinterpret_cast<const uint8_t*>(secret_.data()), secret_.size()}; +} + +std::string_view SecureString::as_view() const { + return {secret_.data(), secret_.size()}; +} + +void SecureString::Dispose() { SecureClear(&secret_); } + +void SecureString::SecureClear(std::string* secret) { + // call SecureClear first just in case secret->clear() frees some memory + SecureClear(reinterpret_cast<uint8_t*>(secret->data()), secret->capacity()); + secret->clear(); +} + +inline void SecureString::SecureClear(uint8_t* data, size_t size) { Review Comment: @EnricoMi That's a good idea, yes. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
