kou commented on code in PR #43601:
URL: https://github.com/apache/arrow/pull/43601#discussion_r1739645940


##########
cpp/src/arrow/filesystem/s3fs.cc:
##########
@@ -1433,11 +1455,12 @@ bool IsDirectory(std::string_view key, const 
S3Model::HeadObjectResult& result)
 class ObjectInputFile final : public io::RandomAccessFile {
  public:
   ObjectInputFile(std::shared_ptr<S3ClientHolder> holder, const io::IOContext& 
io_context,
-                  const S3Path& path, int64_t size = kNoSize)
+                  const S3Path& path, int64_t size = kNoSize, const 
std::string& c_key = "")

Review Comment:
   Could you use `sse_customer_key` not `c_key`?



##########
cpp/src/arrow/filesystem/filesystem_test.cc:
##########
@@ -54,6 +54,17 @@ TEST(FileInfo, BaseName) {
   ASSERT_EQ(info.base_name(), "baz.qux");
 }
 
+TEST(CalculateSSECKeyMD5, Sanity) { 
+  ASSERT_FALSE(CalculateSSECKeyMD5("").ok());  // invalid base64
+  ASSERT_FALSE(CalculateSSECKeyMD5("%^H").ok());  // invalid base64
+  ASSERT_FALSE(CalculateSSECKeyMD5("INVALID").ok());  // invalid base64
+  ASSERT_FALSE(CalculateSSECKeyMD5("MTIzNDU2Nzg5").ok());  // invalid, the 
input key size not match

Review Comment:
   Could you use `ASSERT_RAISES_WITH_MESSAGES()` instead?



##########
cpp/src/arrow/filesystem/filesystem_test.cc:
##########
@@ -54,6 +54,17 @@ TEST(FileInfo, BaseName) {
   ASSERT_EQ(info.base_name(), "baz.qux");
 }
 
+TEST(CalculateSSECKeyMD5, Sanity) { 
+  ASSERT_FALSE(CalculateSSECKeyMD5("").ok());  // invalid base64
+  ASSERT_FALSE(CalculateSSECKeyMD5("%^H").ok());  // invalid base64
+  ASSERT_FALSE(CalculateSSECKeyMD5("INVALID").ok());  // invalid base64
+  ASSERT_FALSE(CalculateSSECKeyMD5("MTIzNDU2Nzg5").ok());  // invalid, the 
input key size not match
+  // valid case
+  auto result = 
CalculateSSECKeyMD5("1WH9aTJ0+Tn0NLbTMHZn9aCW3Li3ViAdBsoIldPCREw=");
+  ASSERT_TRUE(result.ok());  // valid case
+  ASSERT_STREQ(result->c_str(), "3HYIM58NCLwrIOdPpWnYwQ==");  // valid case

Review Comment:
   Can we use `ASSERT_EQ()` here?



##########
cpp/src/arrow/filesystem/s3fs.cc:
##########
@@ -439,13 +440,31 @@ bool S3Options::Equals(const S3Options& other) const {
           background_writes == other.background_writes &&
           allow_bucket_creation == other.allow_bucket_creation &&
           allow_bucket_deletion == other.allow_bucket_deletion &&
+          sse_customer_key == other.sse_customer_key &&
           default_metadata_equals && GetAccessKey() == other.GetAccessKey() &&
           GetSecretKey() == other.GetSecretKey() &&
           GetSessionToken() == other.GetSessionToken());
 }
 
 namespace {
 
+template <typename S3RequestType>
+Status SetSSECustomerKey(S3RequestType& request,
+                       const std::string& sse_customer_key) {
+  if (sse_customer_key.empty()) {
+    return Status::OK(); // do nothing if the sse_customer_key is not 
configured
+  }
+  auto result = internal::CalculateSSECKeyMD5(sse_customer_key);
+  if (result.ok()) {

Review Comment:
   Could you use `ARROW_ASSIGN_OR_RAISE()`?



##########
cpp/src/arrow/filesystem/s3fs.cc:
##########
@@ -1668,6 +1694,8 @@ class ObjectOutputStream final : public io::OutputStream {
     S3Model::CreateMultipartUploadRequest req;
     req.SetBucket(ToAwsString(path_.bucket));
     req.SetKey(ToAwsString(path_.key));
+    RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key));
+

Review Comment:
   Could you revert a needless change?
   
   ```suggestion
   ```



##########
cpp/src/arrow/filesystem/s3fs.cc:
##########
@@ -168,6 +168,7 @@ static constexpr const char kAwsEndpointUrlEnvVar[] = 
"AWS_ENDPOINT_URL";
 static constexpr const char kAwsEndpointUrlS3EnvVar[] = "AWS_ENDPOINT_URL_S3";
 static constexpr const char kAwsDirectoryContentType[] = 
"application/x-directory";
 
+

Review Comment:
   Could you revert a needless change?



##########
cpp/src/arrow/filesystem/s3fs.cc:
##########
@@ -1433,11 +1455,12 @@ bool IsDirectory(std::string_view key, const 
S3Model::HeadObjectResult& result)
 class ObjectInputFile final : public io::RandomAccessFile {
  public:
   ObjectInputFile(std::shared_ptr<S3ClientHolder> holder, const io::IOContext& 
io_context,
-                  const S3Path& path, int64_t size = kNoSize)
+                  const S3Path& path, int64_t size = kNoSize, const 
std::string& c_key = "")
       : holder_(std::move(holder)),
         io_context_(io_context),
         path_(path),
-        content_length_(size) {}
+        content_length_(size),
+        sse_customer_key(c_key) {}

Review Comment:
   Could you use `XXX_` style (`sse_customer_key_`) for a member variable name?



##########
cpp/src/arrow/filesystem/s3fs.cc:
##########
@@ -1769,6 +1797,9 @@ class ObjectOutputStream final : public io::OutputStream {
     S3Model::CompleteMultipartUploadRequest req;
     req.SetBucket(ToAwsString(path_.bucket));
     req.SetKey(ToAwsString(path_.key));
+    RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key));
+
+

Review Comment:
   Could you remove needless empty lines?
   
   ```suggestion
   ```



##########
cpp/src/arrow/filesystem/util_internal.cc:
##########
@@ -260,6 +263,42 @@ Result<FileInfoVector> GlobFiles(const 
std::shared_ptr<FileSystem>& filesystem,
   return out;
 }
 
+Result<std::string> CalculateSSECKeyMD5(const std::string& base64_encoded_key,
+    int expect_input_key_size) {
+  if (base64_encoded_key.size() < 2) {
+    return Status::Invalid("At least 2 bytes needed for the base64 encoded 
string");
+  }
+  // Check if the string contains only valid Base64 characters
+  for (char c : base64_encoded_key) {
+    if (!std::isalnum(c) && c != '+' && c != '/' && c != '=') {
+      return Status::Invalid("Invalid character found in the base64 encoded 
string");
+    }
+  }
+
+  // Decode the Base64-encoded key to get the raw binary key
+  Aws::Utils::ByteBuffer rawKey =
+      Aws::Utils::HashingUtils::Base64Decode(base64_encoded_key);
+
+  // the key needs to be // 256 bits(32 bytes)according to
+  // 
https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html#specifying-s3-c-encryption
+  if (rawKey.GetLength() != expect_input_key_size) {
+    return Status::Invalid("Invalid Length for the key");
+  }
+
+  // Convert the raw binary key to an Aws::String
+  Aws::String rawKeyStr(reinterpret_cast<const 
char*>(rawKey.GetUnderlyingData()),
+                        rawKey.GetLength());
+
+  // Compute the MD5 hash of the raw binary key
+  Aws::Utils::ByteBuffer md5Hash = 
Aws::Utils::HashingUtils::CalculateMD5(rawKeyStr);
+
+  // Base64-encode the MD5 hash
+  Aws::String awsEncodedHash = Aws::Utils::HashingUtils::Base64Encode(md5Hash);

Review Comment:
   Can we use `arrow::util::base64_encode()` instead?



##########
cpp/src/arrow/filesystem/filesystem_test.cc:
##########
@@ -54,6 +54,17 @@ TEST(FileInfo, BaseName) {
   ASSERT_EQ(info.base_name(), "baz.qux");
 }
 
+TEST(CalculateSSECKeyMD5, Sanity) { 
+  ASSERT_FALSE(CalculateSSECKeyMD5("").ok());  // invalid base64
+  ASSERT_FALSE(CalculateSSECKeyMD5("%^H").ok());  // invalid base64
+  ASSERT_FALSE(CalculateSSECKeyMD5("INVALID").ok());  // invalid base64
+  ASSERT_FALSE(CalculateSSECKeyMD5("MTIzNDU2Nzg5").ok());  // invalid, the 
input key size not match
+  // valid case
+  auto result = 
CalculateSSECKeyMD5("1WH9aTJ0+Tn0NLbTMHZn9aCW3Li3ViAdBsoIldPCREw=");
+  ASSERT_TRUE(result.ok());  // valid case

Review Comment:
   Could you use `ASSERT_OK_AND_ASSIGN()` instead?



##########
cpp/src/arrow/filesystem/s3fs.cc:
##########
@@ -1950,6 +1981,8 @@ class ObjectOutputStream final : public io::OutputStream {
     req.SetKey(ToAwsString(path_.key));
     req.SetBody(std::make_shared<StringViewStream>(data, nbytes));
     req.SetContentLength(nbytes);
+    RETURN_NOT_OK(SetSSECustomerKey(req, sse_customer_key));
+

Review Comment:
   Could you remove a needless change?
   
   ```suggestion
   ```



##########
cpp/src/arrow/filesystem/util_internal.cc:
##########
@@ -19,6 +19,9 @@
 
 #include <algorithm>
 #include <cerrno>
+#include <cctype>
+#include <aws/core/utils/HashingUtils.h>
+#include <aws/core/utils/base64/Base64.h>

Review Comment:
   Hmm. `util_internal.cc` is shared by not only the S3 filesystem 
implementation but also all filesystem implementations.
   Can we avoid using `util_internal.cc` for S3 specific utility?



##########
cpp/src/arrow/filesystem/util_internal.cc:
##########
@@ -260,6 +263,42 @@ Result<FileInfoVector> GlobFiles(const 
std::shared_ptr<FileSystem>& filesystem,
   return out;
 }
 
+Result<std::string> CalculateSSECKeyMD5(const std::string& base64_encoded_key,
+    int expect_input_key_size) {
+  if (base64_encoded_key.size() < 2) {
+    return Status::Invalid("At least 2 bytes needed for the base64 encoded 
string");
+  }
+  // Check if the string contains only valid Base64 characters
+  for (char c : base64_encoded_key) {
+    if (!std::isalnum(c) && c != '+' && c != '/' && c != '=') {
+      return Status::Invalid("Invalid character found in the base64 encoded 
string");
+    }
+  }
+
+  // Decode the Base64-encoded key to get the raw binary key
+  Aws::Utils::ByteBuffer rawKey =
+      Aws::Utils::HashingUtils::Base64Decode(base64_encoded_key);
+
+  // the key needs to be // 256 bits(32 bytes)according to

Review Comment:
   Is the `//` garbage?



##########
cpp/src/arrow/filesystem/util_internal.cc:
##########
@@ -260,6 +263,42 @@ Result<FileInfoVector> GlobFiles(const 
std::shared_ptr<FileSystem>& filesystem,
   return out;
 }
 
+Result<std::string> CalculateSSECKeyMD5(const std::string& base64_encoded_key,
+    int expect_input_key_size) {
+  if (base64_encoded_key.size() < 2) {
+    return Status::Invalid("At least 2 bytes needed for the base64 encoded 
string");
+  }
+  // Check if the string contains only valid Base64 characters
+  for (char c : base64_encoded_key) {
+    if (!std::isalnum(c) && c != '+' && c != '/' && c != '=') {
+      return Status::Invalid("Invalid character found in the base64 encoded 
string");
+    }
+  }
+
+  // Decode the Base64-encoded key to get the raw binary key
+  Aws::Utils::ByteBuffer rawKey =
+      Aws::Utils::HashingUtils::Base64Decode(base64_encoded_key);

Review Comment:
   Can we use `arrow::util::base64_decode()` instead?



##########
cpp/src/arrow/filesystem/s3fs_test.cc:
##########
@@ -443,6 +443,7 @@ class TestS3FS : public S3TestMixin {
     // Most tests will create buckets
     options_.allow_bucket_creation = true;
     options_.allow_bucket_deletion = true;
+    options_.sse_customer_key = "1WH9aTJ0+Tn0NLbTMHZn9aCW3Li3ViAdBsoIldPCREw=";

Review Comment:
   Why is this needed?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to