wgtmac commented on code in PR #603:
URL: https://github.com/apache/iceberg-cpp/pull/603#discussion_r2998667206


##########
src/iceberg/CMakeLists.txt:
##########
@@ -64,6 +64,8 @@ set(ICEBERG_SOURCES
     partition_spec.cc
     partition_summary.cc
     puffin/file_metadata.cc
+    puffin/puffin_format.cc
+    puffin/puffin_json_internal.cc

Review Comment:
   Let's rename to `puffin/json_serde_internal.h` and `puffin/json_serde.cc` 
respectively for consistency.



##########
src/iceberg/puffin/puffin_format.h:
##########
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/puffin/puffin_format.h
+/// Puffin file format constants and utilities.
+
+#include <array>
+#include <cstdint>
+#include <span>
+#include <vector>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/puffin/file_metadata.h"
+#include "iceberg/result.h"
+
+namespace iceberg::puffin {
+
+/// \brief Puffin file format constants.
+struct ICEBERG_EXPORT PuffinFormat {
+  /// Magic bytes: "PFA1" (Puffin Fratercula arctica, version 1)
+  static constexpr std::array<uint8_t, 4> kMagic = {0x50, 0x46, 0x41, 0x31};
+
+  static constexpr int32_t kMagicLength = 4;
+  static constexpr int32_t kFooterStartMagicOffset = 0;
+  static constexpr int32_t kFooterStartMagicLength = kMagicLength;
+  static constexpr int32_t kFooterStructPayloadSizeOffset = 0;
+  static constexpr int32_t kFooterStructFlagsOffset = 
kFooterStructPayloadSizeOffset + 4;
+  static constexpr int32_t kFooterStructFlagsLength = 4;
+  static constexpr int32_t kFooterStructMagicOffset =
+      kFooterStructFlagsOffset + kFooterStructFlagsLength;
+
+  /// Total length of the footer struct: payload_size(4) + flags(4) + magic(4)
+  static constexpr int32_t kFooterStructLength = kFooterStructMagicOffset + 
kMagicLength;
+
+  /// Default compression codec for footer payload.
+  static constexpr PuffinCompressionCodec kFooterCompressionCodec =

Review Comment:
   ```suggestion
     static constexpr PuffinCompressionCodec kDefaultFooterCompressionCodec =
   ```



##########
src/iceberg/test/CMakeLists.txt:
##########
@@ -125,6 +125,10 @@ add_iceberg_test(util_test
 
 add_iceberg_test(roaring_test SOURCES roaring_test.cc)
 
+add_iceberg_test(puffin_format_test SOURCES puffin_format_test.cc)
+
+add_iceberg_test(puffin_json_test SOURCES puffin_json_test.cc)

Review Comment:
   Can we merge them into a single puffin_test executable?



##########
src/iceberg/test/puffin_format_test.cc:
##########
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/puffin/puffin_format.h"
+
+#include <array>
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+namespace iceberg::puffin {
+
+TEST(PuffinFormatTest, ByteOrderRoundTrip) {
+  std::array<uint8_t, 4> buf{};
+  WriteInt32LittleEndian(0x12345678, buf);

Review Comment:
   Should these functions be moved to endian.h as they are not puffin specific?



##########
src/iceberg/puffin/puffin_format.cc:
##########
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/puffin/puffin_format.h"
+
+#include <utility>
+
+#include "iceberg/util/endian.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg::puffin {
+
+namespace {
+
+constexpr std::pair<int, int> GetFlagPosition(PuffinFlag flag) {
+  switch (flag) {
+    case PuffinFlag::kFooterPayloadCompressed:
+      return {0, 0};
+  }
+  std::unreachable();
+}
+
+}  // namespace
+
+bool IsFlagSet(std::span<const uint8_t, 4> flags, PuffinFlag flag) {
+  auto [byte_num, bit_num] = GetFlagPosition(flag);
+  return (flags[byte_num] & (1 << bit_num)) != 0;
+}
+
+void SetFlag(std::span<uint8_t, 4> flags, PuffinFlag flag) {
+  auto [byte_num, bit_num] = GetFlagPosition(flag);
+  flags[byte_num] |= (1 << bit_num);
+}
+
+void WriteInt32LittleEndian(int32_t value, std::span<uint8_t, 4> output) {
+  WriteLittleEndian(value, output.data());
+}
+
+int32_t ReadInt32LittleEndian(std::span<const uint8_t, 4> input) {
+  return ReadLittleEndian<int32_t>(input.data());
+}
+
+int32_t ReadInt32LittleEndian(std::span<const uint8_t> data, int32_t offset) {
+  ICEBERG_DCHECK(offset >= 0, "Offset must be non-negative");

Review Comment:
   ICEBERG_DCHECK is a debug check and should only be used in a rare case. If 
this is a generic function and cannot guarantee the input, we should return 
`Result<int32_t>` and replace it by `ICEBERG_PRECHECK` here.



##########
src/iceberg/puffin/puffin_format.h:
##########
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/puffin/puffin_format.h
+/// Puffin file format constants and utilities.
+
+#include <array>
+#include <cstdint>
+#include <span>
+#include <vector>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/puffin/file_metadata.h"
+#include "iceberg/result.h"
+
+namespace iceberg::puffin {
+
+/// \brief Puffin file format constants.
+struct ICEBERG_EXPORT PuffinFormat {
+  /// Magic bytes: "PFA1" (Puffin Fratercula arctica, version 1)
+  static constexpr std::array<uint8_t, 4> kMagic = {0x50, 0x46, 0x41, 0x31};

Review Comment:
   ```suggestion
     static constexpr std::array<uint8_t, 4> kMagicV1 = {0x50, 0x46, 0x41, 
0x31};
   ```
   
   Let's make V1 explicit.



##########
src/iceberg/puffin/puffin_format.h:
##########
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/puffin/puffin_format.h
+/// Puffin file format constants and utilities.
+
+#include <array>
+#include <cstdint>
+#include <span>
+#include <vector>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/puffin/file_metadata.h"
+#include "iceberg/result.h"
+
+namespace iceberg::puffin {
+
+/// \brief Puffin file format constants.
+struct ICEBERG_EXPORT PuffinFormat {
+  /// Magic bytes: "PFA1" (Puffin Fratercula arctica, version 1)
+  static constexpr std::array<uint8_t, 4> kMagic = {0x50, 0x46, 0x41, 0x31};
+
+  static constexpr int32_t kMagicLength = 4;
+  static constexpr int32_t kFooterStartMagicOffset = 0;
+  static constexpr int32_t kFooterStartMagicLength = kMagicLength;
+  static constexpr int32_t kFooterStructPayloadSizeOffset = 0;
+  static constexpr int32_t kFooterStructFlagsOffset = 
kFooterStructPayloadSizeOffset + 4;
+  static constexpr int32_t kFooterStructFlagsLength = 4;
+  static constexpr int32_t kFooterStructMagicOffset =
+      kFooterStructFlagsOffset + kFooterStructFlagsLength;
+
+  /// Total length of the footer struct: payload_size(4) + flags(4) + magic(4)
+  static constexpr int32_t kFooterStructLength = kFooterStructMagicOffset + 
kMagicLength;
+
+  /// Default compression codec for footer payload.
+  static constexpr PuffinCompressionCodec kFooterCompressionCodec =
+      PuffinCompressionCodec::kLz4;
+};
+
+/// \brief Footer flags for Puffin files.
+enum class PuffinFlag : uint8_t {
+  /// Whether the footer payload is compressed.
+  kFooterPayloadCompressed = 0,
+};
+
+/// \brief Check if a flag is set in the flags bytes.
+ICEBERG_EXPORT bool IsFlagSet(std::span<const uint8_t, 4> flags, PuffinFlag 
flag);
+
+/// \brief Set a flag in the flags bytes.
+ICEBERG_EXPORT void SetFlag(std::span<uint8_t, 4> flags, PuffinFlag flag);
+
+/// \brief Write a 32-bit integer in little-endian format.
+ICEBERG_EXPORT void WriteInt32LittleEndian(int32_t value, std::span<uint8_t, 
4> output);
+
+/// \brief Read a 32-bit integer from a fixed-size span in little-endian 
format.
+ICEBERG_EXPORT int32_t ReadInt32LittleEndian(std::span<const uint8_t, 4> 
input);
+
+/// \brief Read a 32-bit integer from a buffer at the given offset in 
little-endian
+/// format.
+ICEBERG_EXPORT int32_t ReadInt32LittleEndian(std::span<const uint8_t> data,
+                                             int32_t offset);
+
+/// \brief Compress data using the specified codec.
+ICEBERG_EXPORT Result<std::vector<uint8_t>> Compress(PuffinCompressionCodec 
codec,

Review Comment:
   It is not the right place to put compression logic here as this can be 
shared by TableMetadata compression as well. We can consider a single place to 
unify the codec implementation (as well as the codec availability).



##########
src/iceberg/puffin/puffin_format.cc:
##########
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/puffin/puffin_format.h"
+
+#include <utility>
+
+#include "iceberg/util/endian.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg::puffin {
+
+namespace {
+
+constexpr std::pair<int, int> GetFlagPosition(PuffinFlag flag) {

Review Comment:
   Add a comment to improve the readability?



##########
src/iceberg/test/puffin_json_test.cc:
##########
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include <nlohmann/json.hpp>
+
+#include "iceberg/puffin/file_metadata.h"
+#include "iceberg/puffin/puffin_json_internal.h"
+
+namespace iceberg::puffin {
+
+TEST(PuffinJsonTest, BlobMetadataRoundTrip) {
+  BlobMetadata blob;
+  blob.type = "apache-datasketches-theta-v1";
+  blob.input_fields = {1, 2};
+  blob.snapshot_id = 12345;
+  blob.sequence_number = 67;
+  blob.offset = 100;
+  blob.length = 200;
+  blob.compression_codec = "zstd";
+  blob.properties = {{"key", "value"}};
+
+  nlohmann::json expected_json = R"({
+    "type": "apache-datasketches-theta-v1",
+    "fields": [1, 2],
+    "snapshot-id": 12345,
+    "sequence-number": 67,
+    "offset": 100,
+    "length": 200,
+    "compression-codec": "zstd",
+    "properties": {"key": "value"}
+  })"_json;
+
+  EXPECT_EQ(ToJson(blob), expected_json);
+
+  auto result = BlobMetadataFromJson(expected_json);
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), blob);

Review Comment:
   nit: merge these two checks into `ASSERT_THAT`



##########
src/iceberg/test/puffin_json_test.cc:
##########
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include <nlohmann/json.hpp>
+
+#include "iceberg/puffin/file_metadata.h"
+#include "iceberg/puffin/puffin_json_internal.h"
+
+namespace iceberg::puffin {
+
+TEST(PuffinJsonTest, BlobMetadataRoundTrip) {

Review Comment:
   We can improve these test cases by refactoring them into parameterized test 
with more valid and invalid cases. A lot of other test files have done similar 
jobs.



##########
src/iceberg/test/puffin_json_test.cc:
##########
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include <nlohmann/json.hpp>
+
+#include "iceberg/puffin/file_metadata.h"
+#include "iceberg/puffin/puffin_json_internal.h"
+
+namespace iceberg::puffin {
+
+TEST(PuffinJsonTest, BlobMetadataRoundTrip) {
+  BlobMetadata blob;

Review Comment:
   nit: use aggregate initialization.



##########
src/iceberg/puffin/puffin_format.h:
##########
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/puffin/puffin_format.h
+/// Puffin file format constants and utilities.
+
+#include <array>
+#include <cstdint>
+#include <span>
+#include <vector>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/puffin/file_metadata.h"
+#include "iceberg/result.h"
+
+namespace iceberg::puffin {
+
+/// \brief Puffin file format constants.
+struct ICEBERG_EXPORT PuffinFormat {
+  /// Magic bytes: "PFA1" (Puffin Fratercula arctica, version 1)
+  static constexpr std::array<uint8_t, 4> kMagic = {0x50, 0x46, 0x41, 0x31};
+
+  static constexpr int32_t kMagicLength = 4;
+  static constexpr int32_t kFooterStartMagicOffset = 0;
+  static constexpr int32_t kFooterStartMagicLength = kMagicLength;
+  static constexpr int32_t kFooterStructPayloadSizeOffset = 0;
+  static constexpr int32_t kFooterStructFlagsOffset = 
kFooterStructPayloadSizeOffset + 4;
+  static constexpr int32_t kFooterStructFlagsLength = 4;
+  static constexpr int32_t kFooterStructMagicOffset =
+      kFooterStructFlagsOffset + kFooterStructFlagsLength;
+
+  /// Total length of the footer struct: payload_size(4) + flags(4) + magic(4)
+  static constexpr int32_t kFooterStructLength = kFooterStructMagicOffset + 
kMagicLength;
+
+  /// Default compression codec for footer payload.
+  static constexpr PuffinCompressionCodec kFooterCompressionCodec =
+      PuffinCompressionCodec::kLz4;
+};
+
+/// \brief Footer flags for Puffin files.
+enum class PuffinFlag : uint8_t {
+  /// Whether the footer payload is compressed.
+  kFooterPayloadCompressed = 0,
+};
+
+/// \brief Check if a flag is set in the flags bytes.
+ICEBERG_EXPORT bool IsFlagSet(std::span<const uint8_t, 4> flags, PuffinFlag 
flag);
+
+/// \brief Set a flag in the flags bytes.
+ICEBERG_EXPORT void SetFlag(std::span<uint8_t, 4> flags, PuffinFlag flag);
+
+/// \brief Write a 32-bit integer in little-endian format.
+ICEBERG_EXPORT void WriteInt32LittleEndian(int32_t value, std::span<uint8_t, 
4> output);
+
+/// \brief Read a 32-bit integer from a fixed-size span in little-endian 
format.
+ICEBERG_EXPORT int32_t ReadInt32LittleEndian(std::span<const uint8_t, 4> 
input);
+
+/// \brief Read a 32-bit integer from a buffer at the given offset in 
little-endian
+/// format.
+ICEBERG_EXPORT int32_t ReadInt32LittleEndian(std::span<const uint8_t> data,
+                                             int32_t offset);
+
+/// \brief Compress data using the specified codec.
+ICEBERG_EXPORT Result<std::vector<uint8_t>> Compress(PuffinCompressionCodec 
codec,
+                                                     std::span<const uint8_t> 
input);
+
+/// \brief Decompress data using the specified codec.
+ICEBERG_EXPORT Result<std::vector<uint8_t>> Decompress(PuffinCompressionCodec 
codec,
+                                                       std::span<const 
uint8_t> input);

Review Comment:
   We're on C++23 so `std::byte` is definitely available. Together with my 
previous comment for a generic codec interface/implementation, we can use 
`std::span<const std::byte>` as input and `std::vector<std::byte>` as output 
but with overloads that accepting `std::string_view` and returning 
`std::string`, etc.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to