wgtmac commented on code in PR #562:
URL: https://github.com/apache/iceberg-cpp/pull/562#discussion_r2929984316


##########
src/iceberg/expression/literal.cc:
##########
@@ -193,12 +198,36 @@ Result<Literal> LiteralCaster::CastFromString(
       ICEBERG_ASSIGN_OR_RAISE(auto uuid, Uuid::FromString(str_val));
       return Literal::UUID(uuid);
     }
-    case TypeId::kDate:
-    case TypeId::kTime:
-    case TypeId::kTimestamp:
-    case TypeId::kTimestampTz:
-      return NotImplemented("Cast from String to {} is not implemented yet",
-                            target_type->ToString());
+    case TypeId::kDate: {
+      ICEBERG_ASSIGN_OR_RAISE(auto days, TransformUtil::ParseDay(str_val));
+      return Literal::Date(days);
+    }
+    case TypeId::kTime: {
+      ICEBERG_ASSIGN_OR_RAISE(auto micros, TransformUtil::ParseTime(str_val));
+      return Literal::Time(micros);
+    }
+    case TypeId::kTimestamp: {
+      ICEBERG_ASSIGN_OR_RAISE(auto micros, 
TransformUtil::ParseTimestamp(str_val));
+      return Literal::Timestamp(micros);
+    }
+    case TypeId::kTimestampTz: {
+      ICEBERG_ASSIGN_OR_RAISE(auto micros,
+                              TransformUtil::ParseTimestampWithZone(str_val));
+      return Literal::TimestampTz(micros);
+    }
+    case TypeId::kBinary: {
+      ICEBERG_ASSIGN_OR_RAISE(auto bytes, 
StringUtils::HexStringToBytes(str_val));
+      return Literal::Binary(std::move(bytes));
+    }
+    case TypeId::kFixed: {
+      ICEBERG_ASSIGN_OR_RAISE(auto bytes, 
StringUtils::HexStringToBytes(str_val));
+      return Literal::Fixed(std::move(bytes));
+    }
+    case TypeId::kDecimal: {
+      const auto& dec_type = internal::checked_cast<const 
DecimalType&>(*target_type);
+      ICEBERG_ASSIGN_OR_RAISE(auto dec, Decimal::FromString(str_val));

Review Comment:
   Same as my other comment, we need to check the parsed scale against 
dec_type.scale()



##########
src/iceberg/util/transform_util.cc:
##########
@@ -92,6 +111,87 @@ std::string TransformUtil::HumanTimestampWithZone(int64_t 
timestamp_micros) {
   }
 }
 
+Result<int32_t> TransformUtil::ParseDay(std::string_view str) {
+  // Expected format: "yyyy-MM-dd" )
+  // Parse year, month, day manually
+  auto dash1 = str.find('-', str[0] == '-' ? 1 : 0);
+  auto dash2 = str.find('-', dash1 + 1);
+  if (str.size() < 10 || dash1 == std::string_view::npos ||
+      dash2 == std::string_view::npos) [[unlikely]] {
+    return InvalidArgument("Invalid date string: '{}'", str);
+  }
+  int32_t year = 0, month = 0, day = 0;
+  auto [_, e1] = std::from_chars(str.data(), str.data() + dash1, year);

Review Comment:
   Let's reuse `ParseNumber` from string_util.h.



##########
src/iceberg/util/transform_util.cc:
##########
@@ -92,6 +111,87 @@ std::string TransformUtil::HumanTimestampWithZone(int64_t 
timestamp_micros) {
   }
 }
 
+Result<int32_t> TransformUtil::ParseDay(std::string_view str) {
+  // Expected format: "yyyy-MM-dd" )
+  // Parse year, month, day manually
+  auto dash1 = str.find('-', str[0] == '-' ? 1 : 0);
+  auto dash2 = str.find('-', dash1 + 1);
+  if (str.size() < 10 || dash1 == std::string_view::npos ||
+      dash2 == std::string_view::npos) [[unlikely]] {
+    return InvalidArgument("Invalid date string: '{}'", str);
+  }
+  int32_t year = 0, month = 0, day = 0;
+  auto [_, e1] = std::from_chars(str.data(), str.data() + dash1, year);
+  auto [__, e2] = std::from_chars(str.data() + dash1 + 1, str.data() + dash2, 
month);
+  auto [___, e3] = std::from_chars(str.data() + dash2 + 1, str.data() + 
str.size(), day);
+
+  if (e1 != std::errc{} || e2 != std::errc{} || e3 != std::errc{}) 
[[unlikely]] {
+    return InvalidArgument("Invalid year in date string: '{}'", str);
+  }
+
+  auto ymd = std::chrono::year{year} / 
std::chrono::month{static_cast<unsigned>(month)} /
+             std::chrono::day{static_cast<unsigned>(day)};
+  if (!ymd.ok()) [[unlikely]] {
+    return InvalidArgument("Invalid date: '{}'", str);
+  }
+
+  auto days = std::chrono::sys_days(ymd) - std::chrono::sys_days(kEpochDate);
+  return static_cast<int32_t>(days.count());
+}
+
+Result<int64_t> TransformUtil::ParseTime(std::string_view str) {
+  int64_t hours = 0, minutes = 0, seconds = 0;
+
+  auto [_, eh] = std::from_chars(str.data(), str.data() + 2, hours);
+
+  auto [__, em] = std::from_chars(str.data() + 3, str.data() + 5, minutes);
+
+  if ((em != std::errc{}) || (eh != std::errc{}) || (str.size()) < 5) 
[[unlikely]] {
+    return InvalidArgument("Invalid time string: '{}'", str);
+  }
+
+  int64_t frac_micros = 0;
+  if (str.size() > 5) {
+    auto [_, es] = std::from_chars(str.data() + 6, str.data() + 8, seconds);
+    if (str[5] != ':' || str.size() < 8 || es != std::errc{}) [[unlikely]] {
+      return InvalidArgument("Invalid time string: '{}'", str);
+    }
+    if (str.size() > 8) {
+      if (str[8] != '.') [[unlikely]] {
+        return InvalidArgument("Invalid time string: '{}'", str);
+      }
+      ICEBERG_ASSIGN_OR_RAISE(frac_micros, 
ParseFractionalMicros(str.substr(9)));
+    }
+  }
+
+  return hours * 3'600 * kMicrosPerSecond + minutes * 60 * kMicrosPerSecond +

Review Comment:
   We need to check `hours < 24, minutes < 60, seconds <= 60`. It seems that 
`99:99:99` can be parsed without issue at the moment.



##########
src/iceberg/expression/json_serde.cc:
##########
@@ -298,10 +298,126 @@ Result<nlohmann::json> ToJson(const Literal& literal) {
   }
 }
 
-Result<Literal> LiteralFromJson(const nlohmann::json& json, const Type* 
/*type*/) {
-  // TODO(gangwu): implement type-aware literal parsing equivalent to Java's
-  // SingleValueParser.fromJson(type, node).
-  return LiteralFromJson(json);
+Result<Literal> LiteralFromJson(const nlohmann::json& json, const Type* type) {
+  // If {"type": "literal", "value": <actual>} wrapper is present, unwrap it 
first.
+  if (json.is_object() && json.contains(kType) &&
+      json[kType].get<std::string>() == kLiteral && json.contains(kValue)) {
+    return LiteralFromJson(json[kValue], type);
+  }
+  // If no type context is provided, fall back to untyped parsing.
+  if (type == nullptr) return LiteralFromJson(json);
+
+  // Type-aware parsing equivalent to Java's SingleValueParser.fromJson(type, 
node).
+  switch (type->type_id()) {
+    case TypeId::kBoolean:
+      if (!json.is_boolean()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a boolean value", 
SafeDumpJson(json));
+      return Literal::Boolean(json.get<bool>());
+
+    case TypeId::kInt:
+      if (!json.is_number_integer()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as an int value", 
SafeDumpJson(json));
+      return Literal::Int(json.get<int32_t>());
+
+    case TypeId::kLong:
+      if (!json.is_number_integer()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a long value", 
SafeDumpJson(json));
+      return Literal::Long(json.get<int64_t>());
+
+    case TypeId::kFloat:
+      if (!json.is_number()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a float value", 
SafeDumpJson(json));
+      return Literal::Float(json.get<float>());
+
+    case TypeId::kDouble:
+      if (!json.is_number()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a double value", 
SafeDumpJson(json));
+      return Literal::Double(json.get<double>());
+
+    case TypeId::kString:
+      if (!json.is_string()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a string value", 
SafeDumpJson(json));
+      return Literal::String(json.get<std::string>());
+
+    // For temporal types (date, time, timestamp, timestamp_tz), we support 
both integer
+    // and string representations.
+    case TypeId::kDate:
+      if (json.is_number_integer()) return Literal::Date(json.get<int32_t>());
+      if (json.is_string()) {
+        ICEBERG_ASSIGN_OR_RAISE(auto days,
+                                
TransformUtil::ParseDay(json.get<std::string>()));
+        return Literal::Date(days);
+      }
+      return JsonParseError("Cannot parse {} as a date value", 
SafeDumpJson(json));
+
+    case TypeId::kTime:
+      if (json.is_number_integer()) return Literal::Time(json.get<int64_t>());
+      if (json.is_string()) {
+        ICEBERG_ASSIGN_OR_RAISE(auto micros,
+                                
TransformUtil::ParseTime(json.get<std::string>()));
+        return Literal::Time(micros);
+      }
+      return JsonParseError("Cannot parse {} as a time value", 
SafeDumpJson(json));
+
+    case TypeId::kTimestamp:
+      if (json.is_number_integer()) return 
Literal::Timestamp(json.get<int64_t>());
+      if (json.is_string()) {
+        ICEBERG_ASSIGN_OR_RAISE(auto micros,
+                                
TransformUtil::ParseTimestamp(json.get<std::string>()));
+        return Literal::Timestamp(micros);
+      }
+      return JsonParseError("Cannot parse {} as a timestamp value", 
SafeDumpJson(json));
+
+    case TypeId::kTimestampTz:
+      if (json.is_number_integer()) return 
Literal::TimestampTz(json.get<int64_t>());
+      if (json.is_string()) {
+        ICEBERG_ASSIGN_OR_RAISE(
+            auto micros, 
TransformUtil::ParseTimestampWithZone(json.get<std::string>()));
+        return Literal::TimestampTz(micros);
+      }
+      return JsonParseError("Cannot parse {} as a timestamptz value", 
SafeDumpJson(json));
+
+    case TypeId::kUuid:
+      if (json.is_string()) {
+        ICEBERG_ASSIGN_OR_RAISE(auto uuid, 
Uuid::FromString(json.get<std::string>()));
+        return Literal::UUID(uuid);
+      }
+      return JsonParseError("Cannot parse {} as a uuid value", 
SafeDumpJson(json));

Review Comment:
   ```suggestion
         if (!json.is_string()) {
           return JsonParseError("Cannot parse {} as a uuid value", 
SafeDumpJson(json));
         }
         ICEBERG_ASSIGN_OR_RAISE(auto uuid, 
Uuid::FromString(json.get<std::string>()));
         return Literal::UUID(uuid);
   ```
   
   Let's just be consistent as above? Same for below.



##########
src/iceberg/util/transform_util.cc:
##########
@@ -92,6 +111,87 @@ std::string TransformUtil::HumanTimestampWithZone(int64_t 
timestamp_micros) {
   }
 }
 
+Result<int32_t> TransformUtil::ParseDay(std::string_view str) {
+  // Expected format: "yyyy-MM-dd" )
+  // Parse year, month, day manually
+  auto dash1 = str.find('-', str[0] == '-' ? 1 : 0);
+  auto dash2 = str.find('-', dash1 + 1);
+  if (str.size() < 10 || dash1 == std::string_view::npos ||
+      dash2 == std::string_view::npos) [[unlikely]] {
+    return InvalidArgument("Invalid date string: '{}'", str);
+  }
+  int32_t year = 0, month = 0, day = 0;
+  auto [_, e1] = std::from_chars(str.data(), str.data() + dash1, year);

Review Comment:
   BTW, it seems that `20x6-03-03` can be parsed without issue since 
`from_chars` ignores the non-numeric characters. We may need to check the 
returned ptr to see if it consumes all input.



##########
src/iceberg/expression/json_serde.cc:
##########
@@ -298,10 +298,126 @@ Result<nlohmann::json> ToJson(const Literal& literal) {
   }
 }
 
-Result<Literal> LiteralFromJson(const nlohmann::json& json, const Type* 
/*type*/) {
-  // TODO(gangwu): implement type-aware literal parsing equivalent to Java's
-  // SingleValueParser.fromJson(type, node).
-  return LiteralFromJson(json);
+Result<Literal> LiteralFromJson(const nlohmann::json& json, const Type* type) {
+  // If {"type": "literal", "value": <actual>} wrapper is present, unwrap it 
first.
+  if (json.is_object() && json.contains(kType) &&
+      json[kType].get<std::string>() == kLiteral && json.contains(kValue)) {
+    return LiteralFromJson(json[kValue], type);
+  }
+  // If no type context is provided, fall back to untyped parsing.
+  if (type == nullptr) return LiteralFromJson(json);
+
+  // Type-aware parsing equivalent to Java's SingleValueParser.fromJson(type, 
node).
+  switch (type->type_id()) {
+    case TypeId::kBoolean:
+      if (!json.is_boolean()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a boolean value", 
SafeDumpJson(json));
+      return Literal::Boolean(json.get<bool>());
+
+    case TypeId::kInt:
+      if (!json.is_number_integer()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as an int value", 
SafeDumpJson(json));
+      return Literal::Int(json.get<int32_t>());
+
+    case TypeId::kLong:
+      if (!json.is_number_integer()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a long value", 
SafeDumpJson(json));
+      return Literal::Long(json.get<int64_t>());
+
+    case TypeId::kFloat:
+      if (!json.is_number()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a float value", 
SafeDumpJson(json));
+      return Literal::Float(json.get<float>());
+
+    case TypeId::kDouble:
+      if (!json.is_number()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a double value", 
SafeDumpJson(json));
+      return Literal::Double(json.get<double>());
+
+    case TypeId::kString:
+      if (!json.is_string()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a string value", 
SafeDumpJson(json));
+      return Literal::String(json.get<std::string>());
+
+    // For temporal types (date, time, timestamp, timestamp_tz), we support 
both integer
+    // and string representations.
+    case TypeId::kDate:
+      if (json.is_number_integer()) return Literal::Date(json.get<int32_t>());
+      if (json.is_string()) {
+        ICEBERG_ASSIGN_OR_RAISE(auto days,
+                                
TransformUtil::ParseDay(json.get<std::string>()));
+        return Literal::Date(days);
+      }
+      return JsonParseError("Cannot parse {} as a date value", 
SafeDumpJson(json));
+
+    case TypeId::kTime:
+      if (json.is_number_integer()) return Literal::Time(json.get<int64_t>());
+      if (json.is_string()) {
+        ICEBERG_ASSIGN_OR_RAISE(auto micros,
+                                
TransformUtil::ParseTime(json.get<std::string>()));
+        return Literal::Time(micros);
+      }
+      return JsonParseError("Cannot parse {} as a time value", 
SafeDumpJson(json));
+
+    case TypeId::kTimestamp:
+      if (json.is_number_integer()) return 
Literal::Timestamp(json.get<int64_t>());
+      if (json.is_string()) {
+        ICEBERG_ASSIGN_OR_RAISE(auto micros,
+                                
TransformUtil::ParseTimestamp(json.get<std::string>()));
+        return Literal::Timestamp(micros);
+      }
+      return JsonParseError("Cannot parse {} as a timestamp value", 
SafeDumpJson(json));
+
+    case TypeId::kTimestampTz:
+      if (json.is_number_integer()) return 
Literal::TimestampTz(json.get<int64_t>());
+      if (json.is_string()) {
+        ICEBERG_ASSIGN_OR_RAISE(
+            auto micros, 
TransformUtil::ParseTimestampWithZone(json.get<std::string>()));
+        return Literal::TimestampTz(micros);
+      }
+      return JsonParseError("Cannot parse {} as a timestamptz value", 
SafeDumpJson(json));
+
+    case TypeId::kUuid:
+      if (json.is_string()) {
+        ICEBERG_ASSIGN_OR_RAISE(auto uuid, 
Uuid::FromString(json.get<std::string>()));
+        return Literal::UUID(uuid);
+      }
+      return JsonParseError("Cannot parse {} as a uuid value", 
SafeDumpJson(json));
+
+    case TypeId::kBinary:
+      if (json.is_string()) {
+        ICEBERG_ASSIGN_OR_RAISE(auto bytes,
+                                
StringUtils::HexStringToBytes(json.get<std::string>()));
+        return Literal::Binary(std::move(bytes));
+      }
+      return JsonParseError("Cannot parse {} as a binary value", 
SafeDumpJson(json));
+
+    case TypeId::kFixed: {
+      if (json.is_string()) {
+        const auto& fixed_type = internal::checked_cast<const 
FixedType&>(*type);
+        const std::string& hex = json.get<std::string>();
+        if (hex.size() != static_cast<size_t>(fixed_type.length()) * 2) 
[[unlikely]]
+          return JsonParseError("Cannot parse fixed[{}]: expected {} hex 
chars, got {}",
+                                fixed_type.length(), fixed_type.length() * 2, 
hex.size());
+        ICEBERG_ASSIGN_OR_RAISE(auto bytes, 
StringUtils::HexStringToBytes(hex));
+        return Literal::Fixed(std::move(bytes));
+      }
+      return JsonParseError("Cannot parse {} as a fixed value", 
SafeDumpJson(json));
+    }
+
+    case TypeId::kDecimal: {
+      if (json.is_string()) {
+        const auto& dec_type = internal::checked_cast<const 
DecimalType&>(*type);
+        ICEBERG_ASSIGN_OR_RAISE(auto dec, 
Decimal::FromString(json.get<std::string>()));

Review Comment:
   We need to check the output scale from `Decimal::FromString` to make sure it 
is same as in the type.



##########
src/iceberg/expression/json_serde.cc:
##########
@@ -298,10 +298,126 @@ Result<nlohmann::json> ToJson(const Literal& literal) {
   }
 }
 
-Result<Literal> LiteralFromJson(const nlohmann::json& json, const Type* 
/*type*/) {
-  // TODO(gangwu): implement type-aware literal parsing equivalent to Java's
-  // SingleValueParser.fromJson(type, node).
-  return LiteralFromJson(json);
+Result<Literal> LiteralFromJson(const nlohmann::json& json, const Type* type) {
+  // If {"type": "literal", "value": <actual>} wrapper is present, unwrap it 
first.
+  if (json.is_object() && json.contains(kType) &&
+      json[kType].get<std::string>() == kLiteral && json.contains(kValue)) {
+    return LiteralFromJson(json[kValue], type);
+  }
+  // If no type context is provided, fall back to untyped parsing.
+  if (type == nullptr) return LiteralFromJson(json);
+
+  // Type-aware parsing equivalent to Java's SingleValueParser.fromJson(type, 
node).
+  switch (type->type_id()) {
+    case TypeId::kBoolean:
+      if (!json.is_boolean()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a boolean value", 
SafeDumpJson(json));
+      return Literal::Boolean(json.get<bool>());
+
+    case TypeId::kInt:
+      if (!json.is_number_integer()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as an int value", 
SafeDumpJson(json));
+      return Literal::Int(json.get<int32_t>());
+
+    case TypeId::kLong:
+      if (!json.is_number_integer()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a long value", 
SafeDumpJson(json));
+      return Literal::Long(json.get<int64_t>());
+
+    case TypeId::kFloat:
+      if (!json.is_number()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a float value", 
SafeDumpJson(json));
+      return Literal::Float(json.get<float>());
+
+    case TypeId::kDouble:
+      if (!json.is_number()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a double value", 
SafeDumpJson(json));
+      return Literal::Double(json.get<double>());
+
+    case TypeId::kString:
+      if (!json.is_string()) [[unlikely]]
+        return JsonParseError("Cannot parse {} as a string value", 
SafeDumpJson(json));
+      return Literal::String(json.get<std::string>());
+
+    // For temporal types (date, time, timestamp, timestamp_tz), we support 
both integer
+    // and string representations.
+    case TypeId::kDate:
+      if (json.is_number_integer()) return Literal::Date(json.get<int32_t>());

Review Comment:
   I'd recommend not support integer representation like this as the timezone 
processing is really tricky in C++. We cannot really trust arbitrary integers 
from timestamp values.



##########
src/iceberg/test/transform_util_test.cc:
##########
@@ -157,4 +159,97 @@ TEST(TransformUtilTest, Base64Encode) {
   EXPECT_EQ("AA==", TransformUtil::Base64Encode({"\x00", 1}));
 }
 
+struct ParseRoundTripParam {
+  std::string name;
+  std::string str;
+  int64_t value;
+  enum Kind { kDay, kTime, kTimestamp, kTimestampTz } kind;
+};
+
+class ParseRoundTripTest : public 
::testing::TestWithParam<ParseRoundTripParam> {};
+
+TEST_P(ParseRoundTripTest, RoundTrip) {
+  const auto& param = GetParam();
+  switch (param.kind) {
+    case ParseRoundTripParam::kDay: {
+      EXPECT_EQ(TransformUtil::HumanDay(static_cast<int32_t>(param.value)), 
param.str);
+      ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseDay(param.str));
+      EXPECT_EQ(parsed, static_cast<int32_t>(param.value));
+      break;
+    }
+    case ParseRoundTripParam::kTime: {
+      EXPECT_EQ(TransformUtil::HumanTime(param.value), param.str);
+      ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTime(param.str));
+      EXPECT_EQ(parsed, param.value);
+      break;
+    }
+    case ParseRoundTripParam::kTimestamp: {
+      EXPECT_EQ(TransformUtil::HumanTimestamp(param.value), param.str);
+      ICEBERG_UNWRAP_OR_FAIL(auto parsed, 
TransformUtil::ParseTimestamp(param.str));
+      EXPECT_EQ(parsed, param.value);
+      break;
+    }
+    case ParseRoundTripParam::kTimestampTz: {
+      EXPECT_EQ(TransformUtil::HumanTimestampWithZone(param.value), param.str);
+      ICEBERG_UNWRAP_OR_FAIL(auto parsed,
+                             TransformUtil::ParseTimestampWithZone(param.str));
+      EXPECT_EQ(parsed, param.value);
+      break;
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    TransformUtilTest, ParseRoundTripTest,
+    ::testing::Values(
+        // Day round-trips
+        ParseRoundTripParam{"DayEpoch", "1970-01-01", 0, 
ParseRoundTripParam::kDay},

Review Comment:
   Thanks for adding these test cases! It would be good to add cases for 
various invalid values.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to