jecsand838 commented on code in PR #9280:
URL: https://github.com/apache/arrow-rs/pull/9280#discussion_r2742748189
##########
arrow-avro/src/reader/mod.rs:
##########
@@ -1163,6 +1167,12 @@ impl ReaderBuilder {
self
}
+ /// Sets the timezone representation for Avro timestamp fields.
+ pub fn with_tz(mut self, tz: Tz) -> Self {
+ self.use_tz = tz;
+ self
+ }
+
Review Comment:
Would you mind adding an e2e test using this?
Also may help to explain in the doc comment what the default value is.
##########
arrow-avro/src/codec.rs:
##########
@@ -953,12 +987,15 @@ impl From<&Codec> for UnionFieldKind {
Codec::Date32 => Self::Date,
Codec::TimeMillis => Self::TimeMillis,
Codec::TimeMicros => Self::TimeMicros,
- Codec::TimestampMillis(true) => Self::TimestampMillisUtc,
- Codec::TimestampMillis(false) => Self::TimestampMillisLocal,
- Codec::TimestampMicros(true) => Self::TimestampMicrosUtc,
- Codec::TimestampMicros(false) => Self::TimestampMicrosLocal,
- Codec::TimestampNanos(true) => Self::TimestampNanosUtc,
- Codec::TimestampNanos(false) => Self::TimestampNanosLocal,
+ Codec::TimestampMillis(Some(Tz::OffsetZero)) =>
Self::TimestampMillisUtc,
+ Codec::TimestampMillis(Some(Tz::Utc)) => Self::TimestampMillisUtc,
+ Codec::TimestampMillis(None) => Self::TimestampMillisLocal,
+ Codec::TimestampMicros(Some(Tz::OffsetZero)) =>
Self::TimestampMicrosUtc,
+ Codec::TimestampMicros(Some(Tz::Utc)) => Self::TimestampMicrosUtc,
+ Codec::TimestampMicros(None) => Self::TimestampMicrosLocal,
+ Codec::TimestampNanos(Some(Tz::OffsetZero)) =>
Self::TimestampNanosUtc,
+ Codec::TimestampNanos(Some(Tz::Utc)) => Self::TimestampNanosUtc,
+ Codec::TimestampNanos(None) => Self::TimestampNanosLocal,
Review Comment:
This maybe a bit cleaner.
```suggestion
Codec::TimestampMillis(Some(_)) => Self::TimestampMillisUtc,
Codec::TimestampMillis(None) => Self::TimestampMillisLocal,
Codec::TimestampMicros(Some(_)) => Self::TimestampMicrosUtc,
Codec::TimestampMicros(None) => Self::TimestampMicrosLocal,
Codec::TimestampNanos(Some(_)) => Self::TimestampNanosUtc,
Codec::TimestampNanos(None) => Self::TimestampNanosLocal,
```
##########
arrow-avro/src/codec.rs:
##########
@@ -611,6 +619,29 @@ impl<'a> AvroFieldBuilder<'a> {
}
}
+/// Timezone representation for timestamps.
+///
+/// Avro only distinguishes between UTC and local time (no timezone), but
Arrow supports
+/// any of the two identifiers of the UTC timezone: "+00:00" and "UTC".
+/// The data types using these time zone IDs behave identically, but are not
logically equal.
+#[derive(Debug, Copy, Clone, PartialEq, Default)]
+pub enum Tz {
+ /// Represent Avro `timestamp-*` logical types with "+00:00" timezone ID
+ #[default]
+ OffsetZero,
+ /// Represent Avro `timestamp-*` logical types with "UTC" timezone ID
+ Utc,
+}
+
+impl Display for Tz {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match self {
+ Tz::OffsetZero => write!(f, "+00:00"),
+ Tz::Utc => write!(f, "UTC"),
+ }
+ }
+}
Review Comment:
What about doing something like this?
```suggestion
impl Tz {
pub fn as_str(&self) -> &'static str {
match self {
Self::OffsetZero => "+00:00",
Self::Utc => "UTC",
}
}
}
impl Display for Tz {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(self.as_str())
}
}
```
`as_str()` can be used to convert directly to `Arc<str>` without
intermediate allocation. Then we can use it both in `fmt` and again below in
`Codec::data_type`.
Meanwhile we can use `f.write_str` to avoid the overhead of the `write!`
macro when parsing a format string for simple string literals.
##########
arrow-avro/src/codec.rs:
##########
@@ -715,15 +746,18 @@ impl Codec {
Self::Date32 => DataType::Date32,
Self::TimeMillis => DataType::Time32(TimeUnit::Millisecond),
Self::TimeMicros => DataType::Time64(TimeUnit::Microsecond),
- Self::TimestampMillis(is_utc) => {
- DataType::Timestamp(TimeUnit::Millisecond, is_utc.then(||
"+00:00".into()))
- }
- Self::TimestampMicros(is_utc) => {
- DataType::Timestamp(TimeUnit::Microsecond, is_utc.then(||
"+00:00".into()))
- }
- Self::TimestampNanos(is_utc) => {
- DataType::Timestamp(TimeUnit::Nanosecond, is_utc.then(||
"+00:00".into()))
- }
+ Self::TimestampMillis(tz) => DataType::Timestamp(
+ TimeUnit::Millisecond,
+ tz.as_ref().map(|tz| tz.to_string().into()),
+ ),
+ Self::TimestampMicros(tz) => DataType::Timestamp(
+ TimeUnit::Microsecond,
+ tz.as_ref().map(|tz| tz.to_string().into()),
+ ),
+ Self::TimestampNanos(tz) => DataType::Timestamp(
+ TimeUnit::Nanosecond,
+ tz.as_ref().map(|tz| tz.to_string().into()),
+ ),
Review Comment:
Then down here you could do:
```suggestion
Self::TimestampMillis(tz) => DataType::Timestamp(
TimeUnit::Millisecond,
tz.as_ref().map(|tz| tz.as_str().into()),
),
Self::TimestampMicros(tz) => DataType::Timestamp(
TimeUnit::Microsecond,
tz.as_ref().map(|tz| tz.as_str().into()),
),
Self::TimestampNanos(tz) => DataType::Timestamp(
TimeUnit::Nanosecond,
tz.as_ref().map(|tz| tz.as_str().into()),
),
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]