This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new ec836382d Reduce duplication and bounds checks in cast kernels (#2284)
ec836382d is described below
commit ec836382d8659901b37255bbbf195b4ba33d847b
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Aug 3 04:23:34 2022 -0400
Reduce duplication and bounds checks in cast kernels (#2284)
---
arrow/src/compute/kernels/cast.rs | 309 ++++++++++++++------------------------
arrow/src/temporal_conversions.rs | 13 +-
2 files changed, 120 insertions(+), 202 deletions(-)
diff --git a/arrow/src/compute/kernels/cast.rs
b/arrow/src/compute/kernels/cast.rs
index e552ce045..8cd531ee0 100644
--- a/arrow/src/compute/kernels/cast.rs
+++ b/arrow/src/compute/kernels/cast.rs
@@ -46,6 +46,10 @@ use crate::compute::kernels::arity::unary;
use crate::compute::kernels::cast_utils::string_to_timestamp_nanos;
use crate::datatypes::*;
use crate::error::{ArrowError, Result};
+use crate::temporal_conversions::{
+ EPOCH_DAYS_FROM_CE, MICROSECONDS, MILLISECONDS, MILLISECONDS_IN_DAY,
NANOSECONDS,
+ SECONDS_IN_DAY,
+};
use crate::{array::*, compute::take};
use crate::{buffer::Buffer, util::serialization::lexical_to_string};
use num::cast::AsPrimitive;
@@ -1247,19 +1251,6 @@ const fn time_unit_multiple(unit: &TimeUnit) -> i64 {
}
}
-/// Number of seconds in a day
-const SECONDS_IN_DAY: i64 = 86_400;
-/// Number of milliseconds in a second
-const MILLISECONDS: i64 = 1_000;
-/// Number of microseconds in a second
-const MICROSECONDS: i64 = 1_000_000;
-/// Number of nanoseconds in a second
-const NANOSECONDS: i64 = 1_000_000_000;
-/// Number of milliseconds in a day
-const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS;
-/// Number of days between 0001-01-01 and 1970-01-01
-const EPOCH_DAYS_FROM_CE: i32 = 719_163;
-
/// Cast one type of decimal array to another type of decimal array
fn cast_decimal_to_decimal(
array: &ArrayRef,
@@ -1463,35 +1454,28 @@ where
<T as ArrowPrimitiveType>::Native: lexical_core::FromLexical,
{
if cast_options.safe {
- let iter = (0..from.len()).map(|i| {
- if from.is_null(i) {
- None
- } else {
- lexical_core::parse(from.value(i).as_bytes()).ok()
- }
- });
+ let iter = from
+ .iter()
+ .map(|v| v.and_then(|v| lexical_core::parse(v.as_bytes()).ok()));
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an
`StringArray`.
Ok(unsafe { PrimitiveArray::<T>::from_trusted_len_iter(iter) })
} else {
- let vec = (0..from.len())
- .map(|i| {
- if from.is_null(i) {
- Ok(None)
- } else {
- let string = from.value(i);
- let result = lexical_core::parse(string.as_bytes());
- Some(result.map_err(|_| {
+ let vec = from
+ .iter()
+ .map(|v| {
+ v.map(|v| {
+ lexical_core::parse(v.as_bytes()).map_err(|_| {
ArrowError::CastError(format!(
"Cannot cast string '{}' to value of {} type",
- string,
+ v,
std::any::type_name::<T>()
))
- }))
- .transpose()
- }
+ })
+ })
+ .transpose()
})
.collect::<Result<Vec<_>>>()?;
// Benefit:
@@ -1514,16 +1498,12 @@ fn cast_string_to_date32<Offset: OffsetSizeTrait>(
.unwrap();
let array = if cast_options.safe {
- let iter = (0..string_array.len()).map(|i| {
- if string_array.is_null(i) {
- None
- } else {
- string_array
- .value(i)
- .parse::<chrono::NaiveDate>()
+ let iter = string_array.iter().map(|v| {
+ v.and_then(|v| {
+ v.parse::<chrono::NaiveDate>()
.map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
.ok()
- }
+ })
});
// Benefit:
@@ -1532,25 +1512,18 @@ fn cast_string_to_date32<Offset: OffsetSizeTrait>(
// The iterator is trustedLen because it comes from an
`StringArray`.
unsafe { Date32Array::from_trusted_len_iter(iter) }
} else {
- let vec = (0..string_array.len())
- .map(|i| {
- if string_array.is_null(i) {
- Ok(None)
- } else {
- let string = string_array
- .value(i);
-
- let result = string
+ let vec = string_array.iter()
+ .map(|v| {
+ v.map(|v| {
+ v
.parse::<chrono::NaiveDate>()
- .map(|date| date.num_days_from_ce() -
EPOCH_DAYS_FROM_CE);
-
- Some(result.map_err(|_| {
+ .map(|date| date.num_days_from_ce() -
EPOCH_DAYS_FROM_CE)
+ .map_err(|_| {
ArrowError::CastError(
- format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Date32Type type", string),
- )
- }))
- .transpose()
- }
+ format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Date32Type type", v))
+ })
+ })
+ .transpose()
})
.collect::<Result<Vec<Option<i32>>>>()?;
@@ -1575,16 +1548,12 @@ fn cast_string_to_date64<Offset: OffsetSizeTrait>(
.unwrap();
let array = if cast_options.safe {
- let iter = (0..string_array.len()).map(|i| {
- if string_array.is_null(i) {
- None
- } else {
- string_array
- .value(i)
- .parse::<chrono::NaiveDateTime>()
+ let iter = string_array.iter().map(|v| {
+ v.and_then(|v| {
+ v.parse::<chrono::NaiveDateTime>()
.map(|datetime| datetime.timestamp_millis())
.ok()
- }
+ })
});
// Benefit:
@@ -1593,25 +1562,19 @@ fn cast_string_to_date64<Offset: OffsetSizeTrait>(
// The iterator is trustedLen because it comes from an
`StringArray`.
unsafe { Date64Array::from_trusted_len_iter(iter) }
} else {
- let vec = (0..string_array.len())
- .map(|i| {
- if string_array.is_null(i) {
- Ok(None)
- } else {
- let string = string_array
- .value(i);
-
- let result = string
+ let vec = string_array.iter()
+ .map(|v| {
+ v.map(|v| {
+ v
.parse::<chrono::NaiveDateTime>()
- .map(|datetime| datetime.timestamp_millis());
-
- Some(result.map_err(|_| {
- ArrowError::CastError(
- format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Date64Type type", string),
+ .map(|datetime| datetime.timestamp_millis())
+ .map_err(|_| {
+ ArrowError::CastError(
+ format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Date64Type type", v),
)
- }))
- .transpose()
- }
+ })
+ })
+ .transpose()
})
.collect::<Result<Vec<Option<i64>>>>()?;
@@ -1639,20 +1602,16 @@ fn cast_string_to_time32second<Offset: OffsetSizeTrait>(
.unwrap();
let array = if cast_options.safe {
- let iter = (0..string_array.len()).map(|i| {
- if string_array.is_null(i) {
- None
- } else {
- string_array
- .value(i)
- .parse::<chrono::NaiveTime>()
+ let iter = string_array.iter().map(|v| {
+ v.and_then(|v| {
+ v.parse::<chrono::NaiveTime>()
.map(|time| {
(time.num_seconds_from_midnight()
+ time.nanosecond() / NANOS_PER_SEC)
as i32
})
.ok()
- }
+ })
});
// Benefit:
@@ -1661,25 +1620,19 @@ fn cast_string_to_time32second<Offset: OffsetSizeTrait>(
// The iterator is trustedLen because it comes from an
`StringArray`.
unsafe { Time32SecondArray::from_trusted_len_iter(iter) }
} else {
- let vec = (0..string_array.len())
- .map(|i| {
- if string_array.is_null(i) {
- Ok(None)
- } else {
- let string = string_array
- .value(i);
- chrono::Duration::days(3);
- let result = string
+ let vec = string_array.iter()
+ .map(|v| {
+ v.map(|v| {
+ v
.parse::<chrono::NaiveTime>()
- .map(|time| (time.num_seconds_from_midnight() +
time.nanosecond() / NANOS_PER_SEC) as i32);
-
- Some(result.map_err(|_| {
- ArrowError::CastError(
- format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Time32SecondType type", string),
+ .map(|time| (time.num_seconds_from_midnight() +
time.nanosecond() / NANOS_PER_SEC) as i32)
+ .map_err(|_| {
+ ArrowError::CastError(
+ format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Time32SecondType type", v),
)
- }))
- .transpose()
- }
+ })
+ })
+ .transpose()
})
.collect::<Result<Vec<Option<i32>>>>()?;
@@ -1709,20 +1662,16 @@ fn cast_string_to_time32millisecond<Offset:
OffsetSizeTrait>(
.unwrap();
let array = if cast_options.safe {
- let iter = (0..string_array.len()).map(|i| {
- if string_array.is_null(i) {
- None
- } else {
- string_array
- .value(i)
- .parse::<chrono::NaiveTime>()
+ let iter = string_array.iter().map(|v| {
+ v.and_then(|v| {
+ v.parse::<chrono::NaiveTime>()
.map(|time| {
(time.num_seconds_from_midnight() * MILLIS_PER_SEC
+ time.nanosecond() / NANOS_PER_MILLI)
as i32
})
.ok()
- }
+ })
});
// Benefit:
@@ -1731,26 +1680,20 @@ fn cast_string_to_time32millisecond<Offset:
OffsetSizeTrait>(
// The iterator is trustedLen because it comes from an
`StringArray`.
unsafe { Time32MillisecondArray::from_trusted_len_iter(iter) }
} else {
- let vec = (0..string_array.len())
- .map(|i| {
- if string_array.is_null(i) {
- Ok(None)
- } else {
- let string = string_array
- .value(i);
-
- let result = string
+ let vec = string_array.iter()
+ .map(|v| {
+ v.map(|v| {
+ v
.parse::<chrono::NaiveTime>()
.map(|time| (time.num_seconds_from_midnight() *
MILLIS_PER_SEC
- + time.nanosecond() / NANOS_PER_MILLI) as i32);
-
- Some(result.map_err(|_| {
+ + time.nanosecond() / NANOS_PER_MILLI) as i32)
+ .map_err(|_| {
ArrowError::CastError(
- format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Time32MillisecondType type", string),
+ format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Time32MillisecondType type", v),
)
- }))
- .transpose()
- }
+ })
+ })
+ .transpose()
})
.collect::<Result<Vec<Option<i32>>>>()?;
@@ -1780,19 +1723,15 @@ fn cast_string_to_time64microsecond<Offset:
OffsetSizeTrait>(
.unwrap();
let array = if cast_options.safe {
- let iter = (0..string_array.len()).map(|i| {
- if string_array.is_null(i) {
- None
- } else {
- string_array
- .value(i)
- .parse::<chrono::NaiveTime>()
+ let iter = string_array.iter().map(|v| {
+ v.and_then(|v| {
+ v.parse::<chrono::NaiveTime>()
.map(|time| {
time.num_seconds_from_midnight() as i64 *
MICROS_PER_SEC
+ time.nanosecond() as i64 / NANOS_PER_MICRO
})
.ok()
- }
+ })
});
// Benefit:
@@ -1801,26 +1740,20 @@ fn cast_string_to_time64microsecond<Offset:
OffsetSizeTrait>(
// The iterator is trustedLen because it comes from an
`StringArray`.
unsafe { Time64MicrosecondArray::from_trusted_len_iter(iter) }
} else {
- let vec = (0..string_array.len())
- .map(|i| {
- if string_array.is_null(i) {
- Ok(None)
- } else {
- let string = string_array
- .value(i);
-
- let result = string
+ let vec = string_array.iter()
+ .map(|v| {
+ v.map(|v| {
+ v
.parse::<chrono::NaiveTime>()
.map(|time| time.num_seconds_from_midnight() as i64 *
MICROS_PER_SEC
- + time.nanosecond() as i64 / NANOS_PER_MICRO);
-
- Some(result.map_err(|_| {
- ArrowError::CastError(
- format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Time64MicrosecondType type", string),
- )
- }))
- .transpose()
- }
+ + time.nanosecond() as i64 / NANOS_PER_MICRO)
+ .map_err(|_| {
+ ArrowError::CastError(
+ format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Time64MicrosecondType type", v),
+ )
+ })
+ })
+ .transpose()
})
.collect::<Result<Vec<Option<i64>>>>()?;
@@ -1848,19 +1781,15 @@ fn cast_string_to_time64nanosecond<Offset:
OffsetSizeTrait>(
.unwrap();
let array = if cast_options.safe {
- let iter = (0..string_array.len()).map(|i| {
- if string_array.is_null(i) {
- None
- } else {
- string_array
- .value(i)
- .parse::<chrono::NaiveTime>()
+ let iter = string_array.iter().map(|v| {
+ v.and_then(|v| {
+ v.parse::<chrono::NaiveTime>()
.map(|time| {
time.num_seconds_from_midnight() as i64 * NANOS_PER_SEC
+ time.nanosecond() as i64
})
.ok()
- }
+ })
});
// Benefit:
@@ -1869,25 +1798,19 @@ fn cast_string_to_time64nanosecond<Offset:
OffsetSizeTrait>(
// The iterator is trustedLen because it comes from an
`StringArray`.
unsafe { Time64NanosecondArray::from_trusted_len_iter(iter) }
} else {
- let vec = (0..string_array.len())
- .map(|i| {
- if string_array.is_null(i) {
- Ok(None)
- } else {
- let string = string_array
- .value(i);
-
- let result = string
+ let vec = string_array.iter()
+ .map(|v| {
+ v.map(|v| {
+ v
.parse::<chrono::NaiveTime>()
- .map(|time| time.num_seconds_from_midnight() as i64 *
NANOS_PER_SEC + time.nanosecond() as i64);
-
- Some(result.map_err(|_| {
- ArrowError::CastError(
- format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Time64NanosecondType type", string),
+ .map(|time| time.num_seconds_from_midnight() as i64 *
NANOS_PER_SEC + time.nanosecond() as i64)
+ .map_err(|_| {
+ ArrowError::CastError(
+ format!("Cannot cast string '{}' to value of
arrow::datatypes::types::Time64NanosecondType type", v),
)
- }))
- .transpose()
- }
+ })
+ })
+ .transpose()
})
.collect::<Result<Vec<Option<i64>>>>()?;
@@ -1912,28 +1835,18 @@ fn cast_string_to_timestamp_ns<Offset: OffsetSizeTrait>(
.unwrap();
let array = if cast_options.safe {
- let iter = (0..string_array.len()).map(|i| {
- if string_array.is_null(i) {
- None
- } else {
- string_to_timestamp_nanos(string_array.value(i)).ok()
- }
- });
+ let iter = string_array
+ .iter()
+ .map(|v| v.and_then(|v| string_to_timestamp_nanos(v).ok()));
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an
`StringArray`.
unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter) }
} else {
- let vec = (0..string_array.len())
- .map(|i| {
- if string_array.is_null(i) {
- Ok(None)
- } else {
- let result =
string_to_timestamp_nanos(string_array.value(i));
- Some(result).transpose()
- }
- })
+ let vec = string_array
+ .iter()
+ .map(|v| v.map(string_to_timestamp_nanos).transpose())
.collect::<Result<Vec<Option<i64>>>>()?;
// Benefit:
diff --git a/arrow/src/temporal_conversions.rs
b/arrow/src/temporal_conversions.rs
index 2d6d6776f..fda004a6d 100644
--- a/arrow/src/temporal_conversions.rs
+++ b/arrow/src/temporal_conversions.rs
@@ -20,13 +20,18 @@
use chrono::{Duration, NaiveDateTime, NaiveTime};
/// Number of seconds in a day
-const SECONDS_IN_DAY: i64 = 86_400;
+pub(crate) const SECONDS_IN_DAY: i64 = 86_400;
/// Number of milliseconds in a second
-const MILLISECONDS: i64 = 1_000;
+pub(crate) const MILLISECONDS: i64 = 1_000;
/// Number of microseconds in a second
-const MICROSECONDS: i64 = 1_000_000;
+pub(crate) const MICROSECONDS: i64 = 1_000_000;
/// Number of nanoseconds in a second
-const NANOSECONDS: i64 = 1_000_000_000;
+pub(crate) const NANOSECONDS: i64 = 1_000_000_000;
+
+/// Number of milliseconds in a day
+pub(crate) const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS;
+/// Number of days between 0001-01-01 and 1970-01-01
+pub(crate) const EPOCH_DAYS_FROM_CE: i32 = 719_163;
/// converts a `i32` representing a `date32` to [`NaiveDateTime`]
#[inline]