This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new f373a866ce Add initial support for Utf8View and BinaryView types
(#10925)
f373a866ce is described below
commit f373a866ce849679f6726699e795ffcb6638609c
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Mon Jun 17 15:31:21 2024 -0400
Add initial support for Utf8View and BinaryView types (#10925)
* add view types
* Add slt tests
* comment out failing test
* update vendored code
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/common/src/scalar/mod.rs | 93 ++++++++++++++++++----
datafusion/functions/src/core/arrow_cast.rs | 4 +
.../proto-common/proto/datafusion_common.proto | 4 +
datafusion/proto-common/src/from_proto/mod.rs | 4 +
datafusion/proto-common/src/generated/pbjson.rs | 55 +++++++++++++
datafusion/proto-common/src/generated/prost.rs | 12 ++-
datafusion/proto-common/src/to_proto/mod.rs | 10 +++
.../proto/src/generated/datafusion_proto_common.rs | 12 ++-
datafusion/sql/src/unparser/expr.rs | 8 ++
.../sqllogictest/test_files/arrow_typeof.slt | 10 +++
10 files changed, 193 insertions(+), 19 deletions(-)
diff --git a/datafusion/common/src/scalar/mod.rs
b/datafusion/common/src/scalar/mod.rs
index 8073b21cdd..3daf347ae4 100644
--- a/datafusion/common/src/scalar/mod.rs
+++ b/datafusion/common/src/scalar/mod.rs
@@ -221,10 +221,14 @@ pub enum ScalarValue {
UInt64(Option<u64>),
/// utf-8 encoded string.
Utf8(Option<String>),
+ /// utf-8 encoded string but from view types.
+ Utf8View(Option<String>),
/// utf-8 encoded string representing a LargeString's arrow type.
LargeUtf8(Option<String>),
/// binary
Binary(Option<Vec<u8>>),
+ /// binary but from view types.
+ BinaryView(Option<Vec<u8>>),
/// fixed size binary
FixedSizeBinary(i32, Option<Vec<u8>>),
/// large binary
@@ -345,10 +349,14 @@ impl PartialEq for ScalarValue {
(UInt64(_), _) => false,
(Utf8(v1), Utf8(v2)) => v1.eq(v2),
(Utf8(_), _) => false,
+ (Utf8View(v1), Utf8View(v2)) => v1.eq(v2),
+ (Utf8View(_), _) => false,
(LargeUtf8(v1), LargeUtf8(v2)) => v1.eq(v2),
(LargeUtf8(_), _) => false,
(Binary(v1), Binary(v2)) => v1.eq(v2),
(Binary(_), _) => false,
+ (BinaryView(v1), BinaryView(v2)) => v1.eq(v2),
+ (BinaryView(_), _) => false,
(FixedSizeBinary(_, v1), FixedSizeBinary(_, v2)) => v1.eq(v2),
(FixedSizeBinary(_, _), _) => false,
(LargeBinary(v1), LargeBinary(v2)) => v1.eq(v2),
@@ -470,8 +478,12 @@ impl PartialOrd for ScalarValue {
(Utf8(_), _) => None,
(LargeUtf8(v1), LargeUtf8(v2)) => v1.partial_cmp(v2),
(LargeUtf8(_), _) => None,
+ (Utf8View(v1), Utf8View(v2)) => v1.partial_cmp(v2),
+ (Utf8View(_), _) => None,
(Binary(v1), Binary(v2)) => v1.partial_cmp(v2),
(Binary(_), _) => None,
+ (BinaryView(v1), BinaryView(v2)) => v1.partial_cmp(v2),
+ (BinaryView(_), _) => None,
(FixedSizeBinary(_, v1), FixedSizeBinary(_, v2)) =>
v1.partial_cmp(v2),
(FixedSizeBinary(_, _), _) => None,
(LargeBinary(v1), LargeBinary(v2)) => v1.partial_cmp(v2),
@@ -667,11 +679,10 @@ impl std::hash::Hash for ScalarValue {
UInt16(v) => v.hash(state),
UInt32(v) => v.hash(state),
UInt64(v) => v.hash(state),
- Utf8(v) => v.hash(state),
- LargeUtf8(v) => v.hash(state),
- Binary(v) => v.hash(state),
- FixedSizeBinary(_, v) => v.hash(state),
- LargeBinary(v) => v.hash(state),
+ Utf8(v) | LargeUtf8(v) | Utf8View(v) => v.hash(state),
+ Binary(v) | FixedSizeBinary(_, v) | LargeBinary(v) | BinaryView(v)
=> {
+ v.hash(state)
+ }
List(arr) => {
hash_nested_array(arr.to_owned() as ArrayRef, state);
}
@@ -1107,7 +1118,9 @@ impl ScalarValue {
ScalarValue::Float64(_) => DataType::Float64,
ScalarValue::Utf8(_) => DataType::Utf8,
ScalarValue::LargeUtf8(_) => DataType::LargeUtf8,
+ ScalarValue::Utf8View(_) => DataType::Utf8View,
ScalarValue::Binary(_) => DataType::Binary,
+ ScalarValue::BinaryView(_) => DataType::BinaryView,
ScalarValue::FixedSizeBinary(sz, _) =>
DataType::FixedSizeBinary(*sz),
ScalarValue::LargeBinary(_) => DataType::LargeBinary,
ScalarValue::List(arr) => arr.data_type().to_owned(),
@@ -1310,11 +1323,13 @@ impl ScalarValue {
ScalarValue::UInt16(v) => v.is_none(),
ScalarValue::UInt32(v) => v.is_none(),
ScalarValue::UInt64(v) => v.is_none(),
- ScalarValue::Utf8(v) => v.is_none(),
- ScalarValue::LargeUtf8(v) => v.is_none(),
- ScalarValue::Binary(v) => v.is_none(),
- ScalarValue::FixedSizeBinary(_, v) => v.is_none(),
- ScalarValue::LargeBinary(v) => v.is_none(),
+ ScalarValue::Utf8(v)
+ | ScalarValue::Utf8View(v)
+ | ScalarValue::LargeUtf8(v) => v.is_none(),
+ ScalarValue::Binary(v)
+ | ScalarValue::BinaryView(v)
+ | ScalarValue::FixedSizeBinary(_, v)
+ | ScalarValue::LargeBinary(v) => v.is_none(),
// arr.len() should be 1 for a list scalar, but we don't seem to
// enforce that anywhere, so we still check against array length.
ScalarValue::List(arr) => arr.len() == arr.null_count(),
@@ -2002,6 +2017,12 @@ impl ScalarValue {
}
None => new_null_array(&DataType::Utf8, size),
},
+ ScalarValue::Utf8View(e) => match e {
+ Some(value) => {
+
Arc::new(StringViewArray::from_iter_values(repeat(value).take(size)))
+ }
+ None => new_null_array(&DataType::Utf8View, size),
+ },
ScalarValue::LargeUtf8(e) => match e {
Some(value) => {
Arc::new(LargeStringArray::from_iter_values(repeat(value).take(size)))
@@ -2018,6 +2039,16 @@ impl ScalarValue {
Arc::new(repeat(None::<&str>).take(size).collect::<BinaryArray>())
}
},
+ ScalarValue::BinaryView(e) => match e {
+ Some(value) => Arc::new(
+ repeat(Some(value.as_slice()))
+ .take(size)
+ .collect::<BinaryViewArray>(),
+ ),
+ None => {
+
Arc::new(repeat(None::<&str>).take(size).collect::<BinaryViewArray>())
+ }
+ },
ScalarValue::FixedSizeBinary(s, e) => match e {
Some(value) => Arc::new(
FixedSizeBinaryArray::try_from_sparse_iter_with_size(
@@ -2361,10 +2392,14 @@ impl ScalarValue {
DataType::LargeBinary => {
typed_cast!(array, index, LargeBinaryArray, LargeBinary)?
}
+ DataType::BinaryView => {
+ typed_cast!(array, index, BinaryViewArray, BinaryView)?
+ }
DataType::Utf8 => typed_cast!(array, index, StringArray, Utf8)?,
DataType::LargeUtf8 => {
typed_cast!(array, index, LargeStringArray, LargeUtf8)?
}
+ DataType::Utf8View => typed_cast!(array, index, StringViewArray,
Utf8View)?,
DataType::List(_) => {
let list_array = array.as_list::<i32>();
let nested_array = list_array.value(index);
@@ -2652,12 +2687,18 @@ impl ScalarValue {
ScalarValue::Utf8(val) => {
eq_array_primitive!(array, index, StringArray, val)?
}
+ ScalarValue::Utf8View(val) => {
+ eq_array_primitive!(array, index, StringViewArray, val)?
+ }
ScalarValue::LargeUtf8(val) => {
eq_array_primitive!(array, index, LargeStringArray, val)?
}
ScalarValue::Binary(val) => {
eq_array_primitive!(array, index, BinaryArray, val)?
}
+ ScalarValue::BinaryView(val) => {
+ eq_array_primitive!(array, index, BinaryViewArray, val)?
+ }
ScalarValue::FixedSizeBinary(_, val) => {
eq_array_primitive!(array, index, FixedSizeBinaryArray, val)?
}
@@ -2790,7 +2831,9 @@ impl ScalarValue {
| ScalarValue::DurationMillisecond(_)
| ScalarValue::DurationMicrosecond(_)
| ScalarValue::DurationNanosecond(_) => 0,
- ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => {
+ ScalarValue::Utf8(s)
+ | ScalarValue::LargeUtf8(s)
+ | ScalarValue::Utf8View(s) => {
s.as_ref().map(|s| s.capacity()).unwrap_or_default()
}
ScalarValue::TimestampSecond(_, s)
@@ -2801,7 +2844,8 @@ impl ScalarValue {
}
ScalarValue::Binary(b)
| ScalarValue::FixedSizeBinary(_, b)
- | ScalarValue::LargeBinary(b) => {
+ | ScalarValue::LargeBinary(b)
+ | ScalarValue::BinaryView(b) => {
b.as_ref().map(|b| b.capacity()).unwrap_or_default()
}
ScalarValue::List(arr) => arr.get_array_memory_size(),
@@ -3068,7 +3112,9 @@ impl TryFrom<&DataType> for ScalarValue {
}
DataType::Utf8 => ScalarValue::Utf8(None),
DataType::LargeUtf8 => ScalarValue::LargeUtf8(None),
+ DataType::Utf8View => ScalarValue::Utf8View(None),
DataType::Binary => ScalarValue::Binary(None),
+ DataType::BinaryView => ScalarValue::BinaryView(None),
DataType::FixedSizeBinary(len) =>
ScalarValue::FixedSizeBinary(*len, None),
DataType::LargeBinary => ScalarValue::LargeBinary(None),
DataType::Date32 => ScalarValue::Date32(None),
@@ -3190,11 +3236,13 @@ impl fmt::Display for ScalarValue {
ScalarValue::TimestampMillisecond(e, _) => format_option!(f, e)?,
ScalarValue::TimestampMicrosecond(e, _) => format_option!(f, e)?,
ScalarValue::TimestampNanosecond(e, _) => format_option!(f, e)?,
- ScalarValue::Utf8(e) => format_option!(f, e)?,
- ScalarValue::LargeUtf8(e) => format_option!(f, e)?,
+ ScalarValue::Utf8(e)
+ | ScalarValue::LargeUtf8(e)
+ | ScalarValue::Utf8View(e) => format_option!(f, e)?,
ScalarValue::Binary(e)
| ScalarValue::FixedSizeBinary(_, e)
- | ScalarValue::LargeBinary(e) => match e {
+ | ScalarValue::LargeBinary(e)
+ | ScalarValue::BinaryView(e) => match e {
Some(l) => write!(
f,
"{}",
@@ -3318,10 +3366,14 @@ impl fmt::Debug for ScalarValue {
}
ScalarValue::Utf8(None) => write!(f, "Utf8({self})"),
ScalarValue::Utf8(Some(_)) => write!(f, "Utf8(\"{self}\")"),
+ ScalarValue::Utf8View(None) => write!(f, "Utf8View({self})"),
+ ScalarValue::Utf8View(Some(_)) => write!(f,
"Utf8View(\"{self}\")"),
ScalarValue::LargeUtf8(None) => write!(f, "LargeUtf8({self})"),
ScalarValue::LargeUtf8(Some(_)) => write!(f,
"LargeUtf8(\"{self}\")"),
ScalarValue::Binary(None) => write!(f, "Binary({self})"),
ScalarValue::Binary(Some(_)) => write!(f, "Binary(\"{self}\")"),
+ ScalarValue::BinaryView(None) => write!(f, "BinaryView({self})"),
+ ScalarValue::BinaryView(Some(_)) => write!(f,
"BinaryView(\"{self}\")"),
ScalarValue::FixedSizeBinary(size, None) => {
write!(f, "FixedSizeBinary({size}, {self})")
}
@@ -5393,6 +5445,17 @@ mod tests {
ScalarValue::Utf8(None),
DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
);
+
+ // needs https://github.com/apache/arrow-rs/issues/5893
+ /*
+ check_scalar_cast(ScalarValue::Utf8(None), DataType::Utf8View);
+ check_scalar_cast(ScalarValue::from("foo"), DataType::Utf8View);
+ check_scalar_cast(
+ ScalarValue::from("larger than 12 bytes string"),
+ DataType::Utf8View,
+ );
+
+ */
}
// mimics how casting work on scalar values by `casting` `scalar` to
`desired_type`
diff --git a/datafusion/functions/src/core/arrow_cast.rs
b/datafusion/functions/src/core/arrow_cast.rs
index d641389e0a..9c410d4e18 100644
--- a/datafusion/functions/src/core/arrow_cast.rs
+++ b/datafusion/functions/src/core/arrow_cast.rs
@@ -564,7 +564,9 @@ impl<'a> Tokenizer<'a> {
"Utf8" => Token::SimpleType(DataType::Utf8),
"LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
+ "Utf8View" => Token::SimpleType(DataType::Utf8View),
"Binary" => Token::SimpleType(DataType::Binary),
+ "BinaryView" => Token::SimpleType(DataType::BinaryView),
"LargeBinary" => Token::SimpleType(DataType::LargeBinary),
"Float16" => Token::SimpleType(DataType::Float16),
@@ -772,11 +774,13 @@ mod test {
DataType::Interval(IntervalUnit::DayTime),
DataType::Interval(IntervalUnit::MonthDayNano),
DataType::Binary,
+ DataType::BinaryView,
DataType::FixedSizeBinary(0),
DataType::FixedSizeBinary(1234),
DataType::FixedSizeBinary(-432),
DataType::LargeBinary,
DataType::Utf8,
+ DataType::Utf8View,
DataType::LargeUtf8,
DataType::Decimal128(7, 12),
DataType::Decimal256(6, 13),
diff --git a/datafusion/proto-common/proto/datafusion_common.proto
b/datafusion/proto-common/proto/datafusion_common.proto
index 29a348283f..e523ef1a5e 100644
--- a/datafusion/proto-common/proto/datafusion_common.proto
+++ b/datafusion/proto-common/proto/datafusion_common.proto
@@ -248,6 +248,7 @@ message ScalarValue{
bool bool_value = 1;
string utf8_value = 2;
string large_utf8_value = 3;
+ string utf8_view_value = 23;
int32 int8_value = 4;
int32 int16_value = 5;
int32 int32_value = 6;
@@ -281,6 +282,7 @@ message ScalarValue{
ScalarDictionaryValue dictionary_value = 27;
bytes binary_value = 28;
bytes large_binary_value = 29;
+ bytes binary_view_value = 22;
ScalarTime64Value time64_value = 30;
IntervalDayTimeValue interval_daytime_value = 25;
IntervalMonthDayNanoValue interval_month_day_nano = 31;
@@ -318,8 +320,10 @@ message ArrowType{
EmptyMessage FLOAT32 = 12 ;
EmptyMessage FLOAT64 = 13 ;
EmptyMessage UTF8 = 14 ;
+ EmptyMessage UTF8_VIEW = 35;
EmptyMessage LARGE_UTF8 = 32;
EmptyMessage BINARY = 15 ;
+ EmptyMessage BINARY_VIEW = 34;
int32 FIXED_SIZE_BINARY = 16 ;
EmptyMessage LARGE_BINARY = 31;
EmptyMessage DATE32 = 17 ;
diff --git a/datafusion/proto-common/src/from_proto/mod.rs
b/datafusion/proto-common/src/from_proto/mod.rs
index 25c1502ee7..be87123fb1 100644
--- a/datafusion/proto-common/src/from_proto/mod.rs
+++ b/datafusion/proto-common/src/from_proto/mod.rs
@@ -224,8 +224,10 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for
DataType {
arrow_type::ArrowTypeEnum::Float32(_) => DataType::Float32,
arrow_type::ArrowTypeEnum::Float64(_) => DataType::Float64,
arrow_type::ArrowTypeEnum::Utf8(_) => DataType::Utf8,
+ arrow_type::ArrowTypeEnum::Utf8View(_) => DataType::Utf8View,
arrow_type::ArrowTypeEnum::LargeUtf8(_) => DataType::LargeUtf8,
arrow_type::ArrowTypeEnum::Binary(_) => DataType::Binary,
+ arrow_type::ArrowTypeEnum::BinaryView(_) => DataType::BinaryView,
arrow_type::ArrowTypeEnum::FixedSizeBinary(size) => {
DataType::FixedSizeBinary(*size)
}
@@ -361,6 +363,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
Ok(match value {
Value::BoolValue(v) => Self::Boolean(Some(*v)),
Value::Utf8Value(v) => Self::Utf8(Some(v.to_owned())),
+ Value::Utf8ViewValue(v) => Self::Utf8View(Some(v.to_owned())),
Value::LargeUtf8Value(v) => Self::LargeUtf8(Some(v.to_owned())),
Value::Int8Value(v) => Self::Int8(Some(*v as i8)),
Value::Int16Value(v) => Self::Int16(Some(*v as i16)),
@@ -571,6 +574,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
Self::Dictionary(Box::new(index_type), Box::new(value))
}
Value::BinaryValue(v) => Self::Binary(Some(v.clone())),
+ Value::BinaryViewValue(v) => Self::BinaryView(Some(v.clone())),
Value::LargeBinaryValue(v) => Self::LargeBinary(Some(v.clone())),
Value::IntervalDaytimeValue(v) => Self::IntervalDayTime(Some(
IntervalDayTimeType::make_value(v.days, v.milliseconds),
diff --git a/datafusion/proto-common/src/generated/pbjson.rs
b/datafusion/proto-common/src/generated/pbjson.rs
index 6f8409b82a..ead29d9b92 100644
--- a/datafusion/proto-common/src/generated/pbjson.rs
+++ b/datafusion/proto-common/src/generated/pbjson.rs
@@ -125,12 +125,18 @@ impl serde::Serialize for ArrowType {
arrow_type::ArrowTypeEnum::Utf8(v) => {
struct_ser.serialize_field("UTF8", v)?;
}
+ arrow_type::ArrowTypeEnum::Utf8View(v) => {
+ struct_ser.serialize_field("UTF8VIEW", v)?;
+ }
arrow_type::ArrowTypeEnum::LargeUtf8(v) => {
struct_ser.serialize_field("LARGEUTF8", v)?;
}
arrow_type::ArrowTypeEnum::Binary(v) => {
struct_ser.serialize_field("BINARY", v)?;
}
+ arrow_type::ArrowTypeEnum::BinaryView(v) => {
+ struct_ser.serialize_field("BINARYVIEW", v)?;
+ }
arrow_type::ArrowTypeEnum::FixedSizeBinary(v) => {
struct_ser.serialize_field("FIXEDSIZEBINARY", v)?;
}
@@ -216,9 +222,13 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
"FLOAT32",
"FLOAT64",
"UTF8",
+ "UTF8_VIEW",
+ "UTF8VIEW",
"LARGE_UTF8",
"LARGEUTF8",
"BINARY",
+ "BINARY_VIEW",
+ "BINARYVIEW",
"FIXED_SIZE_BINARY",
"FIXEDSIZEBINARY",
"LARGE_BINARY",
@@ -258,8 +268,10 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
Float32,
Float64,
Utf8,
+ Utf8View,
LargeUtf8,
Binary,
+ BinaryView,
FixedSizeBinary,
LargeBinary,
Date32,
@@ -312,8 +324,10 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
"FLOAT32" => Ok(GeneratedField::Float32),
"FLOAT64" => Ok(GeneratedField::Float64),
"UTF8" => Ok(GeneratedField::Utf8),
+ "UTF8VIEW" | "UTF8_VIEW" =>
Ok(GeneratedField::Utf8View),
"LARGEUTF8" | "LARGE_UTF8" =>
Ok(GeneratedField::LargeUtf8),
"BINARY" => Ok(GeneratedField::Binary),
+ "BINARYVIEW" | "BINARY_VIEW" =>
Ok(GeneratedField::BinaryView),
"FIXEDSIZEBINARY" | "FIXED_SIZE_BINARY" =>
Ok(GeneratedField::FixedSizeBinary),
"LARGEBINARY" | "LARGE_BINARY" =>
Ok(GeneratedField::LargeBinary),
"DATE32" => Ok(GeneratedField::Date32),
@@ -449,6 +463,13 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
return
Err(serde::de::Error::duplicate_field("UTF8"));
}
arrow_type_enum__ =
map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Utf8)
+;
+ }
+ GeneratedField::Utf8View => {
+ if arrow_type_enum__.is_some() {
+ return
Err(serde::de::Error::duplicate_field("UTF8VIEW"));
+ }
+ arrow_type_enum__ =
map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Utf8View)
;
}
GeneratedField::LargeUtf8 => {
@@ -463,6 +484,13 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
return
Err(serde::de::Error::duplicate_field("BINARY"));
}
arrow_type_enum__ =
map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Binary)
+;
+ }
+ GeneratedField::BinaryView => {
+ if arrow_type_enum__.is_some() {
+ return
Err(serde::de::Error::duplicate_field("BINARYVIEW"));
+ }
+ arrow_type_enum__ =
map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::BinaryView)
;
}
GeneratedField::FixedSizeBinary => {
@@ -6255,6 +6283,9 @@ impl serde::Serialize for ScalarValue {
scalar_value::Value::LargeUtf8Value(v) => {
struct_ser.serialize_field("largeUtf8Value", v)?;
}
+ scalar_value::Value::Utf8ViewValue(v) => {
+ struct_ser.serialize_field("utf8ViewValue", v)?;
+ }
scalar_value::Value::Int8Value(v) => {
struct_ser.serialize_field("int8Value", v)?;
}
@@ -6348,6 +6379,10 @@ impl serde::Serialize for ScalarValue {
#[allow(clippy::needless_borrow)]
struct_ser.serialize_field("largeBinaryValue",
pbjson::private::base64::encode(&v).as_str())?;
}
+ scalar_value::Value::BinaryViewValue(v) => {
+ #[allow(clippy::needless_borrow)]
+ struct_ser.serialize_field("binaryViewValue",
pbjson::private::base64::encode(&v).as_str())?;
+ }
scalar_value::Value::Time64Value(v) => {
struct_ser.serialize_field("time64Value", v)?;
}
@@ -6383,6 +6418,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
"utf8Value",
"large_utf8_value",
"largeUtf8Value",
+ "utf8_view_value",
+ "utf8ViewValue",
"int8_value",
"int8Value",
"int16_value",
@@ -6439,6 +6476,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
"binaryValue",
"large_binary_value",
"largeBinaryValue",
+ "binary_view_value",
+ "binaryViewValue",
"time64_value",
"time64Value",
"interval_daytime_value",
@@ -6457,6 +6496,7 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
BoolValue,
Utf8Value,
LargeUtf8Value,
+ Utf8ViewValue,
Int8Value,
Int16Value,
Int32Value,
@@ -6485,6 +6525,7 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
DictionaryValue,
BinaryValue,
LargeBinaryValue,
+ BinaryViewValue,
Time64Value,
IntervalDaytimeValue,
IntervalMonthDayNano,
@@ -6515,6 +6556,7 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
"boolValue" | "bool_value" =>
Ok(GeneratedField::BoolValue),
"utf8Value" | "utf8_value" =>
Ok(GeneratedField::Utf8Value),
"largeUtf8Value" | "large_utf8_value" =>
Ok(GeneratedField::LargeUtf8Value),
+ "utf8ViewValue" | "utf8_view_value" =>
Ok(GeneratedField::Utf8ViewValue),
"int8Value" | "int8_value" =>
Ok(GeneratedField::Int8Value),
"int16Value" | "int16_value" =>
Ok(GeneratedField::Int16Value),
"int32Value" | "int32_value" =>
Ok(GeneratedField::Int32Value),
@@ -6543,6 +6585,7 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
"dictionaryValue" | "dictionary_value" =>
Ok(GeneratedField::DictionaryValue),
"binaryValue" | "binary_value" =>
Ok(GeneratedField::BinaryValue),
"largeBinaryValue" | "large_binary_value" =>
Ok(GeneratedField::LargeBinaryValue),
+ "binaryViewValue" | "binary_view_value" =>
Ok(GeneratedField::BinaryViewValue),
"time64Value" | "time64_value" =>
Ok(GeneratedField::Time64Value),
"intervalDaytimeValue" | "interval_daytime_value"
=> Ok(GeneratedField::IntervalDaytimeValue),
"intervalMonthDayNano" | "interval_month_day_nano"
=> Ok(GeneratedField::IntervalMonthDayNano),
@@ -6595,6 +6638,12 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
}
value__ =
map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::LargeUtf8Value);
}
+ GeneratedField::Utf8ViewValue => {
+ if value__.is_some() {
+ return
Err(serde::de::Error::duplicate_field("utf8ViewValue"));
+ }
+ value__ =
map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::Utf8ViewValue);
+ }
GeneratedField::Int8Value => {
if value__.is_some() {
return
Err(serde::de::Error::duplicate_field("int8Value"));
@@ -6772,6 +6821,12 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
}
value__ =
map_.next_value::<::std::option::Option<::pbjson::private::BytesDeserialize<_>>>()?.map(|x|
scalar_value::Value::LargeBinaryValue(x.0));
}
+ GeneratedField::BinaryViewValue => {
+ if value__.is_some() {
+ return
Err(serde::de::Error::duplicate_field("binaryViewValue"));
+ }
+ value__ =
map_.next_value::<::std::option::Option<::pbjson::private::BytesDeserialize<_>>>()?.map(|x|
scalar_value::Value::BinaryViewValue(x.0));
+ }
GeneratedField::Time64Value => {
if value__.is_some() {
return
Err(serde::de::Error::duplicate_field("time64Value"));
diff --git a/datafusion/proto-common/src/generated/prost.rs
b/datafusion/proto-common/src/generated/prost.rs
index ff17a40738..b306f3212a 100644
--- a/datafusion/proto-common/src/generated/prost.rs
+++ b/datafusion/proto-common/src/generated/prost.rs
@@ -326,7 +326,7 @@ pub struct ScalarFixedSizeBinary {
pub struct ScalarValue {
#[prost(
oneof = "scalar_value::Value",
- tags = "33, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 32, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 30, 25, 31, 34, 42"
+ tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 32, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34,
42"
)]
pub value: ::core::option::Option<scalar_value::Value>,
}
@@ -345,6 +345,8 @@ pub mod scalar_value {
Utf8Value(::prost::alloc::string::String),
#[prost(string, tag = "3")]
LargeUtf8Value(::prost::alloc::string::String),
+ #[prost(string, tag = "23")]
+ Utf8ViewValue(::prost::alloc::string::String),
#[prost(int32, tag = "4")]
Int8Value(i32),
#[prost(int32, tag = "5")]
@@ -402,6 +404,8 @@ pub mod scalar_value {
BinaryValue(::prost::alloc::vec::Vec<u8>),
#[prost(bytes, tag = "29")]
LargeBinaryValue(::prost::alloc::vec::Vec<u8>),
+ #[prost(bytes, tag = "22")]
+ BinaryViewValue(::prost::alloc::vec::Vec<u8>),
#[prost(message, tag = "30")]
Time64Value(super::ScalarTime64Value),
#[prost(message, tag = "25")]
@@ -440,7 +444,7 @@ pub struct Decimal256 {
pub struct ArrowType {
#[prost(
oneof = "arrow_type::ArrowTypeEnum",
- tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 32, 15, 16, 31,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33"
+ tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34,
16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33"
)]
pub arrow_type_enum: ::core::option::Option<arrow_type::ArrowTypeEnum>,
}
@@ -482,10 +486,14 @@ pub mod arrow_type {
Float64(super::EmptyMessage),
#[prost(message, tag = "14")]
Utf8(super::EmptyMessage),
+ #[prost(message, tag = "35")]
+ Utf8View(super::EmptyMessage),
#[prost(message, tag = "32")]
LargeUtf8(super::EmptyMessage),
#[prost(message, tag = "15")]
Binary(super::EmptyMessage),
+ #[prost(message, tag = "34")]
+ BinaryView(super::EmptyMessage),
#[prost(int32, tag = "16")]
FixedSizeBinary(i32),
#[prost(message, tag = "31")]
diff --git a/datafusion/proto-common/src/to_proto/mod.rs
b/datafusion/proto-common/src/to_proto/mod.rs
index 8e7ee9a7d6..a3dc826a79 100644
--- a/datafusion/proto-common/src/to_proto/mod.rs
+++ b/datafusion/proto-common/src/to_proto/mod.rs
@@ -347,6 +347,11 @@ impl TryFrom<&ScalarValue> for protobuf::ScalarValue {
Value::LargeUtf8Value(s.to_owned())
})
}
+ ScalarValue::Utf8View(val) => {
+ create_proto_scalar(val.as_ref(), &data_type, |s| {
+ Value::Utf8ViewValue(s.to_owned())
+ })
+ }
ScalarValue::List(arr) => {
encode_scalar_nested_value(arr.to_owned() as ArrayRef, val)
}
@@ -461,6 +466,11 @@ impl TryFrom<&ScalarValue> for protobuf::ScalarValue {
Value::BinaryValue(s.to_owned())
})
}
+ ScalarValue::BinaryView(val) => {
+ create_proto_scalar(val.as_ref(), &data_type, |s| {
+ Value::BinaryViewValue(s.to_owned())
+ })
+ }
ScalarValue::LargeBinary(val) => {
create_proto_scalar(val.as_ref(), &data_type, |s| {
Value::LargeBinaryValue(s.to_owned())
diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs
b/datafusion/proto/src/generated/datafusion_proto_common.rs
index ff17a40738..b306f3212a 100644
--- a/datafusion/proto/src/generated/datafusion_proto_common.rs
+++ b/datafusion/proto/src/generated/datafusion_proto_common.rs
@@ -326,7 +326,7 @@ pub struct ScalarFixedSizeBinary {
pub struct ScalarValue {
#[prost(
oneof = "scalar_value::Value",
- tags = "33, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 32, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 30, 25, 31, 34, 42"
+ tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 32, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34,
42"
)]
pub value: ::core::option::Option<scalar_value::Value>,
}
@@ -345,6 +345,8 @@ pub mod scalar_value {
Utf8Value(::prost::alloc::string::String),
#[prost(string, tag = "3")]
LargeUtf8Value(::prost::alloc::string::String),
+ #[prost(string, tag = "23")]
+ Utf8ViewValue(::prost::alloc::string::String),
#[prost(int32, tag = "4")]
Int8Value(i32),
#[prost(int32, tag = "5")]
@@ -402,6 +404,8 @@ pub mod scalar_value {
BinaryValue(::prost::alloc::vec::Vec<u8>),
#[prost(bytes, tag = "29")]
LargeBinaryValue(::prost::alloc::vec::Vec<u8>),
+ #[prost(bytes, tag = "22")]
+ BinaryViewValue(::prost::alloc::vec::Vec<u8>),
#[prost(message, tag = "30")]
Time64Value(super::ScalarTime64Value),
#[prost(message, tag = "25")]
@@ -440,7 +444,7 @@ pub struct Decimal256 {
pub struct ArrowType {
#[prost(
oneof = "arrow_type::ArrowTypeEnum",
- tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 32, 15, 16, 31,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33"
+ tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34,
16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33"
)]
pub arrow_type_enum: ::core::option::Option<arrow_type::ArrowTypeEnum>,
}
@@ -482,10 +486,14 @@ pub mod arrow_type {
Float64(super::EmptyMessage),
#[prost(message, tag = "14")]
Utf8(super::EmptyMessage),
+ #[prost(message, tag = "35")]
+ Utf8View(super::EmptyMessage),
#[prost(message, tag = "32")]
LargeUtf8(super::EmptyMessage),
#[prost(message, tag = "15")]
Binary(super::EmptyMessage),
+ #[prost(message, tag = "34")]
+ BinaryView(super::EmptyMessage),
#[prost(int32, tag = "16")]
FixedSizeBinary(i32),
#[prost(message, tag = "31")]
diff --git a/datafusion/sql/src/unparser/expr.rs
b/datafusion/sql/src/unparser/expr.rs
index 12c48054f1..1d197f3a0d 100644
--- a/datafusion/sql/src/unparser/expr.rs
+++ b/datafusion/sql/src/unparser/expr.rs
@@ -709,12 +709,20 @@ impl Unparser<'_> {
ast::Value::SingleQuotedString(str.to_string()),
)),
ScalarValue::Utf8(None) => Ok(ast::Expr::Value(ast::Value::Null)),
+ ScalarValue::Utf8View(Some(str)) => Ok(ast::Expr::Value(
+ ast::Value::SingleQuotedString(str.to_string()),
+ )),
+ ScalarValue::Utf8View(None) =>
Ok(ast::Expr::Value(ast::Value::Null)),
ScalarValue::LargeUtf8(Some(str)) => Ok(ast::Expr::Value(
ast::Value::SingleQuotedString(str.to_string()),
)),
ScalarValue::LargeUtf8(None) =>
Ok(ast::Expr::Value(ast::Value::Null)),
ScalarValue::Binary(Some(_)) => not_impl_err!("Unsupported scalar:
{v:?}"),
ScalarValue::Binary(None) =>
Ok(ast::Expr::Value(ast::Value::Null)),
+ ScalarValue::BinaryView(Some(_)) => {
+ not_impl_err!("Unsupported scalar: {v:?}")
+ }
+ ScalarValue::BinaryView(None) =>
Ok(ast::Expr::Value(ast::Value::Null)),
ScalarValue::FixedSizeBinary(..) => {
not_impl_err!("Unsupported scalar: {v:?}")
}
diff --git a/datafusion/sqllogictest/test_files/arrow_typeof.slt
b/datafusion/sqllogictest/test_files/arrow_typeof.slt
index c928b96e03..ab4ff9e2ce 100644
--- a/datafusion/sqllogictest/test_files/arrow_typeof.slt
+++ b/datafusion/sqllogictest/test_files/arrow_typeof.slt
@@ -422,3 +422,13 @@ query ?
select arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)');
----
[1, 2, 3]
+
+# Tests for Utf8View
+query ?T
+select arrow_cast('MyAwesomeString', 'Utf8View'),
arrow_typeof(arrow_cast('MyAwesomeString', 'Utf8View'))
+----
+MyAwesomeString Utf8View
+
+# Fails until we update arrow-rs with support for
https://github.com/apache/arrow-rs/pull/5894
+query error DataFusion error: SQL error: ParserError\("Expected an SQL
statement, found: arrow_cast"\)
+arrow_cast('MyAwesomeString', 'BinaryView'),
arrow_typeof(arrow_cast('MyAwesomeString', 'BinaryView'))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]