alamb commented on code in PR #14834:
URL: https://github.com/apache/datafusion/pull/14834#discussion_r1969616455
##########
datafusion/functions/src/math/gcd.rs:
##########
@@ -75,36 +77,73 @@ impl ScalarUDFImpl for GcdFunc {
}
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
- Ok(Int64)
+ Ok(DataType::Int64)
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
- make_scalar_function(gcd, vec![])(&args.args)
+ let args: [ColumnarValue; 2] = args.args.try_into().map_err(|_| {
+ internal_datafusion_err!("Expected 2 arguments for function gcd")
+ })?;
+
+ match args {
+ [ColumnarValue::Array(a), ColumnarValue::Array(b)] => {
+ compute_gcd_for_arrays(&a, &b)
+ }
+ [ColumnarValue::Scalar(ScalarValue::Int64(a)),
ColumnarValue::Scalar(ScalarValue::Int64(b))] => {
+ match (a, b) {
+ (Some(a), Some(b)) =>
Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+ Some(compute_gcd(a, b)?),
+ ))),
+ _ => Ok(ColumnarValue::Scalar(ScalarValue::Int64(None))),
+ }
+ }
+ [ColumnarValue::Array(a),
ColumnarValue::Scalar(ScalarValue::Int64(b))] => {
+ compute_gcd_with_scalar(&a, b)
+ }
+ [ColumnarValue::Scalar(ScalarValue::Int64(a)),
ColumnarValue::Array(b)] => {
+ compute_gcd_with_scalar(&b, a)
+ }
+ _ => exec_err!("Unsupported argument types for function gcd"),
+ }
}
fn documentation(&self) -> Option<&Documentation> {
self.doc()
}
}
-/// Gcd SQL function
-fn gcd(args: &[ArrayRef]) -> Result<ArrayRef> {
- match args[0].data_type() {
- Int64 => {
- let arg1 = downcast_named_arg!(&args[0], "x", Int64Array);
- let arg2 = downcast_named_arg!(&args[1], "y", Int64Array);
+fn compute_gcd_for_arrays(a: &ArrayRef, b: &ArrayRef) -> Result<ColumnarValue>
{
+ let result: Result<Int64Array> = a
+ .as_primitive::<Int64Type>()
+ .iter()
+ .zip(b.as_primitive::<Int64Type>().iter())
+ .map(|(a, b)| match (a, b) {
+ (Some(a), Some(b)) => Ok(Some(compute_gcd(a, b)?)),
+ _ => Ok(None),
+ })
+ .collect();
+
+ result.map(|arr| ColumnarValue::Array(Arc::new(arr) as ArrayRef))
+}
Review Comment:
With my proposal the two array version seems to go 25% faster than this PR.
Here is a PR with that improvement:
- https://github.com/jayzhan211/datafusion/pull/5
```
Gnuplot not found, using plotters backend
Benchmarking gcd both array: Warming up for 3.0000 s
Warning: Unable to complete 100 samples in 5.0s. You may wish to increase
target time to 5.9s, enable flat sampling, or reduce sample count to 60.
gcd both array time: [1.1940 ms 1.2007 ms 1.2081 ms]
change: [-27.574% -27.080% -26.470%] (p = 0.00 <
0.05)
Performance has improved.
Found 9 outliers among 100 measurements (9.00%)
6 (6.00%) high mild
3 (3.00%) high severe
gcd array and scalar time: [1.9644 ms 1.9705 ms 1.9781 ms]
change: [-1.3293% -0.9077% -0.4670%] (p = 0.00 <
0.05)
Change within noise threshold.
Found 6 outliers among 100 measurements (6.00%)
3 (3.00%) high mild
3 (3.00%) high severe
gcd both scalar time: [26.094 ns 26.253 ns 26.467 ns]
change: [-1.1099% -0.6987% -0.2362%] (p = 0.00 <
0.05)
Change within noise threshold.
Found 8 outliers among 100 measurements (8.00%)
4 (4.00%) high mild
4 (4.00%) high severe
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]