[arrow-datafusion] branch master updated (1eb46df49 -> 556282a8b)

2023-01-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


from 1eb46df49 Covariance single row input & null skipping (#4852)
 add 556282a8b Support for SQL Natural Join (#4863)

No new revisions were added by this update.

Summary of changes:
 datafusion/sql/src/relation/join.rs  | 27 ++
 datafusion/sql/tests/integration_test.rs | 48 
 2 files changed, 70 insertions(+), 5 deletions(-)



[arrow-datafusion] branch master updated: Covariance single row input & null skipping (#4852)

2023-01-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new 1eb46df49 Covariance single row input & null skipping (#4852)
1eb46df49 is described below

commit 1eb46df49fcd2479235500810c0562b10da77c90
Author: Eduard Karacharov <13005055+kor...@users.noreply.github.com>
AuthorDate: Tue Jan 10 09:44:17 2023 +0300

Covariance single row input & null skipping (#4852)

* covariance & correlation single row & null skipping

* Apply suggestions from code review

Co-authored-by: Mehmet Ozan Kabak 

* unwrap_or_internal_err macro instead of unwrap

Co-authored-by: Mehmet Ozan Kabak 
---
 .../tests/sqllogictests/test_files/aggregate.slt   |  82 
 .../physical-expr/src/aggregate/correlation.rs |  99 +
 .../physical-expr/src/aggregate/covariance.rs  | 221 ++---
 3 files changed, 292 insertions(+), 110 deletions(-)

diff --git a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt 
b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt
index dbb3b69ca..5ddec784e 100644
--- a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt
@@ -36,12 +36,94 @@ SELECT covar(c2, c12) FROM aggregate_test_100
 
 -0.07996901247859442
 
+# single_row_query_covar_1
+query R
+select covar_samp(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq
+
+NULL
+
+# single_row_query_covar_2
+query R
+select covar_pop(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq
+
+0
+
+# all_nulls_query_covar
+query R
+with data as (
+  select null::int as f, null::int as b
+  union all
+  select null::int as f, null::int as b
+)
+select covar_samp(f, b), covar_pop(f, b)
+from data
+
+NULL NULL
+
+# covar_query_with_nulls
+query R
+with data as (
+  select 1 as f,   4 as b
+  union all
+  select null as f,   99 as b
+  union all
+  select 2 as f,   5 as b
+  union all
+  select 98 as f,   null as b
+  union all
+  select 3 as f,   6 as b
+  union all
+  select null as f, null as b
+)
+select covar_samp(f, b), covar_pop(f, b)
+from data
+
+1 0.
+
 # csv_query_correlation
 query R
 SELECT corr(c2, c12) FROM aggregate_test_100
 
 -0.19064544190576607
 
+# single_row_query_correlation
+query R
+select corr(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq
+
+0
+
+# all_nulls_query_correlation
+query R
+with data as (
+  select null::int as f, null::int as b
+  union all
+  select null::int as f, null::int as b
+)
+select corr(f, b)
+from data
+
+NULL
+
+# correlation_query_with_nulls
+query R
+with data as (
+  select 1 as f,   4 as b
+  union all
+  select null as f,   99 as b
+  union all
+  select 2 as f,   5 as b
+  union all
+  select 98 as f,   null as b
+  union all
+  select 3 as f,   6 as b
+  union all
+  select null as f, null as b
+)
+select corr(f, b)
+from data
+
+1
+
 # csv_query_variance_1
 query R
 SELECT var_pop(c2) FROM aggregate_test_100
diff --git a/datafusion/physical-expr/src/aggregate/correlation.rs 
b/datafusion/physical-expr/src/aggregate/correlation.rs
index 6211c578f..1bed3fe03 100644
--- a/datafusion/physical-expr/src/aggregate/correlation.rs
+++ b/datafusion/physical-expr/src/aggregate/correlation.rs
@@ -22,7 +22,11 @@ use crate::aggregate::stats::StatsType;
 use crate::aggregate::stddev::StddevAccumulator;
 use crate::expressions::format_state_name;
 use crate::{AggregateExpr, PhysicalExpr};
-use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
+use arrow::{
+array::ArrayRef,
+compute::{and, filter, is_not_null},
+datatypes::{DataType, Field},
+};
 use datafusion_common::Result;
 use datafusion_common::ScalarValue;
 use datafusion_expr::Accumulator;
@@ -145,14 +149,39 @@ impl Accumulator for CorrelationAccumulator {
 }
 
 fn update_batch( self, values: &[ArrayRef]) -> Result<()> {
-self.covar.update_batch(values)?;
+// TODO: null input skipping logic duplicated across Correlation
+// and its children accumulators.
+// This could be simplified by splitting up input filtering and
+// calculation logic in children accumulators, and calling only
+// calculation part from Correlation
+let values = if values[0].null_count() != 0 || values[1].null_count() 
!= 0 {
+let mask = and(_not_null([0])?, 
_not_null([1])?)?;
+let values1 = filter([0], )?;
+let values2 = filter([1], )?;
+
+vec![values1, values2]
+} else {
+values.to_vec()
+};
+
+self.covar.update_batch()?;
 self.stddev1.update_batch([0..1])?;
 self.stddev2.update_batch([1..2])?;
 Ok(())
 }
 
 fn retract_batch( self, values: 

[arrow-datafusion] branch master updated: Support using var/var_pop/stddev/stddev_pop in window expressions with custom frames (#4848)

2023-01-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new 292eb954f Support using var/var_pop/stddev/stddev_pop in window 
expressions with custom frames (#4848)
292eb954f is described below

commit 292eb954fc0bad3a1febc597233ba26cb60bda3e
Author: Jon Mease 
AuthorDate: Tue Jan 10 01:37:41 2023 -0500

Support using var/var_pop/stddev/stddev_pop in window expressions with 
custom frames (#4848)

* Wire up retract_batch for Stddev/StddevPop/Variance/VariancePop to

* Add test for Stddev/StddevPop/Variance/VariancePop with window frame
---
 datafusion/core/tests/sql/window.rs| 28 ++
 datafusion/physical-expr/src/aggregate/stddev.rs   | 12 ++
 datafusion/physical-expr/src/aggregate/variance.rs | 10 
 3 files changed, 50 insertions(+)

diff --git a/datafusion/core/tests/sql/window.rs 
b/datafusion/core/tests/sql/window.rs
index 0c3ecfa59..1167d57a4 100644
--- a/datafusion/core/tests/sql/window.rs
+++ b/datafusion/core/tests/sql/window.rs
@@ -524,6 +524,34 @@ async fn window_frame_rows_preceding() -> Result<()> {
 Ok(())
 }
 
+#[tokio::test]
+async fn window_frame_rows_preceding_stddev_variance() -> Result<()> {
+let ctx = SessionContext::new();
+register_aggregate_csv().await?;
+let sql = "SELECT \
+   VAR(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 
FOLLOWING),\
+   VAR_POP(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 
FOLLOWING),\
+   STDDEV(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 
FOLLOWING),\
+   STDDEV_POP(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 
FOLLOWING)\
+   FROM aggregate_test_100 \
+   ORDER BY c9 \
+   LIMIT 5";
+let actual = execute_to_batches(, sql).await;
+let expected = vec![
+
"+-++---+--+",
+"| VARIANCE(aggregate_test_100.c4) | 
VARIANCEPOP(aggregate_test_100.c4) | STDDEV(aggregate_test_100.c4) | 
STDDEVPOP(aggregate_test_100.c4) |",
+
"+-++---+--+",
+"| 46721.174   | 31147.4496
 | 216.15118166073427| 176.4867007894773|",
+"| 2639429.2   | 1759619.48
 | 1624.6320609089714| 1326.5065229977404   |",
+"| 746202.24   | 497468.16 
 | 863.8300372951455 | 705.3142719541563|",
+"| 768422.81   | 512281.88 
 | 876.5973990378925 | 715.7387791645767|",
+"| 66526.333288| 44350.587 
 | 257.9269922542594 | 210.5965073045749|",
+
"+-++---+--+",
+];
+assert_batches_eq!(expected, );
+Ok(())
+}
+
 #[tokio::test]
 async fn window_frame_rows_preceding_with_partition_unique_order_by() -> 
Result<()> {
 let ctx = SessionContext::new();
diff --git a/datafusion/physical-expr/src/aggregate/stddev.rs 
b/datafusion/physical-expr/src/aggregate/stddev.rs
index 4c9e46644..dab84b14a 100644
--- a/datafusion/physical-expr/src/aggregate/stddev.rs
+++ b/datafusion/physical-expr/src/aggregate/stddev.rs
@@ -73,6 +73,10 @@ impl AggregateExpr for Stddev {
 Ok(Box::new(StddevAccumulator::try_new(StatsType::Sample)?))
 }
 
+fn create_sliding_accumulator() -> Result> {
+Ok(Box::new(StddevAccumulator::try_new(StatsType::Sample)?))
+}
+
 fn state_fields() -> Result> {
 Ok(vec![
 Field::new(
@@ -128,6 +132,10 @@ impl AggregateExpr for StddevPop {
 Ok(Box::new(StddevAccumulator::try_new(StatsType::Population)?))
 }
 
+fn create_sliding_accumulator() -> Result> {
+Ok(Box::new(StddevAccumulator::try_new(StatsType::Population)?))
+}
+
 fn state_fields() -> Result> {
 Ok(vec![
 Field::new(
@@ -184,6 +192,10 @@ impl Accumulator for StddevAccumulator {
 self.variance.update_batch(values)
 }
 
+fn retract_batch( self, values: &[ArrayRef]) -> Result<()> {
+self.variance.retract_batch(values)
+}
+
 fn merge_batch( self, states: &[ArrayRef]) -> Result<()> {
 self.variance.merge_batch(states)
 }
diff --git a/datafusion/physical-expr/src/aggregate/variance.rs 

[arrow-datafusion] branch master updated: Implement retract_batch for AvgAccumulator (#4846)

2023-01-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new 13fb42efe Implement retract_batch for AvgAccumulator (#4846)
13fb42efe is described below

commit 13fb42efec4b5ab7f9aa251f1705fdcf89057d23
Author: Jon Mease 
AuthorDate: Tue Jan 10 01:24:31 2023 -0500

Implement retract_batch for AvgAccumulator (#4846)

* Implement retract_batch for AvgAccumulator,

Add avg to custom window frame tests

* fmt
---
 datafusion/core/tests/sql/window.rs   | 38 ---
 datafusion/physical-expr/src/aggregate/average.rs | 13 
 2 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/datafusion/core/tests/sql/window.rs 
b/datafusion/core/tests/sql/window.rs
index 5ca49cff2..0c3ecfa59 100644
--- a/datafusion/core/tests/sql/window.rs
+++ b/datafusion/core/tests/sql/window.rs
@@ -503,21 +503,22 @@ async fn window_frame_rows_preceding() -> Result<()> {
 register_aggregate_csv().await?;
 let sql = "SELECT \
SUM(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 
FOLLOWING),\
+   AVG(c4) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 
FOLLOWING),\
COUNT(*) OVER(ORDER BY c4 ROWS BETWEEN 1 PRECEDING AND 1 
FOLLOWING)\
FROM aggregate_test_100 \
ORDER BY c9 \
LIMIT 5";
 let actual = execute_to_batches(, sql).await;
 let expected = vec![
-"++-+",
-"| SUM(aggregate_test_100.c4) | COUNT(UInt8(1)) |",
-"++-+",
-"| -48302 | 3   |",
-"| 11243  | 3   |",
-"| -51311 | 3   |",
-"| -2391  | 3   |",
-"| 46756  | 3   |",
-"++-+",
+
"+++-+",
+"| SUM(aggregate_test_100.c4) | AVG(aggregate_test_100.c4) | 
COUNT(UInt8(1)) |",
+
"+++-+",
+"| -48302 | -16100.| 3 
  |",
+"| 11243  | 3747.5 | 3 
  |",
+"| -51311 | -17103.6668| 3 
  |",
+"| -2391  | -797   | 3 
  |",
+"| 46756  | 15585.3334 | 3 
  |",
+
"+++-+",
 ];
 assert_batches_eq!(expected, );
 Ok(())
@@ -529,21 +530,22 @@ async fn 
window_frame_rows_preceding_with_partition_unique_order_by() -> Result<
 register_aggregate_csv().await?;
 let sql = "SELECT \
SUM(c4) OVER(PARTITION BY c1 ORDER BY c9 ROWS BETWEEN 1 
PRECEDING AND 1 FOLLOWING),\
+   AVG(c4) OVER(PARTITION BY c1 ORDER BY c9 ROWS BETWEEN 1 
PRECEDING AND 1 FOLLOWING),\
COUNT(*) OVER(PARTITION BY c2 ORDER BY c9 ROWS BETWEEN 1 
PRECEDING AND 1 FOLLOWING)\
FROM aggregate_test_100 \
ORDER BY c9 \
LIMIT 5";
 let actual = execute_to_batches(, sql).await;
 let expected = vec![
-"++-+",
-"| SUM(aggregate_test_100.c4) | COUNT(UInt8(1)) |",
-"++-+",
-"| -38611 | 2   |",
-"| 17547  | 2   |",
-"| -1301  | 2   |",
-"| 26638  | 3   |",
-"| 26861  | 3   |",
-"++-+",
+
"+++-+",
+"| SUM(aggregate_test_100.c4) | AVG(aggregate_test_100.c4) | 
COUNT(UInt8(1)) |",
+
"+++-+",
+"| -38611 | -19305.5   | 2 
  |",
+"| 17547  | 8773.5 | 2 
  |",
+"| -1301  | -650.5 | 2 
  |",
+"| 26638  | 13319  | 3 
  |",
+"| 26861  | 8953.  | 3 
  |",
+

[arrow] branch master updated: GH-15200: [C++] Created benchmarks for round kernels. (#15201)

2023-01-09 Thread yibocai
This is an automated email from the ASF dual-hosted git repository.

yibocai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 85b167c05c GH-15200: [C++] Created benchmarks for round kernels. 
(#15201)
85b167c05c is described below

commit 85b167c05c2f93a95b23e8ac4fd4da576ea5b899
Author: David Sisson 
AuthorDate: Mon Jan 9 20:59:02 2023 -0800

GH-15200: [C++] Created benchmarks for round kernels. (#15201)

The four existing kernel functions Ceil, Floor, Round, and Trunc gain 
benchmarks with this change.
* Closes: #15200

Lead-authored-by: David Sisson 
Co-authored-by: Will Jones 
Signed-off-by: Yibo Cai 
---
 cpp/src/arrow/compute/kernels/CMakeLists.txt   |   1 +
 .../compute/kernels/scalar_arithmetic_benchmark.cc |   2 +
 .../compute/kernels/scalar_round_benchmark.cc  | 120 +
 3 files changed, 123 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt 
b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index e2f869750d..5eadf5d0ea 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -40,6 +40,7 @@ add_arrow_benchmark(scalar_cast_benchmark PREFIX 
"arrow-compute")
 add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(scalar_if_else_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(scalar_random_benchmark PREFIX "arrow-compute")
+add_arrow_benchmark(scalar_round_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(scalar_set_lookup_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(scalar_string_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(scalar_temporal_benchmark PREFIX "arrow-compute")
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc 
b/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc
index 01d9ec944e..4b678da5f1 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc
@@ -107,6 +107,8 @@ static void ArrayArrayKernel(benchmark::State& state) {
 }
 
 void SetArgs(benchmark::internal::Benchmark* bench) {
+  bench->ArgNames({"size", "inverse_null_proportion"});
+
   for (const auto inverse_null_proportion : std::vector({100, 0})) {
 bench->Args({static_cast(kL2Size), inverse_null_proportion});
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_round_benchmark.cc 
b/cpp/src/arrow/compute/kernels/scalar_round_benchmark.cc
new file mode 100644
index 00..dd9ba04a0e
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_round_benchmark.cc
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include 
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernels/test_util.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+#include "arrow/util/benchmark_util.h"
+
+namespace arrow {
+namespace compute {
+
+// Use a fixed hash to ensure consistent results from run to run.
+constexpr auto kSeed = 0x94378165;
+
+template 
+static void RoundArrayBenchmark(benchmark::State& state, const std::string& 
func_name) {
+  RegressionArgs args(state);
+
+  const int64_t array_size = args.size / sizeof(CType);
+  auto rand = random::RandomArrayGenerator(kSeed);
+
+  // Choose values so as to avoid overflow on all ops and types.
+  auto min = static_cast(6);
+  auto max = static_cast(min + 15);
+  auto val = std::static_pointer_cast>(
+  rand.Numeric(array_size, min, max, args.null_proportion));
+  RoundOptions options;
+  options.round_mode = static_cast(Mode);
+
+  for (auto _ : state) {
+ABORT_NOT_OK(CallFunction(func_name, {val}, ));
+  }
+  state.SetItemsProcessed(state.iterations() * array_size);
+}
+
+void SetRoundArgs(benchmark::internal::Benchmark* bench) {
+  bench->ArgNames({"size", "inverse_null_proportion"});
+
+  for (const auto inverse_null_proportion : std::vector({100, 0})) {
+bench->Args({static_cast(kL2Size), inverse_null_proportion});
+  }
+}
+
+template 
+static void Ceil(benchmark::State& state) {
+  

[arrow] branch master updated (7e02fde652 -> 33d677c480)

2023-01-09 Thread yibocai
This is an automated email from the ASF dual-hosted git repository.

yibocai pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 7e02fde652 MINOR: [C++] Mark TypeHolder operator bool() as const 
(#15273)
 add 33d677c480 GH-15249:  [Documentation] Add PR template (#15250)

No new revisions were added by this update.

Summary of changes:
 .github/pull_request_template.md  | 62 +++
 dev/release/rat_exclude_files.txt |  1 +
 2 files changed, 63 insertions(+)
 create mode 100644 .github/pull_request_template.md



[arrow] branch ARROW-17715a updated (019740ad9f -> 145e167753)

2023-01-09 Thread kiszk
This is an automated email from the ASF dual-hosted git repository.

kiszk pushed a change to branch ARROW-17715a
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 019740ad9f change LLVM version
 add 145e167753 disable JEMALLOC and PLASMA

No new revisions were added by this update.

Summary of changes:
 .travis.yml | 4 
 1 file changed, 4 insertions(+)



[arrow-datafusion] branch master updated: Remove tests from sql_integration that were ported to sqllogictest (#4836)

2023-01-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new 42f7dd509 Remove tests from sql_integration that were ported to 
sqllogictest  (#4836)
42f7dd509 is described below

commit 42f7dd50913c4f6b6d830b88d55dfd4a8d16d44f
Author: Matt Willian 
AuthorDate: Mon Jan 9 14:13:17 2023 -0800

Remove tests from sql_integration that were ported to sqllogictest  (#4836)

* delete tests duplicated between sqllogictests and aggregates / 
arrow_typeof

* recomment failing test

* remove typo

* add back in test that's broken in sqllogictest

* remove arrow module

Co-authored-by: Matt 
---
 datafusion/core/tests/sql/aggregates.rs|  72 ---
 datafusion/core/tests/sql/arrow_typeof.rs  | 139 -
 datafusion/core/tests/sql/mod.rs   |   1 -
 .../tests/sqllogictests/test_files/aggregate.slt   |  28 ++---
 4 files changed, 13 insertions(+), 227 deletions(-)

diff --git a/datafusion/core/tests/sql/aggregates.rs 
b/datafusion/core/tests/sql/aggregates.rs
index 89077ae19..9911df9c7 100644
--- a/datafusion/core/tests/sql/aggregates.rs
+++ b/datafusion/core/tests/sql/aggregates.rs
@@ -20,24 +20,6 @@ use datafusion::scalar::ScalarValue;
 use datafusion::test_util::scan_empty;
 use datafusion_common::cast::as_float64_array;
 
-#[tokio::test]
-async fn csv_query_avg_multi_batch() -> Result<()> {
-let ctx = SessionContext::new();
-register_aggregate_csv().await?;
-let sql = "SELECT avg(c12) FROM aggregate_test_100";
-let dataframe = ctx.sql(sql).await.unwrap();
-let results = dataframe.collect().await.unwrap();
-let batch = [0];
-let column = batch.column(0);
-let array = as_float64_array(column)?;
-let actual = array.value(0);
-let expected = 0.5089725;
-// Due to float number's accuracy, different batch size will lead to 
different
-// answers.
-assert!((expected - actual).abs() < 0.01);
-Ok(())
-}
-
 #[tokio::test]
 #[ignore] // https://github.com/apache/arrow-datafusion/issues/3353
 async fn csv_query_approx_count() -> Result<()> {
@@ -120,60 +102,6 @@ async fn 
csv_query_approx_percentile_cont_with_histogram_bins() -> Result<()> {
 Ok(())
 }
 
-#[tokio::test]
-async fn csv_query_array_agg() -> Result<()> {
-let ctx = SessionContext::new();
-register_aggregate_csv().await?;
-let sql =
-"SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 ORDER BY 
c13 LIMIT 2) test";
-let actual = execute_to_batches(, sql).await;
-let expected = vec![
-"+--+",
-"| ARRAYAGG(test.c13)   |",
-"+--+",
-"| [0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB] |",
-"+--+",
-];
-assert_batches_eq!(expected, );
-Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_array_agg_empty() -> Result<()> {
-let ctx = SessionContext::new();
-register_aggregate_csv().await?;
-let sql =
-"SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 LIMIT 0) 
test";
-let actual = execute_to_batches(, sql).await;
-let expected = vec![
-"++",
-"| ARRAYAGG(test.c13) |",
-"++",
-"| [] |",
-"++",
-];
-assert_batches_eq!(expected, );
-Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_array_agg_one() -> Result<()> {
-let ctx = SessionContext::new();
-register_aggregate_csv().await?;
-let sql =
-"SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 ORDER BY 
c13 LIMIT 1) test";
-let actual = execute_to_batches(, sql).await;
-let expected = vec![
-"+--+",
-"| ARRAYAGG(test.c13)   |",
-"+--+",
-"| [0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm] |",
-"+--+",
-];
-assert_batches_eq!(expected, );
-Ok(())
-}
-
 #[tokio::test]
 async fn csv_query_array_agg_unsupported() -> Result<()> {
 let ctx = SessionContext::new();
diff --git a/datafusion/core/tests/sql/arrow_typeof.rs 
b/datafusion/core/tests/sql/arrow_typeof.rs
deleted file mode 100644
index 4477ad53c..0
--- a/datafusion/core/tests/sql/arrow_typeof.rs
+++ /dev/null
@@ -1,139 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright 

[arrow-datafusion] branch master updated: Fix push_down_projection through a distinct (#4849)

2023-01-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new c4f4dffec Fix push_down_projection through a distinct (#4849)
c4f4dffec is described below

commit c4f4dffec79d3ecc7e177a3d07fcf60312d300db
Author: Jeffrey <22608443+jefff...@users.noreply.github.com>
AuthorDate: Tue Jan 10 09:12:55 2023 +1100

Fix push_down_projection through a distinct (#4849)
---
 datafusion/optimizer/src/push_down_projection.rs | 38 +++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/datafusion/optimizer/src/push_down_projection.rs 
b/datafusion/optimizer/src/push_down_projection.rs
index 482c08fb4..21396821c 100644
--- a/datafusion/optimizer/src/push_down_projection.rs
+++ b/datafusion/optimizer/src/push_down_projection.rs
@@ -374,6 +374,24 @@ fn optimize_plan(
 )?;
 from_plan(plan, (), &[child])
 }
+// at a distinct, all columns are required
+LogicalPlan::Distinct(distinct) => {
+let new_required_columns = distinct
+.input
+.schema()
+.fields()
+.iter()
+.map(|f| f.qualified_column())
+.collect();
+let child = optimize_plan(
+_optimizer,
+distinct.input.as_ref(),
+_required_columns,
+has_projection,
+_config,
+)?;
+from_plan(plan, &[], &[child])
+}
 // all other nodes: Add any additional columns used by
 // expressions in this node to the list of required columns
 LogicalPlan::Limit(_)
@@ -392,7 +410,6 @@ fn optimize_plan(
 | LogicalPlan::DropView(_)
 | LogicalPlan::SetVariable(_)
 | LogicalPlan::CrossJoin(_)
-| LogicalPlan::Distinct(_)
 | LogicalPlan::Extension { .. }
 | LogicalPlan::Prepare(_) => {
 let expr = plan.expressions();
@@ -1009,6 +1026,25 @@ mod tests {
 Ok(())
 }
 
+#[test]
+fn pushdown_through_distinct() -> Result<()> {
+let table_scan = test_table_scan()?;
+
+let plan = LogicalPlanBuilder::from(table_scan)
+.project(vec![col("a"), col("b")])?
+.distinct()?
+.project(vec![col("a")])?
+.build()?;
+
+let expected = "Projection: test.a\
+\n  Distinct:\
+\nTableScan: test projection=[a, b]";
+
+assert_optimized_plan_eq(, expected);
+
+Ok(())
+}
+
 fn assert_optimized_plan_eq(plan: , expected: ) {
 let optimized_plan = optimize(plan).expect("failed to optimize plan");
 let formatted_plan = format!("{optimized_plan:?}");



[arrow-datafusion] branch master updated: Orthogonalize distribution and sort enforcement rules into `EnforceDistribution` and `EnforceSorting` (#4839)

2023-01-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new ceff6cb44 Orthogonalize distribution and sort enforcement rules into 
`EnforceDistribution` and `EnforceSorting` (#4839)
ceff6cb44 is described below

commit ceff6cb44ad621f89c9c4e9c0bd34cb204246910
Author: Mustafa akur <106137913+mustafasr...@users.noreply.github.com>
AuthorDate: Tue Jan 10 01:10:43 2023 +0300

Orthogonalize distribution and sort enforcement rules into 
`EnforceDistribution` and `EnforceSorting` (#4839)

* Separate sort rule

* Migrate to clearer file name, tidy up comments

* Add a note about tests verifying EnforceDistribution/EnforceSorting 
jointly

* Address review, fix the stale comment

Co-authored-by: Mehmet Ozan Kabak 
---
 datafusion/core/src/execution/context.rs   | 39 ++--
 .../{enforcement.rs => dist_enforcement.rs}| 73 +-
 datafusion/core/src/physical_optimizer/mod.rs  |  4 +-
 .../core/src/physical_optimizer/repartition.rs | 10 ++-
 .../{optimize_sorts.rs => sort_enforcement.rs} | 41 ++--
 datafusion/core/src/physical_plan/limit.rs |  4 ++
 datafusion/expr/src/logical_plan/builder.rs|  3 +-
 7 files changed, 87 insertions(+), 87 deletions(-)

diff --git a/datafusion/core/src/execution/context.rs 
b/datafusion/core/src/execution/context.rs
index 2687902a3..98fe6ff79 100644
--- a/datafusion/core/src/execution/context.rs
+++ b/datafusion/core/src/execution/context.rs
@@ -68,7 +68,7 @@ use crate::physical_optimizer::repartition::Repartition;
 
 use crate::config::ConfigOptions;
 use crate::execution::{runtime_env::RuntimeEnv, FunctionRegistry};
-use crate::physical_optimizer::enforcement::BasicEnforcement;
+use crate::physical_optimizer::dist_enforcement::EnforceDistribution;
 use crate::physical_plan::file_format::{plan_to_csv, plan_to_json, 
plan_to_parquet};
 use crate::physical_plan::planner::DefaultPhysicalPlanner;
 use crate::physical_plan::udaf::AggregateUDF;
@@ -91,9 +91,9 @@ use crate::catalog::listing_schema::ListingSchemaProvider;
 use crate::datasource::object_store::ObjectStoreUrl;
 use crate::execution::memory_pool::MemoryPool;
 use crate::physical_optimizer::global_sort_selection::GlobalSortSelection;
-use crate::physical_optimizer::optimize_sorts::OptimizeSorts;
 use crate::physical_optimizer::pipeline_checker::PipelineChecker;
 use crate::physical_optimizer::pipeline_fixer::PipelineFixer;
+use crate::physical_optimizer::sort_enforcement::EnforceSorting;
 use datafusion_optimizer::OptimizerConfig;
 use datafusion_sql::planner::object_name_to_table_reference;
 use uuid::Uuid;
@@ -1448,37 +1448,36 @@ impl SessionState {
 // output partitioning of some operators in the plan tree, which 
will influence
 // other rules. Therefore, it should run as soon as possible. It 
is optional because:
 // - It's not used for the distributed engine, Ballista.
-// - It's conflicted with some parts of the BasicEnforcement, 
since it will
-//   introduce additional repartitioning while the 
BasicEnforcement aims at
-//   reducing unnecessary repartitioning.
+// - It's conflicted with some parts of the EnforceDistribution, 
since it will
+//   introduce additional repartitioning while EnforceDistribution 
aims to
+//   reduce unnecessary repartitioning.
 Arc::new(Repartition::new()),
 // - Currently it will depend on the partition number to decide 
whether to change the
 // single node sort to parallel local sort and merge. Therefore, 
GlobalSortSelection
 // should run after the Repartition.
 // - Since it will change the output ordering of some operators, 
it should run
-// before JoinSelection and BasicEnforcement, which may depend on 
that.
+// before JoinSelection and EnforceSorting, which may depend on 
that.
 Arc::new(GlobalSortSelection::new()),
 // Statistics-based join selection will change the Auto mode to a 
real join implementation,
-// like collect left, or hash join, or future sort merge join, 
which will
-// influence the BasicEnforcement to decide whether to add 
additional repartition
-// and local sort to meet the distribution and ordering 
requirements.
-// Therefore, it should run before BasicEnforcement.
+// like collect left, or hash join, or future sort merge join, 
which will influence the
+// EnforceDistribution and EnforceSorting rules as they decide 
whether to add additional
+// repartitioning and local sorting steps to meet distribution and 
ordering requirements.
+// 

[arrow-rs] branch master updated: Fix: Added support to cast string without time (#3494)

2023-01-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
 new fb36dd980 Fix: Added support to cast string without time (#3494)
fb36dd980 is described below

commit fb36dd980b398deabe5547af114982326b97e078
Author: Wenjun L <47608857+csph...@users.noreply.github.com>
AuthorDate: Mon Jan 9 22:59:07 2023 +0100

Fix: Added support to cast string without time (#3494)

* Fix: Added support casting strings without time to timestamp

* Fix: Added support casting strings without time to timestamp

Co-authored-by: Wenjun Liu 
---
 arrow-cast/src/parse.rs | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
index 6de336351..e885ec5b6 100644
--- a/arrow-cast/src/parse.rs
+++ b/arrow-cast/src/parse.rs
@@ -37,6 +37,7 @@ use chrono::prelude::*;
 /// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone 
offset specified
 /// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space 
and no timezone offset
 /// * `1997-01-31 09:26:56` # close to RCF3339, no fractional 
seconds
+/// * `1997-01-31`  # close to RCF3339, only date no time
 //
 /// Internally, this function uses the `chrono` library for the
 /// datetime parsing
@@ -121,6 +122,14 @@ pub fn string_to_timestamp_nanos(s: ) -> Result {
 return Ok(ts.timestamp_nanos());
 }
 
+// without a timezone specifier as a local time, only date
+// Example: 2020-09-08
+if let Ok(dt) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
+if let Some(ts) = dt.and_hms_opt(0, 0, 0) {
+return Ok(ts.timestamp_nanos());
+}
+}
+
 // Note we don't pass along the error message from the underlying
 // chrono parsing because we tried several different format
 // strings and we don't know which the user was trying to
@@ -494,6 +503,19 @@ mod tests {
 naive_datetime_whole_secs.timestamp_nanos(),
 parse_timestamp("2020-09-08 13:42:29").unwrap()
 );
+
+// ensure without time work
+// no time, should be the nano second at
+// 2020-09-08 0:0:0
+let naive_datetime_no_time = NaiveDateTime::new(
+NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(),
+NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
+);
+
+assert_eq!(
+naive_datetime_no_time.timestamp_nanos(),
+parse_timestamp("2020-09-08").unwrap()
+)
 }
 
 #[test]



[arrow] branch master updated (92895c9b54 -> 7e02fde652)

2023-01-09 Thread apitrou
This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 92895c9b54 MINOR: [Documentation] Fine tune Java IPC doc (#15263)
 add 7e02fde652 MINOR: [C++] Mark TypeHolder operator bool() as const 
(#15273)

No new revisions were added by this update.

Summary of changes:
 cpp/src/arrow/type.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)



[arrow-adbc] branch asf-site updated: publish documentation

2023-01-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/arrow-adbc.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new f3b7a41  publish documentation
f3b7a41 is described below

commit f3b7a41e81cf4147d0c1b142f91570915255e1bf
Author: github-actions[bot] 
AuthorDate: Mon Jan 9 16:54:53 2023 +

publish documentation



[arrow-adbc] branch main updated: ci: only run Apache RAT on staged/committed files (#325)

2023-01-09 Thread lidavidm
This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-adbc.git


The following commit(s) were added to refs/heads/main by this push:
 new a8b2893  ci: only run Apache RAT on staged/committed files (#325)
a8b2893 is described below

commit a8b28933499c5d01030d1d6e01b667a762c4a93d
Author: David Li 
AuthorDate: Mon Jan 9 11:49:52 2023 -0500

ci: only run Apache RAT on staged/committed files (#325)

Fixes #313.
---
 .pre-commit-config.yaml   |  2 +-
 ci/scripts/run_rat_local.sh   | 48 +++
 dev/release/rat_exclude_files.txt |  3 +++
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5bece5d..8523fa2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -82,6 +82,6 @@ repos:
   name: Check for unapproved licenses
   language: script
   pass_filenames: false
-  entry: "./dev/release/run-rat.sh ."
+  entry: "./ci/scripts/run_rat_local.sh"
 
 exclude: "^c/vendor/.*"
diff --git a/ci/scripts/run_rat_local.sh b/ci/scripts/run_rat_local.sh
new file mode 100755
index 000..4a68c99
--- /dev/null
+++ b/ci/scripts/run_rat_local.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Run Apache RAT on the local repository to check for licenses
+
+set -e
+
+main() {
+local -r source_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+local -r source_top_dir="$(cd "${source_dir}/../../" && pwd)"
+
+pushd "${source_top_dir}"
+
+# Generate a temporary archive and validate that, ignores build
+# artifacts (RAT itself doesn't respect .gitignore)
+export ARCHIVE=$(mktemp adbc.rat)
+trap 'rm -f "$ARCHIVE"' ERR EXIT INT TERM
+
+# Need to prefix with directory since check-rat-report.py always removes 
one layer
+if [[ "$(uname)" = "Darwin" ]]; then
+git ls-files | tar -s '|.*|adbc/~|' --files-from=- --create --gzip 
--file="${ARCHIVE}"
+else
+git ls-files | tar --transform='s|^|adbc/|' --files-from=- --create 
--gzip --file="${ARCHIVE}"
+fi
+
+"${source_top_dir}/dev/release/run-rat.sh" "${ARCHIVE}"
+
+popd
+}
+
+main "$@"
diff --git a/dev/release/rat_exclude_files.txt 
b/dev/release/rat_exclude_files.txt
index 2b1587e..9d68138 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -3,6 +3,9 @@ ci/linux-packages/changelog
 ci/linux-packages/*.install
 dev/release/rat_exclude_files.txt
 filtered_rat.txt
+go/adbc/drivermgr/adbc.h
+go/adbc/drivermgr/adbc_driver_manager.cc
+go/adbc/drivermgr/adbc_driver_manager.h
 go/adbc/status_string.go
 go/adbc/go.sum
 rat.txt



[arrow-datafusion] branch master updated: Add DataFrame::into_view instead of implementing TableProvider (#2659) (#4778)

2023-01-09 Thread tustvold
This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new c5e2594e9 Add DataFrame::into_view instead of implementing 
TableProvider (#2659) (#4778)
c5e2594e9 is described below

commit c5e2594e99b01c12d4f6903cb998a62a5479455c
Author: Raphael Taylor-Davies <1781103+tustv...@users.noreply.github.com>
AuthorDate: Mon Jan 9 17:27:41 2023 +0100

Add DataFrame::into_view instead of implementing TableProvider (#2659) 
(#4778)
---
 datafusion/core/src/dataframe.rs| 41 -
 datafusion/core/src/datasource/view.rs  |  4 +--
 datafusion/expr/src/logical_plan/builder.rs | 10 +++
 3 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs
index fe417593a..e9773dbdf 100644
--- a/datafusion/core/src/dataframe.rs
+++ b/datafusion/core/src/dataframe.rs
@@ -528,6 +528,15 @@ impl DataFrame {
 self.session_state.optimize()
 }
 
+/// Converts this [`DataFrame`] into a [`TableProvider`] that can be 
registered
+/// as a table view using [`SessionContext::register_table`].
+///
+/// Note: This discards the [`SessionState`] associated with this
+/// [`DataFrame`] in favour of the one passed to [`TableProvider::scan`]
+pub fn into_view(self) -> Arc {
+Arc::new(DataFrameTableProvider { plan: self.plan })
+}
+
 /// Return the optimized logical plan represented by this DataFrame.
 ///
 /// Note: This method should not be used outside testing, as it loses the 
snapshot
@@ -766,9 +775,12 @@ impl DataFrame {
 }
 }
 
-// TODO: This will introduce a ref cycle (#2659)
+struct DataFrameTableProvider {
+plan: LogicalPlan,
+}
+
 #[async_trait]
-impl TableProvider for DataFrame {
+impl TableProvider for DataFrameTableProvider {
 fn as_any() ->  Any {
 self
 }
@@ -796,20 +808,14 @@ impl TableProvider for DataFrame {
 
 async fn scan(
 ,
-_state: ,
+state: ,
 projection: Option<>,
 filters: &[Expr],
 limit: Option,
 ) -> Result> {
-let mut expr = self.clone();
+let mut expr = LogicalPlanBuilder::from(self.plan.clone());
 if let Some(p) = projection {
-let schema = TableProvider::schema().project(p)?;
-let names = schema
-.fields()
-.iter()
-.map(|field| field.name().as_str())
-.collect::>();
-expr = expr.select_columns(names.as_slice())?;
+expr = expr.select(p.iter().copied())?
 }
 
 // Add filter when given
@@ -817,13 +823,12 @@ impl TableProvider for DataFrame {
 if let Some(filter) = filter {
 expr = expr.filter(filter)?
 }
+// add a limit if given
 if let Some(l) = limit {
 expr = expr.limit(0, Some(l))?
 }
-// add a limit if given
-Self::new(self.session_state.clone(), expr.plan)
-.create_physical_plan()
-.await
+let plan = expr.build()?;
+state.create_physical_plan().await
 }
 }
 
@@ -1098,7 +1103,7 @@ mod tests {
 let df_impl = DataFrame::new(ctx.state(), df.plan.clone());
 
 // register a dataframe as a table
-ctx.register_table("test_table", Arc::new(df_impl.clone()))?;
+ctx.register_table("test_table", df_impl.clone().into_view())?;
 
 // pull the table out
 let table = ctx.table("test_table").await?;
@@ -1297,7 +1302,7 @@ mod tests {
 let df = test_table().await?.select_columns(&["c1", "c2", "c3"])?;
 let ctx = SessionContext::new();
 
-let table = Arc::new(df);
+let table = df.into_view();
 ctx.register_table("t1", table.clone())?;
 ctx.register_table("t2", table)?;
 let df = ctx
@@ -1386,7 +1391,7 @@ mod tests {
 )
 .await?;
 
-ctx.register_table("t1", Arc::new(ctx.table("test").await?))?;
+ctx.register_table("t1", ctx.table("test").await?.into_view())?;
 
 let df = ctx
 .table("t1")
diff --git a/datafusion/core/src/datasource/view.rs 
b/datafusion/core/src/datasource/view.rs
index 2d2f33dc2..524ad9f5c 100644
--- a/datafusion/core/src/datasource/view.rs
+++ b/datafusion/core/src/datasource/view.rs
@@ -428,7 +428,7 @@ mod tests {
 )
 .await?;
 
-ctx.register_table("t1", Arc::new(ctx.table("test").await?))?;
+ctx.register_table("t1", ctx.table("test").await?.into_view())?;
 
 ctx.sql("CREATE VIEW t2 as SELECT * FROM t1").await?;
 
@@ -458,7 +458,7 @@ mod tests {
 )
 .await?;
 
-ctx.register_table("t1", Arc::new(ctx.table("test").await?))?;
+ctx.register_table("t1", 

[arrow] branch master updated (154de48f0b -> 92895c9b54)

2023-01-09 Thread lidavidm
This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 154de48f0b GH-14976: [Python] Avoid dependency on exec plan in 
Table.sort_by to fix minimal tests (#15268)
 add 92895c9b54 MINOR: [Documentation] Fine tune Java IPC doc (#15263)

No new revisions were added by this update.

Summary of changes:
 docs/source/java/ipc.rst | 149 ++-
 1 file changed, 82 insertions(+), 67 deletions(-)



[arrow-datafusion] branch master updated: Document ability to select directly from files in datafusion-cli (#4851)

2023-01-09 Thread jakevin
This is an automated email from the ASF dual-hosted git repository.

jakevin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new 4bea81b5d Document ability to select directly from files in 
datafusion-cli (#4851)
4bea81b5d is described below

commit 4bea81b5d1c7b2f81cc6c140abc7d927220bec91
Author: Andrew Lamb 
AuthorDate: Mon Jan 9 09:16:21 2023 -0500

Document ability to select directly from files in datafusion-cli (#4851)

* Document ability to select directly from files in datafusion-cli

* prettier

* Update docs/source/user-guide/cli.md

Co-authored-by: Liang-Chi Hsieh 

Co-authored-by: Liang-Chi Hsieh 
---
 docs/source/user-guide/cli.md | 63 +++
 1 file changed, 46 insertions(+), 17 deletions(-)

diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md
index 3a4c453a7..d3512a6dc 100644
--- a/docs/source/user-guide/cli.md
+++ b/docs/source/user-guide/cli.md
@@ -19,30 +19,51 @@
 
 # DataFusion Command-line SQL Utility
 
-The DataFusion CLI is a command-line interactive SQL utility that allows
-queries to be executed against any supported data files. It is a convenient 
way to
+The DataFusion CLI is a command-line interactive SQL utility for executing
+queries against any supported data files. It is a convenient way to
 try DataFusion out with your own data sources, and test out its SQL support.
 
 ## Example
 
 Create a CSV file to query.
 
-```bash
-$ echo "1,2" > data.csv
+```shell
+$ echo "a,b" > data.csv
+$ echo "1,2" >> data.csv
 ```
 
-```bash
+Query that single file (the CLI also supports parquet, compressed csv, avro, 
json and more)
+
+```shell
 $ datafusion-cli
-DataFusion CLI v12.0.0
-❯ CREATE EXTERNAL TABLE foo STORED AS CSV LOCATION 'data.csv';
-0 rows in set. Query took 0.017 seconds.
-❯ select * from foo;
-+--+--+
-| column_1 | column_2 |
-+--+--+
-| 1| 2|
-+--+--+
-1 row in set. Query took 0.012 seconds.
+DataFusion CLI v17.0.0
+❯ select * from 'data.csv';
++---+---+
+| a | b |
++---+---+
+| 1 | 2 |
++---+---+
+1 row in set. Query took 0.007 seconds.
+```
+
+You can also query directories of files with compatible schemas:
+
+```shell
+$ ls data_dir/
+data.csv   data2.csv
+```
+
+```shell
+$ datafusion-cli
+DataFusion CLI v16.0.0
+❯ select * from 'data_dir';
++---+---+
+| a | b |
++---+---+
+| 3 | 4 |
+| 1 | 2 |
++---+---+
+2 rows in set. Query took 0.007 seconds.
 ```
 
 ## Installation
@@ -87,6 +108,8 @@ docker run -it -v $(your_data_location):/data datafusion-cli
 
 ## Usage
 
+See the current usage using `datafusion-cli --help`:
+
 ```bash
 Apache Arrow 
 Command Line Client for DataFusion query engine.
@@ -104,10 +127,16 @@ OPTIONS:
 -q, --quiet  Reduce printing other than the results 
and work quietly
 -r, --rc ... Run the provided files on startup instead 
of ~/.datafusionrc
 -V, --versionPrint version information
-
-Type `exit` or `quit` to exit the CLI.
 ```
 
+## Selecting files directly
+
+Files can be queried directly by enclosing the file or
+directory name in single `'` quotes as shown in the example.
+
+It is also possible to create a table backed by files by explicitly
+via `CREATE EXTERNAL TABLE` as shown below.
+
 ## Registering Parquet Data Sources
 
 Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` 
SQL statement. It is not necessary to provide schema information for Parquet 
files.



[arrow] branch master updated (5f3df1255e -> 154de48f0b)

2023-01-09 Thread kou
This is an automated email from the ASF dual-hosted git repository.

kou pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 5f3df1255e ARROW-16728: [Python] ParquetDataset to still take legacy 
code path when old filesystem is passed (#15269)
 add 154de48f0b GH-14976: [Python] Avoid dependency on exec plan in 
Table.sort_by to fix minimal tests (#15268)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/table.pxi | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)



[arrow] branch master updated (0121ae73f4 -> 5f3df1255e)

2023-01-09 Thread kou
This is an automated email from the ASF dual-hosted git repository.

kou pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 0121ae73f4 MINOR: [Java] Doc & javadoc improvements (#15222)
 add 5f3df1255e ARROW-16728: [Python] ParquetDataset to still take legacy 
code path when old filesystem is passed (#15269)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/parquet/core.py | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)



[arrow-rs] branch dependabot/cargo/master/base64-0.21 created (now 094c09ec1)

2023-01-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a change to branch dependabot/cargo/master/base64-0.21
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


  at 094c09ec1 Update base64 requirement from 0.20 to 0.21

No new revisions were added by this update.



[arrow] branch master updated (11d286eafb -> 0121ae73f4)

2023-01-09 Thread lidavidm
This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 11d286eafb GH-15254: [GLib] garrow_execute_plain_wait() checks the 
finished status (#15255)
 add 0121ae73f4 MINOR: [Java] Doc & javadoc improvements (#15222)

No new revisions were added by this update.

Summary of changes:
 docs/source/java/quickstartguide.rst  | 4 ++--
 .../src/main/java/org/apache/arrow/vector/BaseIntVector.java  | 8 
 2 files changed, 6 insertions(+), 6 deletions(-)



[arrow] branch master updated: GH-15254: [GLib] garrow_execute_plain_wait() checks the finished status (#15255)

2023-01-09 Thread kou
This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 11d286eafb GH-15254: [GLib] garrow_execute_plain_wait() checks the 
finished status (#15255)
11d286eafb is described below

commit 11d286eafb72b630baf897e619c84ecdfc6b723f
Author: Sutou Kouhei 
AuthorDate: Mon Jan 9 21:59:25 2023 +0900

GH-15254: [GLib] garrow_execute_plain_wait() checks the finished status 
(#15255)


* Closes: #15254

Authored-by: Sutou Kouhei 
Signed-off-by: Sutou Kouhei 
---
 c_glib/arrow-glib/compute.cpp | 9 +++--
 c_glib/arrow-glib/compute.h   | 5 +++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp
index 3554fdf115..41deee9b6e 100644
--- a/c_glib/arrow-glib/compute.cpp
+++ b/c_glib/arrow-glib/compute.cpp
@@ -1931,16 +1931,21 @@ garrow_execute_plan_stop(GArrowExecutePlan *plan)
 /**
  * garrow_execute_plan_wait:
  * @plan: A #GArrowExecutePlan.
+ * @error: (nullable): Return location for a #GError or %NULL.
  *
  * Waits for finishing this plan.
  *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
  * Since: 6.0.0
  */
-void
-garrow_execute_plan_wait(GArrowExecutePlan *plan)
+gboolean
+garrow_execute_plan_wait(GArrowExecutePlan *plan, GError **error)
 {
   auto arrow_plan = garrow_execute_plan_get_raw(plan);
   arrow_plan->finished().Wait();
+  return garrow::check(error, arrow_plan->finished().status(),
+   "[execute-plan][wait]");
 }
 
 
diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h
index a9ba6c2af9..360ae3857e 100644
--- a/c_glib/arrow-glib/compute.h
+++ b/c_glib/arrow-glib/compute.h
@@ -352,8 +352,9 @@ GARROW_AVAILABLE_IN_6_0
 void
 garrow_execute_plan_stop(GArrowExecutePlan *plan);
 GARROW_AVAILABLE_IN_6_0
-void
-garrow_execute_plan_wait(GArrowExecutePlan *plan);
+gboolean
+garrow_execute_plan_wait(GArrowExecutePlan *plan,
+ GError **error);
 
 
 GArrowCastOptions *garrow_cast_options_new(void);



[arrow] branch master updated (878d5cac09 -> 4d31b1ef70)

2023-01-09 Thread kou
This is an automated email from the ASF dual-hosted git repository.

kou pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 878d5cac09 ARROW-15470: [R] Set null value in CSV writer (#14679)
 add 4d31b1ef70 GH-15218:  [Python] Remove auto generated pyarrow_api.h and 
pyarrow_lib.h (#15219)

No new revisions were added by this update.

Summary of changes:
 python/CMakeLists.txt |  48 --
 python/pyarrow/src/arrow/python/pyarrow_api.h | 224 +-
 python/pyarrow/src/arrow/python/pyarrow_lib.h |  67 +---
 3 files changed, 36 insertions(+), 303 deletions(-)



[GitHub] [arrow-site] kou commented on a diff in pull request #295: [Website] Add links to UKV

2023-01-09 Thread GitBox


kou commented on code in PR #295:
URL: https://github.com/apache/arrow-site/pull/295#discussion_r1064587271


##
powered_by.md:
##
@@ -184,6 +184,14 @@ short description of your use case.
   Database Connectivity (ODBC) interface. It provides the ability to return
   Arrow Tables and RecordBatches in addition to the Python Database API
   Specification 2.0.
+* **[UKV][45]:** Open NoSQL binary database interface, with support for
+  LevelDB, RocksDB, UDisk, and in-memory Key-Value Stores. It extends
+  their functionality to support Document Collections, Graphs, and Vector
+  Search, similar to RedisJSON, RedisGraph, and RediSearch, and brings
+  familiar structured bindings on top, mimicking tools like Pandas and 
NetworkX.

Review Comment:
   ```suggestion
 familiar structured bindings on top, mimicking tools like pandas and 
NetworkX.
   ```



##
use_cases.md:
##
@@ -64,7 +64,9 @@ The Arrow format also defines a [C data interface]({% 
post_url 2020-05-04-introd
 which allows zero-copy data sharing inside a single process without any
 build-time or link-time dependency requirements. This allows, for example,
 [R users to access `pyarrow`-based projects]({{ site.baseurl 
}}/docs/r/articles/python.html)
-using the `reticulate` package.
+using the `reticulate` package. Similarly, it empowers 
[UKV](https://unum.cloud/ukv)
+to forward persisted data from RocksDB, LevelDB, and UDisk, into Python
+runtime and `pyarrow` without copies.

Review Comment:
   Could you revert this? It seems that we use use cases only in Apache Arrow 
project.



##
use_cases.md:
##
@@ -81,7 +83,8 @@ and [others]({{ site.baseurl }}/powered_by/) also use Arrow 
similarly.
 
 The Arrow project also defines [Flight]({% post_url 
2019-09-30-introducing-arrow-flight %}),
 a client-server RPC framework to build rich services exchanging data according
-to application-defined semantics.
+to application-defined semantics. Flight RPC is used by 
[UKV](https://unum.cloud/ukv)
+to exchange tables, documents, and graphs, between server application and 
client SDKs.

Review Comment:
   Could you revert this? We refer the `powered_by/` page in the above 
paragraph. UKV is introduced in the page.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



[arrow] branch master updated: ARROW-15470: [R] Set null value in CSV writer (#14679)

2023-01-09 Thread thisisnic
This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 878d5cac09 ARROW-15470: [R] Set null value in CSV writer (#14679)
878d5cac09 is described below

commit 878d5cac09073cee72de000a7e8418ce8a3a31b8
Author: Will Jones 
AuthorDate: Mon Jan 9 04:19:15 2023 -0800

ARROW-15470: [R] Set null value in CSV writer (#14679)

Authored-by: Will Jones 
Signed-off-by: Nic Crane 
---
 r/R/csv.R| 29 +---
 r/R/dataset-format.R | 32 +-
 r/man/CsvReadOptions.Rd  |  2 ++
 r/man/write_csv_arrow.Rd |  4 
 r/src/csv.cpp|  1 +
 r/tests/testthat/_snaps/dataset-write.md |  2 +-
 r/tests/testthat/test-csv.R  | 39 
 r/tests/testthat/test-dataset-csv.R  | 22 ++
 8 files changed, 116 insertions(+), 15 deletions(-)

diff --git a/r/R/csv.R b/r/R/csv.R
index fef8723fb2..6f53a060f5 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -418,6 +418,8 @@ CsvTableReader$create <- function(file,
 #' The `CsvWriteOptions$create()` factory method takes the following arguments:
 #' - `include_header` Whether to write an initial header line with column names
 #' - `batch_size` Maximum number of rows processed at a time. Default is 1024.
+#' - `null_string` The string to be written for null values. Must not contain
+#'   quotation marks. Default is an empty string (`""`).
 #'
 #' @section Active bindings:
 #'
@@ -455,25 +457,32 @@ CsvReadOptions$create <- function(use_threads = 
option_use_threads(),
   options
 }
 
-readr_to_csv_write_options <- function(include_header,
-   batch_size = 1024L) {
-  assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
-  assert_that(is.logical(include_header))
+readr_to_csv_write_options <- function(include_header = TRUE,
+   batch_size = 1024L,
+   na = "") {
   CsvWriteOptions$create(
 include_header = include_header,
-batch_size = as.integer(batch_size)
+batch_size = batch_size,
+null_string = na
   )
 }
 
 #' @rdname CsvReadOptions
 #' @export
 CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
-CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L) {
+CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L, 
null_string = "") {
   assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+  assert_that(is.logical(include_header))
+  assert_that(is.character(null_string))
+  assert_that(!is.na(null_string))
+  assert_that(length(null_string) == 1)
+  assert_that(!grepl('"', null_string), msg = "na argument must not contain 
quote characters.")
+
   csv___WriteOptions__initialize(
 list(
   include_header = include_header,
-  batch_size = as.integer(batch_size)
+  batch_size = as.integer(batch_size),
+  null_string = as.character(null_string)
 )
   )
 }
@@ -665,6 +674,8 @@ readr_to_csv_convert_options <- function(na,
 #' @param col_names identical to `include_header`. Specify this or
 #' `include_headers`, not both.
 #' @param batch_size Maximum number of rows processed at a time. Default is 
1024.
+#' @param na value to write for NA values. Must not contain quote marks. 
Default
+#' is `""`.
 #' @param write_options see [file reader options][CsvWriteOptions]
 #' @param ... additional parameters
 #'
@@ -682,6 +693,7 @@ write_csv_arrow <- function(x,
 include_header = TRUE,
 col_names = NULL,
 batch_size = 1024L,
+na = "",
 write_options = NULL,
 ...) {
   unsupported_passed_args <- names(list(...))
@@ -723,7 +735,8 @@ write_csv_arrow <- function(x,
   if (is.null(write_options)) {
 write_options <- readr_to_csv_write_options(
   include_header = include_header,
-  batch_size = batch_size
+  batch_size = batch_size,
+  na = na
 )
   }
 
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index aacde187c4..c1d2730bb6 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -452,7 +452,10 @@ FileWriteOptions <- R6Class("FileWriteOptions",
 "null_fallback"
   )
 } else if (format == "csv") {
-  supported_args <- names(formals(CsvWriteOptions$create))
+  supported_args <- c(
+names(formals(CsvWriteOptions$create)),
+names(formals(readr_to_csv_write_options))
+  )
 }
 
 unsupported_passed_args <- setdiff(passed_args, supported_args)
@@ -470,7 +473,7 @@ 

[arrow] branch master updated (37f5a3584a -> 211925c92e)

2023-01-09 Thread thisisnic
This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 37f5a3584a GH-15259:  [CI] component assignment fails due to typo 
(#15260)
 add 211925c92e ARROW-15812: [R] Accept col_names in open_dataset for CSV 
(#14705)

No new revisions were added by this update.

Summary of changes:
 r/R/csv.R   |  2 +-
 r/R/dataset-format.R| 78 -
 r/tests/testthat/test-dataset-csv.R | 60 +++-
 3 files changed, 111 insertions(+), 29 deletions(-)



[arrow-rs] branch master updated: feat: add `parquet-rewrite` CLI (#3477)

2023-01-09 Thread tustvold
This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
 new 592d7a360 feat: add `parquet-rewrite` CLI (#3477)
592d7a360 is described below

commit 592d7a3601b1b7876ab5753abde66113f1a9dc23
Author: Marco Neumann 
AuthorDate: Mon Jan 9 11:57:52 2023 +0100

feat: add `parquet-rewrite` CLI (#3477)

* feat: add `parquet-rewrite` CLI

Closes #3476.

* refactor: init ArrowWriter early
---
 parquet/Cargo.toml |   4 +
 parquet/src/bin/parquet-rewrite.rs | 293 +
 2 files changed, 297 insertions(+)

diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index 7a76ff64e..2aa744978 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -118,6 +118,10 @@ required-features = ["arrow"]
 name = "parquet-read"
 required-features = ["cli"]
 
+[[bin]]
+name = "parquet-rewrite"
+required-features = ["arrow", "cli"]
+
 [[bin]]
 name = "parquet-schema"
 required-features = ["cli"]
diff --git a/parquet/src/bin/parquet-rewrite.rs 
b/parquet/src/bin/parquet-rewrite.rs
new file mode 100644
index 0..cd60225ca
--- /dev/null
+++ b/parquet/src/bin/parquet-rewrite.rs
@@ -0,0 +1,293 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Binary file to rewrite parquet files.
+//!
+//! # Install
+//!
+//! `parquet-rewrite` can be installed using `cargo`:
+//! ```
+//! cargo install parquet --features=cli
+//! ```
+//! After this `parquet-rewrite` should be available:
+//! ```
+//! parquet-rewrite -i XYZ.parquet -o XYZ2.parquet
+//! ```
+//!
+//! The binary can also be built from the source code and run as follows:
+//! ```
+//! cargo run --features=cli --bin parquet-rewrite -- -i XYZ.parquet -o 
XYZ2.parquet
+//! ```
+
+use std::fs::File;
+
+use arrow_array::RecordBatchReader;
+use clap::{builder::PossibleValue, Parser, ValueEnum};
+use parquet::{
+arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter},
+basic::Compression,
+file::{
+properties::{EnabledStatistics, WriterProperties, WriterVersion},
+reader::FileReader,
+serialized_reader::SerializedFileReader,
+},
+};
+
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)]
+enum CompressionArgs {
+/// No compression.
+None,
+
+/// Snappy
+Snappy,
+
+/// GZip
+Gzip,
+
+/// LZO
+Lzo,
+
+/// Brotli
+Brotli,
+
+/// LZ4
+Lz4,
+
+/// Zstd
+Zstd,
+
+/// LZ4 Raw
+Lz4Raw,
+}
+
+impl From for Compression {
+fn from(value: CompressionArgs) -> Self {
+match value {
+CompressionArgs::None => Self::UNCOMPRESSED,
+CompressionArgs::Snappy => Self::SNAPPY,
+CompressionArgs::Gzip => Self::GZIP,
+CompressionArgs::Lzo => Self::LZO,
+CompressionArgs::Brotli => Self::BROTLI,
+CompressionArgs::Lz4 => Self::LZ4,
+CompressionArgs::Zstd => Self::ZSTD,
+CompressionArgs::Lz4Raw => Self::LZ4_RAW,
+}
+}
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)]
+enum EnabledStatisticsArgs {
+/// Compute no statistics
+None,
+
+/// Compute chunk-level statistics but not page-level
+Chunk,
+
+/// Compute page-level and chunk-level statistics
+Page,
+}
+
+impl From for EnabledStatistics {
+fn from(value: EnabledStatisticsArgs) -> Self {
+match value {
+EnabledStatisticsArgs::None => Self::None,
+EnabledStatisticsArgs::Chunk => Self::Chunk,
+EnabledStatisticsArgs::Page => Self::Page,
+}
+}
+}
+
+#[derive(Clone, Copy, Debug)]
+enum WriterVersionArgs {
+Parquet1_0,
+Parquet2_0,
+}
+
+impl ValueEnum for WriterVersionArgs {
+fn value_variants<'a>() -> &'a [Self] {
+&[Self::Parquet1_0, Self::Parquet2_0]
+}
+
+fn to_possible_value() -> Option {
+match self {
+WriterVersionArgs::Parquet1_0 => Some(PossibleValue::new("1.0")),
+WriterVersionArgs::Parquet2_0 => 

[arrow-rs] branch master updated: feat: Allow providing a service account key directly for GCS (#3489)

2023-01-09 Thread tustvold
This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
 new eae993fd1 feat: Allow providing a service account key directly for GCS 
(#3489)
eae993fd1 is described below

commit eae993fd196d0a8df8a90857bc4a7ae8f5a3e845
Author: Sean Smith 
AuthorDate: Mon Jan 9 04:25:29 2023 -0600

feat: Allow providing a service account key directly for GCS (#3489)

* feat: Allow providing a service account key directly for GCP

Use case:

We're storing service accounts keys external to where the object store 
client is
being created. We do not want to have to write the key to a file before 
creating
the object store client. This change allows for providing the key directly.

* Add additional aliases for specifying service account path

"google_service_account_path" and "service_account_path" can now be used.

* Add test asserting aliases set appropriate config option
---
 object_store/src/gcp/mod.rs | 144 +++-
 1 file changed, 128 insertions(+), 16 deletions(-)

diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs
index 177812fa8..28972c4a6 100644
--- a/object_store/src/gcp/mod.rs
+++ b/object_store/src/gcp/mod.rs
@@ -121,8 +121,13 @@ enum Error {
 #[snafu(display("Missing bucket name"))]
 MissingBucketName {},
 
-#[snafu(display("Missing service account path"))]
-MissingServiceAccountPath,
+#[snafu(display("Missing service account path or key"))]
+MissingServiceAccountPathOrKey,
+
+#[snafu(display(
+"One of service account path or service account key may be provided."
+))]
+ServiceAccountPathAndKeyProvided,
 
 #[snafu(display("GCP credential error: {}", source))]
 Credential { source: credential::Error },
@@ -800,14 +805,15 @@ pub struct GoogleCloudStorageBuilder {
 bucket_name: Option,
 url: Option,
 service_account_path: Option,
+service_account_key: Option,
 retry_config: RetryConfig,
 client_options: ClientOptions,
 }
 
 /// Configuration keys for [`GoogleCloudStorageBuilder`]
 ///
-/// Configuration via keys can be dome via the 
[`try_with_option`](GoogleCloudStorageBuilder::try_with_option)
-/// or [`with_options`](GoogleCloudStorageBuilder::try_with_options) methods 
on the builder.
+/// Configuration via keys can be done via the 
[`try_with_option`](GoogleCloudStorageBuilder::try_with_option)
+/// or [`try_with_options`](GoogleCloudStorageBuilder::try_with_options) 
methods on the builder.
 ///
 /// # Example
 /// ```
@@ -835,8 +841,17 @@ pub enum GoogleConfigKey {
 /// Supported keys:
 /// - `google_service_account`
 /// - `service_account`
+/// - `google_service_account_path`
+/// - `service_account_path`
 ServiceAccount,
 
+/// The serialized service account key.
+///
+/// Supported keys:
+/// - `google_service_account_key`
+/// - `service_account_key`
+ServiceAccountKey,
+
 /// Bucket name
 ///
 /// See [`GoogleCloudStorageBuilder::with_bucket_name`] for details.
@@ -853,6 +868,7 @@ impl AsRef for GoogleConfigKey {
 fn as_ref() ->  {
 match self {
 Self::ServiceAccount => "google_service_account",
+Self::ServiceAccountKey => "google_service_account_key",
 Self::Bucket => "google_bucket",
 }
 }
@@ -863,7 +879,13 @@ impl FromStr for GoogleConfigKey {
 
 fn from_str(s: ) -> Result {
 match s {
-"google_service_account" | "service_account" => 
Ok(Self::ServiceAccount),
+"google_service_account"
+| "service_account"
+| "google_service_account_path"
+| "service_account_path" => Ok(Self::ServiceAccount),
+"google_service_account_key" | "service_account_key" => {
+Ok(Self::ServiceAccountKey)
+}
 "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" 
=> {
 Ok(Self::Bucket)
 }
@@ -877,6 +899,7 @@ impl Default for GoogleCloudStorageBuilder {
 Self {
 bucket_name: None,
 service_account_path: None,
+service_account_key: None,
 retry_config: Default::default(),
 client_options: ClientOptions::new().with_allow_http(true),
 url: None,
@@ -894,13 +917,17 @@ impl GoogleCloudStorageBuilder {
 ///
 /// Variables extracted from environment:
 /// * GOOGLE_SERVICE_ACCOUNT: location of service account file
+/// * GOOGLE_SERVICE_ACCOUNT_PATH: (alias) location of service account file
 /// * SERVICE_ACCOUNT: (alias) location of service account file
+/// * GOOGLE_SERVICE_ACCOUNT_KEY: JSON serialized service account key
+/// * GOOGLE_BUCKET: bucket name
+/// * 

[arrow] branch ARROW-17715a updated (194f3f249a -> 019740ad9f)

2023-01-09 Thread kiszk
This is an automated email from the ASF dual-hosted git repository.

kiszk pushed a change to branch ARROW-17715a
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 194f3f249a add CLANG_TOOLS
 add 019740ad9f change LLVM version

No new revisions were added by this update.

Summary of changes:
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)



[GitHub] [arrow-site] github-actions[bot] commented on pull request #295: [Website] Add links to UKV

2023-01-09 Thread GitBox


github-actions[bot] commented on PR #295:
URL: https://github.com/apache/arrow-site/pull/295#issuecomment-1375332242

   
   
   Thanks for opening a pull request!
   
   Could you open an issue for this pull request on JIRA?
   https://issues.apache.org/jira/browse/ARROW
   
   Then could you also rename pull request title in the following format?
   
   ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY}
   
   See also:
   
 * [Other pull requests](https://github.com/apache/arrow-site/pulls/)
 * [Contribution Guidelines - How to contribute 
patches](https://arrow.apache.org/docs/developers/contributing.html#how-to-contribute-patches)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



[GitHub] [arrow-site] ashvardanian opened a new pull request, #295: [Website] Add links to UKV

2023-01-09 Thread GitBox


ashvardanian opened a new pull request, #295:
URL: https://github.com/apache/arrow-site/pull/295

   We have been integrating Apache Arrow across all of our projects during 2022 
and hoping to share them with the broader community.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



[arrow] branch ARROW-17715a updated (0f753003a1 -> 194f3f249a)

2023-01-09 Thread kiszk
This is an automated email from the ASF dual-hosted git repository.

kiszk pushed a change to branch ARROW-17715a
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 0f753003a1 disable to build COMPUTE and GANDIVA
 add 194f3f249a add CLANG_TOOLS

No new revisions were added by this update.

Summary of changes:
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)



[arrow] branch ARROW-17715a created (now 0f753003a1)

2023-01-09 Thread kiszk
This is an automated email from the ASF dual-hosted git repository.

kiszk pushed a change to branch ARROW-17715a
in repository https://gitbox.apache.org/repos/asf/arrow.git


  at 0f753003a1 disable to build COMPUTE and GANDIVA

This branch includes the following new commits:

 new 0f753003a1 disable to build COMPUTE and GANDIVA

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




[arrow] 01/01: disable to build COMPUTE and GANDIVA

2023-01-09 Thread kiszk
This is an automated email from the ASF dual-hosted git repository.

kiszk pushed a commit to branch ARROW-17715a
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 0f753003a1557c4e554ea879464ca001888e5c2f
Author: Kazuaki Ishizaki 
AuthorDate: Mon Jan 9 04:01:38 2023 -0500

disable to build COMPUTE and GANDIVA

reduce parallelism to 1
---
 .travis.yml | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a96e07f0c4..b508d60609 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -93,14 +93,16 @@ jobs:
 # aws-sdk-cpp.
 DOCKER_RUN_ARGS: >-
   "
+  -e ARROW_COMPUTE=OFF
   -e ARROW_FLIGHT=ON
+  -e ARROW_GANDIVA=OFF
   -e ARROW_GCS=OFF
   -e ARROW_MIMALLOC=OFF
   -e ARROW_ORC=OFF
   -e ARROW_PARQUET=OFF
   -e ARROW_S3=OFF
   -e ARROW_SUBSTRAIT=OFF
-  -e CMAKE_BUILD_PARALLEL_LEVEL=2
+  -e CMAKE_BUILD_PARALLEL_LEVEL=1
   -e CMAKE_UNITY_BUILD=ON
   -e PARQUET_BUILD_EXAMPLES=OFF
   -e PARQUET_BUILD_EXECUTABLES=OFF
@@ -144,14 +146,16 @@ jobs:
 # aws-sdk-cpp.
 DOCKER_RUN_ARGS: >-
   "
+  -e ARROW_COMPUTE=OFF
   -e ARROW_FLIGHT=ON
+  -e ARROW_GANDIVA=OFF
   -e ARROW_GCS=OFF
   -e ARROW_MIMALLOC=OFF
   -e ARROW_ORC=OFF
   -e ARROW_PARQUET=OFF
   -e ARROW_PYTHON=ON
   -e ARROW_S3=OFF
-  -e CMAKE_BUILD_PARALLEL_LEVEL=2
+  -e CMAKE_BUILD_PARALLEL_LEVEL=1
   -e CMAKE_UNITY_BUILD=ON
   -e PARQUET_BUILD_EXAMPLES=OFF
   -e PARQUET_BUILD_EXECUTABLES=OFF



[arrow-datafusion] branch dependabot/cargo/master/arrow-schema-30.0.1 created (now ab918a39a)

2023-01-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a change to branch dependabot/cargo/master/arrow-schema-30.0.1
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


  at ab918a39a Update arrow-schema requirement from 29.0.0 to 30.0.1

No new revisions were added by this update.



[arrow-datafusion] branch dependabot/cargo/master/arrow-30.0.1 created (now 14c69e06f)

2023-01-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a change to branch dependabot/cargo/master/arrow-30.0.1
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


  at 14c69e06f Update arrow requirement from 29.0.0 to 30.0.1

No new revisions were added by this update.



[arrow-datafusion] branch dependabot/cargo/master/arrow-buffer-30.0.1 created (now 4ff8b26ee)

2023-01-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a change to branch dependabot/cargo/master/arrow-buffer-30.0.1
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


  at 4ff8b26ee Update arrow-buffer requirement from 29.0.0 to 30.0.1

No new revisions were added by this update.



[arrow-datafusion] branch dependabot/cargo/master/parquet-30.0.1 created (now 2443b1ecb)

2023-01-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a change to branch dependabot/cargo/master/parquet-30.0.1
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


  at 2443b1ecb Update parquet requirement from 29.0.0 to 30.0.1

No new revisions were added by this update.