[arrow-ballista] branch master updated: Fix documentation example (#288)

agrove Thu, 29 Sep 2022 06:53:21 -0700

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-ballista.git



The following commit(s) were added to refs/heads/master by this push:
     new 093909ee Fix documentation example (#288)
093909ee is described below

commit 093909eed5d23c3d3559216edd67b8526db9fbd9
Author: Stefan Stanciulescu 
<[email protected]>
AuthorDate: Thu Sep 29 06:53:07 2022 -0700

    Fix documentation example (#288)
    
    * Replace the wrong code snippet for the distributed SQL example
    
    * The New York Taxi and Limousine Commission only provides Parquet files 
now. Change the old CSV code to use the Parquet code, and provide also the SQL 
query together with the expected output
    
    * Update the README file after running prettier and fixing the errors
---
 ballista/rust/client/README.md | 84 ++++++++++++++++++++++++++----------------
 examples/README.md             | 26 ++++++++++---
 2 files changed, 73 insertions(+), 37 deletions(-)

diff --git a/ballista/rust/client/README.md b/ballista/rust/client/README.md
index f5fe094d..1c6a15ce 100644
--- a/ballista/rust/client/README.md
+++ b/ballista/rust/client/README.md
@@ -84,50 +84,72 @@ To build a simple ballista example, add the following 
dependencies to your `Carg
 
 ```toml
 [dependencies]
-ballista = "0.6"
-datafusion = "7.0"
+ballista = "0.8"
+datafusion = "12.0.0"
 tokio = "1.0"
 ```
 
-The following example runs a simple aggregate SQL query against a CSV file 
from the
+The following example runs a simple aggregate SQL query against a Parquet file 
(`yellow_tripdata_2022-01.parquet`) from the
 [New York Taxi and Limousine 
Commission](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
-data set.
+data set. Download the file and add it to the `testdata` folder before running 
the example.
 
 ```rust,no_run
 use ballista::prelude::*;
+use datafusion::prelude::{col, min, max, avg, sum, ParquetReadOptions};
 use datafusion::arrow::util::pretty;
 use datafusion::prelude::CsvReadOptions;
 
 #[tokio::main]
 async fn main() -> Result<()> {
-   // create configuration
-   let config = BallistaConfig::builder()
-       .set("ballista.shuffle.partitions", "4")
-       .build()?;
-
-   // connect to Ballista scheduler
-   let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
-
-   // register csv file with the execution context
-   ctx.register_csv(
-       "tripdata",
-       "/path/to/yellow_tripdata_2020-01.csv",
-       CsvReadOptions::new(),
-   ).await?;
-
-   // execute the query
-   let df = ctx.sql(
-       "SELECT passenger_count, MIN(fare_amount), MAX(fare_amount), 
AVG(fare_amount), SUM(fare_amount)
-       FROM tripdata
-       GROUP BY passenger_count
-       ORDER BY passenger_count",
-   ).await?;
-
-   // collect the results and print them to stdout
-   let results = df.collect().await?;
-   pretty::print_batches(&results)?;
-   Ok(())
+    // create configuration
+    let config = BallistaConfig::builder()
+        .set("ballista.shuffle.partitions", "4")
+        .build()?;
+
+    // connect to Ballista scheduler
+    let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
+
+    let filename = "testdata/yellow_tripdata_2022-01.parquet";
+
+    // define the query using the DataFrame trait
+    let df = ctx
+        .read_parquet(filename, ParquetReadOptions::default())
+        .await?
+        .select_columns(&["passenger_count", "fare_amount"])?
+        .aggregate(vec![col("passenger_count")], vec![min(col("fare_amount")), 
max(col("fare_amount")), avg(col("fare_amount")), sum(col("fare_amount"))])?
+        .sort(vec![col("passenger_count").sort(true,true)])?;
+
+    // this is equivalent to the following SQL
+    // SELECT passenger_count, MIN(fare_amount), MAX(fare_amount), 
AVG(fare_amount), SUM(fare_amount)
+    //     FROM tripdata
+    //     GROUP BY passenger_count
+    //     ORDER BY passenger_count
+
+    // print the results
+    df.show().await?;
+
+    Ok(())
 }
 ```
 
+The output should look similar to the following table.
+
+```{r eval=FALSE}
++-----------------+--------------------------+--------------------------+--------------------------+--------------------------+
+| passenger_count | MIN(?table?.fare_amount) | MAX(?table?.fare_amount) | 
AVG(?table?.fare_amount) | SUM(?table?.fare_amount) |
++-----------------+--------------------------+--------------------------+--------------------------+--------------------------+
+|                 | -159.5                   | 285.2                    | 
17.60577640099004        | 1258865.829999991        |
+| 0               | -115                     | 500                      | 
11.794859107585335       | 614052.1600000001        |
+| 1               | -480                     | 401092.32                | 
12.61028389876563        | 22623542.879999973       |
+| 2               | -250                     | 640.5                    | 
13.79501011585127        | 4732047.139999998        |
+| 3               | -130                     | 480                      | 
13.473184817311106       | 1139427.2400000002       |
+| 4               | -250                     | 464                      | 
14.232650547832726       | 502711.4499999997        |
+| 5               | -52                      | 668                      | 
12.160378472086954       | 624289.51                |
+| 6               | -52                      | 252.5                    | 
12.576583325529857       | 402916                   |
+| 7               | 7                        | 79                       | 
61.77777777777778        | 556                      |
+| 8               | 8.3                      | 115                      | 
79.9125                  | 639.3                    |
+| 9               | 9.3                      | 96.5                     | 
65.26666666666667        | 195.8                    |
++-----------------+--------------------------+--------------------------+--------------------------+--------------------------+
+```
+
 More 
[examples](https://github.com/apache/arrow-datafusion/tree/master/ballista-examples)
 can be found in the arrow-datafusion repository.
diff --git a/examples/README.md b/examples/README.md
index e80bbd29..3e6a00ce 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -96,6 +96,11 @@ cargo run --release --bin sql
 ### Source code for distributed SQL example
 
 ```rust
+use ballista::prelude::*;
+use datafusion::prelude::CsvReadOptions;
+
+/// This example demonstrates executing a simple query against an Arrow data 
source (CSV) and
+/// fetching results, using SQL
 #[tokio::main]
 async fn main() -> Result<()> {
     let config = BallistaConfig::builder()
@@ -103,14 +108,23 @@ async fn main() -> Result<()> {
         .build()?;
     let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
 
-    let filename = "testdata/alltypes_plain.parquet";
+    // register csv file with the execution context
+    ctx.register_csv(
+        "test",
+        "testdata/aggregate_test_100.csv",
+        CsvReadOptions::new(),
+    )
+    .await?;
 
-    // define the query using the DataFrame trait
+    // execute the query
     let df = ctx
-        .read_parquet(filename, ParquetReadOptions::default())
-        .await?
-        .select_columns(&["id", "bool_col", "timestamp_col"])?
-        .filter(col("id").gt(lit(1)))?;
+        .sql(
+            "SELECT c1, MIN(c12), MAX(c12) \
+        FROM test \
+        WHERE c11 > 0.1 AND c11 < 0.9 \
+        GROUP BY c1",
+        )
+        .await?;
 
     // print the results
     df.show().await?;

[arrow-ballista] branch master updated: Fix documentation example (#288)

Reply via email to