[arrow-datafusion] branch master updated: Add DataFrame reference to the user guide (#3067)

agrove Tue, 09 Aug 2022 09:20:56 -0700

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git



The following commit(s) were added to refs/heads/master by this push:
     new 1e4441783 Add DataFrame reference to the user guide (#3067)
1e4441783 is described below

commit 1e44417839a0d7ebdf31a44c0e6516924ea37d39
Author: Andy Grove <[email protected]>
AuthorDate: Tue Aug 9 10:20:40 2022 -0600

    Add DataFrame reference to the user guide (#3067)
    
    * DataFrame reference for user guide
    
    * Add DataFrame methods
    
    * improvements
    
    * improvements
    
    * improvements
    
    * formatting
    
    * Update docs/source/user-guide/dataframe.md
    
    Co-authored-by: Loïc Sharma <[email protected]>
    
    * Update docs/source/user-guide/dataframe.md
    
    Co-authored-by: Loïc Sharma <[email protected]>
    
    * more docs
    
    * more docs
    
    * Update docs/source/user-guide/dataframe.md
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    * Update docs/source/user-guide/dataframe.md
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    * Update docs/source/user-guide/dataframe.md
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    * Update docs/source/user-guide/dataframe.md
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    * add note on lazy evaluation
    
    Co-authored-by: Loïc Sharma <[email protected]>
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/core/src/dataframe.rs    |   2 +-
 datafusion/expr/src/expr_fn.rs      |  52 ++++---
 docs/source/index.rst               |   1 +
 docs/source/user-guide/dataframe.md | 284 ++++++++++++++++++++++++++++++++++++
 4 files changed, 315 insertions(+), 24 deletions(-)

diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs
index 9084619e4..46ea63882 100644
--- a/datafusion/core/src/dataframe.rs
+++ b/datafusion/core/src/dataframe.rs
@@ -621,7 +621,7 @@ impl DataFrame {
         plan_to_json(&state, plan, path).await
     }
 
-    /// Create a projection based on arbitrary expressions.
+    /// Add an additional column to the DataFrame.
     ///
     /// ```
     /// # use datafusion::prelude::*;
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index abe823b57..171bd70a9 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -251,12 +251,10 @@ pub fn rollup(exprs: Vec<Expr>) -> Expr {
     Expr::GroupingSet(GroupingSet::Rollup(exprs))
 }
 
-// TODO(kszucs): this seems buggy, unary_scalar_expr! is used for many
-// varying arity functions
 /// Create an convenience function representing a unary scalar function
 macro_rules! unary_scalar_expr {
-    ($ENUM:ident, $FUNC:ident) => {
-        #[doc = concat!("Unary scalar function definition for ", 
stringify!($FUNC) ) ]
+    ($ENUM:ident, $FUNC:ident, $DOC:expr) => {
+        #[doc = $DOC ]
         pub fn $FUNC(e: Expr) -> Expr {
             Expr::ScalarFunction {
                 fun: built_in_function::BuiltinScalarFunction::$ENUM,
@@ -293,25 +291,32 @@ macro_rules! nary_scalar_expr {
 // generate methods for creating the supported unary/binary expressions
 
 // math functions
-unary_scalar_expr!(Sqrt, sqrt);
-unary_scalar_expr!(Sin, sin);
-unary_scalar_expr!(Cos, cos);
-unary_scalar_expr!(Tan, tan);
-unary_scalar_expr!(Asin, asin);
-unary_scalar_expr!(Acos, acos);
-unary_scalar_expr!(Atan, atan);
-unary_scalar_expr!(Floor, floor);
-unary_scalar_expr!(Ceil, ceil);
-unary_scalar_expr!(Now, now);
-unary_scalar_expr!(Round, round);
-unary_scalar_expr!(Trunc, trunc);
-unary_scalar_expr!(Abs, abs);
-unary_scalar_expr!(Signum, signum);
-unary_scalar_expr!(Exp, exp);
-unary_scalar_expr!(Log2, log2);
-unary_scalar_expr!(Log10, log10);
-unary_scalar_expr!(Ln, ln);
-unary_scalar_expr!(NullIf, nullif);
+unary_scalar_expr!(Sqrt, sqrt, "square root of a number");
+unary_scalar_expr!(Sin, sin, "sine");
+unary_scalar_expr!(Cos, cos, "cosine");
+unary_scalar_expr!(Tan, tan, "tangent");
+unary_scalar_expr!(Asin, asin, "inverse sine");
+unary_scalar_expr!(Acos, acos, "inverse cosine");
+unary_scalar_expr!(Atan, atan, "inverse tangent");
+unary_scalar_expr!(
+    Floor,
+    floor,
+    "nearest integer less than or equal to argument"
+);
+unary_scalar_expr!(
+    Ceil,
+    ceil,
+    "nearest integer greater than or equal to argument"
+);
+unary_scalar_expr!(Round, round, "round to nearest integer");
+unary_scalar_expr!(Trunc, trunc, "truncate toward zero");
+unary_scalar_expr!(Abs, abs, "absolute value");
+unary_scalar_expr!(Signum, signum, "sign of the argument (-1, 0, +1) ");
+unary_scalar_expr!(Exp, exp, "base 2 logarithm");
+unary_scalar_expr!(Log2, log2, "base 10 logarithm");
+unary_scalar_expr!(Log10, log10, "base 10 logarithm");
+unary_scalar_expr!(Ln, ln, "natural logarithm");
+unary_scalar_expr!(NullIf, nullif, "The NULLIF function returns a null value 
if value1 equals value2; otherwise it returns value1. This can be used to 
perform the inverse operation of the COALESCE expression."); //TODO this is not 
a unary expression https://github.com/apache/arrow-datafusion/issues/3069
 scalar_expr!(Power, power, base, exponent);
 scalar_expr!(Atan2, atan2, y, x);
 
@@ -357,6 +362,7 @@ nary_scalar_expr!(Concat, concat_expr);
 nary_scalar_expr!(Now, now_expr);
 
 // date functions
+unary_scalar_expr!(Now, now, "current time"); //TODO this is not a unary 
expression https://github.com/apache/arrow-datafusion/issues/3069
 scalar_expr!(DatePart, date_part, part, date);
 scalar_expr!(DateTrunc, date_trunc, part, date);
 scalar_expr!(DateBin, date_bin, stride, source, origin);
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8bced2d06..76ffe8ecd 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -42,6 +42,7 @@ Table of Contents
    user-guide/example-usage
    user-guide/library
    user-guide/cli
+   user-guide/dataframe
    user-guide/sql/index
    user-guide/configs
    user-guide/faq
diff --git a/docs/source/user-guide/dataframe.md 
b/docs/source/user-guide/dataframe.md
new file mode 100644
index 000000000..7eeb7d463
--- /dev/null
+++ b/docs/source/user-guide/dataframe.md
@@ -0,0 +1,284 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# DataFrame API
+
+A DataFrame represents a logical set of rows with the same named columns, 
similar to a [Pandas 
DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html)
 or
+[Spark 
DataFrame](https://spark.apache.org/docs/latest/sql-programming-guide.html).
+
+DataFrames are typically created by calling a method on
+`SessionContext`, such as `read_csv`, and can then be modified
+by calling the transformation methods, such as `filter`, `select`, 
`aggregate`, and `limit`
+to build up a query definition.
+
+The query can be executed by calling the `collect` method.
+
+The API is well documented in the [API reference on 
docs.rs](https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html)
+
+The DataFrame struct is part of DataFusion's prelude and can be imported with 
the following statement.
+
+```rust
+use datafusion::prelude::*;
+```
+
+Here is a minimal example showing the execution of a query using the DataFrame 
API.
+
+```rust
+let ctx = SessionContext::new();
+let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
+let df = df.filter(col("a").lt_eq(col("b")))?
+           .aggregate(vec![col("a")], vec![min(col("b"))])?
+           .limit(None, Some(100))?;
+// Print results
+df.show();
+```
+
+## DataFrame Transformations
+
+These methods create a new DataFrame after applying a transformation to the 
logical plan that the DataFrame represents.
+
+DataFusion DataFrames use lazy evaluation, meaning that each transformation is 
just creating a new query plan and
+not actually performing any transformations. This approach allows for the 
overall plan to be optimized before
+execution. The plan is evaluated (executed) when an action method is invoked, 
such as `collect`.
+
+| Function            | Notes                                                  
                                                                                
    |
+| ------------------- | 
------------------------------------------------------------------------------------------------------------------------------------------
 |
+| aggregate           | Perform an aggregate query with optional grouping 
expressions.                                                                    
         |
+| distinct            | Filter out duplicate rows.                             
                                                                                
    |
+| except              | Calculate the exception of two DataFrames. The two 
DataFrames must have exactly the same schema                                    
        |
+| filter              | Filter a DataFrame to only include rows that match the 
specified filter expression.                                                    
    |
+| intersect           | Calculate the intersection of two DataFrames. The two 
DataFrames must have exactly the same schema                                    
     |
+| join                | Join this DataFrame with another DataFrame using the 
specified columns as join keys.                                                 
      |
+| limit               | Limit the number of rows returned from this DataFrame. 
                                                                                
    |
+| repartition         | Repartition a DataFrame based on a logical 
partitioning scheme.                                                            
                |
+| sort                | Sort the DataFrame by the specified sorting 
expressions. Any expression can be turned into a sort expression by calling its 
`sort` method. |
+| select              | Create a projection based on arbitrary expressions. 
Example: `df..select(vec![col("c1"), abs(col("c2"))])?`                         
       |
+| select_columns      | Create a projection based on column names. Example: 
`df.select_columns(&["id", "name"])?`.                                          
       |
+| union               | Calculate the union of two DataFrames, preserving 
duplicate rows. The two DataFrames must have exactly the same schema.           
         |
+| union_distinct      | Calculate the distinct union of two DataFrames. The 
two DataFrames must have exactly the same schema.                               
       |
+| with_column         | Add an additional column to the DataFrame.             
                                                                                
    |
+| with_column_renamed | Rename one column by applying a new projection.        
                                                                                
    |
+
+## DataFrame Actions
+
+These methods execute the logical plan represented by the DataFrame and either 
collects the results into memory, prints them to stdout, or writes them to disk.
+
+| Function                   | Notes                                           
                                                                            |
+| -------------------------- | 
---------------------------------------------------------------------------------------------------------------------------
 |
+| collect                    | Executes this DataFrame and collects all 
results into a vector of RecordBatch.                                           
   |
+| collect_partitioned        | Executes this DataFrame and collects all 
results into a vector of vector of RecordBatch maintaining the input 
partitioning. |
+| execute_stream             | Executes this DataFrame and returns a stream 
over a single partition.                                                       |
+| execute_stream_partitioned | Executes this DataFrame and returns one stream 
per partition.                                                               |
+| show                       | Execute this DataFrame and print the results to 
stdout.                                                                     |
+| show_limit                 | Execute this DataFrame and print a subset of 
results to stdout.                                                             |
+| write_csv                  | Execute this DataFrame and write the results to 
disk in CSV format.                                                         |
+| write_json                 | Execute this DataFrame and write the results to 
disk in JSON format.                                                        |
+| write_parquet              | Execute this DataFrame and write the results to 
disk in Parquet format.                                                     |
+
+## Other DataFrame Methods
+
+| Function        | Notes                                                      
                                                                                
                  |
+| --------------- | 
------------------------------------------------------------------------------------------------------------------------------------------------------------
 |
+| explain         | Return a DataFrame with the explanation of its plan so 
far.                                                                            
                      |
+| registry        | Return a `FunctionRegistry` used to plan udf's calls.      
                                                                                
                  |
+| schema          | Returns the schema describing the output of this DataFrame 
in terms of columns returned, where each column has a name, data type, and 
nullability attribute. |
+| to_logical_plan | Return the logical plan represented by this DataFrame.     
                                                                                
                  |
+
+# Expressions
+
+DataFrame methods such as `select` and `filter` accept one or more logical 
expressions and there are many functions
+available for creating logical expressions. These are documented below.
+
+Expressions can be chained together using a fluent-style API:
+
+```rust
+// create the expression `(a > 5) AND (b < 7)`
+col("a").gt(lit(5)).and(col("b").lt(lit(7)))
+```
+
+## Identifiers
+
+| Function | Notes                                        |
+| -------- | -------------------------------------------- |
+| col      | Reference a column in a dataframe `col("a")` |
+
+## Literal Values
+
+| Function | Notes                                              |
+| -------- | -------------------------------------------------- |
+| lit      | Literal value such as `lit(123)` or `lit("hello")` |
+
+## Boolean Expressions
+
+| Function | Notes                                     |
+| -------- | ----------------------------------------- |
+| and      | `and(expr1, expr2)` or `expr1.and(expr2)` |
+| or       | `or(expr1, expr2)` or `expr1.or(expr2)`   |
+| not      | `not(expr)` or `expr.not()`               |
+
+## Comparison Expressions
+
+| Function | Notes                 |
+| -------- | --------------------- |
+| eq       | `expr1.eq(expr2)`     |
+| gt       | `expr1.gt(expr2)`     |
+| gt_eq    | `expr1.gt_eq(expr2)`  |
+| lt       | `expr1.lt(expr2)`     |
+| lt_eq    | `expr1.lt_eq(expr2)`  |
+| not_eq   | `expr1.not_eq(expr2)` |
+
+## Math Functions
+
+In addition to the math functions listed here, some Rust operators are 
implemented for expressions, allowing
+expressions such as `col("a") + col("b")` to be used.
+
+| Function              | Notes                                             |
+| --------------------- | ------------------------------------------------- |
+| abs(x)                | absolute value                                    |
+| acos(x)               | inverse cosine                                    |
+| asin(x)               | inverse sine                                      |
+| atan(x)               | inverse tangent                                   |
+| atan2(y, x)           | inverse tangent of y / x                          |
+| ceil(x)               | nearest integer greater than or equal to argument |
+| cos(x)                | cosine                                            |
+| exp(x)                | exponential                                       |
+| floor(x)              | nearest integer less than or equal to argument    |
+| ln(x)                 | natural logarithm                                 |
+| log10(x)              | base 10 logarithm                                 |
+| log2(x)               | base 2 logarithm                                  |
+| power(base, exponent) | base raised to the power of exponent              |
+| round(x)              | round to nearest integer                          |
+| signum(x)             | sign of the argument (-1, 0, +1)                  |
+| sin(x)                | sine                                              |
+| sqrt(x)               | square root                                       |
+| tan(x)                | tangent                                           |
+| trunc(x)              | truncate toward zero                              |
+
+## Conditional Expressions
+
+| Function | Notes                                                             
                                                                                
                                                       |
+| -------- | 
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 |
+| coalesce | Returns the first of its arguments that is not null. Null is 
returned only if all arguments are null. It is often used to substitute a 
default value for null values when data is retrieved for display. |
+| case     | CASE expression. Example: `case(expr).when(expr, expr).when(expr, 
expr).otherwise(expr).end()`.                                                   
                                                       |
+| nullif   | Returns a null value if `value1` equals `value2`; otherwise it 
returns `value1`. This can be used to perform the inverse operation of the 
`coalesce` expression.                                         |
+
+## String Expressions
+
+| Function         | Notes |
+| ---------------- | ----- |
+| ascii            |       |
+| bit_length       |       |
+| btrim            |       |
+| char_length      |       |
+| character_length |       |
+| concat           |       |
+| concat_ws        |       |
+| chr              |       |
+| initcap          |       |
+| left             |       |
+| length           |       |
+| lower            |       |
+| lpad             |       |
+| ltrim            |       |
+| md5              |       |
+| octet_length     |       |
+| repeat           |       |
+| replace          |       |
+| reverse          |       |
+| right            |       |
+| rpad             |       |
+| rtrim            |       |
+| digest           |       |
+| split_part       |       |
+| starts_with      |       |
+| strpos           |       |
+| substr           |       |
+| translate        |       |
+| trim             |       |
+| upper            |       |
+
+## Regular Expressions
+
+| Function       | Notes |
+| -------------- | ----- |
+| regexp_match   |       |
+| regexp_replace |       |
+
+## Temporal Expressions
+
+| Function             | Notes        |
+| -------------------- | ------------ |
+| date_part            |              |
+| date_trunc           |              |
+| from_unixtime        |              |
+| to_timestamp         |              |
+| to_timestamp_millis  |              |
+| to_timestamp_micros  |              |
+| to_timestamp_seconds |              |
+| now()                | current time |
+
+## Other Expressions
+
+| Function | Notes |
+| -------- | ----- |
+| array    |       |
+| in_list  |       |
+| random   |       |
+| sha224   |       |
+| sha256   |       |
+| sha384   |       |
+| sha512   |       |
+| struct   |       |
+| to_hex   |       |
+
+## Aggregate Functions
+
+| Function                           | Notes |
+| ---------------------------------- | ----- |
+| avg                                |       |
+| approx_distinct                    |       |
+| approx_median                      |       |
+| approx_percentile_cont             |       |
+| approx_percentile_cont_with_weight |       |
+| count                              |       |
+| count_distinct                     |       |
+| cube                               |       |
+| grouping_set                       |       |
+| max                                |       |
+| median                             |       |
+| min                                |       |
+| rollup                             |       |
+| sum                                |       |
+
+## Subquery Expressions
+
+| Function        | Notes                                                      
                                   |
+| --------------- | 
---------------------------------------------------------------------------------------------
 |
+| exists          |                                                            
                                   |
+| in_subquery     | `df1.filter(in_subquery(col("foo"), df2))?` is the 
equivalent of the SQL `WHERE foo IN <df2>` |
+| not_exists      |                                                            
                                   |
+| not_in_subquery |                                                            
                                   |
+| scalar_subquery |                                                            
                                   |
+
+## User-Defined Function Expressions
+
+| Function    | Notes |
+| ----------- | ----- |
+| create_udf  |       |
+| create_udaf |       |

[arrow-datafusion] branch master updated: Add DataFrame reference to the user guide (#3067)

Reply via email to