This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 0534382b99 Expose `parser` on DFParser to enable user controlled
parsing (#9729)
0534382b99 is described below
commit 0534382b9984146a06dbab7a5a9cea3f105d11dd
Author: Trent Hauck <[email protected]>
AuthorDate: Wed Mar 27 19:36:38 2024 -0700
Expose `parser` on DFParser to enable user controlled parsing (#9729)
* poc: custom parser
* play with extension statement
* tweak
* Revert "tweak"
This reverts commit e57006e089c2378ca6cdad5c628a6b7c8d90a884.
* Revert "play with extension statement"
This reverts commit 86588e4513fc787c01c567c40bb76cb598a55ad1.
* style: cargo fmt
* Update datafusion-examples/examples/sql_parsing.rs
Co-authored-by: Andrew Lamb <[email protected]>
* Apply suggestions from code review
Co-authored-by: Andrew Lamb <[email protected]>
* style: cargo cmt
* refactor: less nesting in parse statement
* docs: better example description
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion-examples/README.md | 21 ++---
datafusion-examples/examples/sql_dialect.rs | 134 ++++++++++++++++++++++++++++
datafusion/sql/src/parser.rs | 2 +-
3 files changed, 146 insertions(+), 11 deletions(-)
diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
index dbc8050555..7ca90463cf 100644
--- a/datafusion-examples/README.md
+++ b/datafusion-examples/README.md
@@ -42,36 +42,37 @@ cargo run --example csv_sql
## Single Process
+- [`advanced_udaf.rs`](examples/advanced_udaf.rs): Define and invoke a more
complicated User Defined Aggregate Function (UDAF)
+- [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more
complicated User Defined Scalar Function (UDF)
+- [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more
complicated User Defined Window Function (UDWF)
- [`avro_sql.rs`](examples/avro_sql.rs): Build and run a query plan from a SQL
statement against a local AVRO file
+- [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog
- [`csv_sql.rs`](examples/csv_sql.rs): Build and run a query plan from a SQL
statement against a local CSV file
- [`csv_sql_streaming.rs`](examples/csv_sql_streaming.rs): Build and run a
streaming query plan from a SQL statement against a local CSV file
-- [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog
- [`custom_datasource.rs`](examples/custom_datasource.rs): Run queries against
a custom datasource (TableProvider)
-- [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame
against a local parquet file
- [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run
a query using a DataFrame against a parquet file from s3 and writing back to s3
-- [`dataframe_output.rs`](examples/dataframe_output.rs): Examples of methods
which write data out from a DataFrame
+- [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame
against a local parquet file
- [`dataframe_in_memory.rs`](examples/dataframe_in_memory.rs): Run a query
using a DataFrame against data in memory
+- [`dataframe_output.rs`](examples/dataframe_output.rs): Examples of methods
which write data out from a DataFrame
- [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert
query results into rust structs using serde
- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and analyze
`Expr`s
- [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run
DataFusion as a standalone process and execute SQL queries from JDBC clients
- [`function_factory.rs`](examples/function_factory.rs): Register `CREATE
FUNCTION` handler to implement SQL macros
- [`make_date.rs`](examples/make_date.rs): Examples of using the make_date
function
- [`memtable.rs`](examples/memtable.rs): Create an query data in memory using
SQL and `RecordBatch`es
-- [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based
on statistics
- [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from
a SQL statement against a local Parquet file
- [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs):
Build and run a query plan from a SQL statement against multiple local Parquet
files
+- [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based
on statistics
- [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure
`object_store` and run a query against files stored in AWS S3
- [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store`
and run a query against files vi HTTP
- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression
functions
- [`rewrite_expr.rs`](examples/rewrite_expr.rs): Define and invoke a custom
Query Optimizer pass
-- [`to_char.rs`](examples/to_char.rs): Examples of using the to_char function
-- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using
to_timestamp functions
-- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined
Scalar Function (UDF)
-- [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more
complicated User Defined Scalar Function (UDF)
- [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User
Defined Aggregate Function (UDAF)
-- [`advanced_udaf.rs`](examples/advanced_udaf.rs): Define and invoke a more
complicated User Defined Aggregate Function (UDAF)
+- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined
Scalar Function (UDF)
- [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User
Defined Window Function (UDWF)
-- [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more
complicated User Defined Window Function (UDWF)
+- [`sql_dialect.rs`](examples/sql_dialect.rs): Example of implementing a
custom SQL dialect on top of `DFParser`
+- [`to_char.rs`](examples/to_char.rs): Examples of using the to_char function
+- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using
to_timestamp functions
## Distributed
diff --git a/datafusion-examples/examples/sql_dialect.rs
b/datafusion-examples/examples/sql_dialect.rs
new file mode 100644
index 0000000000..259f38216b
--- /dev/null
+++ b/datafusion-examples/examples/sql_dialect.rs
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::Display;
+
+use datafusion::error::Result;
+use datafusion_sql::{
+ parser::{CopyToSource, CopyToStatement, DFParser, Statement},
+ sqlparser::{keywords::Keyword, parser::ParserError, tokenizer::Token},
+};
+
+/// This example demonstrates how to use the DFParser to parse a statement in
a custom way
+///
+/// This technique can be used to implement a custom SQL dialect, for example.
+#[tokio::main]
+async fn main() -> Result<()> {
+ let mut my_parser =
+ MyParser::new("COPY source_table TO 'file.fasta' STORED AS FASTA")?;
+
+ let my_statement = my_parser.parse_statement()?;
+
+ match my_statement {
+ MyStatement::DFStatement(s) => println!("df: {}", s),
+ MyStatement::MyCopyTo(s) => println!("my_copy: {}", s),
+ }
+
+ Ok(())
+}
+
+/// Here we define a Parser for our new SQL dialect that wraps the existing
`DFParser`
+struct MyParser<'a> {
+ df_parser: DFParser<'a>,
+}
+
+impl MyParser<'_> {
+ fn new(sql: &str) -> Result<Self> {
+ let df_parser = DFParser::new(sql)?;
+ Ok(Self { df_parser })
+ }
+
+ /// Returns true if the next token is `COPY` keyword, false otherwise
+ fn is_copy(&self) -> bool {
+ matches!(
+ self.df_parser.parser.peek_token().token,
+ Token::Word(w) if w.keyword == Keyword::COPY
+ )
+ }
+
+ /// This is the entry point to our parser -- it handles `COPY` statements
specially
+ /// but otherwise delegates to the existing DataFusion parser.
+ pub fn parse_statement(&mut self) -> Result<MyStatement, ParserError> {
+ if self.is_copy() {
+ self.df_parser.parser.next_token(); // COPY
+ let df_statement = self.df_parser.parse_copy()?;
+
+ if let Statement::CopyTo(s) = df_statement {
+ Ok(MyStatement::from(s))
+ } else {
+ Ok(MyStatement::DFStatement(Box::from(df_statement)))
+ }
+ } else {
+ let df_statement = self.df_parser.parse_statement()?;
+ Ok(MyStatement::from(df_statement))
+ }
+ }
+}
+
+enum MyStatement {
+ DFStatement(Box<Statement>),
+ MyCopyTo(MyCopyToStatement),
+}
+
+impl Display for MyStatement {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ MyStatement::DFStatement(s) => write!(f, "{}", s),
+ MyStatement::MyCopyTo(s) => write!(f, "{}", s),
+ }
+ }
+}
+
+impl From<Statement> for MyStatement {
+ fn from(s: Statement) -> Self {
+ Self::DFStatement(Box::from(s))
+ }
+}
+
+impl From<CopyToStatement> for MyStatement {
+ fn from(s: CopyToStatement) -> Self {
+ if s.stored_as == Some("FASTA".to_string()) {
+ Self::MyCopyTo(MyCopyToStatement::from(s))
+ } else {
+ Self::DFStatement(Box::from(Statement::CopyTo(s)))
+ }
+ }
+}
+
+struct MyCopyToStatement {
+ pub source: CopyToSource,
+ pub target: String,
+}
+
+impl From<CopyToStatement> for MyCopyToStatement {
+ fn from(s: CopyToStatement) -> Self {
+ Self {
+ source: s.source,
+ target: s.target,
+ }
+ }
+}
+
+impl Display for MyCopyToStatement {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(
+ f,
+ "COPY {} TO '{}' STORED AS FASTA",
+ self.source, self.target
+ )
+ }
+}
diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs
index a5d7970495..c585917a1e 100644
--- a/datafusion/sql/src/parser.rs
+++ b/datafusion/sql/src/parser.rs
@@ -278,7 +278,7 @@ fn ensure_not_set<T>(field: &Option<T>, name: &str) ->
Result<(), ParserError> {
/// `CREATE EXTERNAL TABLE` have special syntax in DataFusion. See
/// [`Statement`] for a list of this special syntax
pub struct DFParser<'a> {
- parser: Parser<'a>,
+ pub parser: Parser<'a>,
}
impl<'a> DFParser<'a> {