This is an automated email from the ASF dual-hosted git repository.

iffyio pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 269967a6 Support underscore separators in numbers for Clickhouse. 
Fixes #1659 (#1677)
269967a6 is described below

commit 269967a6ac4f4d9799cccb6c97142823123ed2c5
Author: Paul Grau <[email protected]>
AuthorDate: Tue Jan 28 15:26:08 2025 +0200

    Support underscore separators in numbers for Clickhouse. Fixes #1659 (#1677)
---
 src/dialect/clickhouse.rs     |  4 +++
 src/dialect/mod.rs            |  5 +++
 src/dialect/postgresql.rs     |  4 +++
 src/tokenizer.rs              | 74 +++++++++++++++++++++++++++++++++++++++++--
 tests/sqlparser_clickhouse.rs | 15 +++++++++
 5 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/src/dialect/clickhouse.rs b/src/dialect/clickhouse.rs
index 884dfcbc..830b3da9 100644
--- a/src/dialect/clickhouse.rs
+++ b/src/dialect/clickhouse.rs
@@ -59,6 +59,10 @@ impl Dialect for ClickHouseDialect {
         true
     }
 
+    fn supports_numeric_literal_underscores(&self) -> bool {
+        true
+    }
+
     // ClickHouse uses this for some FORMAT expressions in `INSERT` context, 
e.g. when inserting
     // with FORMAT JSONEachRow a raw JSON key-value expression is valid and 
expected.
     //
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
index bc3c0c96..817f5f32 100644
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@@ -304,6 +304,11 @@ pub trait Dialect: Debug + Any {
         false
     }
 
+    /// Returns true if the dialect supports numbers containing underscores, 
e.g. `10_000_000`
+    fn supports_numeric_literal_underscores(&self) -> bool {
+        false
+    }
+
     /// Returns true if the dialects supports specifying null treatment
     /// as part of a window function's parameter list as opposed
     /// to after the parameter list.
diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs
index d4f2a032..5ce4250f 100644
--- a/src/dialect/postgresql.rs
+++ b/src/dialect/postgresql.rs
@@ -249,6 +249,10 @@ impl Dialect for PostgreSqlDialect {
     fn supports_string_escape_constant(&self) -> bool {
         true
     }
+
+    fn supports_numeric_literal_underscores(&self) -> bool {
+        true
+    }
 }
 
 pub fn parse_create(parser: &mut Parser) -> Option<Result<Statement, 
ParserError>> {
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 309f09d8..7742e8fa 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1136,12 +1136,24 @@ impl<'a> Tokenizer<'a> {
                 }
                 // numbers and period
                 '0'..='9' | '.' => {
-                    let mut s = peeking_take_while(chars, |ch| 
ch.is_ascii_digit());
+                    // Some dialects support underscore as number separator
+                    // There can only be one at a time and it must be followed 
by another digit
+                    let is_number_separator = |ch: char, next_char: 
Option<char>| {
+                        self.dialect.supports_numeric_literal_underscores()
+                            && ch == '_'
+                            && next_char.is_some_and(|next_ch| 
next_ch.is_ascii_hexdigit())
+                    };
+
+                    let mut s = peeking_next_take_while(chars, |ch, next_ch| {
+                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
+                    });
 
                     // match binary literal that starts with 0x
                     if s == "0" && chars.peek() == Some(&'x') {
                         chars.next();
-                        let s2 = peeking_take_while(chars, |ch| 
ch.is_ascii_hexdigit());
+                        let s2 = peeking_next_take_while(chars, |ch, next_ch| {
+                            ch.is_ascii_hexdigit() || is_number_separator(ch, 
next_ch)
+                        });
                         return Ok(Some(Token::HexStringLiteral(s2)));
                     }
 
@@ -1150,7 +1162,10 @@ impl<'a> Tokenizer<'a> {
                         s.push('.');
                         chars.next();
                     }
-                    s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
+
+                    s += &peeking_next_take_while(chars, |ch, next_ch| {
+                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
+                    });
 
                     // No number -> Token::Period
                     if s == "." {
@@ -1946,6 +1961,24 @@ fn peeking_take_while(chars: &mut State, mut predicate: 
impl FnMut(char) -> bool
     s
 }
 
+/// Same as peeking_take_while, but also passes the next character to the 
predicate.
+fn peeking_next_take_while(
+    chars: &mut State,
+    mut predicate: impl FnMut(char, Option<char>) -> bool,
+) -> String {
+    let mut s = String::new();
+    while let Some(&ch) = chars.peek() {
+        let next_char = chars.peekable.clone().nth(1);
+        if predicate(ch, next_char) {
+            chars.next(); // consume
+            s.push(ch);
+        } else {
+            break;
+        }
+    }
+    s
+}
+
 fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
     Unescape::new(chars).unescape()
 }
@@ -2227,6 +2260,41 @@ mod tests {
         compare(expected, tokens);
     }
 
+    #[test]
+    fn tokenize_numeric_literal_underscore() {
+        let dialect = GenericDialect {};
+        let sql = String::from("SELECT 10_000");
+        let mut tokenizer = Tokenizer::new(&dialect, &sql);
+        let tokens = tokenizer.tokenize().unwrap();
+        let expected = vec![
+            Token::make_keyword("SELECT"),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("10".to_string(), false),
+            Token::make_word("_000", None),
+        ];
+        compare(expected, tokens);
+
+        all_dialects_where(|dialect| 
dialect.supports_numeric_literal_underscores()).tokenizes_to(
+            "SELECT 10_000, _10_000, 10_00_, 10___0",
+            vec![
+                Token::make_keyword("SELECT"),
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("10_000".to_string(), false),
+                Token::Comma,
+                Token::Whitespace(Whitespace::Space),
+                Token::make_word("_10_000", None), // leading underscore 
tokenizes as a word (parsed as column identifier)
+                Token::Comma,
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("10_00".to_string(), false),
+                Token::make_word("_", None), // trailing underscores tokenizes 
as a word (syntax error in some dialects)
+                Token::Comma,
+                Token::Whitespace(Whitespace::Space),
+                Token::Number("10".to_string(), false),
+                Token::make_word("___0", None), // multiple underscores 
tokenizes as a word (syntax error in some dialects)
+            ],
+        );
+    }
+
     #[test]
     fn tokenize_select_exponent() {
         let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 
1e-10-10");
diff --git a/tests/sqlparser_clickhouse.rs b/tests/sqlparser_clickhouse.rs
index 0f22db38..5b0638a4 100644
--- a/tests/sqlparser_clickhouse.rs
+++ b/tests/sqlparser_clickhouse.rs
@@ -1649,6 +1649,21 @@ fn parse_table_sample() {
     clickhouse().verified_stmt("SELECT * FROM tbl SAMPLE 1 / 10 OFFSET 1 / 2");
 }
 
+#[test]
+fn parse_numbers_with_underscore() {
+    let canonical = if cfg!(feature = "bigdecimal") {
+        "SELECT 10000"
+    } else {
+        "SELECT 10_000"
+    };
+    let select = clickhouse().verified_only_select_with_canonical("SELECT 
10_000", canonical);
+
+    assert_eq!(
+        select.projection,
+        vec![SelectItem::UnnamedExpr(Expr::Value(number("10_000")))]
+    )
+}
+
 fn clickhouse() -> TestedDialects {
     TestedDialects::new(vec![Box::new(ClickHouseDialect {})])
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to