This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch 
gh-readonly-queue/main/pr-2107-1b842d3b6a76eadd0a8dc9bfecc8cb1dcd0bd3c0
in repository https://gitbox.apache.org/repos/asf/datafusion-sqlparser-rs.git

commit 9b8a2d1e226a024758a4dbbaaf47fafe67a9619d
Author: xitep <[email protected]>
AuthorDate: Tue Dec 16 12:30:30 2025 +0100

    Extract source comments (#2107)
    
    Co-authored-by: Ifeanyi Ubah <[email protected]>
---
 src/ast/comments.rs         | 329 ++++++++++++++++++++++++++++++++++++++++++++
 src/ast/mod.rs              |   1 +
 src/ast/spans.rs            |   8 +-
 src/parser/mod.rs           |  53 ++++++-
 tests/sqlparser_comments.rs |  75 ++++++++++
 5 files changed, 459 insertions(+), 7 deletions(-)

diff --git a/src/ast/comments.rs b/src/ast/comments.rs
new file mode 100644
index 00000000..1f5b3102
--- /dev/null
+++ b/src/ast/comments.rs
@@ -0,0 +1,329 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Provides a representation of source code comments in parsed SQL code.
+//!
+//! See [Comments::find] for an example.
+
+#[cfg(not(feature = "std"))]
+use alloc::{string::String, vec::Vec};
+
+use core::{
+    ops::{Bound, Deref, RangeBounds},
+    slice,
+};
+
+use crate::tokenizer::{Location, Span};
+
+/// An opaque container for comments from a parse SQL source code.
+#[derive(Default, Debug)]
+pub struct Comments(Vec<CommentWithSpan>);
+
+impl Comments {
+    /// Accepts `comment` if its the first or is located strictly after the
+    /// last accepted comment.  In other words, this method will skip the
+    /// comment if its comming out of order (as encountered in the parsed
+    /// source code.)
+    pub(crate) fn offer(&mut self, comment: CommentWithSpan) {
+        if self
+            .0
+            .last()
+            .map(|last| last.span < comment.span)
+            .unwrap_or(true)
+        {
+            self.0.push(comment);
+        }
+    }
+
+    /// Finds comments starting within the given location range. The order of
+    /// iterator reflects the order of the comments as encountered in the 
parsed
+    /// source code.
+    ///
+    /// # Example
+    /// ```rust
+    /// use sqlparser::{dialect::GenericDialect, parser::Parser, 
tokenizer::Location};
+    ///
+    /// let sql = r#"/*
+    ///  header comment ...
+    ///  ... spanning multiple lines
+    /// */
+    ///
+    ///  -- first statement
+    ///  SELECT 'hello' /* world */ FROM DUAL;
+    ///
+    ///  -- second statement
+    ///  SELECT 123 FROM DUAL;
+    ///
+    ///  -- trailing comment
+    /// "#;
+    ///
+    /// let (ast, comments) = Parser::parse_sql_with_comments(&GenericDialect, 
sql).unwrap();
+    ///
+    /// // all comments appearing before line seven, i.e. before the first 
statement itself
+    /// assert_eq!(
+    ///    &comments.find(..Location::new(7, 1)).map(|c| 
c.as_str()).collect::<Vec<_>>(),
+    ///    &["\n header comment ...\n ... spanning multiple lines\n", " first 
statement\n"]);
+    ///
+    /// // all comments appearing within the first statement
+    /// assert_eq!(
+    ///    &comments.find(Location::new(7, 1)..Location::new(8,1)).map(|c| 
c.as_str()).collect::<Vec<_>>(),
+    ///    &[" world "]);
+    ///
+    /// // all comments appearing within or after the first statement
+    /// assert_eq!(
+    ///    &comments.find(Location::new(7, 1)..).map(|c| 
c.as_str()).collect::<Vec<_>>(),
+    ///    &[" world ", " second statement\n", " trailing comment\n"]);
+    /// ```
+    ///
+    /// The [Spanned](crate::ast::Spanned) trait allows you to access location
+    /// information for certain AST nodes.
+    pub fn find<R: RangeBounds<Location>>(&self, range: R) -> Iter<'_> {
+        let (start, end) = (
+            self.start_index(range.start_bound()),
+            self.end_index(range.end_bound()),
+        );
+        debug_assert!((0..=self.0.len()).contains(&start));
+        debug_assert!((0..=self.0.len()).contains(&end));
+        // in case the user specified a reverse range
+        Iter(if start <= end {
+            self.0[start..end].iter()
+        } else {
+            self.0[0..0].iter()
+        })
+    }
+
+    /// Find the index of the first comment starting "before" the given 
location.
+    ///
+    /// The returned index is _inclusive_ and within the range of 
`0..=self.0.len()`.
+    fn start_index(&self, location: Bound<&Location>) -> usize {
+        match location {
+            Bound::Included(location) => {
+                match self.0.binary_search_by(|c| c.span.start.cmp(location)) {
+                    Ok(i) => i,
+                    Err(i) => i,
+                }
+            }
+            Bound::Excluded(location) => {
+                match self.0.binary_search_by(|c| c.span.start.cmp(location)) {
+                    Ok(i) => i + 1,
+                    Err(i) => i,
+                }
+            }
+            Bound::Unbounded => 0,
+        }
+    }
+
+    /// Find the index of the first comment starting "after" the given 
location.
+    ///
+    /// The returned index is _exclusive_ and within the range of 
`0..=self.0.len()`.
+    fn end_index(&self, location: Bound<&Location>) -> usize {
+        match location {
+            Bound::Included(location) => {
+                match self.0.binary_search_by(|c| c.span.start.cmp(location)) {
+                    Ok(i) => i + 1,
+                    Err(i) => i,
+                }
+            }
+            Bound::Excluded(location) => {
+                match self.0.binary_search_by(|c| c.span.start.cmp(location)) {
+                    Ok(i) => i,
+                    Err(i) => i,
+                }
+            }
+            Bound::Unbounded => self.0.len(),
+        }
+    }
+}
+
+impl From<Comments> for Vec<CommentWithSpan> {
+    fn from(comments: Comments) -> Self {
+        comments.0
+    }
+}
+
+/// A source code comment with information of its entire span.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CommentWithSpan {
+    /// The source code comment iself
+    pub comment: Comment,
+    /// The span of the comment including its markers
+    pub span: Span,
+}
+
+impl Deref for CommentWithSpan {
+    type Target = Comment;
+
+    fn deref(&self) -> &Self::Target {
+        &self.comment
+    }
+}
+
+/// A unified type of the different source code comment formats.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Comment {
+    /// A single line comment, typically introduced with a prefix and spanning
+    /// until end-of-line or end-of-file in the source code.
+    ///
+    /// Note: `content` will include the terminating new-line character, if 
any.
+    SingleLine { content: String, prefix: String },
+
+    /// A multi-line comment, typically enclosed in `/* .. */` markers. The
+    /// string represents the content excluding the markers.
+    MultiLine(String),
+}
+
+impl Comment {
+    /// Retrieves the content of the comment as string slice.
+    pub fn as_str(&self) -> &str {
+        match self {
+            Comment::SingleLine { content, prefix: _ } => content.as_str(),
+            Comment::MultiLine(content) => content.as_str(),
+        }
+    }
+}
+
+impl Deref for Comment {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        self.as_str()
+    }
+}
+
+/// An opaque iterator implementation over comments served by [Comments::find].
+pub struct Iter<'a>(slice::Iter<'a, CommentWithSpan>);
+
+impl<'a> Iterator for Iter<'a> {
+    type Item = &'a CommentWithSpan;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.0.next()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_find() {
+        let comments = {
+            // ```
+            // -- abc
+            //   /* hello */--, world
+            // /* def
+            //  ghi
+            //  jkl
+            // */
+            // ```
+            let mut c = Comments(Vec::new());
+            c.offer(CommentWithSpan {
+                comment: Comment::SingleLine {
+                    content: " abc".into(),
+                    prefix: "--".into(),
+                },
+                span: Span::new((1, 1).into(), (1, 7).into()),
+            });
+            c.offer(CommentWithSpan {
+                comment: Comment::MultiLine(" hello ".into()),
+                span: Span::new((2, 3).into(), (2, 14).into()),
+            });
+            c.offer(CommentWithSpan {
+                comment: Comment::SingleLine {
+                    content: ", world".into(),
+                    prefix: "--".into(),
+                },
+                span: Span::new((2, 14).into(), (2, 21).into()),
+            });
+            c.offer(CommentWithSpan {
+                comment: Comment::MultiLine(" def\n ghi\n jkl\n".into()),
+                span: Span::new((3, 3).into(), (7, 1).into()),
+            });
+            c
+        };
+
+        fn find<R: RangeBounds<Location>>(comments: &Comments, range: R) -> 
Vec<&str> {
+            comments.find(range).map(|c| c.as_str()).collect::<Vec<_>>()
+        }
+
+        // ~ end-points only --------------------------------------------------
+        assert_eq!(find(&comments, ..Location::new(0, 0)), Vec::<&str>::new());
+        assert_eq!(find(&comments, ..Location::new(2, 1)), vec![" abc"]);
+        assert_eq!(find(&comments, ..Location::new(2, 3)), vec![" abc"]);
+        assert_eq!(
+            find(&comments, ..=Location::new(2, 3)),
+            vec![" abc", " hello "]
+        );
+        assert_eq!(
+            find(&comments, ..=Location::new(2, 3)),
+            vec![" abc", " hello "]
+        );
+        assert_eq!(
+            find(&comments, ..Location::new(2, 15)),
+            vec![" abc", " hello ", ", world"]
+        );
+
+        // ~ start-points only ------------------------------------------------
+        assert_eq!(
+            find(&comments, Location::new(1000, 1000)..),
+            Vec::<&str>::new()
+        );
+        assert_eq!(
+            find(&comments, Location::new(2, 14)..),
+            vec![", world", " def\n ghi\n jkl\n"]
+        );
+        assert_eq!(
+            find(&comments, Location::new(2, 15)..),
+            vec![" def\n ghi\n jkl\n"]
+        );
+        assert_eq!(
+            find(&comments, Location::new(0, 0)..),
+            vec![" abc", " hello ", ", world", " def\n ghi\n jkl\n"]
+        );
+        assert_eq!(
+            find(&comments, Location::new(1, 1)..),
+            vec![" abc", " hello ", ", world", " def\n ghi\n jkl\n"]
+        );
+
+        // ~ ranges -----------------------------------------------------------
+        assert_eq!(
+            find(&comments, Location::new(2, 1)..Location::new(1, 1)),
+            Vec::<&str>::new()
+        );
+        assert_eq!(
+            find(&comments, Location::new(1, 1)..Location::new(2, 3)),
+            vec![" abc"]
+        );
+        assert_eq!(
+            find(&comments, Location::new(1, 1)..=Location::new(2, 3)),
+            vec![" abc", " hello "]
+        );
+        assert_eq!(
+            find(&comments, Location::new(1, 1)..=Location::new(2, 10)),
+            vec![" abc", " hello "]
+        );
+        assert_eq!(
+            find(&comments, Location::new(1, 1)..=Location::new(2, 14)),
+            vec![" abc", " hello ", ", world"]
+        );
+        assert_eq!(
+            find(&comments, Location::new(1, 1)..Location::new(2, 15)),
+            vec![" abc", " hello ", ", world"]
+        );
+
+        // ~ find everything --------------------------------------------------
+        assert_eq!(
+            find(&comments, ..),
+            vec![" abc", " hello ", ", world", " def\n ghi\n jkl\n"]
+        );
+    }
+}
diff --git a/src/ast/mod.rs b/src/ast/mod.rs
index 6cb4c336..23cde478 100644
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@@ -136,6 +136,7 @@ mod query;
 mod spans;
 pub use spans::Spanned;
 
+pub mod comments;
 mod trigger;
 mod value;
 
diff --git a/src/ast/spans.rs b/src/ast/spans.rs
index de8fba75..2ec797db 100644
--- a/src/ast/spans.rs
+++ b/src/ast/spans.rs
@@ -28,7 +28,7 @@ use core::iter;
 use crate::tokenizer::Span;
 
 use super::{
-    dcl::SecondaryRoles, value::ValueWithSpan, AccessExpr, 
AlterColumnOperation,
+    comments, dcl::SecondaryRoles, value::ValueWithSpan, AccessExpr, 
AlterColumnOperation,
     AlterIndexOperation, AlterTableOperation, Analyze, Array, Assignment, 
AssignmentTarget,
     AttachedToken, BeginEndStatements, CaseStatement, CloseCursor, 
ClusteredIndex, ColumnDef,
     ColumnOption, ColumnOptionDef, ConditionalStatementBlock, 
ConditionalStatements,
@@ -2477,6 +2477,12 @@ impl Spanned for OutputClause {
     }
 }
 
+impl Spanned for comments::CommentWithSpan {
+    fn span(&self) -> Span {
+        self.span
+    }
+}
+
 #[cfg(test)]
 pub mod tests {
     use crate::dialect::{Dialect, GenericDialect, SnowflakeDialect};
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index 54fb3273..2b82d009 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -32,14 +32,17 @@ use recursion::RecursionCounter;
 use IsLateral::*;
 use IsOptional::*;
 
-use crate::ast::helpers::{
-    key_value_options::{
-        KeyValueOption, KeyValueOptionKind, KeyValueOptions, 
KeyValueOptionsDelimiter,
-    },
-    stmt_create_table::{CreateTableBuilder, CreateTableConfiguration},
-};
 use crate::ast::Statement::CreatePolicy;
 use crate::ast::*;
+use crate::ast::{
+    comments,
+    helpers::{
+        key_value_options::{
+            KeyValueOption, KeyValueOptionKind, KeyValueOptions, 
KeyValueOptionsDelimiter,
+        },
+        stmt_create_table::{CreateTableBuilder, CreateTableConfiguration},
+    },
+};
 use crate::dialect::*;
 use crate::keywords::{Keyword, ALL_KEYWORDS};
 use crate::tokenizer::*;
@@ -530,6 +533,44 @@ impl<'a> Parser<'a> {
         Parser::new(dialect).try_with_sql(sql)?.parse_statements()
     }
 
+    /// Parses the given `sql` into an Abstract Syntax Tree (AST), returning
+    /// also encountered source code comments.
+    ///
+    /// See [Parser::parse_sql].
+    pub fn parse_sql_with_comments(
+        dialect: &'a dyn Dialect,
+        sql: &str,
+    ) -> Result<(Vec<Statement>, comments::Comments), ParserError> {
+        let mut p = Parser::new(dialect).try_with_sql(sql)?;
+        p.parse_statements().map(|stmts| (stmts, p.into_comments()))
+    }
+
+    /// Consumes this parser returning comments from the parsed token stream.
+    fn into_comments(self) -> comments::Comments {
+        let mut comments = comments::Comments::default();
+        for t in self.tokens.into_iter() {
+            match t.token {
+                Token::Whitespace(Whitespace::SingleLineComment { comment, 
prefix }) => {
+                    comments.offer(comments::CommentWithSpan {
+                        comment: comments::Comment::SingleLine {
+                            content: comment,
+                            prefix,
+                        },
+                        span: t.span,
+                    });
+                }
+                Token::Whitespace(Whitespace::MultiLineComment(comment)) => {
+                    comments.offer(comments::CommentWithSpan {
+                        comment: comments::Comment::MultiLine(comment),
+                        span: t.span,
+                    });
+                }
+                _ => {}
+            }
+        }
+        comments
+    }
+
     /// Parse a single top-level statement (such as SELECT, INSERT, CREATE, 
etc.),
     /// stopping before the statement separator, if any.
     pub fn parse_statement(&mut self) -> Result<Statement, ParserError> {
diff --git a/tests/sqlparser_comments.rs b/tests/sqlparser_comments.rs
new file mode 100644
index 00000000..34442ca3
--- /dev/null
+++ b/tests/sqlparser_comments.rs
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#![warn(clippy::all)]
+//! Test comment extraction from SQL source code.
+
+#[cfg(test)]
+use pretty_assertions::assert_eq;
+
+use sqlparser::{
+    ast::comments::{Comment, CommentWithSpan},
+    dialect::GenericDialect,
+    parser::Parser,
+    tokenizer::Span,
+};
+
+#[test]
+fn parse_sql_with_comments() {
+    let sql = r#"
+-- second line comment
+select * from /* inline comment after `from` */ dual;
+
+/*select
+some
+more*/
+
+  -- end-of-script-with-no-newline"#;
+
+    let comments = match Parser::parse_sql_with_comments(&GenericDialect, sql) 
{
+        Ok((_, comments)) => comments,
+        Err(e) => panic!("Invalid sql script: {e}"),
+    };
+
+    assert_eq!(
+        Vec::from(comments),
+        vec![
+            CommentWithSpan {
+                comment: Comment::SingleLine {
+                    content: " second line comment\n".into(),
+                    prefix: "--".into()
+                },
+                span: Span::new((2, 1).into(), (3, 1).into()),
+            },
+            CommentWithSpan {
+                comment: Comment::MultiLine(" inline comment after `from` 
".into()),
+                span: Span::new((3, 15).into(), (3, 48).into()),
+            },
+            CommentWithSpan {
+                comment: Comment::MultiLine("select\nsome\nmore".into()),
+                span: Span::new((5, 1).into(), (7, 7).into())
+            },
+            CommentWithSpan {
+                comment: Comment::SingleLine {
+                    content: " end-of-script-with-no-newline".into(),
+                    prefix: "--".into()
+                },
+                span: Span::new((9, 3).into(), (9, 35).into()),
+            }
+        ]
+    );
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to