Re: [PR] Reduce string copies cow [datafusion-sqlparser-rs]

via GitHub Mon, 01 Dec 2025 05:18:40 -0800


iffyio commented on code in PR #2075:
URL: 
https://github.com/apache/datafusion-sqlparser-rs/pull/2075#discussion_r2577017573



##########
src/tokenizer.rs:
##########
@@ -1783,96 +1786,115 @@ impl<'a> Tokenizer<'a> {
     }
 
     /// Tokenize dollar preceded value (i.e: a string/placeholder)
-    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> 
Result<Token, TokenizerError> {
-        let mut s = String::new();
-        let mut value = String::new();
+    fn tokenize_dollar_preceded_value(
+        &self,
+        chars: &mut State<'a>,
+    ) -> Result<Token, TokenizerError> {
+        chars.next(); // consume first $
 
-        chars.next();
+        // Case 1: $$text$$ (untagged dollar-quoted string)
+        if matches!(chars.peek(), Some('$')) && 
!self.dialect.supports_dollar_placeholder() {
+            let (value, tag) = 
self.tokenize_dollar_quoted_string_borrowed(chars, None)?;
+            return Ok(Token::DollarQuotedString(DollarQuotedString {
+                value: value.into_owned(),
+                tag: tag.map(|t| t.into_owned()),
+            }));
+        }
 
-        // If the dialect does not support dollar-quoted strings, then `$$` is 
rather a placeholder.
+        // If it's not $$ we have 2 options :
+        //   Case 2: $tag$text$tag$ (tagged dollar-quoted string) if dialect 
supports it
+        //   Case 3: $placeholder (e.g., $1, $name)
+        let tag_start = chars.byte_pos;
+        let _tag_slice = peeking_take_while_ref(chars, |ch| {
+            ch.is_alphanumeric()
+                || ch == '_'
+                || matches!(ch, '$' if 
self.dialect.supports_dollar_placeholder())
+        });
+        let tag_end = chars.byte_pos;
+
+        // Case 2: $tag$text$tag$ (tagged dollar-quoted string)
         if matches!(chars.peek(), Some('$')) && 
!self.dialect.supports_dollar_placeholder() {
-            chars.next();
+            let tag_value = &chars.source[tag_start..tag_end];
+            let (value, tag) =
+                self.tokenize_dollar_quoted_string_borrowed(chars, 
Some(tag_value))?;
+            return Ok(Token::DollarQuotedString(DollarQuotedString {
+                value: value.into_owned(),
+                tag: tag.map(|t| t.into_owned()),
+            }));
+        }
 
-            let mut is_terminated = false;
-            let mut prev: Option<char> = None;
+        // Case 3: $placeholder (e.g., $1, $name)
+        let tag_value = &chars.source[tag_start..tag_end];
+        Ok(Token::Placeholder(format!("${}", tag_value)))
+    }
 
-            while let Some(&ch) = chars.peek() {
-                if prev == Some('$') {
-                    if ch == '$' {
-                        chars.next();
-                        is_terminated = true;
-                        break;
-                    } else {
-                        s.push('$');
-                        s.push(ch);
+    /// Tokenize a dollar-quoted string ($$text$$ or $tag$text$tag$), 
returning borrowed slices.
+    /// tag_prefix: None for $$, Some("tag") for $tag$
+    /// Returns (value: Cow<'a, str>, tag: Option<Cow<'a, str>>)
+    fn tokenize_dollar_quoted_string_borrowed(
+        &self,
+        chars: &mut State<'a>,
+        tag_prefix: Option<&'a str>,
+    ) -> Result<(Cow<'a, str>, Option<Cow<'a, str>>), TokenizerError> {
+        chars.next(); // consume $ after tag (or second $ for $$)
+        let content_start = chars.byte_pos;
+
+        match tag_prefix {
+            None => {
+                // Case: $$text$$
+                let mut prev: Option<char> = None;
+
+                while let Some(&ch) = chars.peek() {
+                    if prev == Some('$') && ch == '$' {
+                        chars.next(); // consume final $
+                                      // content_end is before the first $ of 
$$
+                        let content_end = chars.byte_pos - 2;
+                        let value = &chars.source[content_start..content_end];

Review Comment:
   can we do a pass through the places where we index into the buffer, to 
ensure that the indexing is safe and we return an error otherwise. Maybe we 
could have a helper function for it for reuse



##########
src/tokenizer.rs:
##########
@@ -1783,96 +1786,115 @@ impl<'a> Tokenizer<'a> {
     }
 
     /// Tokenize dollar preceded value (i.e: a string/placeholder)
-    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> 
Result<Token, TokenizerError> {
-        let mut s = String::new();
-        let mut value = String::new();
+    fn tokenize_dollar_preceded_value(
+        &self,
+        chars: &mut State<'a>,
+    ) -> Result<Token, TokenizerError> {
+        chars.next(); // consume first $

Review Comment:
   Similar comment as below that we can explicitly check what the token is 
before consuming it



##########
src/tokenizer.rs:
##########
@@ -1783,96 +1786,115 @@ impl<'a> Tokenizer<'a> {
     }
 
     /// Tokenize dollar preceded value (i.e: a string/placeholder)
-    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> 
Result<Token, TokenizerError> {
-        let mut s = String::new();
-        let mut value = String::new();
+    fn tokenize_dollar_preceded_value(
+        &self,
+        chars: &mut State<'a>,
+    ) -> Result<Token, TokenizerError> {
+        chars.next(); // consume first $
 
-        chars.next();
+        // Case 1: $$text$$ (untagged dollar-quoted string)
+        if matches!(chars.peek(), Some('$')) && 
!self.dialect.supports_dollar_placeholder() {
+            let (value, tag) = 
self.tokenize_dollar_quoted_string_borrowed(chars, None)?;
+            return Ok(Token::DollarQuotedString(DollarQuotedString {
+                value: value.into_owned(),
+                tag: tag.map(|t| t.into_owned()),
+            }));
+        }
 
-        // If the dialect does not support dollar-quoted strings, then `$$` is 
rather a placeholder.
+        // If it's not $$ we have 2 options :
+        //   Case 2: $tag$text$tag$ (tagged dollar-quoted string) if dialect 
supports it
+        //   Case 3: $placeholder (e.g., $1, $name)
+        let tag_start = chars.byte_pos;
+        let _tag_slice = peeking_take_while_ref(chars, |ch| {
+            ch.is_alphanumeric()
+                || ch == '_'
+                || matches!(ch, '$' if 
self.dialect.supports_dollar_placeholder())
+        });
+        let tag_end = chars.byte_pos;
+
+        // Case 2: $tag$text$tag$ (tagged dollar-quoted string)
         if matches!(chars.peek(), Some('$')) && 
!self.dialect.supports_dollar_placeholder() {
-            chars.next();
+            let tag_value = &chars.source[tag_start..tag_end];
+            let (value, tag) =
+                self.tokenize_dollar_quoted_string_borrowed(chars, 
Some(tag_value))?;
+            return Ok(Token::DollarQuotedString(DollarQuotedString {
+                value: value.into_owned(),
+                tag: tag.map(|t| t.into_owned()),
+            }));
+        }
 
-            let mut is_terminated = false;
-            let mut prev: Option<char> = None;
+        // Case 3: $placeholder (e.g., $1, $name)
+        let tag_value = &chars.source[tag_start..tag_end];
+        Ok(Token::Placeholder(format!("${}", tag_value)))
+    }
 
-            while let Some(&ch) = chars.peek() {
-                if prev == Some('$') {
-                    if ch == '$' {
-                        chars.next();
-                        is_terminated = true;
-                        break;
-                    } else {
-                        s.push('$');
-                        s.push(ch);
+    /// Tokenize a dollar-quoted string ($$text$$ or $tag$text$tag$), 
returning borrowed slices.
+    /// tag_prefix: None for $$, Some("tag") for $tag$
+    /// Returns (value: Cow<'a, str>, tag: Option<Cow<'a, str>>)
+    fn tokenize_dollar_quoted_string_borrowed(
+        &self,
+        chars: &mut State<'a>,
+        tag_prefix: Option<&'a str>,
+    ) -> Result<(Cow<'a, str>, Option<Cow<'a, str>>), TokenizerError> {
+        chars.next(); // consume $ after tag (or second $ for $$)

Review Comment:
   Can we explicitly check here that we have a `$` token being consumed?i.e. 
something like
   ```
   if chars.peek() != Some('$') {
     return Err(...)
   }
   chars.next(); // Consume '$'
   ```
   It looks incorrect otherwise from the method's pov that its potentially 
blindly skipping a token



##########
src/tokenizer.rs:
##########
@@ -2304,7 +2386,69 @@ fn peeking_next_take_while(
 }
 
 fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
-    Unescape::new(chars).unescape()
+    borrow_or_unescape_single_quoted_string(chars, true).map(|cow| 
cow.into_owned())
+}
+
+/// Scans a single-quoted string and returns either a borrowed slice or an 
unescaped owned string.
+///
+/// Strategy: Scan once to find the end and detect escape sequences.
+/// - If no escapes exist (or unescape=false), return Cow::Borrowed
+/// - If escapes exist and unescape=true, reprocess using existing Unescape 
logic
+fn borrow_or_unescape_single_quoted_string<'a>(
+    chars: &mut State<'a>,
+    unescape: bool,
+) -> Option<Cow<'a, str>> {
+    let content_start = chars.byte_pos;
+    chars.next(); // consume opening '
+
+    // Scan to find end and check for escape sequences
+    let mut has_escapes = false;
+
+    loop {
+        match chars.next() {
+            Some('\'') => {
+                // Check for doubled single quote (escape)
+                if chars.peek() == Some(&'\'') {
+                    has_escapes = true;
+                    chars.next(); // consume the second '
+                } else {
+                    // End of string found (including closing ')
+                    let content_end = chars.byte_pos;
+                    let full_content = 
&chars.source[content_start..content_end];
+
+                    // If no unescaping needed, return borrowed (without 
quotes)
+                    if !unescape || !has_escapes {
+                        // Strip opening and closing quotes
+                        return 
Some(Cow::Borrowed(&full_content[1..full_content.len() - 1]));

Review Comment:
   Similar comment here re ensureing that the indexing is safe



##########
src/tokenizer.rs:
##########
@@ -2304,7 +2386,69 @@ fn peeking_next_take_while(
 }
 
 fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
-    Unescape::new(chars).unescape()
+    borrow_or_unescape_single_quoted_string(chars, true).map(|cow| 
cow.into_owned())
+}
+
+/// Scans a single-quoted string and returns either a borrowed slice or an 
unescaped owned string.
+///
+/// Strategy: Scan once to find the end and detect escape sequences.
+/// - If no escapes exist (or unescape=false), return Cow::Borrowed

Review Comment:
   ```suggestion
   /// - If no escapes exist (or unescape=false), return [Cow::Borrowed]
   ```



##########
src/tokenizer.rs:
##########
@@ -2304,7 +2386,69 @@ fn peeking_next_take_while(
 }
 
 fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
-    Unescape::new(chars).unescape()
+    borrow_or_unescape_single_quoted_string(chars, true).map(|cow| 
cow.into_owned())
+}
+
+/// Scans a single-quoted string and returns either a borrowed slice or an 
unescaped owned string.
+///
+/// Strategy: Scan once to find the end and detect escape sequences.
+/// - If no escapes exist (or unescape=false), return Cow::Borrowed
+/// - If escapes exist and unescape=true, reprocess using existing Unescape 
logic

Review Comment:
   ```suggestion
   /// - If escapes exist and unescape=true, reprocess using existing 
[Unescape] logic
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Reduce string copies cow [datafusion-sqlparser-rs]

Reply via email to