martin-g commented on code in PR #20278:
URL: https://github.com/apache/datafusion/pull/20278#discussion_r2816618201


##########
datafusion/functions/src/unicode/lpad.rs:
##########
@@ -266,17 +292,28 @@ where
                     continue;
                 }
 
-                // Reuse buffer by clearing and refilling
-                graphemes_buf.clear();
-                graphemes_buf.extend(string.graphemes(true));
-
-                if length < graphemes_buf.len() {
-                    builder.append_value(graphemes_buf[..length].concat());
+                if string.is_ascii() {
+                    // ASCII fast path: byte length == character length
+                    let str_len = string.len();
+                    if length < str_len {

Review Comment:
   ```suggestion
                       if length == str_len {
                           builder.append_value(string);
                       } else if length < str_len {
   ```
   this would avoid `builder.write_str(" ".repeat(length - str_len).as_str())?;`



##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -273,27 +287,49 @@ where
                                     );
                                 }
                                 let length = if length < 0 { 0 } else { length 
as usize };
-                                // Reuse buffer by clearing and refilling
-                                graphemes_buf.clear();
-                                graphemes_buf.extend(string.graphemes(true));
-
-                                if length < graphemes_buf.len() {
-                                    builder
-                                        
.append_value(graphemes_buf[..length].concat());
-                                } else if fill.is_empty() {
-                                    builder.append_value(string);
+                                if string.is_ascii() && fill.is_ascii() {
+                                    // ASCII fast path: byte length == 
character length,
+                                    // so we skip expensive grapheme 
segmentation.
+                                    let str_len = string.len();
+                                    if length < str_len {
+                                        
builder.append_value(&string[..length]);
+                                    } else if fill.is_empty() {
+                                        builder.append_value(string);
+                                    } else {
+                                        let pad_len = length - str_len;
+                                        let fill_len = fill.len();
+                                        let full_reps = pad_len / fill_len;
+                                        let remainder = pad_len % fill_len;
+                                        builder.write_str(string)?;
+                                        for _ in 0..full_reps {
+                                            builder.write_str(fill)?;
+                                        }
+                                        
builder.append_value(&fill[..remainder]);

Review Comment:
   ```suggestion
                                           
builder.append_value(&fill[..remainder])?;
                                           if remainder > 0 {
                                               
builder.write_str(&fill[..remainder])?;
                                           }
   ```
   as at 
https://github.com/apache/datafusion/pull/20278/changes#diff-4f47738c719ccd61cdf07e6eaee63e914ad552f9f0e87b736c431cfff64cddd2R247



##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -459,6 +495,17 @@ mod tests {
             Utf8,
             StringArray
         );
+        test_function!(
+            RPadFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("hello")),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+            ],
+            Ok(Some("he")),
+            &str,
+            Utf8,
+            StringArray
+        );

Review Comment:
   ```suggestion
           );
            test_function!(
               RPadFunc::new(),
               vec![
                   ColumnarValue::Scalar(ScalarValue::from("hi")),
                   ColumnarValue::Scalar(ScalarValue::from(6i64)),
                   ColumnarValue::Scalar(ScalarValue::from("xy")),
               ],
               Ok(Some("hixyxy")),
               &str,
               Utf8,
               StringArray
           );
   ```
   to test `remainder == 0`



##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -234,6 +238,17 @@ where
                             let length = if length < 0 { 0 } else { length as 
usize };
                             if length == 0 {
                                 builder.append_value("");
+                            } else if string.is_ascii() {
+                                // ASCII fast path: byte length == character 
length
+                                let str_len = string.len();
+                                if length < str_len {

Review Comment:
   ```suggestion
                       if length == str_len {
                           builder.append_value(string);
                       } else if length < str_len {
   ```
   this would avoid `builder.write_str(" ".repeat(length - str_len).as_str())?;`



##########
datafusion/functions/benches/pad.rs:
##########
@@ -30,6 +33,51 @@ use std::hint::black_box;
 use std::sync::Arc;
 use std::time::Duration;
 
+const UNICODE_STRINGS: &[&str] = &[
+    "Ñandú",
+    "Íslensku",
+    "Þjóðarinnar",
+    "Ελληνική",
+    "Иванович",
+    "データフュージョン",
+    "José García",
+    "Ölçü bïrïmï",
+    "Ÿéšṱëṟḏàÿ",
+    "Ährenstraße",
+];
+
+fn create_unicode_string_array<O: OffsetSizeTrait>(
+    size: usize,
+    null_density: f32,
+) -> arrow::array::GenericStringArray<O> {
+    let mut rng = rand::rng();
+    let mut builder = GenericStringBuilder::<O>::new();
+    for i in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
+        }
+    }
+    builder.finish()
+}
+
+fn create_unicode_string_view_array(

Review Comment:
   nit: The function is almost identical to `create_unicode_string_array()`. 
Maybe merge them into a more generic one ?



##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -273,27 +287,49 @@ where
                                     );
                                 }
                                 let length = if length < 0 { 0 } else { length 
as usize };
-                                // Reuse buffer by clearing and refilling
-                                graphemes_buf.clear();
-                                graphemes_buf.extend(string.graphemes(true));
-
-                                if length < graphemes_buf.len() {
-                                    builder
-                                        
.append_value(graphemes_buf[..length].concat());
-                                } else if fill.is_empty() {
-                                    builder.append_value(string);
+                                if string.is_ascii() && fill.is_ascii() {
+                                    // ASCII fast path: byte length == 
character length,
+                                    // so we skip expensive grapheme 
segmentation.
+                                    let str_len = string.len();
+                                    if length < str_len {
+                                        
builder.append_value(&string[..length]);
+                                    } else if fill.is_empty() {
+                                        builder.append_value(string);
+                                    } else {
+                                        let pad_len = length - str_len;
+                                        let fill_len = fill.len();
+                                        let full_reps = pad_len / fill_len;
+                                        let remainder = pad_len % fill_len;
+                                        builder.write_str(string)?;
+                                        for _ in 0..full_reps {
+                                            builder.write_str(fill)?;
+                                        }
+                                        
builder.append_value(&fill[..remainder]);
+                                    }
                                 } else {
-                                    builder.write_str(string)?;
-                                    // Reuse fill_chars_buf by clearing and 
refilling
-                                    fill_chars_buf.clear();
-                                    fill_chars_buf.extend(fill.chars());
-                                    for l in 0..length - graphemes_buf.len() {
-                                        let c = *fill_chars_buf
-                                            .get(l % fill_chars_buf.len())
-                                            .unwrap();
-                                        builder.write_char(c)?;
+                                    // Reuse buffer by clearing and refilling
+                                    graphemes_buf.clear();
+                                    
graphemes_buf.extend(string.graphemes(true));
+
+                                    if length < graphemes_buf.len() {
+                                        builder.append_value(
+                                            graphemes_buf[..length].concat(),
+                                        );

Review Comment:
   ```suggestion
                                           )?;
   ```



##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -234,6 +238,17 @@ where
                             let length = if length < 0 { 0 } else { length as 
usize };
                             if length == 0 {
                                 builder.append_value("");
+                            } else if string.is_ascii() {
+                                // ASCII fast path: byte length == character 
length
+                                let str_len = string.len();
+                                if length < str_len {
+                                    builder.append_value(&string[..length]);
+                                } else {
+                                    builder.write_str(string)?;
+                                    builder.append_value(
+                                        " ".repeat(length - str_len).as_str(),
+                                    );

Review Comment:
   ```suggestion
                                       for _ in 0..(length - str_len) {
                                           builder.write_char(' ')?;
                                       }
   ```
   to avoid the String allocation.
   If the bench test shows that it is better then please apply it also in all 
other places. 



##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -273,27 +287,49 @@ where
                                     );
                                 }
                                 let length = if length < 0 { 0 } else { length 
as usize };
-                                // Reuse buffer by clearing and refilling
-                                graphemes_buf.clear();
-                                graphemes_buf.extend(string.graphemes(true));
-
-                                if length < graphemes_buf.len() {
-                                    builder
-                                        
.append_value(graphemes_buf[..length].concat());
-                                } else if fill.is_empty() {
-                                    builder.append_value(string);
+                                if string.is_ascii() && fill.is_ascii() {
+                                    // ASCII fast path: byte length == 
character length,
+                                    // so we skip expensive grapheme 
segmentation.
+                                    let str_len = string.len();
+                                    if length < str_len {
+                                        
builder.append_value(&string[..length]);
+                                    } else if fill.is_empty() {
+                                        builder.append_value(string);
+                                    } else {
+                                        let pad_len = length - str_len;
+                                        let fill_len = fill.len();
+                                        let full_reps = pad_len / fill_len;
+                                        let remainder = pad_len % fill_len;
+                                        builder.write_str(string)?;
+                                        for _ in 0..full_reps {
+                                            builder.write_str(fill)?;
+                                        }
+                                        
builder.append_value(&fill[..remainder]);
+                                    }
                                 } else {
-                                    builder.write_str(string)?;
-                                    // Reuse fill_chars_buf by clearing and 
refilling
-                                    fill_chars_buf.clear();
-                                    fill_chars_buf.extend(fill.chars());
-                                    for l in 0..length - graphemes_buf.len() {
-                                        let c = *fill_chars_buf
-                                            .get(l % fill_chars_buf.len())
-                                            .unwrap();
-                                        builder.write_char(c)?;
+                                    // Reuse buffer by clearing and refilling
+                                    graphemes_buf.clear();
+                                    
graphemes_buf.extend(string.graphemes(true));
+
+                                    if length < graphemes_buf.len() {
+                                        builder.append_value(
+                                            graphemes_buf[..length].concat(),
+                                        );
+                                    } else if fill.is_empty() {
+                                        builder.append_value(string);

Review Comment:
   ```suggestion
                                           builder.append_value(string)?;
   ```



##########
datafusion/functions/src/unicode/lpad.rs:
##########
@@ -523,6 +560,11 @@ mod tests {
             None,
             Ok(None)
         );
+        test_lpad!(
+            Some("hello".into()),
+            ScalarValue::Int64(Some(2i64)),
+            Ok(Some("he"))
+        );

Review Comment:
   ```suggestion
           );
           test_lpad!(
               Some("hi".into()),
               ScalarValue::Int64(Some(6i64)),
               Some("xy".into()),
               Ok(Some("xyxyhi"))
           );
   ```
   to test `remainder == 0`



##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -273,27 +287,49 @@ where
                                     );
                                 }
                                 let length = if length < 0 { 0 } else { length 
as usize };
-                                // Reuse buffer by clearing and refilling
-                                graphemes_buf.clear();
-                                graphemes_buf.extend(string.graphemes(true));
-
-                                if length < graphemes_buf.len() {
-                                    builder
-                                        
.append_value(graphemes_buf[..length].concat());
-                                } else if fill.is_empty() {
-                                    builder.append_value(string);
+                                if string.is_ascii() && fill.is_ascii() {
+                                    // ASCII fast path: byte length == 
character length,
+                                    // so we skip expensive grapheme 
segmentation.
+                                    let str_len = string.len();
+                                    if length < str_len {
+                                        
builder.append_value(&string[..length]);
+                                    } else if fill.is_empty() {
+                                        builder.append_value(string);
+                                    } else {
+                                        let pad_len = length - str_len;
+                                        let fill_len = fill.len();
+                                        let full_reps = pad_len / fill_len;
+                                        let remainder = pad_len % fill_len;
+                                        builder.write_str(string)?;
+                                        for _ in 0..full_reps {
+                                            builder.write_str(fill)?;
+                                        }
+                                        
builder.append_value(&fill[..remainder]);
+                                    }
                                 } else {
-                                    builder.write_str(string)?;
-                                    // Reuse fill_chars_buf by clearing and 
refilling
-                                    fill_chars_buf.clear();
-                                    fill_chars_buf.extend(fill.chars());
-                                    for l in 0..length - graphemes_buf.len() {
-                                        let c = *fill_chars_buf
-                                            .get(l % fill_chars_buf.len())
-                                            .unwrap();
-                                        builder.write_char(c)?;
+                                    // Reuse buffer by clearing and refilling
+                                    graphemes_buf.clear();
+                                    
graphemes_buf.extend(string.graphemes(true));
+
+                                    if length < graphemes_buf.len() {
+                                        builder.append_value(
+                                            graphemes_buf[..length].concat(),
+                                        );
+                                    } else if fill.is_empty() {
+                                        builder.append_value(string);
+                                    } else {
+                                        builder.write_str(string)?;
+                                        // Reuse fill_chars_buf by clearing 
and refilling
+                                        fill_chars_buf.clear();
+                                        fill_chars_buf.extend(fill.chars());
+                                        for l in 0..length - 
graphemes_buf.len() {
+                                            let c = *fill_chars_buf
+                                                .get(l % fill_chars_buf.len())
+                                                .unwrap();
+                                            builder.write_char(c)?;
+                                        }
+                                        builder.append_value("");

Review Comment:
   ```suggestion
                                           builder.append_value("")?;
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to