[ https://issues.apache.org/jira/browse/SPARK-41905?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Sandeep Singh updated SPARK-41905: ---------------------------------- Description: {code:java} df = self.spark.createDataFrame( [ ( [1, 2, 3], 2, 2, ), ( [4, 5], 2, 2, ), ], ["x", "index", "len"], ) expected = [Row(sliced=[2, 3]), Row(sliced=[5])] self.assertTrue( all( [ df.select(slice(df.x, 2, 2).alias("sliced")).collect() == expected, df.select(slice(df.x, lit(2), lit(2)).alias("sliced")).collect() == expected, df.select(slice("x", "index", "len").alias("sliced")).collect() == expected, ] ) ) self.assertEqual( df.select(slice(df.x, size(df.x) - 1, lit(1)).alias("sliced")).collect(), [Row(sliced=[2]), Row(sliced=[4])], ) self.assertEqual( df.select(slice(df.x, lit(1), size(df.x) - 1).alias("sliced")).collect(), [Row(sliced=[1, 2]), Row(sliced=[4])], ){code} {code:java} Traceback (most recent call last): File "/Users/s.singh/personal/spark-oss/python/pyspark/sql/tests/test_functions.py", line 596, in test_slice df.select(slice("x", "index", "len").alias("sliced")).collect() == expected, File "/Users/s.singh/personal/spark-oss/python/pyspark/sql/utils.py", line 332, in wrapped return getattr(functions, f.__name__)(*args, **kwargs) File "/Users/s.singh/personal/spark-oss/python/pyspark/sql/connect/functions.py", line 1525, in slice raise TypeError(f"start should be a Column or int, but got {type(start).__name__}") TypeError: start should be a Column or int, but got str{code} was: {code:java} from pyspark.sql import Window from pyspark.sql.functions import nth_value df = self.spark.createDataFrame( [ ("a", 0, None), ("a", 1, "x"), ("a", 2, "y"), ("a", 3, "z"), ("a", 4, None), ("b", 1, None), ("b", 2, None), ], schema=("key", "order", "value"), ) w = Window.partitionBy("key").orderBy("order") rs = df.select( df.key, df.order, nth_value("value", 2).over(w), nth_value("value", 2, False).over(w), nth_value("value", 2, True).over(w), ).collect() expected = [ ("a", 0, None, None, None), ("a", 1, "x", "x", None), ("a", 2, "x", "x", "y"), ("a", 3, "x", "x", "y"), ("a", 4, "x", "x", "y"), ("b", 1, None, None, None), ("b", 2, None, None, None), ] for r, ex in zip(sorted(rs), sorted(expected)): self.assertEqual(tuple(r), ex[: len(r)]){code} {code:java} Traceback (most recent call last): File "/Users/s.singh/personal/spark-oss/python/pyspark/sql/tests/test_functions.py", line 755, in test_nth_value self.assertEqual(tuple(r), ex[: len(r)]) AssertionError: Tuples differ: ('a', 1, 'x', None) != ('a', 1, 'x', 'x') First differing element 3: None 'x' - ('a', 1, 'x', None) ? ^^^^ + ('a', 1, 'x', 'x') ? ^^^ {code} > Function `slice` should handle string in params > ----------------------------------------------- > > Key: SPARK-41905 > URL: https://issues.apache.org/jira/browse/SPARK-41905 > Project: Spark > Issue Type: Sub-task > Components: Connect > Affects Versions: 3.4.0 > Reporter: Sandeep Singh > Priority: Major > > {code:java} > df = self.spark.createDataFrame( > [ > ( > [1, 2, 3], > 2, > 2, > ), > ( > [4, 5], > 2, > 2, > ), > ], > ["x", "index", "len"], > ) > expected = [Row(sliced=[2, 3]), Row(sliced=[5])] > self.assertTrue( > all( > [ > df.select(slice(df.x, 2, 2).alias("sliced")).collect() == > expected, > df.select(slice(df.x, lit(2), lit(2)).alias("sliced")).collect() > == expected, > df.select(slice("x", "index", "len").alias("sliced")).collect() > == expected, > ] > ) > ) > self.assertEqual( > df.select(slice(df.x, size(df.x) - 1, lit(1)).alias("sliced")).collect(), > [Row(sliced=[2]), Row(sliced=[4])], > ) > self.assertEqual( > df.select(slice(df.x, lit(1), size(df.x) - 1).alias("sliced")).collect(), > [Row(sliced=[1, 2]), Row(sliced=[4])], > ){code} > {code:java} > Traceback (most recent call last): > File > "/Users/s.singh/personal/spark-oss/python/pyspark/sql/tests/test_functions.py", > line 596, in test_slice > df.select(slice("x", "index", "len").alias("sliced")).collect() == > expected, > File "/Users/s.singh/personal/spark-oss/python/pyspark/sql/utils.py", line > 332, in wrapped > return getattr(functions, f.__name__)(*args, **kwargs) > File > "/Users/s.singh/personal/spark-oss/python/pyspark/sql/connect/functions.py", > line 1525, in slice > raise TypeError(f"start should be a Column or int, but got > {type(start).__name__}") > TypeError: start should be a Column or int, but got str{code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org