[ https://issues.apache.org/jira/browse/SPARK-31788?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Hyukjin Kwon updated SPARK-31788: --------------------------------- Target Version/s: 3.0.0 Affects Version/s: (was: 3.0.1) Description: Union RDD of Pair RDD's seems to have issues SparkSession available as 'spark'. {code} rdd1 = sc.parallelize([1,2,3,4,5]) rdd2 = sc.parallelize([6,7,8,9,10]) pairRDD1 = rdd1.zip(rdd2) unionRDD1 = sc.union([pairRDD1, pairRDD1]) {code} {code} Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/home/gs/spark/latest/python/pyspark/context.py", line 870, in union jrdds[i] = rdds[i]._jrdd File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", line 238, in _setitem_ File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", line 221, in __set_item File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 332, in get_return_value py4j.protocol.Py4JError: An error occurred while calling None.None. Trace: py4j.Py4JException: Cannot convert org.apache.spark.api.java.JavaPairRDD to org.apache.spark.api.java.JavaRDD at py4j.commands.ArrayCommand.convertArgument(ArrayCommand.java:166) at py4j.commands.ArrayCommand.setArray(ArrayCommand.java:144) at py4j.commands.ArrayCommand.execute(ArrayCommand.java:97) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748) {code} {code} rdd3 = sc.parallelize([11,12,13,14,15]) pairRDD2 = rdd3.zip(rdd3) unionRDD2 = sc.union([pairRDD1, pairRDD2]) {code} {code} Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/home/gs/spark/latest/python/pyspark/context.py", line 870, in union jrdds[i] = rdds[i]._jrdd File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", line 238, in _setitem_ File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", line 221, in __set_item File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 332, in get_return_value py4j.protocol.Py4JError: An error occurred while calling None.None. Trace: py4j.Py4JException: Cannot convert org.apache.spark.api.java.JavaPairRDD to org.apache.spark.api.java.JavaRDD at py4j.commands.ArrayCommand.convertArgument(ArrayCommand.java:166) at py4j.commands.ArrayCommand.setArray(ArrayCommand.java:144) at py4j.commands.ArrayCommand.execute(ArrayCommand.java:97) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748) {code} {code} rdd4 = sc.parallelize(range(5)) pairRDD3 = rdd4.zip(rdd4) unionRDD3 = sc.union([pairRDD1, pairRDD3]) unionRDD3.collect() [(1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (0, 0), (1, 1), (2, 2), (3, 3), (4, 4)] {code} 2.4.5 does not have this regression was: Union RDD of Pair RDD's seems to have issues SparkSession available as 'spark'. >>> rdd1 = sc.parallelize([1,2,3,4,5]) >>> rdd2 = sc.parallelize([6,7,8,9,10]) >>> pairRDD1 = rdd1.zip(rdd2) >>> unionRDD1 = sc.union([pairRDD1, pairRDD1]) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/home/gs/spark/latest/python/pyspark/context.py", line 870, in union jrdds[i] = rdds[i]._jrdd File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", line 238, in _setitem_ File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", line 221, in __set_item File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 332, in get_return_value py4j.protocol.Py4JError: An error occurred while calling None.None. Trace: py4j.Py4JException: Cannot convert org.apache.spark.api.java.JavaPairRDD to org.apache.spark.api.java.JavaRDD at py4j.commands.ArrayCommand.convertArgument(ArrayCommand.java:166) at py4j.commands.ArrayCommand.setArray(ArrayCommand.java:144) at py4j.commands.ArrayCommand.execute(ArrayCommand.java:97) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748) >>> rdd3 = sc.parallelize([11,12,13,14,15]) >>> pairRDD2 = rdd3.zip(rdd3) >>> unionRDD2 = sc.union([pairRDD1, pairRDD2]) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/home/gs/spark/latest/python/pyspark/context.py", line 870, in union jrdds[i] = rdds[i]._jrdd File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", line 238, in _setitem_ File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", line 221, in __set_item File "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 332, in get_return_value py4j.protocol.Py4JError: An error occurred while calling None.None. Trace: py4j.Py4JException: Cannot convert org.apache.spark.api.java.JavaPairRDD to org.apache.spark.api.java.JavaRDD at py4j.commands.ArrayCommand.convertArgument(ArrayCommand.java:166) at py4j.commands.ArrayCommand.setArray(ArrayCommand.java:144) at py4j.commands.ArrayCommand.execute(ArrayCommand.java:97) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748) >>> rdd4 = sc.parallelize(range(5)) >>> pairRDD3 = rdd4.zip(rdd4) >>> unionRDD3 = sc.union([pairRDD1, pairRDD3]) >>> unionRDD3.collect() [(1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (0, 0), (1, >>> 1), (2, 2), (3, 3), (4, 4)] 2.4.5 does not have this regression > Error when creating UnionRDD of PairRDDs > ---------------------------------------- > > Key: SPARK-31788 > URL: https://issues.apache.org/jira/browse/SPARK-31788 > Project: Spark > Issue Type: Bug > Components: DStreams, PySpark, Spark Core > Affects Versions: 3.0.0 > Reporter: Sanket Reddy > Assignee: Hyukjin Kwon > Priority: Blocker > > Union RDD of Pair RDD's seems to have issues > SparkSession available as 'spark'. > {code} > rdd1 = sc.parallelize([1,2,3,4,5]) > rdd2 = sc.parallelize([6,7,8,9,10]) > pairRDD1 = rdd1.zip(rdd2) > unionRDD1 = sc.union([pairRDD1, pairRDD1]) > {code} > {code} > Traceback (most recent call last): File "<stdin>", line 1, in <module> File > "/home/gs/spark/latest/python/pyspark/context.py", line 870, > in union jrdds[i] = rdds[i]._jrdd > File > "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", > line 238, in _setitem_ File > "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", > line 221, > in __set_item File > "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line > 332, in get_return_value py4j.protocol.Py4JError: An error occurred while > calling None.None. Trace: py4j.Py4JException: Cannot convert > org.apache.spark.api.java.JavaPairRDD to org.apache.spark.api.java.JavaRDD at > py4j.commands.ArrayCommand.convertArgument(ArrayCommand.java:166) at > py4j.commands.ArrayCommand.setArray(ArrayCommand.java:144) at > py4j.commands.ArrayCommand.execute(ArrayCommand.java:97) at > py4j.GatewayConnection.run(GatewayConnection.java:238) at > java.lang.Thread.run(Thread.java:748) > {code} > {code} > rdd3 = sc.parallelize([11,12,13,14,15]) > pairRDD2 = rdd3.zip(rdd3) > unionRDD2 = sc.union([pairRDD1, pairRDD2]) > {code} > {code} > Traceback (most recent call last): File "<stdin>", line 1, in <module> File > "/home/gs/spark/latest/python/pyspark/context.py", line 870, in union > jrdds[i] = rdds[i]._jrdd File > "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", > line 238, in _setitem_ File > "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/java_collections.py", > line 221, in __set_item File > "/home/gs/spark/latest/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line > 332, in get_return_value py4j.protocol.Py4JError: An error occurred while > calling None.None. Trace: py4j.Py4JException: Cannot convert > org.apache.spark.api.java.JavaPairRDD to org.apache.spark.api.java.JavaRDD at > py4j.commands.ArrayCommand.convertArgument(ArrayCommand.java:166) at > py4j.commands.ArrayCommand.setArray(ArrayCommand.java:144) at > py4j.commands.ArrayCommand.execute(ArrayCommand.java:97) at > py4j.GatewayConnection.run(GatewayConnection.java:238) at > java.lang.Thread.run(Thread.java:748) > {code} > {code} > rdd4 = sc.parallelize(range(5)) > pairRDD3 = rdd4.zip(rdd4) > unionRDD3 = sc.union([pairRDD1, pairRDD3]) > unionRDD3.collect() [(1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (0, 0), (1, 1), > (2, 2), (3, 3), (4, 4)] > {code} > 2.4.5 does not have this regression -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org