[jira] [Updated] (SPARK-45714) Spark UI throws 500 error when StructuredStreaming query filter is selected
[ https://issues.apache.org/jira/browse/SPARK-45714?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Satyam Raj updated SPARK-45714: --- Description: h2. When I'm trying to do the following inside the listener class, and go to the UI to check the streaming query tab and click on Sort by Latest Batch, I get the below error. {code:java} class MyListener extends StreamingQueryListener { private val kafkaAdminClient = try { Some(Admin.create(getKafkaProp())) } catch { case _: Throwable => None } private val registeredConsumerGroups = kafkaAdminClient.get.listConsumerGroups().all().get() override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = { val eventName = event.progress.name commitConsumerOffset(event.progress.sources.head.endOffset, eventName) } private def getKafkaProp(eventName: String = "dummy-admin"): Properties = { val props: Properties = new Properties() props.put("bootstrap.servers", Configuration.configurationMap("kafka.broker")) props.put("group.id", eventName) props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer") props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer") props.put("security.protocol", Configuration.configurationMap("kafka.security.protocol")) props.put("sasl.mechanism", Configuration.configurationMap("kafka.sasl.mechanism")) props.put("sasl.jaas.config", Configuration.configurationMap("kafka.sasl.jaas.config")) props } private def getTopicPartitionMap(topic: String, jsonOffsets: Map[String, Map[String, Long]]) = { val offsets = jsonOffsets.head._2 val topicPartitionMap = new util.HashMap[TopicPartition, OffsetAndMetadata]() offsets.keys.foreach(partition => { val tp = new TopicPartition(topic, partition.toInt) val oam = new OffsetAndMetadata(offsets(partition).asInstanceOf[Number].longValue()) topicPartitionMap.put(tp, oam) }) topicPartitionMap } private def commitConsumerOffset(endOffset: String, eventName: String) = { val jsonOffsets = mapper.readValue(endOffset, classOf[Map[String, Map[String, Long]]]) val topicPartitionMap = getTopicPartitionMap(eventName, jsonOffsets) if (!registeredConsumerGroups.contains(eventName)) { print("topic consumer not created! creating") val kafkaConsumer = new KafkaConsumer[String, String](getKafkaProp(eventName)) kafkaConsumer.commitSync(topicPartitionMap) kafkaConsumer.close() } else { print("committing cg offsets using admin") kafkaAdminClient.get.alterConsumerGroupOffsets(eventName, topicPartitionMap).all().get() } } }{code} h2. HTTP ERROR 500 java.lang.NullPointerException ||URI:|/StreamingQuery/active| ||STATUS:|500| ||MESSAGE:|java.lang.NullPointerException| ||SERVLET:|org.apache.spark.ui.JettyUtils$$anon$1-522a82dd| ||CAUSED BY:|java.lang.NullPointerException| h3. Caused by: {code:java} java.lang.NullPointerException at org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9(StreamingQueryPage.scala:258) at org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9$adapted(StreamingQueryPage.scala:258) at scala.math.Ordering$$anon$5.compare(Ordering.scala:253) at java.base/java.util.TimSort.binarySort(TimSort.java:296) at java.base/java.util.TimSort.sort(TimSort.java:239) at java.base/java.util.Arrays.sort(Arrays.java:1441) at scala.collection.SeqLike.sorted(SeqLike.scala:659) at scala.collection.SeqLike.sorted$(SeqLike.scala:647) at scala.collection.AbstractSeq.sorted(Seq.scala:45) at org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.(StreamingQueryPage.scala:223) at org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.dataSource(StreamingQueryPage.scala:149) at org.apache.spark.ui.PagedTable.table(PagedTable.scala:101) at org.apache.spark.ui.PagedTable.table$(PagedTable.scala:100) at org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.table(StreamingQueryPage.scala:114) at org.apache.spark.sql.streaming.ui.StreamingQueryPage.queryTable(StreamingQueryPage.scala:101) at org.apache.spark.sql.streaming.ui.StreamingQueryPage.generateStreamingQueryTable(StreamingQueryPage.scala:60) at org.apache.spark.sql.streaming.ui.StreamingQueryPage.render(StreamingQueryPage.scala:38) at org.apache.spark.ui.WebUI.$anonfun$attachPage$1(WebUI.scala:90) at org.apache.spark.ui.JettyUtils$$anon$1.doGet(JettyUtils.scala:81) at javax.servlet.http.HttpServlet.service(HttpServlet.java:503) at javax.servlet.http.HttpServlet.service(HttpServlet.java:590) at org.sparkproject.jetty.servlet.ServletHolder.handle(ServletHolder.java:799) at org.sparkproject.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1626) at
[jira] [Updated] (SPARK-45714) Spark UI throws 500 error when StructuredStreaming query filter is selected
[ https://issues.apache.org/jira/browse/SPARK-45714?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Satyam Raj updated SPARK-45714: --- Description: h2. HTTP ERROR 500 java.lang.NullPointerException ||URI:|/StreamingQuery/active| ||STATUS:|500| ||MESSAGE:|java.lang.NullPointerException| ||SERVLET:|org.apache.spark.ui.JettyUtils$$anon$1-522a82dd| ||CAUSED BY:|java.lang.NullPointerException| h3. Caused by: {code:java} java.lang.NullPointerException at org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9(StreamingQueryPage.scala:258) at org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9$adapted(StreamingQueryPage.scala:258) at scala.math.Ordering$$anon$5.compare(Ordering.scala:253) at java.base/java.util.TimSort.binarySort(TimSort.java:296) at java.base/java.util.TimSort.sort(TimSort.java:239) at java.base/java.util.Arrays.sort(Arrays.java:1441) at scala.collection.SeqLike.sorted(SeqLike.scala:659) at scala.collection.SeqLike.sorted$(SeqLike.scala:647) at scala.collection.AbstractSeq.sorted(Seq.scala:45) at org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.(StreamingQueryPage.scala:223) at org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.dataSource(StreamingQueryPage.scala:149) at org.apache.spark.ui.PagedTable.table(PagedTable.scala:101) at org.apache.spark.ui.PagedTable.table$(PagedTable.scala:100) at org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.table(StreamingQueryPage.scala:114) at org.apache.spark.sql.streaming.ui.StreamingQueryPage.queryTable(StreamingQueryPage.scala:101) at org.apache.spark.sql.streaming.ui.StreamingQueryPage.generateStreamingQueryTable(StreamingQueryPage.scala:60) at org.apache.spark.sql.streaming.ui.StreamingQueryPage.render(StreamingQueryPage.scala:38) at org.apache.spark.ui.WebUI.$anonfun$attachPage$1(WebUI.scala:90) at org.apache.spark.ui.JettyUtils$$anon$1.doGet(JettyUtils.scala:81) at javax.servlet.http.HttpServlet.service(HttpServlet.java:503) at javax.servlet.http.HttpServlet.service(HttpServlet.java:590) at org.sparkproject.jetty.servlet.ServletHolder.handle(ServletHolder.java:799) at org.sparkproject.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1626) at org.apache.spark.ui.HttpSecurityFilter.doFilter(HttpSecurityFilter.scala:95) at org.sparkproject.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at org.sparkproject.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601) at org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.doFilter(AmIpFilter.java:185) at org.sparkproject.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at org.sparkproject.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601) at org.sparkproject.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:548) at org.sparkproject.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233) at org.sparkproject.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1434) at org.sparkproject.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188) at org.sparkproject.jetty.servlet.ServletHandler.doScope(ServletHandler.java:501) at org.sparkproject.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186) at org.sparkproject.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1349) at org.sparkproject.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) at org.sparkproject.jetty.server.handler.gzip.GzipHandler.handle(GzipHandler.java:763) at org.sparkproject.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:234) at org.sparkproject.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) at org.sparkproject.jetty.server.Server.handle(Server.java:516) at org.sparkproject.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:388) at org.sparkproject.jetty.server.HttpChannel.dispatch(HttpChannel.java:633) at org.sparkproject.jetty.server.HttpChannel.handle(HttpChannel.java:380) at org.sparkproject.jetty.server.HttpConnection.onFillable(HttpConnection.java:277) at org.sparkproject.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311) at org.sparkproject.jetty.io.FillInterest.fillable(FillInterest.java:105) at org.sparkproject.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104) at org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338) at org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315) at org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173) at org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:131) at org.sparkproject.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:386) at
[jira] [Created] (SPARK-45714) Spark UI throws 500 error when StructuredStreaming query filter is selected
Satyam Raj created SPARK-45714: -- Summary: Spark UI throws 500 error when StructuredStreaming query filter is selected Key: SPARK-45714 URL: https://issues.apache.org/jira/browse/SPARK-45714 Project: Spark Issue Type: Bug Components: Java API Affects Versions: 3.4.0 Reporter: Satyam Raj ``` h2. HTTP ERROR 500 java.lang.NullPointerException ||URI:|/StreamingQuery/active| ||STATUS:|500| ||MESSAGE:|java.lang.NullPointerException| ||SERVLET:|org.apache.spark.ui.JettyUtils$$anon$1-522a82dd| ||CAUSED BY:|java.lang.NullPointerException| h3. Caused by: {code:java} java.lang.NullPointerException at org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9(StreamingQueryPage.scala:258) at org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9$adapted(StreamingQueryPage.scala:258) at scala.math.Ordering$$anon$5.compare(Ordering.scala:253) at java.base/java.util.TimSort.binarySort(TimSort.java:296) at java.base/java.util.TimSort.sort(TimSort.java:239) at java.base/java.util.Arrays.sort(Arrays.java:1441) at scala.collection.SeqLike.sorted(SeqLike.scala:659) at scala.collection.SeqLike.sorted$(SeqLike.scala:647) at scala.collection.AbstractSeq.sorted(Seq.scala:45) at org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.(StreamingQueryPage.scala:223) at org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.dataSource(StreamingQueryPage.scala:149) at org.apache.spark.ui.PagedTable.table(PagedTable.scala:101) at org.apache.spark.ui.PagedTable.table$(PagedTable.scala:100) at org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.table(StreamingQueryPage.scala:114) at org.apache.spark.sql.streaming.ui.StreamingQueryPage.queryTable(StreamingQueryPage.scala:101) at org.apache.spark.sql.streaming.ui.StreamingQueryPage.generateStreamingQueryTable(StreamingQueryPage.scala:60) at org.apache.spark.sql.streaming.ui.StreamingQueryPage.render(StreamingQueryPage.scala:38) at org.apache.spark.ui.WebUI.$anonfun$attachPage$1(WebUI.scala:90) at org.apache.spark.ui.JettyUtils$$anon$1.doGet(JettyUtils.scala:81) at javax.servlet.http.HttpServlet.service(HttpServlet.java:503) at javax.servlet.http.HttpServlet.service(HttpServlet.java:590) at org.sparkproject.jetty.servlet.ServletHolder.handle(ServletHolder.java:799) at org.sparkproject.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1626) at org.apache.spark.ui.HttpSecurityFilter.doFilter(HttpSecurityFilter.scala:95) at org.sparkproject.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at org.sparkproject.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601) at org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.doFilter(AmIpFilter.java:185) at org.sparkproject.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at org.sparkproject.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601) at org.sparkproject.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:548) at org.sparkproject.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233) at org.sparkproject.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1434) at org.sparkproject.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188) at org.sparkproject.jetty.servlet.ServletHandler.doScope(ServletHandler.java:501) at org.sparkproject.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186) at org.sparkproject.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1349) at org.sparkproject.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) at org.sparkproject.jetty.server.handler.gzip.GzipHandler.handle(GzipHandler.java:763) at org.sparkproject.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:234) at org.sparkproject.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) at org.sparkproject.jetty.server.Server.handle(Server.java:516) at org.sparkproject.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:388) at org.sparkproject.jetty.server.HttpChannel.dispatch(HttpChannel.java:633) at org.sparkproject.jetty.server.HttpChannel.handle(HttpChannel.java:380) at org.sparkproject.jetty.server.HttpConnection.onFillable(HttpConnection.java:277) at org.sparkproject.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311) at org.sparkproject.jetty.io.FillInterest.fillable(FillInterest.java:105) at org.sparkproject.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104) at org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338) at org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315) at org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173) at
[jira] [Updated] (SPARK-45678) Cover BufferReleasingInputStream.available under tryOrFetchFailedException
[ https://issues.apache.org/jira/browse/SPARK-45678?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] L. C. Hsieh updated SPARK-45678: Fix Version/s: 3.4.2 3.5.1 > Cover BufferReleasingInputStream.available under tryOrFetchFailedException > -- > > Key: SPARK-45678 > URL: https://issues.apache.org/jira/browse/SPARK-45678 > Project: Spark > Issue Type: Bug > Components: Spark Core >Affects Versions: 3.4.1, 3.5.0, 4.0.0 >Reporter: L. C. Hsieh >Assignee: L. C. Hsieh >Priority: Minor > Labels: pull-request-available > Fix For: 3.4.2, 4.0.0, 3.5.1 > > > We have encountered shuffle data corruption issue: > ``` > Caused by: java.io.IOException: FAILED_TO_UNCOMPRESS(5) > at org.xerial.snappy.SnappyNative.throw_error(SnappyNative.java:112) > at org.xerial.snappy.SnappyNative.rawUncompress(Native Method) > at org.xerial.snappy.Snappy.rawUncompress(Snappy.java:504) > at org.xerial.snappy.Snappy.uncompress(Snappy.java:543) > at > org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:450) > at > org.xerial.snappy.SnappyInputStream.available(SnappyInputStream.java:497) > at > org.apache.spark.storage.BufferReleasingInputStream.available(ShuffleBlockFetcherIterator.scala:1356) > ``` > Spark shuffle has capacity to detect corruption for a few stream op like > `read` and `skip`, such `IOException` in the stack trace will be rethrown as > `FetchFailedException` that will re-try the failed shuffle task. But in the > stack trace it is `available` that is not covered by the mechanism. So > no-retry has been happened and the Spark application just failed. > As the `available` op will also involve data decompression, we should be able > to check it like `read` and `skip` do. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45678) Cover BufferReleasingInputStream.available under tryOrFetchFailedException
[ https://issues.apache.org/jira/browse/SPARK-45678?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] L. C. Hsieh updated SPARK-45678: Issue Type: Bug (was: Improvement) > Cover BufferReleasingInputStream.available under tryOrFetchFailedException > -- > > Key: SPARK-45678 > URL: https://issues.apache.org/jira/browse/SPARK-45678 > Project: Spark > Issue Type: Bug > Components: Spark Core >Affects Versions: 4.0.0 >Reporter: L. C. Hsieh >Assignee: L. C. Hsieh >Priority: Minor > Labels: pull-request-available > Fix For: 4.0.0 > > > We have encountered shuffle data corruption issue: > ``` > Caused by: java.io.IOException: FAILED_TO_UNCOMPRESS(5) > at org.xerial.snappy.SnappyNative.throw_error(SnappyNative.java:112) > at org.xerial.snappy.SnappyNative.rawUncompress(Native Method) > at org.xerial.snappy.Snappy.rawUncompress(Snappy.java:504) > at org.xerial.snappy.Snappy.uncompress(Snappy.java:543) > at > org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:450) > at > org.xerial.snappy.SnappyInputStream.available(SnappyInputStream.java:497) > at > org.apache.spark.storage.BufferReleasingInputStream.available(ShuffleBlockFetcherIterator.scala:1356) > ``` > Spark shuffle has capacity to detect corruption for a few stream op like > `read` and `skip`, such `IOException` in the stack trace will be rethrown as > `FetchFailedException` that will re-try the failed shuffle task. But in the > stack trace it is `available` that is not covered by the mechanism. So > no-retry has been happened and the Spark application just failed. > As the `available` op will also involve data decompression, we should be able > to check it like `read` and `skip` do. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45678) Cover BufferReleasingInputStream.available under tryOrFetchFailedException
[ https://issues.apache.org/jira/browse/SPARK-45678?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] L. C. Hsieh updated SPARK-45678: Affects Version/s: 3.5.0 3.4.1 > Cover BufferReleasingInputStream.available under tryOrFetchFailedException > -- > > Key: SPARK-45678 > URL: https://issues.apache.org/jira/browse/SPARK-45678 > Project: Spark > Issue Type: Bug > Components: Spark Core >Affects Versions: 3.4.1, 3.5.0, 4.0.0 >Reporter: L. C. Hsieh >Assignee: L. C. Hsieh >Priority: Minor > Labels: pull-request-available > Fix For: 4.0.0 > > > We have encountered shuffle data corruption issue: > ``` > Caused by: java.io.IOException: FAILED_TO_UNCOMPRESS(5) > at org.xerial.snappy.SnappyNative.throw_error(SnappyNative.java:112) > at org.xerial.snappy.SnappyNative.rawUncompress(Native Method) > at org.xerial.snappy.Snappy.rawUncompress(Snappy.java:504) > at org.xerial.snappy.Snappy.uncompress(Snappy.java:543) > at > org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:450) > at > org.xerial.snappy.SnappyInputStream.available(SnappyInputStream.java:497) > at > org.apache.spark.storage.BufferReleasingInputStream.available(ShuffleBlockFetcherIterator.scala:1356) > ``` > Spark shuffle has capacity to detect corruption for a few stream op like > `read` and `skip`, such `IOException` in the stack trace will be rethrown as > `FetchFailedException` that will re-try the failed shuffle task. But in the > stack trace it is `available` that is not covered by the mechanism. So > no-retry has been happened and the Spark application just failed. > As the `available` op will also involve data decompression, we should be able > to check it like `read` and `skip` do. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Assigned] (SPARK-45678) Cover BufferReleasingInputStream.available under tryOrFetchFailedException
[ https://issues.apache.org/jira/browse/SPARK-45678?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Chao Sun reassigned SPARK-45678: Assignee: L. C. Hsieh > Cover BufferReleasingInputStream.available under tryOrFetchFailedException > -- > > Key: SPARK-45678 > URL: https://issues.apache.org/jira/browse/SPARK-45678 > Project: Spark > Issue Type: Improvement > Components: Spark Core >Affects Versions: 4.0.0 >Reporter: L. C. Hsieh >Assignee: L. C. Hsieh >Priority: Minor > Labels: pull-request-available > > We have encountered shuffle data corruption issue: > ``` > Caused by: java.io.IOException: FAILED_TO_UNCOMPRESS(5) > at org.xerial.snappy.SnappyNative.throw_error(SnappyNative.java:112) > at org.xerial.snappy.SnappyNative.rawUncompress(Native Method) > at org.xerial.snappy.Snappy.rawUncompress(Snappy.java:504) > at org.xerial.snappy.Snappy.uncompress(Snappy.java:543) > at > org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:450) > at > org.xerial.snappy.SnappyInputStream.available(SnappyInputStream.java:497) > at > org.apache.spark.storage.BufferReleasingInputStream.available(ShuffleBlockFetcherIterator.scala:1356) > ``` > Spark shuffle has capacity to detect corruption for a few stream op like > `read` and `skip`, such `IOException` in the stack trace will be rethrown as > `FetchFailedException` that will re-try the failed shuffle task. But in the > stack trace it is `available` that is not covered by the mechanism. So > no-retry has been happened and the Spark application just failed. > As the `available` op will also involve data decompression, we should be able > to check it like `read` and `skip` do. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Resolved] (SPARK-45678) Cover BufferReleasingInputStream.available under tryOrFetchFailedException
[ https://issues.apache.org/jira/browse/SPARK-45678?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Chao Sun resolved SPARK-45678. -- Fix Version/s: 4.0.0 Resolution: Fixed Issue resolved by pull request 43543 [https://github.com/apache/spark/pull/43543] > Cover BufferReleasingInputStream.available under tryOrFetchFailedException > -- > > Key: SPARK-45678 > URL: https://issues.apache.org/jira/browse/SPARK-45678 > Project: Spark > Issue Type: Improvement > Components: Spark Core >Affects Versions: 4.0.0 >Reporter: L. C. Hsieh >Assignee: L. C. Hsieh >Priority: Minor > Labels: pull-request-available > Fix For: 4.0.0 > > > We have encountered shuffle data corruption issue: > ``` > Caused by: java.io.IOException: FAILED_TO_UNCOMPRESS(5) > at org.xerial.snappy.SnappyNative.throw_error(SnappyNative.java:112) > at org.xerial.snappy.SnappyNative.rawUncompress(Native Method) > at org.xerial.snappy.Snappy.rawUncompress(Snappy.java:504) > at org.xerial.snappy.Snappy.uncompress(Snappy.java:543) > at > org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:450) > at > org.xerial.snappy.SnappyInputStream.available(SnappyInputStream.java:497) > at > org.apache.spark.storage.BufferReleasingInputStream.available(ShuffleBlockFetcherIterator.scala:1356) > ``` > Spark shuffle has capacity to detect corruption for a few stream op like > `read` and `skip`, such `IOException` in the stack trace will be rethrown as > `FetchFailedException` that will re-try the failed shuffle task. But in the > stack trace it is `available` that is not covered by the mechanism. So > no-retry has been happened and the Spark application just failed. > As the `available` op will also involve data decompression, we should be able > to check it like `read` and `skip` do. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Resolved] (SPARK-45705) Fix flaky test: Status of a failed DDL/DML with no jobs should be FAILED
[ https://issues.apache.org/jira/browse/SPARK-45705?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] XiDuo You resolved SPARK-45705. --- Fix Version/s: 4.0.0 Resolution: Fixed Issue resolved by pull request 43554 [https://github.com/apache/spark/pull/43554] > Fix flaky test: Status of a failed DDL/DML with no jobs should be FAILED > - > > Key: SPARK-45705 > URL: https://issues.apache.org/jira/browse/SPARK-45705 > Project: Spark > Issue Type: Improvement > Components: Tests >Affects Versions: 4.0.0 >Reporter: Kent Yao >Priority: Major > Labels: pull-request-available > Fix For: 4.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45713) Support registering Python data sources
[ https://issues.apache.org/jira/browse/SPARK-45713?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated SPARK-45713: --- Labels: pull-request-available (was: ) > Support registering Python data sources > --- > > Key: SPARK-45713 > URL: https://issues.apache.org/jira/browse/SPARK-45713 > Project: Spark > Issue Type: Sub-task > Components: PySpark >Affects Versions: 4.0.0 >Reporter: Allison Wang >Priority: Major > Labels: pull-request-available > > Support registering Python data sources. > Users can register a Python data source and later use reference it using its > name. > {code:java} > class MyDataSource(DataSource): > @classmethod > def name(cls): > return "my-data-source" > spark.dataSource.register(MyDataSource){code} > Users can then use the name of the data source as the format (will be > supported in SPARK-45639) > {code:java} > spark.read.format("my-data-source").load(){code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45713) Support registering Python data sources
[ https://issues.apache.org/jira/browse/SPARK-45713?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Allison Wang updated SPARK-45713: - Description: Support registering Python data sources. Users can register a Python data source and later use reference it using its name. {code:java} class MyDataSource(DataSource): @classmethod def name(cls): return "my-data-source" spark.dataSource.register(MyDataSource){code} Users can then use the name of the data source as the format (will be supported in SPARK-45639) {code:java} spark.read.format("my-data-source").load(){code} was: Support registering Python data sources. Users can register a Python data source and later use reference it using its name. {code:java} class MyDataSource(DataSource): @classmethod def name(cls): return "my-data-source" spark.dataSource.register(MyDataSource){code} Users can then use the name of the data source as the format SPARK-45639 {code:java} spark.read.format("my-data-source").load(){code} > Support registering Python data sources > --- > > Key: SPARK-45713 > URL: https://issues.apache.org/jira/browse/SPARK-45713 > Project: Spark > Issue Type: Sub-task > Components: PySpark >Affects Versions: 4.0.0 >Reporter: Allison Wang >Priority: Major > > Support registering Python data sources. > Users can register a Python data source and later use reference it using its > name. > {code:java} > class MyDataSource(DataSource): > @classmethod > def name(cls): > return "my-data-source" > spark.dataSource.register(MyDataSource){code} > Users can then use the name of the data source as the format (will be > supported in SPARK-45639) > {code:java} > spark.read.format("my-data-source").load(){code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45713) Support registering Python data sources
[ https://issues.apache.org/jira/browse/SPARK-45713?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Allison Wang updated SPARK-45713: - Description: Support registering Python data sources. Users can register a Python data source and later use reference it using its name. {code:java} class MyDataSource(DataSource): @classmethod def name(cls): return "my-data-source" spark.dataSource.register(MyDataSource){code} Users can then use the name of the data source as the format SPARK-45639 {code:java} spark.read.format("my-data-source").load(){code} was: Support registering Python data sources. Users can register a Python data source and later use reference it using its name. {code:java} class MyDataSource(DataSource): @classmethod def name(cls): return "my-data-source" spark.dataSource.register(MyDataSource){code} Users can then use the name of the data source as the format {code:java} spark.read.format("my-data-source").load(){code} > Support registering Python data sources > --- > > Key: SPARK-45713 > URL: https://issues.apache.org/jira/browse/SPARK-45713 > Project: Spark > Issue Type: Sub-task > Components: PySpark >Affects Versions: 4.0.0 >Reporter: Allison Wang >Priority: Major > > Support registering Python data sources. > Users can register a Python data source and later use reference it using its > name. > {code:java} > class MyDataSource(DataSource): > @classmethod > def name(cls): > return "my-data-source" > spark.dataSource.register(MyDataSource){code} > Users can then use the name of the data source as the format SPARK-45639 > {code:java} > spark.read.format("my-data-source").load(){code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Created] (SPARK-45713) Support registering Python data sources
Allison Wang created SPARK-45713: Summary: Support registering Python data sources Key: SPARK-45713 URL: https://issues.apache.org/jira/browse/SPARK-45713 Project: Spark Issue Type: Sub-task Components: PySpark Affects Versions: 4.0.0 Reporter: Allison Wang Support registering Python data sources. Users can register a Python data source and later use reference it using its name. {code:java} class MyDataSource(DataSource): @classmethod def name(cls): return "my-data-source" spark.dataSource.register(MyDataSource){code} Users can then use the name of the data source as the format {code:java} spark.read.format("my-data-source").load(){code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45639) Support loading Python data sources in DataFrameReader
[ https://issues.apache.org/jira/browse/SPARK-45639?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Allison Wang updated SPARK-45639: - Description: Allow users to read from a Python data source using `spark.read.format(...).load()` in PySpark. For example Users can extend the DataSource and the DataSourceReader classes to create their own Python data source reader and use them in PySpark: {code:java} class MyReader(DataSourceReader): def read(self, partition): yield (0, 1) class MyDataSource(DataSource): def schema(self): return "id INT, value INT" def reader(self, schema): return MyReader() df = spark.read.format("MyDataSource").load() df.show() +---+-+ | id|value| +---+-+ | 0| 1| +---+-+ {code} was: Allow users to read from a Python data source using `spark.read.format(...).load()` For example Users can extend the DataSource and the DataSourceReader classes to create their own Python data source reader and use them in PySpark: {code:java} class MyReader(DataSourceReader): def read(self, partition): yield (0, 1) class MyDataSource(DataSource): def schema(self): return "id INT, value INT" def reader(self, schema): return MyReader() df = spark.read.format("MyDataSource").load() df.show() +---+-+ | id|value| +---+-+ | 0| 1| +---+-+ {code} > Support loading Python data sources in DataFrameReader > -- > > Key: SPARK-45639 > URL: https://issues.apache.org/jira/browse/SPARK-45639 > Project: Spark > Issue Type: Sub-task > Components: PySpark >Affects Versions: 4.0.0 >Reporter: Allison Wang >Priority: Major > > Allow users to read from a Python data source using > `spark.read.format(...).load()` in PySpark. For example > Users can extend the DataSource and the DataSourceReader classes to create > their own Python data source reader and use them in PySpark: > {code:java} > class MyReader(DataSourceReader): > def read(self, partition): > yield (0, 1) > class MyDataSource(DataSource): > def schema(self): > return "id INT, value INT" > > def reader(self, schema): > return MyReader() > df = spark.read.format("MyDataSource").load() > df.show() > +---+-+ > | id|value| > +---+-+ > | 0| 1| > +---+-+ > {code} > -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45639) Support loading Python data sources in DataFrameReader
[ https://issues.apache.org/jira/browse/SPARK-45639?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Allison Wang updated SPARK-45639: - Summary: Support loading Python data sources in DataFrameReader (was: Support Python data source in DataFrameReader) > Support loading Python data sources in DataFrameReader > -- > > Key: SPARK-45639 > URL: https://issues.apache.org/jira/browse/SPARK-45639 > Project: Spark > Issue Type: Sub-task > Components: PySpark >Affects Versions: 4.0.0 >Reporter: Allison Wang >Priority: Major > > Allow users to read from a Python data source using > `spark.read.format(...).load()` > For example > Users can extend the DataSource and the DataSourceReader classes to create > their own Python data source reader and use them in PySpark: > {code:java} > class MyReader(DataSourceReader): > def read(self, partition): > yield (0, 1) > class MyDataSource(DataSource): > def schema(self): > return "id INT, value INT" > def reader(self, schema): > return MyReader() > df = spark.read.format("MyDataSource").load() > df.show() > +---+-+ > | id|value| > +---+-+ > | 0| 1| > +---+-+ > {code} > -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Created] (SPARK-45712) Provide a command line flag to override the log4j properties file
Holden Karau created SPARK-45712: Summary: Provide a command line flag to override the log4j properties file Key: SPARK-45712 URL: https://issues.apache.org/jira/browse/SPARK-45712 Project: Spark Issue Type: Improvement Components: Spark Core Affects Versions: 4.0.0 Reporter: Holden Karau Override log4j properties is kind of annoying and depends on putting a file in the right place, we should let users specify which log4j properties file to use in spark-submit and friends. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45684) Clean up the deprecated API usage related to `SeqOps`
[ https://issues.apache.org/jira/browse/SPARK-45684?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Yang Jie updated SPARK-45684: - Description: * method transform in trait SeqOps is deprecated (since 2.13.0) * method reverseMap in trait SeqOps is deprecated (since 2.13.0) * method union in trait SeqOps is deprecated (since 2.13.0) {code:java} [warn] /Users/yangjie01/SourceCode/git/spark-mine-sbt/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala:675:15: method transform in trait SeqOps is deprecated (since 2.13.0): Use `mapInPlace` on an `IndexedSeq` instead [warn] Applicable -Wconf / @nowarn filters for this warning: msg=, cat=deprecation, site=org.apache.spark.ml.classification.LogisticRegression.train.$anonfun, origin=scala.collection.mutable.SeqOps.transform, version=2.13.0 [warn] centers.transform(_ / numCoefficientSets) [warn] ^ {code} was: * method transform in trait SeqOps is deprecated (since 2.13.0) * method reverseMap in trait SeqOps is deprecated (since 2.13.0) * method retain in trait SetOps is deprecated (since 2.13.0) * method union in trait SeqOps is deprecated (since 2.13.0) {code:java} [warn] /Users/yangjie01/SourceCode/git/spark-mine-sbt/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala:675:15: method transform in trait SeqOps is deprecated (since 2.13.0): Use `mapInPlace` on an `IndexedSeq` instead [warn] Applicable -Wconf / @nowarn filters for this warning: msg=, cat=deprecation, site=org.apache.spark.ml.classification.LogisticRegression.train.$anonfun, origin=scala.collection.mutable.SeqOps.transform, version=2.13.0 [warn] centers.transform(_ / numCoefficientSets) [warn] ^ {code} > Clean up the deprecated API usage related to `SeqOps` > - > > Key: SPARK-45684 > URL: https://issues.apache.org/jira/browse/SPARK-45684 > Project: Spark > Issue Type: Sub-task > Components: Build, Spark Core, SQL >Affects Versions: 4.0.0 >Reporter: Yang Jie >Priority: Major > > * method transform in trait SeqOps is deprecated (since 2.13.0) > * method reverseMap in trait SeqOps is deprecated (since 2.13.0) > * method union in trait SeqOps is deprecated (since 2.13.0) > {code:java} > [warn] > /Users/yangjie01/SourceCode/git/spark-mine-sbt/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala:675:15: > method transform in trait SeqOps is deprecated (since 2.13.0): Use > `mapInPlace` on an `IndexedSeq` instead > [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, > site=org.apache.spark.ml.classification.LogisticRegression.train.$anonfun, > origin=scala.collection.mutable.SeqOps.transform, version=2.13.0 > [warn] centers.transform(_ / numCoefficientSets) > [warn] ^ {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45685) Use `LazyList` instead of `Stream`
[ https://issues.apache.org/jira/browse/SPARK-45685?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated SPARK-45685: --- Labels: pull-request-available (was: ) > Use `LazyList` instead of `Stream` > -- > > Key: SPARK-45685 > URL: https://issues.apache.org/jira/browse/SPARK-45685 > Project: Spark > Issue Type: Sub-task > Components: Build, Spark Core, SQL >Affects Versions: 4.0.0 >Reporter: Yang Jie >Priority: Major > Labels: pull-request-available > > * class Stream in package immutable is deprecated (since 2.13.0) > * object Stream in package immutable is deprecated (since 2.13.0) > * type Stream in package scala is deprecated (since 2.13.0) > * value Stream in package scala is deprecated (since 2.13.0) > * method append in class Stream is deprecated (since 2.13.0) > * method toStream in trait IterableOnceOps is deprecated (since 2.13.0) > > {code:java} > [warn] > /Users/yangjie01/SourceCode/git/spark-mine-sbt/sql/core/src/test/scala/org/apache/spark/sql/GenTPCDSData.scala:49:20: > class Stream in package immutable is deprecated (since 2.13.0): Use LazyList > (which is fully lazy) instead of Stream (which has a lazy tail only) > [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, > site=org.apache.spark.sql.BlockingLineStream.BlockingStreamed.stream, > origin=scala.collection.immutable.Stream, version=2.13.0 > [warn] val stream: () => Stream[T]) > [warn] ^ {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Resolved] (SPARK-45649) Unify the prepare framework for `OffsetWindowFunctionFrame`
[ https://issues.apache.org/jira/browse/SPARK-45649?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Wenchen Fan resolved SPARK-45649. - Fix Version/s: 4.0.0 Resolution: Fixed Issue resolved by pull request 43507 [https://github.com/apache/spark/pull/43507] > Unify the prepare framework for `OffsetWindowFunctionFrame` > --- > > Key: SPARK-45649 > URL: https://issues.apache.org/jira/browse/SPARK-45649 > Project: Spark > Issue Type: Improvement > Components: SQL >Affects Versions: 4.0.0 >Reporter: Jiaan Geng >Assignee: Jiaan Geng >Priority: Major > Labels: pull-request-available > Fix For: 4.0.0 > > > Currently, the implementation the `prepare` of all the > `OffsetWindowFunctionFrame` have the same code logic show below. > ``` > override def prepare(rows: ExternalAppendOnlyUnsafeRowArray): Unit = { > if (offset > rows.length) { > fillDefaultValue(EmptyRow) > } else { > resetStates(rows) > if (ignoreNulls) { > ... > } else { > ... > } > } > } > ``` -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45711) Introduce a mapper for avro compression codecs
[ https://issues.apache.org/jira/browse/SPARK-45711?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated SPARK-45711: --- Labels: pull-request-available (was: ) > Introduce a mapper for avro compression codecs > -- > > Key: SPARK-45711 > URL: https://issues.apache.org/jira/browse/SPARK-45711 > Project: Spark > Issue Type: Improvement > Components: SQL >Affects Versions: 4.0.0 >Reporter: Jiaan Geng >Assignee: Jiaan Geng >Priority: Major > Labels: pull-request-available > > Currently, Spark supported all the avro compression codecs, but the avro > supported compression codecs and spark supported are not completely > one-on-one due to Spark introduce the compression codecs UNCOMPRESSED. > There are a lot of magic strings copy from avro compression codecs. This > issue lead to developers need to manually maintain its consistency. It is > easy to make mistakes and reduce development efficiency. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45711) Introduce a mapper for avro compression codecs
[ https://issues.apache.org/jira/browse/SPARK-45711?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Jiaan Geng updated SPARK-45711: --- Description: Currently, Spark supported all the avro compression codecs, but the avro supported compression codecs and spark supported are not completely one-on-one due to Spark introduce the compression codecs UNCOMPRESSED. There are a lot of magic strings copy from avro compression codecs. This issue lead to developers need to manually maintain its consistency. It is easy to make mistakes and reduce development efficiency. was: Currently, Spark supported all the avro compression codecs, but the avro supported compression codecs and spark supported are not completely one-on-one due to Spark introduce the compression codecs UNCOMPRESSED. There are a lot of magic strings copy from orc compression codecs. This issue lead to developers need to manually maintain its consistency. It is easy to make mistakes and reduce development efficiency. > Introduce a mapper for avro compression codecs > -- > > Key: SPARK-45711 > URL: https://issues.apache.org/jira/browse/SPARK-45711 > Project: Spark > Issue Type: Improvement > Components: SQL >Affects Versions: 4.0.0 >Reporter: Jiaan Geng >Assignee: Jiaan Geng >Priority: Major > > Currently, Spark supported all the avro compression codecs, but the avro > supported compression codecs and spark supported are not completely > one-on-one due to Spark introduce the compression codecs UNCOMPRESSED. > There are a lot of magic strings copy from avro compression codecs. This > issue lead to developers need to manually maintain its consistency. It is > easy to make mistakes and reduce development efficiency. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45711) Introduce a mapper for avro compression codecs
[ https://issues.apache.org/jira/browse/SPARK-45711?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Jiaan Geng updated SPARK-45711: --- Description: Currently, Spark supported all the avro compression codecs, but the avro supported compression codecs and spark supported are not completely one-on-one due to Spark introduce the compression codecs UNCOMPRESSED. There are a lot of magic strings copy from orc compression codecs. This issue lead to developers need to manually maintain its consistency. It is easy to make mistakes and reduce development efficiency. was: Currently, Spark supported all the orc compression codecs, but the orc supported compression codecs and spark supported are not completely one-on-one due to Spark introduce two compression codecs none and UNCOMPRESSED. There are a lot of magic strings copy from orc compression codecs. This issue lead to developers need to manually maintain its consistency. It is easy to make mistakes and reduce development efficiency. > Introduce a mapper for avro compression codecs > -- > > Key: SPARK-45711 > URL: https://issues.apache.org/jira/browse/SPARK-45711 > Project: Spark > Issue Type: Improvement > Components: SQL >Affects Versions: 4.0.0 >Reporter: Jiaan Geng >Assignee: Jiaan Geng >Priority: Major > > Currently, Spark supported all the avro compression codecs, but the avro > supported compression codecs and spark supported are not completely > one-on-one due to Spark introduce the compression codecs UNCOMPRESSED. > There are a lot of magic strings copy from orc compression codecs. This issue > lead to developers need to manually maintain its consistency. It is easy to > make mistakes and reduce development efficiency. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45711) Introduce a mapper for avro compression codecs
[ https://issues.apache.org/jira/browse/SPARK-45711?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Jiaan Geng updated SPARK-45711: --- Description: Currently, Spark supported all the orc compression codecs, but the orc supported compression codecs and spark supported are not completely one-on-one due to Spark introduce two compression codecs none and UNCOMPRESSED. There are a lot of magic strings copy from orc compression codecs. This issue lead to developers need to manually maintain its consistency. It is easy to make mistakes and reduce development efficiency. > Introduce a mapper for avro compression codecs > -- > > Key: SPARK-45711 > URL: https://issues.apache.org/jira/browse/SPARK-45711 > Project: Spark > Issue Type: Improvement > Components: SQL >Affects Versions: 4.0.0 >Reporter: Jiaan Geng >Assignee: Jiaan Geng >Priority: Major > > Currently, Spark supported all the orc compression codecs, but the orc > supported compression codecs and spark supported are not completely > one-on-one due to Spark introduce two compression codecs none and > UNCOMPRESSED. > There are a lot of magic strings copy from orc compression codecs. This issue > lead to developers need to manually maintain its consistency. It is easy to > make mistakes and reduce development efficiency. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Resolved] (SPARK-45706) Makes entire Binder build fails fast during setting up
[ https://issues.apache.org/jira/browse/SPARK-45706?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Hyukjin Kwon resolved SPARK-45706. -- Fix Version/s: 4.0.0 3.5.1 Assignee: Hyukjin Kwon Resolution: Fixed Fixed in https://github.com/apache/spark/pull/43553 > Makes entire Binder build fails fast during setting up > -- > > Key: SPARK-45706 > URL: https://issues.apache.org/jira/browse/SPARK-45706 > Project: Spark > Issue Type: New Feature > Components: Documentation, PySpark >Affects Versions: 3.5.0 >Reporter: Hyukjin Kwon >Assignee: Hyukjin Kwon >Priority: Major > Labels: pull-request-available > Fix For: 4.0.0, 3.5.1 > > > Binder build is currently broken: > https://mybinder.org/v2/gh/apache/spark/ce5ddad9903?filepath=python%2Fdocs%2Fsource%2Fgetting_started%2Fquickstart_df.ipynb > Seems like we uploaded PySpark late into PyPI, and the installation steps > just slightly ignored the failure. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45710) Assign name to _LEGACY_ERROR_TEMP_21[59,60,61,62]
[ https://issues.apache.org/jira/browse/SPARK-45710?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Deng Ziming updated SPARK-45710: Description: Choose a proper name for the error class {*}_LEGACY_ERROR_TEMP_21[59,60,61,62]{*}defined in {*}core/src/main/resources/error/error-classes.json{*}. The name should be short but complete (look at the example in error-classes.json). Add a test which triggers the error from user code if such test still doesn't exist. Check exception fields by using {*}checkError(){*}. The last function checks valuable error fields only, and avoids dependencies from error text message. In this way, tech editors can modify error format in error-classes.json, and don't worry of Spark's internal tests. Migrate other tests that might trigger the error onto checkError(). If you cannot reproduce the error from user space (using SQL query), replace the error by an internal error, see {*}SparkException.internalError(){*}. Improve the error message format in error-classes.json if the current is not clear. Propose a solution to users how to avoid and fix such kind of errors. Please, look at the PR below as examples: * [https://github.com/apache/spark/pull/38685] * [https://github.com/apache/spark/pull/38656] * [https://github.com/apache/spark/pull/38490] was: Choose a proper name for the error class *_LEGACY_ERROR_TEMP_2153* defined in {*}core/src/main/resources/error/error-classes.json{*}. The name should be short but complete (look at the example in error-classes.json). Add a test which triggers the error from user code if such test still doesn't exist. Check exception fields by using {*}checkError(){*}. The last function checks valuable error fields only, and avoids dependencies from error text message. In this way, tech editors can modify error format in error-classes.json, and don't worry of Spark's internal tests. Migrate other tests that might trigger the error onto checkError(). If you cannot reproduce the error from user space (using SQL query), replace the error by an internal error, see {*}SparkException.internalError(){*}. Improve the error message format in error-classes.json if the current is not clear. Propose a solution to users how to avoid and fix such kind of errors. Please, look at the PR below as examples: * [https://github.com/apache/spark/pull/38685] * [https://github.com/apache/spark/pull/38656] * [https://github.com/apache/spark/pull/38490] > Assign name to _LEGACY_ERROR_TEMP_21[59,60,61,62] > - > > Key: SPARK-45710 > URL: https://issues.apache.org/jira/browse/SPARK-45710 > Project: Spark > Issue Type: Sub-task > Components: SQL >Affects Versions: 3.5.0 >Reporter: Deng Ziming >Assignee: Deng Ziming >Priority: Major > Labels: pull-request-available > Fix For: 4.0.0 > > > Choose a proper name for the error class > {*}_LEGACY_ERROR_TEMP_21[59,60,61,62]{*}defined in > {*}core/src/main/resources/error/error-classes.json{*}. The name should be > short but complete (look at the example in error-classes.json). > Add a test which triggers the error from user code if such test still doesn't > exist. Check exception fields by using {*}checkError(){*}. The last function > checks valuable error fields only, and avoids dependencies from error text > message. In this way, tech editors can modify error format in > error-classes.json, and don't worry of Spark's internal tests. Migrate other > tests that might trigger the error onto checkError(). > If you cannot reproduce the error from user space (using SQL query), replace > the error by an internal error, see {*}SparkException.internalError(){*}. > Improve the error message format in error-classes.json if the current is not > clear. Propose a solution to users how to avoid and fix such kind of errors. > Please, look at the PR below as examples: > * [https://github.com/apache/spark/pull/38685] > * [https://github.com/apache/spark/pull/38656] > * [https://github.com/apache/spark/pull/38490] -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Created] (SPARK-45710) Assign name to _LEGACY_ERROR_TEMP__LEGACY_ERROR_TEMP_21[59,60,61,62]
Deng Ziming created SPARK-45710: --- Summary: Assign name to _LEGACY_ERROR_TEMP__LEGACY_ERROR_TEMP_21[59,60,61,62] Key: SPARK-45710 URL: https://issues.apache.org/jira/browse/SPARK-45710 Project: Spark Issue Type: Sub-task Components: SQL Affects Versions: 3.5.0 Reporter: Deng Ziming Assignee: Deng Ziming Fix For: 4.0.0 Choose a proper name for the error class *_LEGACY_ERROR_TEMP_2153* defined in {*}core/src/main/resources/error/error-classes.json{*}. The name should be short but complete (look at the example in error-classes.json). Add a test which triggers the error from user code if such test still doesn't exist. Check exception fields by using {*}checkError(){*}. The last function checks valuable error fields only, and avoids dependencies from error text message. In this way, tech editors can modify error format in error-classes.json, and don't worry of Spark's internal tests. Migrate other tests that might trigger the error onto checkError(). If you cannot reproduce the error from user space (using SQL query), replace the error by an internal error, see {*}SparkException.internalError(){*}. Improve the error message format in error-classes.json if the current is not clear. Propose a solution to users how to avoid and fix such kind of errors. Please, look at the PR below as examples: * [https://github.com/apache/spark/pull/38685] * [https://github.com/apache/spark/pull/38656] * [https://github.com/apache/spark/pull/38490] -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45710) Assign name to _LEGACY_ERROR_TEMP_21[59,60,61,62]
[ https://issues.apache.org/jira/browse/SPARK-45710?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Deng Ziming updated SPARK-45710: Summary: Assign name to _LEGACY_ERROR_TEMP_21[59,60,61,62] (was: Assign name to _LEGACY_ERROR_TEMP__LEGACY_ERROR_TEMP_21[59,60,61,62]) > Assign name to _LEGACY_ERROR_TEMP_21[59,60,61,62] > - > > Key: SPARK-45710 > URL: https://issues.apache.org/jira/browse/SPARK-45710 > Project: Spark > Issue Type: Sub-task > Components: SQL >Affects Versions: 3.5.0 >Reporter: Deng Ziming >Assignee: Deng Ziming >Priority: Major > Labels: pull-request-available > Fix For: 4.0.0 > > > Choose a proper name for the error class *_LEGACY_ERROR_TEMP_2153* defined in > {*}core/src/main/resources/error/error-classes.json{*}. The name should be > short but complete (look at the example in error-classes.json). > Add a test which triggers the error from user code if such test still doesn't > exist. Check exception fields by using {*}checkError(){*}. The last function > checks valuable error fields only, and avoids dependencies from error text > message. In this way, tech editors can modify error format in > error-classes.json, and don't worry of Spark's internal tests. Migrate other > tests that might trigger the error onto checkError(). > If you cannot reproduce the error from user space (using SQL query), replace > the error by an internal error, see {*}SparkException.internalError(){*}. > Improve the error message format in error-classes.json if the current is not > clear. Propose a solution to users how to avoid and fix such kind of errors. > Please, look at the PR below as examples: > * [https://github.com/apache/spark/pull/38685] > * [https://github.com/apache/spark/pull/38656] > * [https://github.com/apache/spark/pull/38490] -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45708) Retry mvn deploy failures
[ https://issues.apache.org/jira/browse/SPARK-45708?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated SPARK-45708: --- Labels: pull-request-available (was: ) > Retry mvn deploy failures > - > > Key: SPARK-45708 > URL: https://issues.apache.org/jira/browse/SPARK-45708 > Project: Spark > Issue Type: Bug > Components: Build >Affects Versions: 4.0.0 >Reporter: Enrico Minack >Priority: Major > Labels: pull-request-available > > It is common to see {{408 Request Timeout}} and {{502 Proxy Error}} when > deploying artifacts to Apache snapshot repository: > https://github.com/apache/spark/actions/runs/6437635317 > {quote} > 2023-10-07T01:09:51.1719360Z [ERROR] Failed to execute goal > org.apache.maven.plugins:maven-deploy-plugin:3.0.0-M1:deploy (default-deploy) > on project spark-streaming_2.13: ArtifactDeployerException: Failed to deploy > artifacts: Could not transfer artifact > org.apache.spark:spark-streaming_2.13:jar:tests:3.3.4-20231007.005815-57 > from/to apache.snapshots.https > (https://repository.apache.org/content/repositories/snapshots): transfer > failed for > https://repository.apache.org/content/repositories/snapshots/org/apache/spark/spark-streaming_2.13/3.3.4-SNAPSHOT/spark-streaming_2.13-3.3.4-20231007.005815-57-tests.jar, > status: 502 Proxy Error -> [Help 1] > {quote} > {quote} > 2023-10-07T01:11:48.5651501Z [ERROR] Failed to execute goal > org.apache.maven.plugins:maven-deploy-plugin:3.0.0:deploy (default-deploy) on > project spark-connect-common_2.12: Failed to deploy artifacts: Could not > transfer artifact > org.apache.spark:spark-connect-common_2.12:xml:cyclonedx:3.4.2-20231007.001102-103 > from/to apache.snapshots.https > (https://repository.apache.org/content/repositories/snapshots): transfer > failed for > https://repository.apache.org/content/repositories/snapshots/org/apache/spark/spark-connect-common_2.12/3.4.2-SNAPSHOT/spark-connect-common_2.12-3.4.2-20231007.001102-103-cyclonedx.xml, > status: 408 Request Timeout -> [Help 1] > {quote} > Such errors should be retried by `mvn deploy`. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Created] (SPARK-45708) Retry mvn deploy failures
Enrico Minack created SPARK-45708: - Summary: Retry mvn deploy failures Key: SPARK-45708 URL: https://issues.apache.org/jira/browse/SPARK-45708 Project: Spark Issue Type: Bug Components: Build Affects Versions: 4.0.0 Reporter: Enrico Minack It is common to see {{408 Request Timeout}} and {{502 Proxy Error}} when deploying artifacts to Apache snapshot repository: https://github.com/apache/spark/actions/runs/6437635317 {quote} 2023-10-07T01:09:51.1719360Z [ERROR] Failed to execute goal org.apache.maven.plugins:maven-deploy-plugin:3.0.0-M1:deploy (default-deploy) on project spark-streaming_2.13: ArtifactDeployerException: Failed to deploy artifacts: Could not transfer artifact org.apache.spark:spark-streaming_2.13:jar:tests:3.3.4-20231007.005815-57 from/to apache.snapshots.https (https://repository.apache.org/content/repositories/snapshots): transfer failed for https://repository.apache.org/content/repositories/snapshots/org/apache/spark/spark-streaming_2.13/3.3.4-SNAPSHOT/spark-streaming_2.13-3.3.4-20231007.005815-57-tests.jar, status: 502 Proxy Error -> [Help 1] {quote} {quote} 2023-10-07T01:11:48.5651501Z [ERROR] Failed to execute goal org.apache.maven.plugins:maven-deploy-plugin:3.0.0:deploy (default-deploy) on project spark-connect-common_2.12: Failed to deploy artifacts: Could not transfer artifact org.apache.spark:spark-connect-common_2.12:xml:cyclonedx:3.4.2-20231007.001102-103 from/to apache.snapshots.https (https://repository.apache.org/content/repositories/snapshots): transfer failed for https://repository.apache.org/content/repositories/snapshots/org/apache/spark/spark-connect-common_2.12/3.4.2-SNAPSHOT/spark-connect-common_2.12-3.4.2-20231007.001102-103-cyclonedx.xml, status: 408 Request Timeout -> [Help 1] {quote} Such errors should be retried by `mvn deploy`. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45707) Simplify `DataFrameStatFunctions. countMinSketch` with `CountMinSketchAgg `
[ https://issues.apache.org/jira/browse/SPARK-45707?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated SPARK-45707: --- Labels: pull-request-available (was: ) > Simplify `DataFrameStatFunctions. countMinSketch` with `CountMinSketchAgg ` > --- > > Key: SPARK-45707 > URL: https://issues.apache.org/jira/browse/SPARK-45707 > Project: Spark > Issue Type: Improvement > Components: SQL >Affects Versions: 4.0.0 >Reporter: Ruifeng Zheng >Priority: Minor > Labels: pull-request-available > -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Created] (SPARK-45707) Simplify `DataFrameStatFunctions. countMinSketch` with `CountMinSketchAgg `
Ruifeng Zheng created SPARK-45707: - Summary: Simplify `DataFrameStatFunctions. countMinSketch` with `CountMinSketchAgg ` Key: SPARK-45707 URL: https://issues.apache.org/jira/browse/SPARK-45707 Project: Spark Issue Type: Improvement Components: SQL Affects Versions: 4.0.0 Reporter: Ruifeng Zheng -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45696) Fix `method tryCompleteWith in trait Promise is deprecated`
[ https://issues.apache.org/jira/browse/SPARK-45696?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated SPARK-45696: --- Labels: pull-request-available (was: ) > Fix `method tryCompleteWith in trait Promise is deprecated` > --- > > Key: SPARK-45696 > URL: https://issues.apache.org/jira/browse/SPARK-45696 > Project: Spark > Issue Type: Sub-task > Components: Spark Core >Affects Versions: 4.0.0 >Reporter: Yang Jie >Priority: Major > Labels: pull-request-available > > {code:java} > [warn] > /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/FutureAction.scala:190:32: > method tryCompleteWith in trait Promise is deprecated (since 2.13.0): Since > this method is semantically equivalent to `completeWith`, use that instead. > [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, site=org.apache.spark.ComplexFutureAction.p, > origin=scala.concurrent.Promise.tryCompleteWith, version=2.13.0 > [warn] private val p = Promise[T]().tryCompleteWith(run(jobSubmitter)) > [warn] ^ {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Resolved] (SPARK-45698) Clean up the deprecated API usage related to `Buffer`
[ https://issues.apache.org/jira/browse/SPARK-45698?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Max Gekk resolved SPARK-45698. -- Fix Version/s: 4.0.0 Resolution: Fixed Issue resolved by pull request 43551 [https://github.com/apache/spark/pull/43551] > Clean up the deprecated API usage related to `Buffer` > - > > Key: SPARK-45698 > URL: https://issues.apache.org/jira/browse/SPARK-45698 > Project: Spark > Issue Type: Sub-task > Components: Spark Core, SQL >Affects Versions: 4.0.0 >Reporter: Yang Jie >Assignee: Yang Jie >Priority: Major > Labels: pull-request-available > Fix For: 4.0.0 > > > * method append in trait Buffer is deprecated (since 2.13.0) > * method prepend in trait Buffer is deprecated (since 2.13.0) > * method trimEnd in trait Buffer is deprecated (since 2.13.4) > * method trimStart in trait Buffer is deprecated (since 2.13.4) > {code:java} > [warn] > /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala:319:18: > method append in trait Buffer is deprecated (since 2.13.0): Use appendAll > instead > [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, > site=org.apache.spark.deploy.IvyTestUtils.createLocalRepository, > origin=scala.collection.mutable.Buffer.append, version=2.13.0 > [warn] allFiles.append(rFiles: _*) > [warn] ^ > [warn] > /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala:183:13: > method trimEnd in trait Buffer is deprecated (since 2.13.4): use > dropRightInPlace instead > [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, > site=org.apache.spark.util.SizeEstimator.SearchState.dequeue, > origin=scala.collection.mutable.Buffer.trimEnd, version=2.13.4 > [warn] stack.trimEnd(1) > [warn] ^{code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Assigned] (SPARK-45698) Clean up the deprecated API usage related to `Buffer`
[ https://issues.apache.org/jira/browse/SPARK-45698?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Max Gekk reassigned SPARK-45698: Assignee: Yang Jie > Clean up the deprecated API usage related to `Buffer` > - > > Key: SPARK-45698 > URL: https://issues.apache.org/jira/browse/SPARK-45698 > Project: Spark > Issue Type: Sub-task > Components: Spark Core, SQL >Affects Versions: 4.0.0 >Reporter: Yang Jie >Assignee: Yang Jie >Priority: Major > Labels: pull-request-available > > * method append in trait Buffer is deprecated (since 2.13.0) > * method prepend in trait Buffer is deprecated (since 2.13.0) > * method trimEnd in trait Buffer is deprecated (since 2.13.4) > * method trimStart in trait Buffer is deprecated (since 2.13.4) > {code:java} > [warn] > /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala:319:18: > method append in trait Buffer is deprecated (since 2.13.0): Use appendAll > instead > [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, > site=org.apache.spark.deploy.IvyTestUtils.createLocalRepository, > origin=scala.collection.mutable.Buffer.append, version=2.13.0 > [warn] allFiles.append(rFiles: _*) > [warn] ^ > [warn] > /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala:183:13: > method trimEnd in trait Buffer is deprecated (since 2.13.4): use > dropRightInPlace instead > [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, > site=org.apache.spark.util.SizeEstimator.SearchState.dequeue, > origin=scala.collection.mutable.Buffer.trimEnd, version=2.13.4 > [warn] stack.trimEnd(1) > [warn] ^{code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Assigned] (SPARK-45651) Snapshots of some packages are not published any more
[ https://issues.apache.org/jira/browse/SPARK-45651?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Yang Jie reassigned SPARK-45651: Assignee: Enrico Minack > Snapshots of some packages are not published any more > - > > Key: SPARK-45651 > URL: https://issues.apache.org/jira/browse/SPARK-45651 > Project: Spark > Issue Type: Bug > Components: Build >Affects Versions: 4.0.0 >Reporter: Enrico Minack >Assignee: Enrico Minack >Priority: Major > Labels: pull-request-available > > Snapshots of some packages are not been published anymore, e.g. > spark-sql_2.13-4.0.0 has not been published since Sep, 13th: > https://repository.apache.org/content/groups/snapshots/org/apache/spark/spark-sql_2.13/4.0.0-SNAPSHOT/ > There have been some attempts to fix CI: SPARK-45535 SPARK-45536 > Assumption is that memory consumption during build exceeds the available > memory of the Github host. > The following could be attempted: > - enable manual trigger of the {{publish_snapshots.yml}} workflow > - enable some memory use logging to proof that exceeded memory is the root > cause > - attempt to reduce memory footprint and see impact in above logging > - revert memory use logging -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Resolved] (SPARK-45651) Snapshots of some packages are not published any more
[ https://issues.apache.org/jira/browse/SPARK-45651?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Yang Jie resolved SPARK-45651. -- Fix Version/s: 4.0.0 Resolution: Fixed Issue resolved by pull request 43555 [https://github.com/apache/spark/pull/43555] > Snapshots of some packages are not published any more > - > > Key: SPARK-45651 > URL: https://issues.apache.org/jira/browse/SPARK-45651 > Project: Spark > Issue Type: Bug > Components: Build >Affects Versions: 4.0.0 >Reporter: Enrico Minack >Assignee: Enrico Minack >Priority: Major > Labels: pull-request-available > Fix For: 4.0.0 > > > Snapshots of some packages are not been published anymore, e.g. > spark-sql_2.13-4.0.0 has not been published since Sep, 13th: > https://repository.apache.org/content/groups/snapshots/org/apache/spark/spark-sql_2.13/4.0.0-SNAPSHOT/ > There have been some attempts to fix CI: SPARK-45535 SPARK-45536 > Assumption is that memory consumption during build exceeds the available > memory of the Github host. > The following could be attempted: > - enable manual trigger of the {{publish_snapshots.yml}} workflow > - enable some memory use logging to proof that exceeded memory is the root > cause > - attempt to reduce memory footprint and see impact in above logging > - revert memory use logging -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45705) Fix flaky test: Status of a failed DDL/DML with no jobs should be FAILED
[ https://issues.apache.org/jira/browse/SPARK-45705?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated SPARK-45705: --- Labels: pull-request-available (was: ) > Fix flaky test: Status of a failed DDL/DML with no jobs should be FAILED > - > > Key: SPARK-45705 > URL: https://issues.apache.org/jira/browse/SPARK-45705 > Project: Spark > Issue Type: Improvement > Components: Tests >Affects Versions: 4.0.0 >Reporter: Kent Yao >Priority: Major > Labels: pull-request-available > -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45706) Makes entire Binder build fails fast during setting up
[ https://issues.apache.org/jira/browse/SPARK-45706?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated SPARK-45706: --- Labels: pull-request-available (was: ) > Makes entire Binder build fails fast during setting up > -- > > Key: SPARK-45706 > URL: https://issues.apache.org/jira/browse/SPARK-45706 > Project: Spark > Issue Type: New Feature > Components: Documentation, PySpark >Affects Versions: 3.5.0 >Reporter: Hyukjin Kwon >Priority: Major > Labels: pull-request-available > > Binder build is currently broken: > https://mybinder.org/v2/gh/apache/spark/ce5ddad9903?filepath=python%2Fdocs%2Fsource%2Fgetting_started%2Fquickstart_df.ipynb > Seems like we uploaded PySpark late into PyPI, and the installation steps > just slightly ignored the failure. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Created] (SPARK-45706) Makes entire Binder build fails fast during setting up
Hyukjin Kwon created SPARK-45706: Summary: Makes entire Binder build fails fast during setting up Key: SPARK-45706 URL: https://issues.apache.org/jira/browse/SPARK-45706 Project: Spark Issue Type: New Feature Components: Documentation, PySpark Affects Versions: 3.5.0 Reporter: Hyukjin Kwon Binder build is currently broken: https://mybinder.org/v2/gh/apache/spark/ce5ddad9903?filepath=python%2Fdocs%2Fsource%2Fgetting_started%2Fquickstart_df.ipynb Seems like we uploaded PySpark late into PyPI, and the installation steps just slightly ignored the failure. -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Created] (SPARK-45705) Fix flaky test: Status of a failed DDL/DML with no jobs should be FAILED
Kent Yao created SPARK-45705: Summary: Fix flaky test: Status of a failed DDL/DML with no jobs should be FAILED Key: SPARK-45705 URL: https://issues.apache.org/jira/browse/SPARK-45705 Project: Spark Issue Type: Improvement Components: Tests Affects Versions: 4.0.0 Reporter: Kent Yao -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45701) Clean up the deprecated API usage related to `SetOps`
[ https://issues.apache.org/jira/browse/SPARK-45701?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Yang Jie updated SPARK-45701: - Description: * method - in trait SetOps is deprecated (since 2.13.0) * method – in trait SetOps is deprecated (since 2.13.0) * method + in trait SetOps is deprecated (since 2.13.0) * method retain in trait SetOps is deprecated (since 2.13.0) {code:java} [warn] /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala:70:32: method + in trait SetOps is deprecated (since 2.13.0): Consider requiring an immutable Set or fall back to Set.union [warn] Applicable -Wconf / @nowarn filters for this warning: msg=, cat=deprecation, site=org.apache.spark.storage.BlockReplicationUtils.getSampleIds.indices.$anonfun, origin=scala.collection.SetOps.+, version=2.13.0 [warn] if (set.contains(t)) set + i else set + t [warn] ^ {code} was: * method - in trait SetOps is deprecated (since 2.13.0) * method – in trait SetOps is deprecated (since 2.13.0) * method + in trait SetOps is deprecated (since 2.13.0) {code:java} [warn] /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala:70:32: method + in trait SetOps is deprecated (since 2.13.0): Consider requiring an immutable Set or fall back to Set.union [warn] Applicable -Wconf / @nowarn filters for this warning: msg=, cat=deprecation, site=org.apache.spark.storage.BlockReplicationUtils.getSampleIds.indices.$anonfun, origin=scala.collection.SetOps.+, version=2.13.0 [warn] if (set.contains(t)) set + i else set + t [warn] ^ {code} > Clean up the deprecated API usage related to `SetOps` > - > > Key: SPARK-45701 > URL: https://issues.apache.org/jira/browse/SPARK-45701 > Project: Spark > Issue Type: Sub-task > Components: Spark Core, SQL >Affects Versions: 4.0.0 >Reporter: Yang Jie >Priority: Major > > * method - in trait SetOps is deprecated (since 2.13.0) > * method – in trait SetOps is deprecated (since 2.13.0) > * method + in trait SetOps is deprecated (since 2.13.0) > * method retain in trait SetOps is deprecated (since 2.13.0) > > {code:java} > [warn] > /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala:70:32: > method + in trait SetOps is deprecated (since 2.13.0): Consider requiring an > immutable Set or fall back to Set.union > [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, > site=org.apache.spark.storage.BlockReplicationUtils.getSampleIds.indices.$anonfun, > origin=scala.collection.SetOps.+, version=2.13.0 > [warn] if (set.contains(t)) set + i else set + t > [warn] ^ {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Resolved] (SPARK-45614) Assign name to _LEGACY_ERROR_TEMP_215[6,7,8]
[ https://issues.apache.org/jira/browse/SPARK-45614?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Max Gekk resolved SPARK-45614. -- Resolution: Fixed Issue resolved by pull request 43481 [https://github.com/apache/spark/pull/43481] > Assign name to _LEGACY_ERROR_TEMP_215[6,7,8] > > > Key: SPARK-45614 > URL: https://issues.apache.org/jira/browse/SPARK-45614 > Project: Spark > Issue Type: Sub-task > Components: SQL >Affects Versions: 3.5.0 >Reporter: Deng Ziming >Assignee: Deng Ziming >Priority: Major > Labels: pull-request-available > Fix For: 4.0.0 > > > Choose a proper name for the error class *_LEGACY_ERROR_TEMP_2153* defined in > {*}core/src/main/resources/error/error-classes.json{*}. The name should be > short but complete (look at the example in error-classes.json). > Add a test which triggers the error from user code if such test still doesn't > exist. Check exception fields by using {*}checkError(){*}. The last function > checks valuable error fields only, and avoids dependencies from error text > message. In this way, tech editors can modify error format in > error-classes.json, and don't worry of Spark's internal tests. Migrate other > tests that might trigger the error onto checkError(). > If you cannot reproduce the error from user space (using SQL query), replace > the error by an internal error, see {*}SparkException.internalError(){*}. > Improve the error message format in error-classes.json if the current is not > clear. Propose a solution to users how to avoid and fix such kind of errors. > Please, look at the PR below as examples: > * [https://github.com/apache/spark/pull/38685] > * [https://github.com/apache/spark/pull/38656] > * [https://github.com/apache/spark/pull/38490] -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Assigned] (SPARK-40820) Creating StructType from Json
[ https://issues.apache.org/jira/browse/SPARK-40820?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Hyukjin Kwon reassigned SPARK-40820: Assignee: Anthony Wainer Cachay Guivin > Creating StructType from Json > - > > Key: SPARK-40820 > URL: https://issues.apache.org/jira/browse/SPARK-40820 > Project: Spark > Issue Type: Bug > Components: PySpark, Spark Core >Affects Versions: 3.5.0 >Reporter: Anthony Wainer Cachay Guivin >Assignee: Anthony Wainer Cachay Guivin >Priority: Minor > Labels: pull-request-available > > When create a StructType from a Python dictionary you use > [StructType.fromJson|https://github.com/apache/spark/blob/master/python/pyspark/sql/types.py#L792] > or in scala > [DataType.fromJson|https://github.com/apache/spark/blob/master/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala#L158C7-L158C15] > To create a schema can be created as follows from the code below, but it > requires to put inside the json: Nullable and Metadata, this is inconsistent > because within the DataType class this by default. > {code:python} > schema = { > "name": "name", "type": "string" > } > StructField.fromJson(schema) > {code} > Python Error: > {code:python} > from pyspark.sql.types import StructField > schema = { > "name": "c1", "type": "string" > } > StructField.fromJson(schema) > >> > Traceback (most recent call last): > File "code.py", line 90, in runcode > exec(code, self.locals) > File "", line 1, in > File "pyspark/sql/types.py", line 583, in fromJson > json["nullable"], > KeyError: 'nullable' > {code} > Scala Error: > {code:scala} > val schema = """ > |{ > |"type": "struct", > |"fields": [ > |{ > |"name": "c1", > |"type": "string", > |"nullable": false > |} > |] > |} > |""".stripMargin > DataType.fromJson(schema) > >> > Failed to convert the JSON string '{"name":"c1","type":"string"}' to a field. > java.lang.IllegalArgumentException: Failed to convert the JSON string > '{"name":"c1","type":"string"}' to a field. > at org.apache.spark.sql.types.DataType$.parseStructField(DataType.scala:268) > at > org.apache.spark.sql.types.DataType$.$anonfun$parseDataType$1(DataType.scala:225) > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Resolved] (SPARK-40820) Creating StructType from Json
[ https://issues.apache.org/jira/browse/SPARK-40820?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Hyukjin Kwon resolved SPARK-40820. -- Fix Version/s: 4.0.0 Resolution: Fixed Issue resolved by pull request 43474 [https://github.com/apache/spark/pull/43474] > Creating StructType from Json > - > > Key: SPARK-40820 > URL: https://issues.apache.org/jira/browse/SPARK-40820 > Project: Spark > Issue Type: Bug > Components: PySpark, Spark Core >Affects Versions: 3.5.0 >Reporter: Anthony Wainer Cachay Guivin >Assignee: Anthony Wainer Cachay Guivin >Priority: Minor > Labels: pull-request-available > Fix For: 4.0.0 > > > When create a StructType from a Python dictionary you use > [StructType.fromJson|https://github.com/apache/spark/blob/master/python/pyspark/sql/types.py#L792] > or in scala > [DataType.fromJson|https://github.com/apache/spark/blob/master/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala#L158C7-L158C15] > To create a schema can be created as follows from the code below, but it > requires to put inside the json: Nullable and Metadata, this is inconsistent > because within the DataType class this by default. > {code:python} > schema = { > "name": "name", "type": "string" > } > StructField.fromJson(schema) > {code} > Python Error: > {code:python} > from pyspark.sql.types import StructField > schema = { > "name": "c1", "type": "string" > } > StructField.fromJson(schema) > >> > Traceback (most recent call last): > File "code.py", line 90, in runcode > exec(code, self.locals) > File "", line 1, in > File "pyspark/sql/types.py", line 583, in fromJson > json["nullable"], > KeyError: 'nullable' > {code} > Scala Error: > {code:scala} > val schema = """ > |{ > |"type": "struct", > |"fields": [ > |{ > |"name": "c1", > |"type": "string", > |"nullable": false > |} > |] > |} > |""".stripMargin > DataType.fromJson(schema) > >> > Failed to convert the JSON string '{"name":"c1","type":"string"}' to a field. > java.lang.IllegalArgumentException: Failed to convert the JSON string > '{"name":"c1","type":"string"}' to a field. > at org.apache.spark.sql.types.DataType$.parseStructField(DataType.scala:268) > at > org.apache.spark.sql.types.DataType$.$anonfun$parseDataType$1(DataType.scala:225) > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45670) SparkSubmit does not support --total-executor-cores when deploying on K8s
[ https://issues.apache.org/jira/browse/SPARK-45670?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Hyukjin Kwon updated SPARK-45670: - Fix Version/s: 3.4.2 3.5.1 > SparkSubmit does not support --total-executor-cores when deploying on K8s > - > > Key: SPARK-45670 > URL: https://issues.apache.org/jira/browse/SPARK-45670 > Project: Spark > Issue Type: Bug > Components: Spark Submit >Affects Versions: 3.3.3, 3.4.1, 3.5.0 >Reporter: Cheng Pan >Assignee: Cheng Pan >Priority: Major > Labels: pull-request-available > Fix For: 3.4.2, 3.5.1, 3.3.4 > > -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Resolved] (SPARK-45670) SparkSubmit does not support --total-executor-cores when deploying on K8s
[ https://issues.apache.org/jira/browse/SPARK-45670?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Hyukjin Kwon resolved SPARK-45670. -- Fix Version/s: 3.3.4 Resolution: Fixed Issue resolved by pull request 43548 [https://github.com/apache/spark/pull/43548] > SparkSubmit does not support --total-executor-cores when deploying on K8s > - > > Key: SPARK-45670 > URL: https://issues.apache.org/jira/browse/SPARK-45670 > Project: Spark > Issue Type: Bug > Components: Spark Submit >Affects Versions: 3.3.3, 3.4.1, 3.5.0 >Reporter: Cheng Pan >Assignee: Cheng Pan >Priority: Major > Labels: pull-request-available > Fix For: 3.3.4 > > -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Assigned] (SPARK-45670) SparkSubmit does not support --total-executor-cores when deploying on K8s
[ https://issues.apache.org/jira/browse/SPARK-45670?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Hyukjin Kwon reassigned SPARK-45670: Assignee: Cheng Pan > SparkSubmit does not support --total-executor-cores when deploying on K8s > - > > Key: SPARK-45670 > URL: https://issues.apache.org/jira/browse/SPARK-45670 > Project: Spark > Issue Type: Bug > Components: Spark Submit >Affects Versions: 3.3.3, 3.4.1, 3.5.0 >Reporter: Cheng Pan >Assignee: Cheng Pan >Priority: Major > Labels: pull-request-available > -- This message was sent by Atlassian Jira (v8.20.10#820010) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-45637) Time window aggregation in separate streams followed by stream-stream join not returning results
[ https://issues.apache.org/jira/browse/SPARK-45637?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Wei Liu updated SPARK-45637: Description: According to documentation update (SPARK-42591) resulting from SPARK-42376, Spark 3.5.0 should support time-window aggregations in two separate streams followed by stream-stream window join: [https://github.com/apache/spark/blob/261b281e6e57be32eb28bf4e50bea24ed22a9f21/docs/structured-streaming-programming-guide.md?plain=1#L1939-L1995] However, I failed to reproduce this example and the query I built doesn't return any results: {code:java} from pyspark.sql.functions import rand from pyspark.sql.functions import expr, window, window_time spark.conf.set("spark.sql.shuffle.partitions", "1") impressions = ( spark .readStream.format("rate").option("rowsPerSecond", "5").option("numPartitions", "1").load() .selectExpr("value AS adId", "timestamp AS impressionTime") ) impressionsWithWatermark = impressions \ .selectExpr("adId AS impressionAdId", "impressionTime") \ .withWatermark("impressionTime", "10 seconds") clicks = ( spark .readStream.format("rate").option("rowsPerSecond", "5").option("numPartitions", "1").load() .where((rand() * 100).cast("integer") < 10) # 10 out of every 100 impressions result in a click .selectExpr("(value - 10) AS adId ", "timestamp AS clickTime") # -10 so that a click with same id as impression is generated later (i.e. delayed data). .where("adId > 0") ) clicksWithWatermark = clicks \ .selectExpr("adId AS clickAdId", "clickTime") \ .withWatermark("clickTime", "10 seconds") clicksWindow = clicksWithWatermark.groupBy( window(clicksWithWatermark.clickTime, "1 minute") ).count() impressionsWindow = impressionsWithWatermark.groupBy( window(impressionsWithWatermark.impressionTime, "1 minute") ).count() clicksAndImpressions = clicksWindow.join(impressionsWindow, "window", "inner") clicksAndImpressions.writeStream \ .format("memory") \ .queryName("clicksAndImpressions") \ .outputMode("append") \ .start() {code} My intuition is that I'm getting no results because to output results of the first stateful operator (time window aggregation), a watermark needs to pass the end timestamp of the window. And once the watermark is after the end timestamp of the window, this window is ignored at the second stateful operator (stream-stream) join because it's behind the watermark. Indeed, a small hack done to event time column (adding one minute) between two stateful operators makes it possible to get results: {code:java} clicksWindow2 = clicksWithWatermark.groupBy( window(clicksWithWatermark.clickTime, "1 minute") ).count().withColumn("window_time", window_time("window") + expr('INTERVAL 1 MINUTE')).drop("window") impressionsWindow2 = impressionsWithWatermark.groupBy( window(impressionsWithWatermark.impressionTime, "1 minute") ).count().withColumn("window_time", window_time("window") + expr('INTERVAL 1 MINUTE')).drop("window") clicksAndImpressions2 = clicksWindow2.join(impressionsWindow2, "window_time", "inner") clicksAndImpressions2.writeStream \ .format("memory") \ .queryName("clicksAndImpressions2") \ .outputMode("append") \ .start() {code} was: According to documentation update (SPARK-42591) resulting from SPARK-42376, Spark 3.5.0 should support time-window aggregations in two separate streams followed by stream-stream window join: https://github.com/apache/spark/blob/261b281e6e57be32eb28bf4e50bea24ed22a9f21/docs/structured-streaming-programming-guide.md?plain=1#L1939-L1995 However, I failed to reproduce this example and the query I built doesn't return any results: {code:java} from pyspark.sql.functions import rand from pyspark.sql.functions import expr, window, window_time spark.conf.set("spark.sql.shuffle.partitions", "1") impressions = ( spark .readStream.format("rate").option("rowsPerSecond", "5").option("numPartitions", "1").load() .selectExpr("value AS adId", "timestamp AS impressionTime") ) impressionsWithWatermark = impressions \ .selectExpr("adId AS impressionAdId", "impressionTime") \ .withWatermark("impressionTime", "10 seconds") clicks = ( spark .readStream.format("rate").option("rowsPerSecond", "5").option("numPartitions", "1").load() .where((rand() * 100).cast("integer") < 10) # 10 out of every 100 impressions result in a click .selectExpr("(value - 10) AS adId ", "timestamp AS clickTime") # -10 so that a click with same id as impression is generated later (i.e. delayed data). .where("adId > 0") ) clicksWithWatermark = clicks \ .selectExpr("adId AS clickAdId", "clickTime") \ .withWatermark("clickTime", "10 seconds") clicksWindow = clicksWithWatermark.groupBy( window(clicksWithWatermark.clickTime, "1 minute") ).count() impressionsWindow