date:20231027

[jira] [Updated] (SPARK-45714) Spark UI throws 500 error when StructuredStreaming query filter is selected

2023-10-27 Thread Satyam Raj (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45714?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Satyam Raj updated SPARK-45714:
---
Description: 
h2.  

When I'm trying to do the following inside the listener class, and go to the UI 
to check the streaming query tab and click on Sort by Latest Batch, I get the 
below error.

 
{code:java}
class MyListener extends StreamingQueryListener {

  private val kafkaAdminClient = try {
Some(Admin.create(getKafkaProp()))
  } catch {
case _: Throwable => None
  } 
  private val registeredConsumerGroups = 
kafkaAdminClient.get.listConsumerGroups().all().get()

  override def onQueryProgress(event: 
StreamingQueryListener.QueryProgressEvent): Unit = {
    val eventName = event.progress.name
    commitConsumerOffset(event.progress.sources.head.endOffset, eventName)
  }

  private def getKafkaProp(eventName: String = "dummy-admin"): Properties = {
val props: Properties = new Properties()
props.put("bootstrap.servers", 
Configuration.configurationMap("kafka.broker"))
props.put("group.id", eventName)
props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 
"org.apache.kafka.common.serialization.StringDeserializer")
props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 
"org.apache.kafka.common.serialization.StringDeserializer")
props.put("security.protocol", 
Configuration.configurationMap("kafka.security.protocol"))
props.put("sasl.mechanism", 
Configuration.configurationMap("kafka.sasl.mechanism"))
props.put("sasl.jaas.config", 
Configuration.configurationMap("kafka.sasl.jaas.config"))
props
  }

  private def getTopicPartitionMap(topic: String, jsonOffsets: Map[String, 
Map[String, Long]]) = {
val offsets = jsonOffsets.head._2
val topicPartitionMap = new util.HashMap[TopicPartition, 
OffsetAndMetadata]()
offsets.keys.foreach(partition => {
  val tp = new TopicPartition(topic, partition.toInt)
  val oam = new 
OffsetAndMetadata(offsets(partition).asInstanceOf[Number].longValue())
  topicPartitionMap.put(tp, oam)
})
topicPartitionMap
  }


  private def commitConsumerOffset(endOffset: String, eventName: String) = {
val jsonOffsets = mapper.readValue(endOffset, classOf[Map[String, 
Map[String, Long]]])
val topicPartitionMap = getTopicPartitionMap(eventName, jsonOffsets)
if (!registeredConsumerGroups.contains(eventName)) {
  print("topic consumer not created! creating")
  val kafkaConsumer = new KafkaConsumer[String, 
String](getKafkaProp(eventName))
  kafkaConsumer.commitSync(topicPartitionMap)
  kafkaConsumer.close()
} else {
  print("committing cg offsets using admin")
  kafkaAdminClient.get.alterConsumerGroupOffsets(eventName, 
topicPartitionMap).all().get()
}
  }
}{code}
 

 
h2. HTTP ERROR 500 java.lang.NullPointerException
||URI:|/StreamingQuery/active|
||STATUS:|500|
||MESSAGE:|java.lang.NullPointerException|
||SERVLET:|org.apache.spark.ui.JettyUtils$$anon$1-522a82dd|
||CAUSED BY:|java.lang.NullPointerException|
h3. Caused by:

 
{code:java}
java.lang.NullPointerException at 
org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9(StreamingQueryPage.scala:258)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9$adapted(StreamingQueryPage.scala:258)
 at scala.math.Ordering$$anon$5.compare(Ordering.scala:253) at 
java.base/java.util.TimSort.binarySort(TimSort.java:296) at 
java.base/java.util.TimSort.sort(TimSort.java:239) at 
java.base/java.util.Arrays.sort(Arrays.java:1441) at 
scala.collection.SeqLike.sorted(SeqLike.scala:659) at 
scala.collection.SeqLike.sorted$(SeqLike.scala:647) at 
scala.collection.AbstractSeq.sorted(Seq.scala:45) at 
org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.(StreamingQueryPage.scala:223)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.dataSource(StreamingQueryPage.scala:149)
 at org.apache.spark.ui.PagedTable.table(PagedTable.scala:101) at 
org.apache.spark.ui.PagedTable.table$(PagedTable.scala:100) at 
org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.table(StreamingQueryPage.scala:114)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPage.queryTable(StreamingQueryPage.scala:101)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPage.generateStreamingQueryTable(StreamingQueryPage.scala:60)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPage.render(StreamingQueryPage.scala:38)
 at org.apache.spark.ui.WebUI.$anonfun$attachPage$1(WebUI.scala:90) at 
org.apache.spark.ui.JettyUtils$$anon$1.doGet(JettyUtils.scala:81) at 
javax.servlet.http.HttpServlet.service(HttpServlet.java:503) at 
javax.servlet.http.HttpServlet.service(HttpServlet.java:590) at 
org.sparkproject.jetty.servlet.ServletHolder.handle(ServletHolder.java:799) at 
org.sparkproject.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1626)
 at

[jira] [Updated] (SPARK-45714) Spark UI throws 500 error when StructuredStreaming query filter is selected

2023-10-27 Thread Satyam Raj (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45714?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Satyam Raj updated SPARK-45714:
---
Description: 
h2. HTTP ERROR 500 java.lang.NullPointerException
||URI:|/StreamingQuery/active|
||STATUS:|500|
||MESSAGE:|java.lang.NullPointerException|
||SERVLET:|org.apache.spark.ui.JettyUtils$$anon$1-522a82dd|
||CAUSED BY:|java.lang.NullPointerException|
h3. Caused by:

 
{code:java}
java.lang.NullPointerException at 
org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9(StreamingQueryPage.scala:258)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9$adapted(StreamingQueryPage.scala:258)
 at scala.math.Ordering$$anon$5.compare(Ordering.scala:253) at 
java.base/java.util.TimSort.binarySort(TimSort.java:296) at 
java.base/java.util.TimSort.sort(TimSort.java:239) at 
java.base/java.util.Arrays.sort(Arrays.java:1441) at 
scala.collection.SeqLike.sorted(SeqLike.scala:659) at 
scala.collection.SeqLike.sorted$(SeqLike.scala:647) at 
scala.collection.AbstractSeq.sorted(Seq.scala:45) at 
org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.(StreamingQueryPage.scala:223)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.dataSource(StreamingQueryPage.scala:149)
 at org.apache.spark.ui.PagedTable.table(PagedTable.scala:101) at 
org.apache.spark.ui.PagedTable.table$(PagedTable.scala:100) at 
org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.table(StreamingQueryPage.scala:114)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPage.queryTable(StreamingQueryPage.scala:101)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPage.generateStreamingQueryTable(StreamingQueryPage.scala:60)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPage.render(StreamingQueryPage.scala:38)
 at org.apache.spark.ui.WebUI.$anonfun$attachPage$1(WebUI.scala:90) at 
org.apache.spark.ui.JettyUtils$$anon$1.doGet(JettyUtils.scala:81) at 
javax.servlet.http.HttpServlet.service(HttpServlet.java:503) at 
javax.servlet.http.HttpServlet.service(HttpServlet.java:590) at 
org.sparkproject.jetty.servlet.ServletHolder.handle(ServletHolder.java:799) at 
org.sparkproject.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1626)
 at 
org.apache.spark.ui.HttpSecurityFilter.doFilter(HttpSecurityFilter.scala:95) at 
org.sparkproject.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at 
org.sparkproject.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
 at 
org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.doFilter(AmIpFilter.java:185)
 at org.sparkproject.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) 
at 
org.sparkproject.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
 at 
org.sparkproject.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:548) 
at 
org.sparkproject.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)
 at 
org.sparkproject.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1434)
 at 
org.sparkproject.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)
 at 
org.sparkproject.jetty.servlet.ServletHandler.doScope(ServletHandler.java:501) 
at 
org.sparkproject.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)
 at 
org.sparkproject.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1349)
 at 
org.sparkproject.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
 at 
org.sparkproject.jetty.server.handler.gzip.GzipHandler.handle(GzipHandler.java:763)
 at 
org.sparkproject.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:234)
 at 
org.sparkproject.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
 at org.sparkproject.jetty.server.Server.handle(Server.java:516) at 
org.sparkproject.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:388) 
at org.sparkproject.jetty.server.HttpChannel.dispatch(HttpChannel.java:633) at 
org.sparkproject.jetty.server.HttpChannel.handle(HttpChannel.java:380) at 
org.sparkproject.jetty.server.HttpConnection.onFillable(HttpConnection.java:277)
 at 
org.sparkproject.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
 at org.sparkproject.jetty.io.FillInterest.fillable(FillInterest.java:105) at 
org.sparkproject.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104) at 
org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338)
 at 
org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315)
 at 
org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173)
 at 
org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:131)
 at 
org.sparkproject.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:386)
 at

[jira] [Created] (SPARK-45714) Spark UI throws 500 error when StructuredStreaming query filter is selected

2023-10-27 Thread Satyam Raj (Jira)

Satyam Raj created SPARK-45714:
--

 Summary: Spark UI throws 500 error when StructuredStreaming query 
filter is selected
 Key: SPARK-45714
 URL: https://issues.apache.org/jira/browse/SPARK-45714
 Project: Spark
  Issue Type: Bug
  Components: Java API
Affects Versions: 3.4.0
Reporter: Satyam Raj


```
h2. HTTP ERROR 500 java.lang.NullPointerException
||URI:|/StreamingQuery/active|
||STATUS:|500|
||MESSAGE:|java.lang.NullPointerException|
||SERVLET:|org.apache.spark.ui.JettyUtils$$anon$1-522a82dd|
||CAUSED BY:|java.lang.NullPointerException|
h3. Caused by:

 
{code:java}
java.lang.NullPointerException at 
org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9(StreamingQueryPage.scala:258)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.$anonfun$ordering$9$adapted(StreamingQueryPage.scala:258)
 at scala.math.Ordering$$anon$5.compare(Ordering.scala:253) at 
java.base/java.util.TimSort.binarySort(TimSort.java:296) at 
java.base/java.util.TimSort.sort(TimSort.java:239) at 
java.base/java.util.Arrays.sort(Arrays.java:1441) at 
scala.collection.SeqLike.sorted(SeqLike.scala:659) at 
scala.collection.SeqLike.sorted$(SeqLike.scala:647) at 
scala.collection.AbstractSeq.sorted(Seq.scala:45) at 
org.apache.spark.sql.streaming.ui.StreamingQueryDataSource.(StreamingQueryPage.scala:223)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.dataSource(StreamingQueryPage.scala:149)
 at org.apache.spark.ui.PagedTable.table(PagedTable.scala:101) at 
org.apache.spark.ui.PagedTable.table$(PagedTable.scala:100) at 
org.apache.spark.sql.streaming.ui.StreamingQueryPagedTable.table(StreamingQueryPage.scala:114)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPage.queryTable(StreamingQueryPage.scala:101)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPage.generateStreamingQueryTable(StreamingQueryPage.scala:60)
 at 
org.apache.spark.sql.streaming.ui.StreamingQueryPage.render(StreamingQueryPage.scala:38)
 at org.apache.spark.ui.WebUI.$anonfun$attachPage$1(WebUI.scala:90) at 
org.apache.spark.ui.JettyUtils$$anon$1.doGet(JettyUtils.scala:81) at 
javax.servlet.http.HttpServlet.service(HttpServlet.java:503) at 
javax.servlet.http.HttpServlet.service(HttpServlet.java:590) at 
org.sparkproject.jetty.servlet.ServletHolder.handle(ServletHolder.java:799) at 
org.sparkproject.jetty.servlet.ServletHandler$ChainEnd.doFilter(ServletHandler.java:1626)
 at 
org.apache.spark.ui.HttpSecurityFilter.doFilter(HttpSecurityFilter.scala:95) at 
org.sparkproject.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) at 
org.sparkproject.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
 at 
org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.doFilter(AmIpFilter.java:185)
 at org.sparkproject.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193) 
at 
org.sparkproject.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1601)
 at 
org.sparkproject.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:548) 
at 
org.sparkproject.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)
 at 
org.sparkproject.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1434)
 at 
org.sparkproject.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)
 at 
org.sparkproject.jetty.servlet.ServletHandler.doScope(ServletHandler.java:501) 
at 
org.sparkproject.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)
 at 
org.sparkproject.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1349)
 at 
org.sparkproject.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
 at 
org.sparkproject.jetty.server.handler.gzip.GzipHandler.handle(GzipHandler.java:763)
 at 
org.sparkproject.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:234)
 at 
org.sparkproject.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
 at org.sparkproject.jetty.server.Server.handle(Server.java:516) at 
org.sparkproject.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:388) 
at org.sparkproject.jetty.server.HttpChannel.dispatch(HttpChannel.java:633) at 
org.sparkproject.jetty.server.HttpChannel.handle(HttpChannel.java:380) at 
org.sparkproject.jetty.server.HttpConnection.onFillable(HttpConnection.java:277)
 at 
org.sparkproject.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
 at org.sparkproject.jetty.io.FillInterest.fillable(FillInterest.java:105) at 
org.sparkproject.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104) at 
org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338)
 at 
org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315)
 at 
org.sparkproject.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173)
 at

[jira] [Updated] (SPARK-45678) Cover BufferReleasingInputStream.available under tryOrFetchFailedException

2023-10-27 Thread L. C. Hsieh (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45678?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

L. C. Hsieh updated SPARK-45678:

Fix Version/s: 3.4.2
   3.5.1

> Cover BufferReleasingInputStream.available under tryOrFetchFailedException
> --
>
> Key: SPARK-45678
> URL: https://issues.apache.org/jira/browse/SPARK-45678
> Project: Spark
>  Issue Type: Bug
>  Components: Spark Core
>Affects Versions: 3.4.1, 3.5.0, 4.0.0
>Reporter: L. C. Hsieh
>Assignee: L. C. Hsieh
>Priority: Minor
>  Labels: pull-request-available
> Fix For: 3.4.2, 4.0.0, 3.5.1
>
>
> We have encountered shuffle data corruption issue:
> ```
> Caused by: java.io.IOException: FAILED_TO_UNCOMPRESS(5)
>   at org.xerial.snappy.SnappyNative.throw_error(SnappyNative.java:112)
>   at org.xerial.snappy.SnappyNative.rawUncompress(Native Method)
>   at org.xerial.snappy.Snappy.rawUncompress(Snappy.java:504)
>   at org.xerial.snappy.Snappy.uncompress(Snappy.java:543)
>   at 
> org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:450)
>   at 
> org.xerial.snappy.SnappyInputStream.available(SnappyInputStream.java:497)
>   at 
> org.apache.spark.storage.BufferReleasingInputStream.available(ShuffleBlockFetcherIterator.scala:1356)
>  ```
> Spark shuffle has capacity to detect corruption for a few stream op like 
> `read` and `skip`, such `IOException` in the stack trace will be rethrown as 
> `FetchFailedException` that will re-try the failed shuffle task. But in the 
> stack trace it is `available` that is not covered by the mechanism. So 
> no-retry has been happened and the Spark application just failed.
> As the `available` op will also involve data decompression, we should be able 
> to check it like `read` and `skip` do.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45678) Cover BufferReleasingInputStream.available under tryOrFetchFailedException

2023-10-27 Thread L. C. Hsieh (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45678?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

L. C. Hsieh updated SPARK-45678:

Issue Type: Bug  (was: Improvement)

> Cover BufferReleasingInputStream.available under tryOrFetchFailedException
> --
>
> Key: SPARK-45678
> URL: https://issues.apache.org/jira/browse/SPARK-45678
> Project: Spark
>  Issue Type: Bug
>  Components: Spark Core
>Affects Versions: 4.0.0
>Reporter: L. C. Hsieh
>Assignee: L. C. Hsieh
>Priority: Minor
>  Labels: pull-request-available
> Fix For: 4.0.0
>
>
> We have encountered shuffle data corruption issue:
> ```
> Caused by: java.io.IOException: FAILED_TO_UNCOMPRESS(5)
>   at org.xerial.snappy.SnappyNative.throw_error(SnappyNative.java:112)
>   at org.xerial.snappy.SnappyNative.rawUncompress(Native Method)
>   at org.xerial.snappy.Snappy.rawUncompress(Snappy.java:504)
>   at org.xerial.snappy.Snappy.uncompress(Snappy.java:543)
>   at 
> org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:450)
>   at 
> org.xerial.snappy.SnappyInputStream.available(SnappyInputStream.java:497)
>   at 
> org.apache.spark.storage.BufferReleasingInputStream.available(ShuffleBlockFetcherIterator.scala:1356)
>  ```
> Spark shuffle has capacity to detect corruption for a few stream op like 
> `read` and `skip`, such `IOException` in the stack trace will be rethrown as 
> `FetchFailedException` that will re-try the failed shuffle task. But in the 
> stack trace it is `available` that is not covered by the mechanism. So 
> no-retry has been happened and the Spark application just failed.
> As the `available` op will also involve data decompression, we should be able 
> to check it like `read` and `skip` do.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45678) Cover BufferReleasingInputStream.available under tryOrFetchFailedException

2023-10-27 Thread L. C. Hsieh (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45678?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

L. C. Hsieh updated SPARK-45678:

Affects Version/s: 3.5.0
   3.4.1

> Cover BufferReleasingInputStream.available under tryOrFetchFailedException
> --
>
> Key: SPARK-45678
> URL: https://issues.apache.org/jira/browse/SPARK-45678
> Project: Spark
>  Issue Type: Bug
>  Components: Spark Core
>Affects Versions: 3.4.1, 3.5.0, 4.0.0
>Reporter: L. C. Hsieh
>Assignee: L. C. Hsieh
>Priority: Minor
>  Labels: pull-request-available
> Fix For: 4.0.0
>
>
> We have encountered shuffle data corruption issue:
> ```
> Caused by: java.io.IOException: FAILED_TO_UNCOMPRESS(5)
>   at org.xerial.snappy.SnappyNative.throw_error(SnappyNative.java:112)
>   at org.xerial.snappy.SnappyNative.rawUncompress(Native Method)
>   at org.xerial.snappy.Snappy.rawUncompress(Snappy.java:504)
>   at org.xerial.snappy.Snappy.uncompress(Snappy.java:543)
>   at 
> org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:450)
>   at 
> org.xerial.snappy.SnappyInputStream.available(SnappyInputStream.java:497)
>   at 
> org.apache.spark.storage.BufferReleasingInputStream.available(ShuffleBlockFetcherIterator.scala:1356)
>  ```
> Spark shuffle has capacity to detect corruption for a few stream op like 
> `read` and `skip`, such `IOException` in the stack trace will be rethrown as 
> `FetchFailedException` that will re-try the failed shuffle task. But in the 
> stack trace it is `available` that is not covered by the mechanism. So 
> no-retry has been happened and the Spark application just failed.
> As the `available` op will also involve data decompression, we should be able 
> to check it like `read` and `skip` do.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Assigned] (SPARK-45678) Cover BufferReleasingInputStream.available under tryOrFetchFailedException

2023-10-27 Thread Chao Sun (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45678?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Chao Sun reassigned SPARK-45678:


Assignee: L. C. Hsieh

> Cover BufferReleasingInputStream.available under tryOrFetchFailedException
> --
>
> Key: SPARK-45678
> URL: https://issues.apache.org/jira/browse/SPARK-45678
> Project: Spark
>  Issue Type: Improvement
>  Components: Spark Core
>Affects Versions: 4.0.0
>Reporter: L. C. Hsieh
>Assignee: L. C. Hsieh
>Priority: Minor
>  Labels: pull-request-available
>
> We have encountered shuffle data corruption issue:
> ```
> Caused by: java.io.IOException: FAILED_TO_UNCOMPRESS(5)
>   at org.xerial.snappy.SnappyNative.throw_error(SnappyNative.java:112)
>   at org.xerial.snappy.SnappyNative.rawUncompress(Native Method)
>   at org.xerial.snappy.Snappy.rawUncompress(Snappy.java:504)
>   at org.xerial.snappy.Snappy.uncompress(Snappy.java:543)
>   at 
> org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:450)
>   at 
> org.xerial.snappy.SnappyInputStream.available(SnappyInputStream.java:497)
>   at 
> org.apache.spark.storage.BufferReleasingInputStream.available(ShuffleBlockFetcherIterator.scala:1356)
>  ```
> Spark shuffle has capacity to detect corruption for a few stream op like 
> `read` and `skip`, such `IOException` in the stack trace will be rethrown as 
> `FetchFailedException` that will re-try the failed shuffle task. But in the 
> stack trace it is `available` that is not covered by the mechanism. So 
> no-retry has been happened and the Spark application just failed.
> As the `available` op will also involve data decompression, we should be able 
> to check it like `read` and `skip` do.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Resolved] (SPARK-45678) Cover BufferReleasingInputStream.available under tryOrFetchFailedException

2023-10-27 Thread Chao Sun (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45678?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Chao Sun resolved SPARK-45678.
--
Fix Version/s: 4.0.0
   Resolution: Fixed

Issue resolved by pull request 43543
[https://github.com/apache/spark/pull/43543]

> Cover BufferReleasingInputStream.available under tryOrFetchFailedException
> --
>
> Key: SPARK-45678
> URL: https://issues.apache.org/jira/browse/SPARK-45678
> Project: Spark
>  Issue Type: Improvement
>  Components: Spark Core
>Affects Versions: 4.0.0
>Reporter: L. C. Hsieh
>Assignee: L. C. Hsieh
>Priority: Minor
>  Labels: pull-request-available
> Fix For: 4.0.0
>
>
> We have encountered shuffle data corruption issue:
> ```
> Caused by: java.io.IOException: FAILED_TO_UNCOMPRESS(5)
>   at org.xerial.snappy.SnappyNative.throw_error(SnappyNative.java:112)
>   at org.xerial.snappy.SnappyNative.rawUncompress(Native Method)
>   at org.xerial.snappy.Snappy.rawUncompress(Snappy.java:504)
>   at org.xerial.snappy.Snappy.uncompress(Snappy.java:543)
>   at 
> org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:450)
>   at 
> org.xerial.snappy.SnappyInputStream.available(SnappyInputStream.java:497)
>   at 
> org.apache.spark.storage.BufferReleasingInputStream.available(ShuffleBlockFetcherIterator.scala:1356)
>  ```
> Spark shuffle has capacity to detect corruption for a few stream op like 
> `read` and `skip`, such `IOException` in the stack trace will be rethrown as 
> `FetchFailedException` that will re-try the failed shuffle task. But in the 
> stack trace it is `available` that is not covered by the mechanism. So 
> no-retry has been happened and the Spark application just failed.
> As the `available` op will also involve data decompression, we should be able 
> to check it like `read` and `skip` do.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Resolved] (SPARK-45705) Fix flaky test: Status of a failed DDL/DML with no jobs should be FAILED

2023-10-27 Thread XiDuo You (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45705?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

XiDuo You resolved SPARK-45705.
---
Fix Version/s: 4.0.0
   Resolution: Fixed

Issue resolved by pull request 43554
[https://github.com/apache/spark/pull/43554]

> Fix flaky test: Status of a failed DDL/DML with no jobs should be FAILED 
> -
>
> Key: SPARK-45705
> URL: https://issues.apache.org/jira/browse/SPARK-45705
> Project: Spark
>  Issue Type: Improvement
>  Components: Tests
>Affects Versions: 4.0.0
>Reporter: Kent Yao
>Priority: Major
>  Labels: pull-request-available
> Fix For: 4.0.0
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45713) Support registering Python data sources

2023-10-27 Thread ASF GitHub Bot (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45713?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated SPARK-45713:
---
Labels: pull-request-available  (was: )

> Support registering Python data sources
> ---
>
> Key: SPARK-45713
> URL: https://issues.apache.org/jira/browse/SPARK-45713
> Project: Spark
>  Issue Type: Sub-task
>  Components: PySpark
>Affects Versions: 4.0.0
>Reporter: Allison Wang
>Priority: Major
>  Labels: pull-request-available
>
> Support registering Python data sources.
> Users can register a Python data source and later use reference it using its 
> name.
> {code:java}
> class MyDataSource(DataSource):
> @classmethod
> def name(cls):
> return "my-data-source"
> spark.dataSource.register(MyDataSource){code}
> Users can then use the name of the data source as the format (will be 
> supported in SPARK-45639)
> {code:java}
> spark.read.format("my-data-source").load(){code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45713) Support registering Python data sources

2023-10-27 Thread Allison Wang (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45713?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Allison Wang updated SPARK-45713:
-
Description: 
Support registering Python data sources.

Users can register a Python data source and later use reference it using its 
name.
{code:java}
class MyDataSource(DataSource):
@classmethod
def name(cls):
return "my-data-source"

spark.dataSource.register(MyDataSource){code}
Users can then use the name of the data source as the format (will be supported 
in SPARK-45639)
{code:java}
spark.read.format("my-data-source").load(){code}

  was:
Support registering Python data sources.

Users can register a Python data source and later use reference it using its 
name.
{code:java}
class MyDataSource(DataSource):
@classmethod
def name(cls):
return "my-data-source"

spark.dataSource.register(MyDataSource){code}
Users can then use the name of the data source as the format SPARK-45639
{code:java}
spark.read.format("my-data-source").load(){code}


> Support registering Python data sources
> ---
>
> Key: SPARK-45713
> URL: https://issues.apache.org/jira/browse/SPARK-45713
> Project: Spark
>  Issue Type: Sub-task
>  Components: PySpark
>Affects Versions: 4.0.0
>Reporter: Allison Wang
>Priority: Major
>
> Support registering Python data sources.
> Users can register a Python data source and later use reference it using its 
> name.
> {code:java}
> class MyDataSource(DataSource):
> @classmethod
> def name(cls):
> return "my-data-source"
> spark.dataSource.register(MyDataSource){code}
> Users can then use the name of the data source as the format (will be 
> supported in SPARK-45639)
> {code:java}
> spark.read.format("my-data-source").load(){code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45713) Support registering Python data sources

2023-10-27 Thread Allison Wang (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45713?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Allison Wang updated SPARK-45713:
-
Description: 
Support registering Python data sources.

Users can register a Python data source and later use reference it using its 
name.
{code:java}
class MyDataSource(DataSource):
@classmethod
def name(cls):
return "my-data-source"

spark.dataSource.register(MyDataSource){code}
Users can then use the name of the data source as the format SPARK-45639
{code:java}
spark.read.format("my-data-source").load(){code}

  was:
Support registering Python data sources.

Users can register a Python data source and later use reference it using its 
name.
{code:java}
class MyDataSource(DataSource):
@classmethod
def name(cls):
return "my-data-source"

spark.dataSource.register(MyDataSource){code}
Users can then use the name of the data source as the format
{code:java}
spark.read.format("my-data-source").load(){code}


> Support registering Python data sources
> ---
>
> Key: SPARK-45713
> URL: https://issues.apache.org/jira/browse/SPARK-45713
> Project: Spark
>  Issue Type: Sub-task
>  Components: PySpark
>Affects Versions: 4.0.0
>Reporter: Allison Wang
>Priority: Major
>
> Support registering Python data sources.
> Users can register a Python data source and later use reference it using its 
> name.
> {code:java}
> class MyDataSource(DataSource):
> @classmethod
> def name(cls):
> return "my-data-source"
> spark.dataSource.register(MyDataSource){code}
> Users can then use the name of the data source as the format SPARK-45639
> {code:java}
> spark.read.format("my-data-source").load(){code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Created] (SPARK-45713) Support registering Python data sources

2023-10-27 Thread Allison Wang (Jira)

Allison Wang created SPARK-45713:


 Summary: Support registering Python data sources
 Key: SPARK-45713
 URL: https://issues.apache.org/jira/browse/SPARK-45713
 Project: Spark
  Issue Type: Sub-task
  Components: PySpark
Affects Versions: 4.0.0
Reporter: Allison Wang


Support registering Python data sources.

Users can register a Python data source and later use reference it using its 
name.
{code:java}
class MyDataSource(DataSource):
@classmethod
def name(cls):
return "my-data-source"

spark.dataSource.register(MyDataSource){code}
Users can then use the name of the data source as the format
{code:java}
spark.read.format("my-data-source").load(){code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45639) Support loading Python data sources in DataFrameReader

2023-10-27 Thread Allison Wang (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45639?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Allison Wang updated SPARK-45639:
-
Description: 
Allow users to read from a Python data source using 
`spark.read.format(...).load()` in PySpark. For example

Users can extend the DataSource and the DataSourceReader classes to create 
their own Python data source reader and use them in PySpark:
{code:java}
class MyReader(DataSourceReader):
    def read(self, partition):
        yield (0, 1)

class MyDataSource(DataSource):
    def schema(self):
        return "id INT, value INT"
    
def reader(self, schema):
        return MyReader()

df = spark.read.format("MyDataSource").load()
df.show()
+---+-+
| id|value|
+---+-+
|  0|    1|
+---+-+
{code}
 

  was:
Allow users to read from a Python data source using 
`spark.read.format(...).load()`

For example

Users can extend the DataSource and the DataSourceReader classes to create 
their own Python data source reader and use them in PySpark:
{code:java}
class MyReader(DataSourceReader):
    def read(self, partition):
        yield (0, 1)

class MyDataSource(DataSource):
    def schema(self):
        return "id INT, value INT"
    def reader(self, schema):
        return MyReader()

df = spark.read.format("MyDataSource").load()
df.show()
+---+-+
| id|value|
+---+-+
|  0|    1|
+---+-+
{code}
 


> Support loading Python data sources in DataFrameReader
> --
>
> Key: SPARK-45639
> URL: https://issues.apache.org/jira/browse/SPARK-45639
> Project: Spark
>  Issue Type: Sub-task
>  Components: PySpark
>Affects Versions: 4.0.0
>Reporter: Allison Wang
>Priority: Major
>
> Allow users to read from a Python data source using 
> `spark.read.format(...).load()` in PySpark. For example
> Users can extend the DataSource and the DataSourceReader classes to create 
> their own Python data source reader and use them in PySpark:
> {code:java}
> class MyReader(DataSourceReader):
>     def read(self, partition):
>         yield (0, 1)
> class MyDataSource(DataSource):
>     def schema(self):
>         return "id INT, value INT"
>     
> def reader(self, schema):
>         return MyReader()
> df = spark.read.format("MyDataSource").load()
> df.show()
> +---+-+
> | id|value|
> +---+-+
> |  0|    1|
> +---+-+
> {code}
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45639) Support loading Python data sources in DataFrameReader

2023-10-27 Thread Allison Wang (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45639?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Allison Wang updated SPARK-45639:
-
Summary: Support loading Python data sources in DataFrameReader  (was: 
Support Python data source in DataFrameReader)

> Support loading Python data sources in DataFrameReader
> --
>
> Key: SPARK-45639
> URL: https://issues.apache.org/jira/browse/SPARK-45639
> Project: Spark
>  Issue Type: Sub-task
>  Components: PySpark
>Affects Versions: 4.0.0
>Reporter: Allison Wang
>Priority: Major
>
> Allow users to read from a Python data source using 
> `spark.read.format(...).load()`
> For example
> Users can extend the DataSource and the DataSourceReader classes to create 
> their own Python data source reader and use them in PySpark:
> {code:java}
> class MyReader(DataSourceReader):
>     def read(self, partition):
>         yield (0, 1)
> class MyDataSource(DataSource):
>     def schema(self):
>         return "id INT, value INT"
>     def reader(self, schema):
>         return MyReader()
> df = spark.read.format("MyDataSource").load()
> df.show()
> +---+-+
> | id|value|
> +---+-+
> |  0|    1|
> +---+-+
> {code}
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Created] (SPARK-45712) Provide a command line flag to override the log4j properties file

2023-10-27 Thread Holden Karau (Jira)

Holden Karau created SPARK-45712:


 Summary: Provide a command line flag to override the log4j 
properties file
 Key: SPARK-45712
 URL: https://issues.apache.org/jira/browse/SPARK-45712
 Project: Spark
  Issue Type: Improvement
  Components: Spark Core
Affects Versions: 4.0.0
Reporter: Holden Karau


Override log4j properties is kind of annoying and depends on putting a file in 
the right place, we should let users specify which log4j properties file to use 
in spark-submit and friends.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45684) Clean up the deprecated API usage related to `SeqOps`

2023-10-27 Thread Yang Jie (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45684?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Yang Jie updated SPARK-45684:
-
Description: 
* method transform in trait SeqOps is deprecated (since 2.13.0)
 * method reverseMap in trait SeqOps is deprecated (since 2.13.0)
 * method union in trait SeqOps is deprecated (since 2.13.0)

{code:java}
[warn] 
/Users/yangjie01/SourceCode/git/spark-mine-sbt/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala:675:15:
 method transform in trait SeqOps is deprecated (since 2.13.0): Use 
`mapInPlace` on an `IndexedSeq` instead
[warn] Applicable -Wconf / @nowarn filters for this warning: msg=, cat=deprecation, 
site=org.apache.spark.ml.classification.LogisticRegression.train.$anonfun, 
origin=scala.collection.mutable.SeqOps.transform, version=2.13.0
[warn]       centers.transform(_ / numCoefficientSets)
[warn]               ^ {code}

  was:
* method transform in trait SeqOps is deprecated (since 2.13.0)
 * method reverseMap in trait SeqOps is deprecated (since 2.13.0)
 * method retain in trait SetOps is deprecated (since 2.13.0)
 * method union in trait SeqOps is deprecated (since 2.13.0)

{code:java}
[warn] 
/Users/yangjie01/SourceCode/git/spark-mine-sbt/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala:675:15:
 method transform in trait SeqOps is deprecated (since 2.13.0): Use 
`mapInPlace` on an `IndexedSeq` instead
[warn] Applicable -Wconf / @nowarn filters for this warning: msg=, cat=deprecation, 
site=org.apache.spark.ml.classification.LogisticRegression.train.$anonfun, 
origin=scala.collection.mutable.SeqOps.transform, version=2.13.0
[warn]       centers.transform(_ / numCoefficientSets)
[warn]               ^ {code}


> Clean up the deprecated API usage related to `SeqOps`
> -
>
> Key: SPARK-45684
> URL: https://issues.apache.org/jira/browse/SPARK-45684
> Project: Spark
>  Issue Type: Sub-task
>  Components: Build, Spark Core, SQL
>Affects Versions: 4.0.0
>Reporter: Yang Jie
>Priority: Major
>
> * method transform in trait SeqOps is deprecated (since 2.13.0)
>  * method reverseMap in trait SeqOps is deprecated (since 2.13.0)
>  * method union in trait SeqOps is deprecated (since 2.13.0)
> {code:java}
> [warn] 
> /Users/yangjie01/SourceCode/git/spark-mine-sbt/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala:675:15:
>  method transform in trait SeqOps is deprecated (since 2.13.0): Use 
> `mapInPlace` on an `IndexedSeq` instead
> [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, 
> site=org.apache.spark.ml.classification.LogisticRegression.train.$anonfun, 
> origin=scala.collection.mutable.SeqOps.transform, version=2.13.0
> [warn]       centers.transform(_ / numCoefficientSets)
> [warn]               ^ {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45685) Use `LazyList` instead of `Stream`

2023-10-27 Thread ASF GitHub Bot (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45685?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated SPARK-45685:
---
Labels: pull-request-available  (was: )

> Use `LazyList` instead of `Stream`
> --
>
> Key: SPARK-45685
> URL: https://issues.apache.org/jira/browse/SPARK-45685
> Project: Spark
>  Issue Type: Sub-task
>  Components: Build, Spark Core, SQL
>Affects Versions: 4.0.0
>Reporter: Yang Jie
>Priority: Major
>  Labels: pull-request-available
>
> * class Stream in package immutable is deprecated (since 2.13.0)
>  * object Stream in package immutable is deprecated (since 2.13.0)
>  * type Stream in package scala is deprecated (since 2.13.0)
>  * value Stream in package scala is deprecated (since 2.13.0)
>  * method append in class Stream is deprecated (since 2.13.0)
>  * method toStream in trait IterableOnceOps is deprecated (since 2.13.0)
>  
> {code:java}
> [warn] 
> /Users/yangjie01/SourceCode/git/spark-mine-sbt/sql/core/src/test/scala/org/apache/spark/sql/GenTPCDSData.scala:49:20:
>  class Stream in package immutable is deprecated (since 2.13.0): Use LazyList 
> (which is fully lazy) instead of Stream (which has a lazy tail only)
> [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, 
> site=org.apache.spark.sql.BlockingLineStream.BlockingStreamed.stream, 
> origin=scala.collection.immutable.Stream, version=2.13.0
> [warn]     val stream: () => Stream[T])
> [warn]                    ^ {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Resolved] (SPARK-45649) Unify the prepare framework for `OffsetWindowFunctionFrame`

2023-10-27 Thread Wenchen Fan (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45649?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Wenchen Fan resolved SPARK-45649.
-
Fix Version/s: 4.0.0
   Resolution: Fixed

Issue resolved by pull request 43507
[https://github.com/apache/spark/pull/43507]

> Unify the prepare framework for `OffsetWindowFunctionFrame`
> ---
>
> Key: SPARK-45649
> URL: https://issues.apache.org/jira/browse/SPARK-45649
> Project: Spark
>  Issue Type: Improvement
>  Components: SQL
>Affects Versions: 4.0.0
>Reporter: Jiaan Geng
>Assignee: Jiaan Geng
>Priority: Major
>  Labels: pull-request-available
> Fix For: 4.0.0
>
>
> Currently, the implementation the `prepare` of  all the 
> `OffsetWindowFunctionFrame` have the same code logic show below.
> ```
>   override def prepare(rows: ExternalAppendOnlyUnsafeRowArray): Unit = {
> if (offset > rows.length) {
>   fillDefaultValue(EmptyRow)
> } else {
>   resetStates(rows)
>   if (ignoreNulls) {
> ...
>   } else {
> ...
>   }
> }
>   }
> ```



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45711) Introduce a mapper for avro compression codecs

2023-10-27 Thread ASF GitHub Bot (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45711?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated SPARK-45711:
---
Labels: pull-request-available  (was: )

> Introduce a mapper for avro compression codecs
> --
>
> Key: SPARK-45711
> URL: https://issues.apache.org/jira/browse/SPARK-45711
> Project: Spark
>  Issue Type: Improvement
>  Components: SQL
>Affects Versions: 4.0.0
>Reporter: Jiaan Geng
>Assignee: Jiaan Geng
>Priority: Major
>  Labels: pull-request-available
>
> Currently, Spark supported all the avro compression codecs, but the avro 
> supported compression codecs and spark supported are not completely 
> one-on-one due to Spark introduce the compression codecs UNCOMPRESSED.
> There are a lot of magic strings copy from avro compression codecs. This 
> issue lead to developers need to manually maintain its consistency. It is 
> easy to make mistakes and reduce development efficiency.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45711) Introduce a mapper for avro compression codecs

2023-10-27 Thread Jiaan Geng (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45711?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Jiaan Geng updated SPARK-45711:
---
Description: 
Currently, Spark supported all the avro compression codecs, but the avro 
supported compression codecs and spark supported are not completely one-on-one 
due to Spark introduce the compression codecs UNCOMPRESSED.

There are a lot of magic strings copy from avro compression codecs. This issue 
lead to developers need to manually maintain its consistency. It is easy to 
make mistakes and reduce development efficiency.

  was:
Currently, Spark supported all the avro compression codecs, but the avro 
supported compression codecs and spark supported are not completely one-on-one 
due to Spark introduce the compression codecs UNCOMPRESSED.

There are a lot of magic strings copy from orc compression codecs. This issue 
lead to developers need to manually maintain its consistency. It is easy to 
make mistakes and reduce development efficiency.


> Introduce a mapper for avro compression codecs
> --
>
> Key: SPARK-45711
> URL: https://issues.apache.org/jira/browse/SPARK-45711
> Project: Spark
>  Issue Type: Improvement
>  Components: SQL
>Affects Versions: 4.0.0
>Reporter: Jiaan Geng
>Assignee: Jiaan Geng
>Priority: Major
>
> Currently, Spark supported all the avro compression codecs, but the avro 
> supported compression codecs and spark supported are not completely 
> one-on-one due to Spark introduce the compression codecs UNCOMPRESSED.
> There are a lot of magic strings copy from avro compression codecs. This 
> issue lead to developers need to manually maintain its consistency. It is 
> easy to make mistakes and reduce development efficiency.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45711) Introduce a mapper for avro compression codecs

2023-10-27 Thread Jiaan Geng (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45711?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Jiaan Geng updated SPARK-45711:
---
Description: 
Currently, Spark supported all the avro compression codecs, but the avro 
supported compression codecs and spark supported are not completely one-on-one 
due to Spark introduce the compression codecs UNCOMPRESSED.

There are a lot of magic strings copy from orc compression codecs. This issue 
lead to developers need to manually maintain its consistency. It is easy to 
make mistakes and reduce development efficiency.

  was:
Currently, Spark supported all the orc compression codecs, but the orc 
supported compression codecs and spark supported are not completely one-on-one 
due to Spark introduce two compression codecs none and UNCOMPRESSED.

There are a lot of magic strings copy from orc compression codecs. This issue 
lead to developers need to manually maintain its consistency. It is easy to 
make mistakes and reduce development efficiency.


> Introduce a mapper for avro compression codecs
> --
>
> Key: SPARK-45711
> URL: https://issues.apache.org/jira/browse/SPARK-45711
> Project: Spark
>  Issue Type: Improvement
>  Components: SQL
>Affects Versions: 4.0.0
>Reporter: Jiaan Geng
>Assignee: Jiaan Geng
>Priority: Major
>
> Currently, Spark supported all the avro compression codecs, but the avro 
> supported compression codecs and spark supported are not completely 
> one-on-one due to Spark introduce the compression codecs UNCOMPRESSED.
> There are a lot of magic strings copy from orc compression codecs. This issue 
> lead to developers need to manually maintain its consistency. It is easy to 
> make mistakes and reduce development efficiency.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45711) Introduce a mapper for avro compression codecs

2023-10-27 Thread Jiaan Geng (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45711?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Jiaan Geng updated SPARK-45711:
---
Description: 
Currently, Spark supported all the orc compression codecs, but the orc 
supported compression codecs and spark supported are not completely one-on-one 
due to Spark introduce two compression codecs none and UNCOMPRESSED.

There are a lot of magic strings copy from orc compression codecs. This issue 
lead to developers need to manually maintain its consistency. It is easy to 
make mistakes and reduce development efficiency.

> Introduce a mapper for avro compression codecs
> --
>
> Key: SPARK-45711
> URL: https://issues.apache.org/jira/browse/SPARK-45711
> Project: Spark
>  Issue Type: Improvement
>  Components: SQL
>Affects Versions: 4.0.0
>Reporter: Jiaan Geng
>Assignee: Jiaan Geng
>Priority: Major
>
> Currently, Spark supported all the orc compression codecs, but the orc 
> supported compression codecs and spark supported are not completely 
> one-on-one due to Spark introduce two compression codecs none and 
> UNCOMPRESSED.
> There are a lot of magic strings copy from orc compression codecs. This issue 
> lead to developers need to manually maintain its consistency. It is easy to 
> make mistakes and reduce development efficiency.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Resolved] (SPARK-45706) Makes entire Binder build fails fast during setting up

2023-10-27 Thread Hyukjin Kwon (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45706?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Hyukjin Kwon resolved SPARK-45706.
--
Fix Version/s: 4.0.0
   3.5.1
 Assignee: Hyukjin Kwon
   Resolution: Fixed

Fixed in https://github.com/apache/spark/pull/43553

> Makes entire Binder build fails fast during setting up
> --
>
> Key: SPARK-45706
> URL: https://issues.apache.org/jira/browse/SPARK-45706
> Project: Spark
>  Issue Type: New Feature
>  Components: Documentation, PySpark
>Affects Versions: 3.5.0
>Reporter: Hyukjin Kwon
>Assignee: Hyukjin Kwon
>Priority: Major
>  Labels: pull-request-available
> Fix For: 4.0.0, 3.5.1
>
>
> Binder build is currently broken:
> https://mybinder.org/v2/gh/apache/spark/ce5ddad9903?filepath=python%2Fdocs%2Fsource%2Fgetting_started%2Fquickstart_df.ipynb
> Seems like we uploaded PySpark late into PyPI, and the installation steps 
> just slightly ignored the failure.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45710) Assign name to _LEGACY_ERROR_TEMP_21[59,60,61,62]

2023-10-27 Thread Deng Ziming (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45710?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Deng Ziming updated SPARK-45710:

Description: 
Choose a proper name for the error class 
{*}_LEGACY_ERROR_TEMP_21[59,60,61,62]{*}defined in 
{*}core/src/main/resources/error/error-classes.json{*}. The name should be 
short but complete (look at the example in error-classes.json).

Add a test which triggers the error from user code if such test still doesn't 
exist. Check exception fields by using {*}checkError(){*}. The last function 
checks valuable error fields only, and avoids dependencies from error text 
message. In this way, tech editors can modify error format in 
error-classes.json, and don't worry of Spark's internal tests. Migrate other 
tests that might trigger the error onto checkError().

If you cannot reproduce the error from user space (using SQL query), replace 
the error by an internal error, see {*}SparkException.internalError(){*}.

Improve the error message format in error-classes.json if the current is not 
clear. Propose a solution to users how to avoid and fix such kind of errors.

Please, look at the PR below as examples:
 * [https://github.com/apache/spark/pull/38685]
 * [https://github.com/apache/spark/pull/38656]
 * [https://github.com/apache/spark/pull/38490]

  was:
Choose a proper name for the error class *_LEGACY_ERROR_TEMP_2153* defined in 
{*}core/src/main/resources/error/error-classes.json{*}. The name should be 
short but complete (look at the example in error-classes.json).

Add a test which triggers the error from user code if such test still doesn't 
exist. Check exception fields by using {*}checkError(){*}. The last function 
checks valuable error fields only, and avoids dependencies from error text 
message. In this way, tech editors can modify error format in 
error-classes.json, and don't worry of Spark's internal tests. Migrate other 
tests that might trigger the error onto checkError().

If you cannot reproduce the error from user space (using SQL query), replace 
the error by an internal error, see {*}SparkException.internalError(){*}.

Improve the error message format in error-classes.json if the current is not 
clear. Propose a solution to users how to avoid and fix such kind of errors.

Please, look at the PR below as examples:
 * [https://github.com/apache/spark/pull/38685]
 * [https://github.com/apache/spark/pull/38656]
 * [https://github.com/apache/spark/pull/38490]


> Assign name to _LEGACY_ERROR_TEMP_21[59,60,61,62]
> -
>
> Key: SPARK-45710
> URL: https://issues.apache.org/jira/browse/SPARK-45710
> Project: Spark
>  Issue Type: Sub-task
>  Components: SQL
>Affects Versions: 3.5.0
>Reporter: Deng Ziming
>Assignee: Deng Ziming
>Priority: Major
>  Labels: pull-request-available
> Fix For: 4.0.0
>
>
> Choose a proper name for the error class 
> {*}_LEGACY_ERROR_TEMP_21[59,60,61,62]{*}defined in 
> {*}core/src/main/resources/error/error-classes.json{*}. The name should be 
> short but complete (look at the example in error-classes.json).
> Add a test which triggers the error from user code if such test still doesn't 
> exist. Check exception fields by using {*}checkError(){*}. The last function 
> checks valuable error fields only, and avoids dependencies from error text 
> message. In this way, tech editors can modify error format in 
> error-classes.json, and don't worry of Spark's internal tests. Migrate other 
> tests that might trigger the error onto checkError().
> If you cannot reproduce the error from user space (using SQL query), replace 
> the error by an internal error, see {*}SparkException.internalError(){*}.
> Improve the error message format in error-classes.json if the current is not 
> clear. Propose a solution to users how to avoid and fix such kind of errors.
> Please, look at the PR below as examples:
>  * [https://github.com/apache/spark/pull/38685]
>  * [https://github.com/apache/spark/pull/38656]
>  * [https://github.com/apache/spark/pull/38490]



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Created] (SPARK-45710) Assign name to _LEGACY_ERROR_TEMP__LEGACY_ERROR_TEMP_21[59,60,61,62]

2023-10-27 Thread Deng Ziming (Jira)

Deng Ziming created SPARK-45710:
---

 Summary: Assign name to 
_LEGACY_ERROR_TEMP__LEGACY_ERROR_TEMP_21[59,60,61,62]
 Key: SPARK-45710
 URL: https://issues.apache.org/jira/browse/SPARK-45710
 Project: Spark
  Issue Type: Sub-task
  Components: SQL
Affects Versions: 3.5.0
Reporter: Deng Ziming
Assignee: Deng Ziming
 Fix For: 4.0.0


Choose a proper name for the error class *_LEGACY_ERROR_TEMP_2153* defined in 
{*}core/src/main/resources/error/error-classes.json{*}. The name should be 
short but complete (look at the example in error-classes.json).

Add a test which triggers the error from user code if such test still doesn't 
exist. Check exception fields by using {*}checkError(){*}. The last function 
checks valuable error fields only, and avoids dependencies from error text 
message. In this way, tech editors can modify error format in 
error-classes.json, and don't worry of Spark's internal tests. Migrate other 
tests that might trigger the error onto checkError().

If you cannot reproduce the error from user space (using SQL query), replace 
the error by an internal error, see {*}SparkException.internalError(){*}.

Improve the error message format in error-classes.json if the current is not 
clear. Propose a solution to users how to avoid and fix such kind of errors.

Please, look at the PR below as examples:
 * [https://github.com/apache/spark/pull/38685]
 * [https://github.com/apache/spark/pull/38656]
 * [https://github.com/apache/spark/pull/38490]



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45710) Assign name to _LEGACY_ERROR_TEMP_21[59,60,61,62]

2023-10-27 Thread Deng Ziming (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45710?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Deng Ziming updated SPARK-45710:

Summary: Assign name to _LEGACY_ERROR_TEMP_21[59,60,61,62]  (was: Assign 
name to _LEGACY_ERROR_TEMP__LEGACY_ERROR_TEMP_21[59,60,61,62])

> Assign name to _LEGACY_ERROR_TEMP_21[59,60,61,62]
> -
>
> Key: SPARK-45710
> URL: https://issues.apache.org/jira/browse/SPARK-45710
> Project: Spark
>  Issue Type: Sub-task
>  Components: SQL
>Affects Versions: 3.5.0
>Reporter: Deng Ziming
>Assignee: Deng Ziming
>Priority: Major
>  Labels: pull-request-available
> Fix For: 4.0.0
>
>
> Choose a proper name for the error class *_LEGACY_ERROR_TEMP_2153* defined in 
> {*}core/src/main/resources/error/error-classes.json{*}. The name should be 
> short but complete (look at the example in error-classes.json).
> Add a test which triggers the error from user code if such test still doesn't 
> exist. Check exception fields by using {*}checkError(){*}. The last function 
> checks valuable error fields only, and avoids dependencies from error text 
> message. In this way, tech editors can modify error format in 
> error-classes.json, and don't worry of Spark's internal tests. Migrate other 
> tests that might trigger the error onto checkError().
> If you cannot reproduce the error from user space (using SQL query), replace 
> the error by an internal error, see {*}SparkException.internalError(){*}.
> Improve the error message format in error-classes.json if the current is not 
> clear. Propose a solution to users how to avoid and fix such kind of errors.
> Please, look at the PR below as examples:
>  * [https://github.com/apache/spark/pull/38685]
>  * [https://github.com/apache/spark/pull/38656]
>  * [https://github.com/apache/spark/pull/38490]



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45708) Retry mvn deploy failures

2023-10-27 Thread ASF GitHub Bot (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45708?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated SPARK-45708:
---
Labels: pull-request-available  (was: )

> Retry mvn deploy failures
> -
>
> Key: SPARK-45708
> URL: https://issues.apache.org/jira/browse/SPARK-45708
> Project: Spark
>  Issue Type: Bug
>  Components: Build
>Affects Versions: 4.0.0
>Reporter: Enrico Minack
>Priority: Major
>  Labels: pull-request-available
>
> It is common to see {{408 Request Timeout}} and {{502 Proxy Error}} when 
> deploying artifacts to Apache snapshot repository: 
> https://github.com/apache/spark/actions/runs/6437635317
> {quote}
> 2023-10-07T01:09:51.1719360Z [ERROR] Failed to execute goal 
> org.apache.maven.plugins:maven-deploy-plugin:3.0.0-M1:deploy (default-deploy) 
> on project spark-streaming_2.13: ArtifactDeployerException: Failed to deploy 
> artifacts: Could not transfer artifact 
> org.apache.spark:spark-streaming_2.13:jar:tests:3.3.4-20231007.005815-57 
> from/to apache.snapshots.https 
> (https://repository.apache.org/content/repositories/snapshots): transfer 
> failed for 
> https://repository.apache.org/content/repositories/snapshots/org/apache/spark/spark-streaming_2.13/3.3.4-SNAPSHOT/spark-streaming_2.13-3.3.4-20231007.005815-57-tests.jar,
>  status: 502 Proxy Error -> [Help 1]
> {quote}
> {quote}
> 2023-10-07T01:11:48.5651501Z [ERROR] Failed to execute goal 
> org.apache.maven.plugins:maven-deploy-plugin:3.0.0:deploy (default-deploy) on 
> project spark-connect-common_2.12: Failed to deploy artifacts: Could not 
> transfer artifact 
> org.apache.spark:spark-connect-common_2.12:xml:cyclonedx:3.4.2-20231007.001102-103
>  from/to apache.snapshots.https 
> (https://repository.apache.org/content/repositories/snapshots): transfer 
> failed for 
> https://repository.apache.org/content/repositories/snapshots/org/apache/spark/spark-connect-common_2.12/3.4.2-SNAPSHOT/spark-connect-common_2.12-3.4.2-20231007.001102-103-cyclonedx.xml,
>  status: 408 Request Timeout -> [Help 1]
> {quote}
> Such errors should be retried by `mvn deploy`.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Created] (SPARK-45708) Retry mvn deploy failures

2023-10-27 Thread Enrico Minack (Jira)

Enrico Minack created SPARK-45708:
-

 Summary: Retry mvn deploy failures
 Key: SPARK-45708
 URL: https://issues.apache.org/jira/browse/SPARK-45708
 Project: Spark
  Issue Type: Bug
  Components: Build
Affects Versions: 4.0.0
Reporter: Enrico Minack


It is common to see {{408 Request Timeout}} and {{502 Proxy Error}} when 
deploying artifacts to Apache snapshot repository: 
https://github.com/apache/spark/actions/runs/6437635317

{quote}
2023-10-07T01:09:51.1719360Z [ERROR] Failed to execute goal 
org.apache.maven.plugins:maven-deploy-plugin:3.0.0-M1:deploy (default-deploy) 
on project spark-streaming_2.13: ArtifactDeployerException: Failed to deploy 
artifacts: Could not transfer artifact 
org.apache.spark:spark-streaming_2.13:jar:tests:3.3.4-20231007.005815-57 
from/to apache.snapshots.https 
(https://repository.apache.org/content/repositories/snapshots): transfer failed 
for 
https://repository.apache.org/content/repositories/snapshots/org/apache/spark/spark-streaming_2.13/3.3.4-SNAPSHOT/spark-streaming_2.13-3.3.4-20231007.005815-57-tests.jar,
 status: 502 Proxy Error -> [Help 1]
{quote}

{quote}
2023-10-07T01:11:48.5651501Z [ERROR] Failed to execute goal 
org.apache.maven.plugins:maven-deploy-plugin:3.0.0:deploy (default-deploy) on 
project spark-connect-common_2.12: Failed to deploy artifacts: Could not 
transfer artifact 
org.apache.spark:spark-connect-common_2.12:xml:cyclonedx:3.4.2-20231007.001102-103
 from/to apache.snapshots.https 
(https://repository.apache.org/content/repositories/snapshots): transfer failed 
for 
https://repository.apache.org/content/repositories/snapshots/org/apache/spark/spark-connect-common_2.12/3.4.2-SNAPSHOT/spark-connect-common_2.12-3.4.2-20231007.001102-103-cyclonedx.xml,
 status: 408 Request Timeout -> [Help 1]
{quote}

Such errors should be retried by `mvn deploy`.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45707) Simplify `DataFrameStatFunctions. countMinSketch` with `CountMinSketchAgg `

2023-10-27 Thread ASF GitHub Bot (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45707?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated SPARK-45707:
---
Labels: pull-request-available  (was: )

> Simplify `DataFrameStatFunctions. countMinSketch` with `CountMinSketchAgg `
> ---
>
> Key: SPARK-45707
> URL: https://issues.apache.org/jira/browse/SPARK-45707
> Project: Spark
>  Issue Type: Improvement
>  Components: SQL
>Affects Versions: 4.0.0
>Reporter: Ruifeng Zheng
>Priority: Minor
>  Labels: pull-request-available
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Created] (SPARK-45707) Simplify `DataFrameStatFunctions. countMinSketch` with `CountMinSketchAgg `

2023-10-27 Thread Ruifeng Zheng (Jira)

Ruifeng Zheng created SPARK-45707:
-

 Summary: Simplify `DataFrameStatFunctions. countMinSketch` with 
`CountMinSketchAgg `
 Key: SPARK-45707
 URL: https://issues.apache.org/jira/browse/SPARK-45707
 Project: Spark
  Issue Type: Improvement
  Components: SQL
Affects Versions: 4.0.0
Reporter: Ruifeng Zheng






--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45696) Fix `method tryCompleteWith in trait Promise is deprecated`

2023-10-27 Thread ASF GitHub Bot (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45696?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated SPARK-45696:
---
Labels: pull-request-available  (was: )

> Fix `method tryCompleteWith in trait Promise is deprecated`
> ---
>
> Key: SPARK-45696
> URL: https://issues.apache.org/jira/browse/SPARK-45696
> Project: Spark
>  Issue Type: Sub-task
>  Components: Spark Core
>Affects Versions: 4.0.0
>Reporter: Yang Jie
>Priority: Major
>  Labels: pull-request-available
>
> {code:java}
> [warn] 
> /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/FutureAction.scala:190:32:
>  method tryCompleteWith in trait Promise is deprecated (since 2.13.0): Since 
> this method is semantically equivalent to `completeWith`, use that instead.
> [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, site=org.apache.spark.ComplexFutureAction.p, 
> origin=scala.concurrent.Promise.tryCompleteWith, version=2.13.0
> [warn]   private val p = Promise[T]().tryCompleteWith(run(jobSubmitter))
> [warn]                                ^ {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Resolved] (SPARK-45698) Clean up the deprecated API usage related to `Buffer`

2023-10-27 Thread Max Gekk (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45698?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Max Gekk resolved SPARK-45698.
--
Fix Version/s: 4.0.0
   Resolution: Fixed

Issue resolved by pull request 43551
[https://github.com/apache/spark/pull/43551]

> Clean up the deprecated API usage related to `Buffer`
> -
>
> Key: SPARK-45698
> URL: https://issues.apache.org/jira/browse/SPARK-45698
> Project: Spark
>  Issue Type: Sub-task
>  Components: Spark Core, SQL
>Affects Versions: 4.0.0
>Reporter: Yang Jie
>Assignee: Yang Jie
>Priority: Major
>  Labels: pull-request-available
> Fix For: 4.0.0
>
>
> * method append in trait Buffer is deprecated (since 2.13.0)
>  * method prepend in trait Buffer is deprecated (since 2.13.0)
>  * method trimEnd in trait Buffer is deprecated (since 2.13.4)
>  * method trimStart in trait Buffer is deprecated (since 2.13.4)
> {code:java}
> [warn] 
> /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala:319:18:
>  method append in trait Buffer is deprecated (since 2.13.0): Use appendAll 
> instead
> [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, 
> site=org.apache.spark.deploy.IvyTestUtils.createLocalRepository, 
> origin=scala.collection.mutable.Buffer.append, version=2.13.0
> [warn]         allFiles.append(rFiles: _*)
> [warn]                  ^ 
> [warn] 
> /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala:183:13:
>  method trimEnd in trait Buffer is deprecated (since 2.13.4): use 
> dropRightInPlace instead
> [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, 
> site=org.apache.spark.util.SizeEstimator.SearchState.dequeue, 
> origin=scala.collection.mutable.Buffer.trimEnd, version=2.13.4
> [warn]       stack.trimEnd(1)
> [warn]             ^{code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Assigned] (SPARK-45698) Clean up the deprecated API usage related to `Buffer`

2023-10-27 Thread Max Gekk (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45698?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Max Gekk reassigned SPARK-45698:


Assignee: Yang Jie

> Clean up the deprecated API usage related to `Buffer`
> -
>
> Key: SPARK-45698
> URL: https://issues.apache.org/jira/browse/SPARK-45698
> Project: Spark
>  Issue Type: Sub-task
>  Components: Spark Core, SQL
>Affects Versions: 4.0.0
>Reporter: Yang Jie
>Assignee: Yang Jie
>Priority: Major
>  Labels: pull-request-available
>
> * method append in trait Buffer is deprecated (since 2.13.0)
>  * method prepend in trait Buffer is deprecated (since 2.13.0)
>  * method trimEnd in trait Buffer is deprecated (since 2.13.4)
>  * method trimStart in trait Buffer is deprecated (since 2.13.4)
> {code:java}
> [warn] 
> /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala:319:18:
>  method append in trait Buffer is deprecated (since 2.13.0): Use appendAll 
> instead
> [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, 
> site=org.apache.spark.deploy.IvyTestUtils.createLocalRepository, 
> origin=scala.collection.mutable.Buffer.append, version=2.13.0
> [warn]         allFiles.append(rFiles: _*)
> [warn]                  ^ 
> [warn] 
> /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala:183:13:
>  method trimEnd in trait Buffer is deprecated (since 2.13.4): use 
> dropRightInPlace instead
> [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, 
> site=org.apache.spark.util.SizeEstimator.SearchState.dequeue, 
> origin=scala.collection.mutable.Buffer.trimEnd, version=2.13.4
> [warn]       stack.trimEnd(1)
> [warn]             ^{code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Assigned] (SPARK-45651) Snapshots of some packages are not published any more

2023-10-27 Thread Yang Jie (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45651?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Yang Jie reassigned SPARK-45651:


Assignee: Enrico Minack

> Snapshots of some packages are not published any more
> -
>
> Key: SPARK-45651
> URL: https://issues.apache.org/jira/browse/SPARK-45651
> Project: Spark
>  Issue Type: Bug
>  Components: Build
>Affects Versions: 4.0.0
>Reporter: Enrico Minack
>Assignee: Enrico Minack
>Priority: Major
>  Labels: pull-request-available
>
> Snapshots of some packages are not been published anymore, e.g. 
> spark-sql_2.13-4.0.0 has not been published since Sep, 13th: 
> https://repository.apache.org/content/groups/snapshots/org/apache/spark/spark-sql_2.13/4.0.0-SNAPSHOT/
> There have been some attempts to fix CI: SPARK-45535 SPARK-45536
> Assumption is that memory consumption during build exceeds the available 
> memory of the Github host.
> The following could be attempted:
> - enable manual trigger of the {{publish_snapshots.yml}} workflow
> - enable some memory use logging to proof that exceeded memory is the root 
> cause
> - attempt to reduce memory footprint and see impact in above logging
> - revert memory use logging



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Resolved] (SPARK-45651) Snapshots of some packages are not published any more

2023-10-27 Thread Yang Jie (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45651?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Yang Jie resolved SPARK-45651.
--
Fix Version/s: 4.0.0
   Resolution: Fixed

Issue resolved by pull request 43555
[https://github.com/apache/spark/pull/43555]

> Snapshots of some packages are not published any more
> -
>
> Key: SPARK-45651
> URL: https://issues.apache.org/jira/browse/SPARK-45651
> Project: Spark
>  Issue Type: Bug
>  Components: Build
>Affects Versions: 4.0.0
>Reporter: Enrico Minack
>Assignee: Enrico Minack
>Priority: Major
>  Labels: pull-request-available
> Fix For: 4.0.0
>
>
> Snapshots of some packages are not been published anymore, e.g. 
> spark-sql_2.13-4.0.0 has not been published since Sep, 13th: 
> https://repository.apache.org/content/groups/snapshots/org/apache/spark/spark-sql_2.13/4.0.0-SNAPSHOT/
> There have been some attempts to fix CI: SPARK-45535 SPARK-45536
> Assumption is that memory consumption during build exceeds the available 
> memory of the Github host.
> The following could be attempted:
> - enable manual trigger of the {{publish_snapshots.yml}} workflow
> - enable some memory use logging to proof that exceeded memory is the root 
> cause
> - attempt to reduce memory footprint and see impact in above logging
> - revert memory use logging



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45705) Fix flaky test: Status of a failed DDL/DML with no jobs should be FAILED

2023-10-27 Thread ASF GitHub Bot (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45705?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated SPARK-45705:
---
Labels: pull-request-available  (was: )

> Fix flaky test: Status of a failed DDL/DML with no jobs should be FAILED 
> -
>
> Key: SPARK-45705
> URL: https://issues.apache.org/jira/browse/SPARK-45705
> Project: Spark
>  Issue Type: Improvement
>  Components: Tests
>Affects Versions: 4.0.0
>Reporter: Kent Yao
>Priority: Major
>  Labels: pull-request-available
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45706) Makes entire Binder build fails fast during setting up

2023-10-27 Thread ASF GitHub Bot (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45706?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated SPARK-45706:
---
Labels: pull-request-available  (was: )

> Makes entire Binder build fails fast during setting up
> --
>
> Key: SPARK-45706
> URL: https://issues.apache.org/jira/browse/SPARK-45706
> Project: Spark
>  Issue Type: New Feature
>  Components: Documentation, PySpark
>Affects Versions: 3.5.0
>Reporter: Hyukjin Kwon
>Priority: Major
>  Labels: pull-request-available
>
> Binder build is currently broken:
> https://mybinder.org/v2/gh/apache/spark/ce5ddad9903?filepath=python%2Fdocs%2Fsource%2Fgetting_started%2Fquickstart_df.ipynb
> Seems like we uploaded PySpark late into PyPI, and the installation steps 
> just slightly ignored the failure.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Created] (SPARK-45706) Makes entire Binder build fails fast during setting up

2023-10-27 Thread Hyukjin Kwon (Jira)

Hyukjin Kwon created SPARK-45706:


 Summary: Makes entire Binder build fails fast during setting up
 Key: SPARK-45706
 URL: https://issues.apache.org/jira/browse/SPARK-45706
 Project: Spark
  Issue Type: New Feature
  Components: Documentation, PySpark
Affects Versions: 3.5.0
Reporter: Hyukjin Kwon


Binder build is currently broken:

https://mybinder.org/v2/gh/apache/spark/ce5ddad9903?filepath=python%2Fdocs%2Fsource%2Fgetting_started%2Fquickstart_df.ipynb

Seems like we uploaded PySpark late into PyPI, and the installation steps just 
slightly ignored the failure.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Created] (SPARK-45705) Fix flaky test: Status of a failed DDL/DML with no jobs should be FAILED

2023-10-27 Thread Kent Yao (Jira)

Kent Yao created SPARK-45705:


 Summary: Fix flaky test: Status of a failed DDL/DML with no jobs 
should be FAILED 
 Key: SPARK-45705
 URL: https://issues.apache.org/jira/browse/SPARK-45705
 Project: Spark
  Issue Type: Improvement
  Components: Tests
Affects Versions: 4.0.0
Reporter: Kent Yao






--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45701) Clean up the deprecated API usage related to `SetOps`

2023-10-27 Thread Yang Jie (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45701?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Yang Jie updated SPARK-45701:
-
Description: 
* method - in trait SetOps is deprecated (since 2.13.0)
 * method – in trait SetOps is deprecated (since 2.13.0)
 * method + in trait SetOps is deprecated (since 2.13.0)
 * method retain in trait SetOps is deprecated (since 2.13.0)

 
{code:java}
[warn] 
/Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala:70:32:
 method + in trait SetOps is deprecated (since 2.13.0): Consider requiring an 
immutable Set or fall back to Set.union
[warn] Applicable -Wconf / @nowarn filters for this warning: msg=, cat=deprecation, 
site=org.apache.spark.storage.BlockReplicationUtils.getSampleIds.indices.$anonfun,
 origin=scala.collection.SetOps.+, version=2.13.0
[warn]       if (set.contains(t)) set + i else set + t
[warn]                                ^ {code}

  was:
* method - in trait SetOps is deprecated (since 2.13.0)
 * method – in trait SetOps is deprecated (since 2.13.0)
 * method + in trait SetOps is deprecated (since 2.13.0)

 
{code:java}
[warn] 
/Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala:70:32:
 method + in trait SetOps is deprecated (since 2.13.0): Consider requiring an 
immutable Set or fall back to Set.union
[warn] Applicable -Wconf / @nowarn filters for this warning: msg=, cat=deprecation, 
site=org.apache.spark.storage.BlockReplicationUtils.getSampleIds.indices.$anonfun,
 origin=scala.collection.SetOps.+, version=2.13.0
[warn]       if (set.contains(t)) set + i else set + t
[warn]                                ^ {code}


> Clean up the deprecated API usage related to `SetOps`
> -
>
> Key: SPARK-45701
> URL: https://issues.apache.org/jira/browse/SPARK-45701
> Project: Spark
>  Issue Type: Sub-task
>  Components: Spark Core, SQL
>Affects Versions: 4.0.0
>Reporter: Yang Jie
>Priority: Major
>
> * method - in trait SetOps is deprecated (since 2.13.0)
>  * method – in trait SetOps is deprecated (since 2.13.0)
>  * method + in trait SetOps is deprecated (since 2.13.0)
>  * method retain in trait SetOps is deprecated (since 2.13.0)
>  
> {code:java}
> [warn] 
> /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala:70:32:
>  method + in trait SetOps is deprecated (since 2.13.0): Consider requiring an 
> immutable Set or fall back to Set.union
> [warn] Applicable -Wconf / @nowarn filters for this warning: msg= message>, cat=deprecation, 
> site=org.apache.spark.storage.BlockReplicationUtils.getSampleIds.indices.$anonfun,
>  origin=scala.collection.SetOps.+, version=2.13.0
> [warn]       if (set.contains(t)) set + i else set + t
> [warn]                                ^ {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Resolved] (SPARK-45614) Assign name to _LEGACY_ERROR_TEMP_215[6,7,8]

2023-10-27 Thread Max Gekk (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45614?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Max Gekk resolved SPARK-45614.
--
Resolution: Fixed

Issue resolved by pull request 43481
[https://github.com/apache/spark/pull/43481]

> Assign name to _LEGACY_ERROR_TEMP_215[6,7,8]
> 
>
> Key: SPARK-45614
> URL: https://issues.apache.org/jira/browse/SPARK-45614
> Project: Spark
>  Issue Type: Sub-task
>  Components: SQL
>Affects Versions: 3.5.0
>Reporter: Deng Ziming
>Assignee: Deng Ziming
>Priority: Major
>  Labels: pull-request-available
> Fix For: 4.0.0
>
>
> Choose a proper name for the error class *_LEGACY_ERROR_TEMP_2153* defined in 
> {*}core/src/main/resources/error/error-classes.json{*}. The name should be 
> short but complete (look at the example in error-classes.json).
> Add a test which triggers the error from user code if such test still doesn't 
> exist. Check exception fields by using {*}checkError(){*}. The last function 
> checks valuable error fields only, and avoids dependencies from error text 
> message. In this way, tech editors can modify error format in 
> error-classes.json, and don't worry of Spark's internal tests. Migrate other 
> tests that might trigger the error onto checkError().
> If you cannot reproduce the error from user space (using SQL query), replace 
> the error by an internal error, see {*}SparkException.internalError(){*}.
> Improve the error message format in error-classes.json if the current is not 
> clear. Propose a solution to users how to avoid and fix such kind of errors.
> Please, look at the PR below as examples:
>  * [https://github.com/apache/spark/pull/38685]
>  * [https://github.com/apache/spark/pull/38656]
>  * [https://github.com/apache/spark/pull/38490]



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Assigned] (SPARK-40820) Creating StructType from Json

2023-10-27 Thread Hyukjin Kwon (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-40820?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Hyukjin Kwon reassigned SPARK-40820:


Assignee: Anthony Wainer Cachay Guivin

> Creating StructType from Json
> -
>
> Key: SPARK-40820
> URL: https://issues.apache.org/jira/browse/SPARK-40820
> Project: Spark
>  Issue Type: Bug
>  Components: PySpark, Spark Core
>Affects Versions: 3.5.0
>Reporter: Anthony Wainer Cachay Guivin
>Assignee: Anthony Wainer Cachay Guivin
>Priority: Minor
>  Labels: pull-request-available
>
> When create a StructType from a Python dictionary you use 
> [StructType.fromJson|https://github.com/apache/spark/blob/master/python/pyspark/sql/types.py#L792]
>  or in scala 
> [DataType.fromJson|https://github.com/apache/spark/blob/master/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala#L158C7-L158C15]
> To create a schema can be created as follows from the code below, but it 
> requires to put inside the json: Nullable and Metadata, this is inconsistent 
> because within the DataType class this by default.
> {code:python}
> schema = {
>  "name": "name", "type": "string" 
> }
> StructField.fromJson(schema)
> {code}
> Python Error:
> {code:python}
> from pyspark.sql.types import StructField
> schema = {
>  "name": "c1", "type": "string" 
> }
> StructField.fromJson(schema)
> >>
> Traceback (most recent call last):
> File "code.py", line 90, in runcode
> exec(code, self.locals)
> File "", line 1, in 
> File "pyspark/sql/types.py", line 583, in fromJson
> json["nullable"],
> KeyError: 'nullable' 
> {code}
> Scala Error:
> {code:scala}
> val schema = """
> |{
> |"type": "struct",
> |"fields": [
> |{
> |"name": "c1",
> |"type": "string",
> |"nullable": false
> |}
> |]
> |}
> |""".stripMargin
> DataType.fromJson(schema)
> >>
> Failed to convert the JSON string '{"name":"c1","type":"string"}' to a field.
> java.lang.IllegalArgumentException: Failed to convert the JSON string 
> '{"name":"c1","type":"string"}' to a field.
> at org.apache.spark.sql.types.DataType$.parseStructField(DataType.scala:268)
> at 
> org.apache.spark.sql.types.DataType$.$anonfun$parseDataType$1(DataType.scala:225)
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Resolved] (SPARK-40820) Creating StructType from Json

2023-10-27 Thread Hyukjin Kwon (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-40820?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Hyukjin Kwon resolved SPARK-40820.
--
Fix Version/s: 4.0.0
   Resolution: Fixed

Issue resolved by pull request 43474
[https://github.com/apache/spark/pull/43474]

> Creating StructType from Json
> -
>
> Key: SPARK-40820
> URL: https://issues.apache.org/jira/browse/SPARK-40820
> Project: Spark
>  Issue Type: Bug
>  Components: PySpark, Spark Core
>Affects Versions: 3.5.0
>Reporter: Anthony Wainer Cachay Guivin
>Assignee: Anthony Wainer Cachay Guivin
>Priority: Minor
>  Labels: pull-request-available
> Fix For: 4.0.0
>
>
> When create a StructType from a Python dictionary you use 
> [StructType.fromJson|https://github.com/apache/spark/blob/master/python/pyspark/sql/types.py#L792]
>  or in scala 
> [DataType.fromJson|https://github.com/apache/spark/blob/master/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala#L158C7-L158C15]
> To create a schema can be created as follows from the code below, but it 
> requires to put inside the json: Nullable and Metadata, this is inconsistent 
> because within the DataType class this by default.
> {code:python}
> schema = {
>  "name": "name", "type": "string" 
> }
> StructField.fromJson(schema)
> {code}
> Python Error:
> {code:python}
> from pyspark.sql.types import StructField
> schema = {
>  "name": "c1", "type": "string" 
> }
> StructField.fromJson(schema)
> >>
> Traceback (most recent call last):
> File "code.py", line 90, in runcode
> exec(code, self.locals)
> File "", line 1, in 
> File "pyspark/sql/types.py", line 583, in fromJson
> json["nullable"],
> KeyError: 'nullable' 
> {code}
> Scala Error:
> {code:scala}
> val schema = """
> |{
> |"type": "struct",
> |"fields": [
> |{
> |"name": "c1",
> |"type": "string",
> |"nullable": false
> |}
> |]
> |}
> |""".stripMargin
> DataType.fromJson(schema)
> >>
> Failed to convert the JSON string '{"name":"c1","type":"string"}' to a field.
> java.lang.IllegalArgumentException: Failed to convert the JSON string 
> '{"name":"c1","type":"string"}' to a field.
> at org.apache.spark.sql.types.DataType$.parseStructField(DataType.scala:268)
> at 
> org.apache.spark.sql.types.DataType$.$anonfun$parseDataType$1(DataType.scala:225)
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45670) SparkSubmit does not support --total-executor-cores when deploying on K8s

2023-10-27 Thread Hyukjin Kwon (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45670?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Hyukjin Kwon updated SPARK-45670:
-
Fix Version/s: 3.4.2
   3.5.1

> SparkSubmit does not support --total-executor-cores when deploying on K8s
> -
>
> Key: SPARK-45670
> URL: https://issues.apache.org/jira/browse/SPARK-45670
> Project: Spark
>  Issue Type: Bug
>  Components: Spark Submit
>Affects Versions: 3.3.3, 3.4.1, 3.5.0
>Reporter: Cheng Pan
>Assignee: Cheng Pan
>Priority: Major
>  Labels: pull-request-available
> Fix For: 3.4.2, 3.5.1, 3.3.4
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Resolved] (SPARK-45670) SparkSubmit does not support --total-executor-cores when deploying on K8s

2023-10-27 Thread Hyukjin Kwon (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45670?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Hyukjin Kwon resolved SPARK-45670.
--
Fix Version/s: 3.3.4
   Resolution: Fixed

Issue resolved by pull request 43548
[https://github.com/apache/spark/pull/43548]

> SparkSubmit does not support --total-executor-cores when deploying on K8s
> -
>
> Key: SPARK-45670
> URL: https://issues.apache.org/jira/browse/SPARK-45670
> Project: Spark
>  Issue Type: Bug
>  Components: Spark Submit
>Affects Versions: 3.3.3, 3.4.1, 3.5.0
>Reporter: Cheng Pan
>Assignee: Cheng Pan
>Priority: Major
>  Labels: pull-request-available
> Fix For: 3.3.4
>
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Assigned] (SPARK-45670) SparkSubmit does not support --total-executor-cores when deploying on K8s

2023-10-27 Thread Hyukjin Kwon (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45670?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Hyukjin Kwon reassigned SPARK-45670:


Assignee: Cheng Pan

> SparkSubmit does not support --total-executor-cores when deploying on K8s
> -
>
> Key: SPARK-45670
> URL: https://issues.apache.org/jira/browse/SPARK-45670
> Project: Spark
>  Issue Type: Bug
>  Components: Spark Submit
>Affects Versions: 3.3.3, 3.4.1, 3.5.0
>Reporter: Cheng Pan
>Assignee: Cheng Pan
>Priority: Major
>  Labels: pull-request-available
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-45637) Time window aggregation in separate streams followed by stream-stream join not returning results

2023-10-27 Thread Wei Liu (Jira)



 [ 
https://issues.apache.org/jira/browse/SPARK-45637?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Wei Liu updated SPARK-45637:

Description: 
According to documentation update (SPARK-42591) resulting from SPARK-42376, 
Spark 3.5.0 should support time-window aggregations in two separate streams 
followed by stream-stream window join:

[https://github.com/apache/spark/blob/261b281e6e57be32eb28bf4e50bea24ed22a9f21/docs/structured-streaming-programming-guide.md?plain=1#L1939-L1995]

However, I failed to reproduce this example and the query I built doesn't 
return any results:
{code:java}
from pyspark.sql.functions import rand
from pyspark.sql.functions import expr, window, window_time

spark.conf.set("spark.sql.shuffle.partitions", "1")

impressions = (
spark
.readStream.format("rate").option("rowsPerSecond", 
"5").option("numPartitions", "1").load()
.selectExpr("value AS adId", "timestamp AS impressionTime")
)

impressionsWithWatermark = impressions \
.selectExpr("adId AS impressionAdId", "impressionTime") \
.withWatermark("impressionTime", "10 seconds")

clicks = (  
spark  
.readStream.format("rate").option("rowsPerSecond", 
"5").option("numPartitions", "1").load()  
.where((rand() * 100).cast("integer") < 10)  # 10 out of every 100 
impressions result in a click  
.selectExpr("(value - 10) AS adId ", "timestamp AS clickTime")  # -10 so 
that a click with same id as impression is generated later (i.e. delayed data).
    .where("adId > 0")  
) 

clicksWithWatermark = clicks \
.selectExpr("adId AS clickAdId", "clickTime") \
.withWatermark("clickTime", "10 seconds")

clicksWindow = clicksWithWatermark.groupBy(  
window(clicksWithWatermark.clickTime, "1 minute")
).count()

impressionsWindow = impressionsWithWatermark.groupBy(
window(impressionsWithWatermark.impressionTime, "1 minute")
).count()

clicksAndImpressions = clicksWindow.join(impressionsWindow, "window", "inner")

clicksAndImpressions.writeStream \
.format("memory") \
.queryName("clicksAndImpressions") \
.outputMode("append") \
.start() {code}
 

My intuition is that I'm getting no results because to output results of the 
first stateful operator (time window aggregation), a watermark needs to pass 
the end timestamp of the window. And once the watermark is after the end 
timestamp of the window, this window is ignored at the second stateful operator 
(stream-stream) join because it's behind the watermark. Indeed, a small hack 
done to event time column (adding one minute) between two stateful operators 
makes it possible to get results:
{code:java}
clicksWindow2 = clicksWithWatermark.groupBy( 
window(clicksWithWatermark.clickTime, "1 minute")
).count().withColumn("window_time", window_time("window") + expr('INTERVAL 1 
MINUTE')).drop("window")

impressionsWindow2 = impressionsWithWatermark.groupBy(
window(impressionsWithWatermark.impressionTime, "1 minute")
).count().withColumn("window_time", window_time("window") + expr('INTERVAL 1 
MINUTE')).drop("window")

clicksAndImpressions2 = clicksWindow2.join(impressionsWindow2, "window_time", 
"inner")

clicksAndImpressions2.writeStream \
.format("memory") \
.queryName("clicksAndImpressions2") \
.outputMode("append") \
.start()  {code}
 

  was:
According to documentation update (SPARK-42591) resulting from SPARK-42376, 
Spark 3.5.0 should support time-window aggregations in two separate streams 
followed by stream-stream window join:

https://github.com/apache/spark/blob/261b281e6e57be32eb28bf4e50bea24ed22a9f21/docs/structured-streaming-programming-guide.md?plain=1#L1939-L1995

However, I failed to reproduce this example and the query I built doesn't 
return any results:
{code:java}
from pyspark.sql.functions import rand
from pyspark.sql.functions import expr, window, window_time

spark.conf.set("spark.sql.shuffle.partitions", "1")

impressions = (
spark
.readStream.format("rate").option("rowsPerSecond", 
"5").option("numPartitions", "1").load()
.selectExpr("value AS adId", "timestamp AS impressionTime")
)

impressionsWithWatermark = impressions \
.selectExpr("adId AS impressionAdId", "impressionTime") \
.withWatermark("impressionTime", "10 seconds")

clicks = (  
spark  
.readStream.format("rate").option("rowsPerSecond", 
"5").option("numPartitions", "1").load()  
.where((rand() * 100).cast("integer") < 10)  # 10 out of every 100 
impressions result in a click  
.selectExpr("(value - 10) AS adId ", "timestamp AS clickTime")  # -10 so 
that a click with same id as impression is generated later (i.e. delayed data). 
 .where("adId > 0")
) 

clicksWithWatermark = clicks \
.selectExpr("adId AS clickAdId", "clickTime") \
.withWatermark("clickTime", "10 seconds")

clicksWindow = clicksWithWatermark.groupBy(  
window(clicksWithWatermark.clickTime, "1 minute")
).count()

impressionsWindow

48 matches

Mail list logo