[ 
https://issues.apache.org/jira/browse/SPARK-31500?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Eric Wasserman updated SPARK-31500:
-----------------------------------
    Description: 
The collect_set() aggregate function should produce a set of distinct elements. 
When the column argument's type is BinayType this is not the case.

 

Example:

{{import org.apache.spark.sql.functions._}}
 {{import org.apache.spark.sql.expressions.Window}}

{{case class R(id: String, value: String, bytes: Array[Byte])}}
 {{def makeR(id: String, value: String) = R(id, value, value.getBytes)}}
 {{val df = Seq(makeR("a", "dog"), makeR("a", "cat"), makeR("a", "cat"), 
makeR("b", "fish")).toDF()}}

 

{{// In the example below "bytesSet" erroneously has duplicates but "stringSet" 
does not (as expected).}}

{{df.agg(collect_set('value) as "stringSet", collect_set('bytes) as 
"byteSet").show(truncate=false)}}

 

{{// The same problem is displayed when using window functions.}}
 {{val win = Window.partitionBy('id).rowsBetween(Window.unboundedPreceding, 
Window.unboundedFollowing)}}
 {{val result = df.select(}}
 \{{ collect_set('value).over(win) as "stringSet",}}
 \{{ collect_set('bytes).over(win) as "bytesSet"}}
 {{)}}
 {{.select('stringSet, 'bytesSet, size('stringSet) as "stringSetSize", 
size('bytesSet) as "bytesSetSize")}}
 {{.show()}}

  was:
The collect_set() aggregate function should produce a set of distinct elements. 
When the column argument's type is BinayType this is not the case.

 

Example:

{{import org.apache.spark.sql.functions._}}
 {{import org.apache.spark.sql.expressions.Window}}{\{case }}

{{case class R(id: String, value: String, bytes: Array[Byte])}}
 {{def makeR(id: String, value: String) = R(id, value, value.getBytes)}}
 {{val df = Seq(makeR("a", "dog"), makeR("a", "cat"), makeR("a", "cat"), 
makeR("b", "fish")).toDF()}}

 

{{// In the example below "bytesSet" erroneously has duplicates but "stringSet" 
does not (as expected).}}

{{df.agg(collect_set('value) as "stringSet", collect_set('bytes) as 
"byteSet").show(truncate=false)}}

 

{{// The same problem is displayed when using window functions.}}
 {{val win = Window.partitionBy('id).rowsBetween(Window.unboundedPreceding, 
Window.unboundedFollowing)}}
 {{val result = df.select(}}
 \{{ collect_set('value).over(win) as "stringSet",}}
 \{{ collect_set('bytes).over(win) as "bytesSet"}}
 {{)}}
 {{.select('stringSet, 'bytesSet, size('stringSet) as "stringSetSize", 
size('bytesSet) as "bytesSetSize")}}
 {{.show()}}


> collect_set() of BinaryType returns duplicate elements
> ------------------------------------------------------
>
>                 Key: SPARK-31500
>                 URL: https://issues.apache.org/jira/browse/SPARK-31500
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.4.4
>            Reporter: Eric Wasserman
>            Priority: Major
>
> The collect_set() aggregate function should produce a set of distinct 
> elements. When the column argument's type is BinayType this is not the case.
>  
> Example:
> {{import org.apache.spark.sql.functions._}}
>  {{import org.apache.spark.sql.expressions.Window}}
> {{case class R(id: String, value: String, bytes: Array[Byte])}}
>  {{def makeR(id: String, value: String) = R(id, value, value.getBytes)}}
>  {{val df = Seq(makeR("a", "dog"), makeR("a", "cat"), makeR("a", "cat"), 
> makeR("b", "fish")).toDF()}}
>  
> {{// In the example below "bytesSet" erroneously has duplicates but 
> "stringSet" does not (as expected).}}
> {{df.agg(collect_set('value) as "stringSet", collect_set('bytes) as 
> "byteSet").show(truncate=false)}}
>  
> {{// The same problem is displayed when using window functions.}}
>  {{val win = Window.partitionBy('id).rowsBetween(Window.unboundedPreceding, 
> Window.unboundedFollowing)}}
>  {{val result = df.select(}}
>  \{{ collect_set('value).over(win) as "stringSet",}}
>  \{{ collect_set('bytes).over(win) as "bytesSet"}}
>  {{)}}
>  {{.select('stringSet, 'bytesSet, size('stringSet) as "stringSetSize", 
> size('bytesSet) as "bytesSetSize")}}
>  {{.show()}}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to