Github user kiszk commented on a diff in the pull request:

    https://github.com/apache/spark/pull/21061#discussion_r192546226
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
 ---
    @@ -2189,3 +2189,302 @@ case class ArrayRemove(left: Expression, right: 
Expression)
     
       override def prettyName: String = "array_remove"
     }
    +
    +object ArraySetLike {
    +  private val MAX_ARRAY_LENGTH: Int = 
ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH
    +
    +  def toArrayDataInt(hs: OpenHashSet[Int]): ArrayData = {
    +    val array = new Array[Int](hs.size)
    +    var pos = hs.nextPos(0)
    +    var i = 0
    +    while (pos != OpenHashSet.INVALID_POS) {
    +      array(i) = hs.getValue(pos)
    +      pos = hs.nextPos(pos + 1)
    +      i += 1
    +    }
    +
    +    if (useGenericArrayData(LongType.defaultSize, array.length)) {
    +      new GenericArrayData(array)
    +    } else {
    +      UnsafeArrayData.fromPrimitiveArray(array)
    +    }
    +  }
    +
    +  def toArrayDataLong(hs: OpenHashSet[Long]): ArrayData = {
    +    val array = new Array[Long](hs.size)
    +    var pos = hs.nextPos(0)
    +    var i = 0
    +    while (pos != OpenHashSet.INVALID_POS) {
    +      array(i) = hs.getValue(pos)
    +      pos = hs.nextPos(pos + 1)
    +      i += 1
    +    }
    +
    +    if (useGenericArrayData(LongType.defaultSize, array.length)) {
    +      new GenericArrayData(array)
    +    } else {
    +      UnsafeArrayData.fromPrimitiveArray(array)
    +    }
    +  }
    +
    +  def useGenericArrayData(elementSize: Int, length: Int): Boolean = {
    --- End diff --
    
    Although I tried it, I stopped reusing. This is because 
`UnsafeArrayData.fromPrimitiveArray()` also uses variables (e.g. 
`headerInBytes` and `valueRegionInBytes`) calculated in this method.
    I think that there is no typical way to return multiple values from a 
function.
    
    Thus, we can move this to `UnsafeArrayData`. But, it is not easy to reuse 
it. WDYT?
    
    ```
      private static UnsafeArrayData fromPrimitiveArray(
           Object arr, int offset, int length, int elementSize) {
        final long headerInBytes = calculateHeaderPortionInBytes(length);
        final long valueRegionInBytes = elementSize * length;
        final long totalSizeInLongs = (headerInBytes + valueRegionInBytes + 7) 
/ 8;
        if (totalSizeInLongs > Integer.MAX_VALUE / 8) {
          throw new UnsupportedOperationException("Cannot convert this array to 
unsafe format as " +
            "it's too big.");
        }
    
        final long[] data = new long[(int)totalSizeInLongs];
    
        Platform.putLong(data, Platform.LONG_ARRAY_OFFSET, length);
        Platform.copyMemory(arr, offset, data,
          Platform.LONG_ARRAY_OFFSET + headerInBytes, valueRegionInBytes);
    
        UnsafeArrayData result = new UnsafeArrayData();
        result.pointTo(data, Platform.LONG_ARRAY_OFFSET, (int)totalSizeInLongs 
* 8);
        return result;
      }
    ```


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to