Github user rdblue commented on a diff in the pull request:

    https://github.com/apache/spark/pull/21623#discussion_r198230713
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
 ---
    @@ -270,6 +277,29 @@ private[parquet] class ParquetFilters(pushDownDate: 
Boolean) {
           case sources.Not(pred) =>
             createFilter(schema, pred).map(FilterApi.not)
     
    +      case sources.StringStartsWith(name, prefix) if pushDownStartWith && 
canMakeFilterOn(name) =>
    +        Option(prefix).map { v =>
    +          FilterApi.userDefined(binaryColumn(name),
    +            new UserDefinedPredicate[Binary] with Serializable {
    +              private val strToBinary = 
Binary.fromReusedByteArray(v.getBytes)
    +              private val size = strToBinary.length
    +
    +              override def canDrop(statistics: Statistics[Binary]): 
Boolean = {
    +                val comparator = 
PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR
    +                val max = statistics.getMax
    +                val min = statistics.getMin
    +                comparator.compare(max.slice(0, math.min(size, 
max.length)), strToBinary) < 0 ||
    +                  comparator.compare(min.slice(0, math.min(size, 
min.length)), strToBinary) > 0
    +              }
    +
    +              override def inverseCanDrop(statistics: Statistics[Binary]): 
Boolean = false
    --- End diff --
    
    Why can't this evaluate the inverse of `StartsWith`? If the min and max 
values exclude the prefix, then this should be able to filter.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to