Github user smurching commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19753#discussion_r151298765
  
    --- Diff: python/pyspark/ml/feature.py ---
    @@ -2565,22 +2575,28 @@ class VectorIndexer(JavaEstimator, HasInputCol, 
HasOutputCol, JavaMLReadable, Ja
                               "(>= 2). If a feature is found to have > 
maxCategories values, then " +
                               "it is declared continuous.", 
typeConverter=TypeConverters.toInt)
     
    +    handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle 
invalid data " +
    +                          "(unseen labels or NULL values). Options are 
'skip' (filter out " +
    +                          "rows with invalid data), 'error' (throw an 
error), or 'keep' (put " +
    +                          "invalid data in a special additional bucket, at 
index numCategories).",
    +                          typeConverter=TypeConverters.toString)
    +
         @keyword_only
    -    def __init__(self, maxCategories=20, inputCol=None, outputCol=None):
    +    def __init__(self, maxCategories=20, inputCol=None, outputCol=None, 
handleInvalid="error"):
             """
    -        __init__(self, maxCategories=20, inputCol=None, outputCol=None)
    +        __init__(self, maxCategories=20, inputCol=None, outputCol=None, 
handleInvalid="error")
             """
             super(VectorIndexer, self).__init__()
             self._java_obj = 
self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid)
    -        self._setDefault(maxCategories=20)
    +        self._setDefault(maxCategories=20, handleInvalid="error")
             kwargs = self._input_kwargs
             self.setParams(**kwargs)
     
         @keyword_only
         @since("1.4.0")
    -    def setParams(self, maxCategories=20, inputCol=None, outputCol=None):
    +    def setParams(self, maxCategories=20, inputCol=None, outputCol=None, 
handleInvalid="error"):
    --- End diff --
    
    The same goes for the constructor (IMO we should default to 
`handleInvalid=None` there too), but open to hearing your thoughts.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to