Github user aborsu985 commented on a diff in the pull request:
    --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala 
    @@ -39,3 +39,66 @@ class Tokenizer extends UnaryTransformer[String, 
Seq[String], Tokenizer] {
       override protected def outputDataType: DataType = new 
ArrayType(StringType, false)
    + * :: AlphaComponent ::
    + * A regex based tokenizer that extracts tokens using a regex.
    + * Optional additional parameters include enabling lowercase 
stabdarization, a minimum character
    + * size for tokens as well as an array of stop words to remove from the 
    + */
    +class RegexTokenizer extends UnaryTransformer[String, Seq[String], 
RegexTokenizer] {
    +  val lowerCase = new BooleanParam(this, 
    +      "lowerCase", 
    +      "enable case folding to lower case", 
    +      Some(true))
    +  def setLowercase(value: Boolean) = set(lowerCase, value)
    +  def getLowercase: Boolean = get(lowerCase)
    +  val minLength = new IntParam(this, 
    --- End diff --
    I removed excluded as it is indeed unusual and set the default value to 1 
which is standard

