Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/20229#discussion_r160878716 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala --- @@ -228,22 +229,33 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String) // Then we handle one-hot encoding and interactions between terms. var keepReferenceCategory = false val encodedTerms = resolvedFormula.terms.map { - case Seq(term) if dataset.schema(term).dataType == StringType => - val encodedCol = tmpColumn("onehot") - var encoder = new OneHotEncoder() - .setInputCol(indexed(term)) - .setOutputCol(encodedCol) - // Formula w/o intercept, one of the categories in the first category feature is - // being used as reference category, we will not drop any category for that feature. - if (!hasIntercept && !keepReferenceCategory) { - encoder = encoder.setDropLast(false) - keepReferenceCategory = true - } - encoderStages += encoder - prefixesToRewrite(encodedCol + "_") = term + "_" - encodedCol case Seq(term) => - term + dataset.schema(term).dataType match { + case _: StringType => + val encodedCol = tmpColumn("onehot") + // Formula w/o intercept, one of the categories in the first category feature is + // being used as reference category, we will not drop any category for that feature. + if (!hasIntercept && !keepReferenceCategory) { + keepReferenceCategory = true + encoderStages += new OneHotEncoderEstimator(uid) --- End diff -- Oh, I see. There is one `OneHotEncoderEstimator` for `dropLast=false`.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org