Github user smurching commented on a diff in the pull request: https://github.com/apache/spark/pull/19666#discussion_r149237212 --- Diff: mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala --- @@ -741,17 +678,43 @@ private[spark] object RandomForest extends Logging { (splits(featureIndex)(bestFeatureSplitIndex), bestFeatureGainStats) } else if (binAggregates.metadata.isUnordered(featureIndex)) { // Unordered categorical feature - val leftChildOffset = binAggregates.getFeatureOffset(featureIndexIdx) - val (bestFeatureSplitIndex, bestFeatureGainStats) = - Range(0, numSplits).map { splitIndex => - val leftChildStats = binAggregates.getImpurityCalculator(leftChildOffset, splitIndex) - val rightChildStats = binAggregates.getParentImpurityCalculator() - .subtract(leftChildStats) + val numBins = binAggregates.metadata.numBins(featureIndex) + val featureOffset = binAggregates.getFeatureOffset(featureIndexIdx) + + val binStatsArray = Array.tabulate(numBins) { binIndex => --- End diff -- Could you please add a comment explaining what this is? E.g.: `// Each element of binStatsArray stores pre-computed label statistics for a single bin of the current future`
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org