Github user srowen commented on a diff in the pull request: https://github.com/apache/spark/pull/13440#discussion_r218209381 --- Diff: mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala --- @@ -52,6 +52,49 @@ trait Impurity extends Serializable { @Since("1.0.0") @DeveloperApi def calculate(count: Double, sum: Double, sumSquares: Double): Double + + /** + * :: DeveloperApi :: + * Compute a test-statistic p-value quality measure from left and right split populations + * @param calcL impurity calculator for the left split population + * @param calcR impurity calculator for the right split population + * @return The p-value for the null hypothesis; that left and right split populations + * represent the same distribution + * @note Unless overridden this method will fail with an exception, for backward compatability + */ + @Since("2.2.0") + @DeveloperApi + def calculate(calcL: ImpurityCalculator, calcR: ImpurityCalculator): Double + + /** + * :: DeveloperApi :: + * Determine if this impurity measure is a test-statistic measure + * @return True if this is a split quality measure based on a test statistic (i.e. returns a + * p-value) or false otherwise. + * @note Unless overridden this method returns false by default, for backward compatability + */ + @Since("2.2.0") + @DeveloperApi + def isTestStatistic: Boolean +} + +/** + * :: DeveloperApi :: + * Utility functions for Impurity measures + */ +@Since("2.0.0") +@DeveloperApi +object Impurity { + /** + * :: DeveloperApi :: + * Convert a test-statistic p-value into a "larger-is-better" gain value. + * @param pval The test statistic p-value + * @return The negative logarithm of the p-value. Any p-values smaller than 10^-20 are clipped + * to 10^-20 to prevent arithmetic errors + */ + @Since("2.0.0") + @DeveloperApi + def pValToGain(pval: Double): Double = -math.log(math.max(1e-20, pval)) --- End diff -- private to spark?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org