[GitHub] incubator-madlib pull request: Prediction Metrics: New module

iyerr3 Tue, 31 May 2016 13:45:23 -0700

Github user iyerr3 commented on a diff in the pull request:

    https://github.com/apache/incubator-madlib/pull/42#discussion_r65259215
  
    --- Diff: src/ports/postgres/modules/stats/pred_metrics.py_in ---
    @@ -0,0 +1,562 @@
    +# coding=utf-8
    +#
    +# Licensed to the Apache Software Foundation (ASF) under one
    +# or more contributor license agreements.  See the NOTICE file
    +# distributed with this work for additional information
    +# regarding copyright ownership.  The ASF licenses this file
    +# to you under the Apache License, Version 2.0 (the
    +# "License"); you may not use this file except in compliance
    +# with the License.  You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing,
    +# software distributed under the License is distributed on an
    +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    +# KIND, either express or implied.  See the License for the
    +# specific language governing permissions and limitations
    +# under the License.
    +
    +# Prediction Metrics
    +# This module provides a set of prediction accuracy metrics. It is a 
support
    +# module for several machine learning algorithms that require metrics to
    +# validate their models. A typical function will take a set of 
"prediction" and
    +# "observation" values to calculate the desired metric, unless noted 
otherwise.
    +# Grouping is supported by all of these functions (except confusion 
matrix).
    +
    +# Please refer to the pred_metrics.sql_in file for the documentation
    +
    +import plpy
    +from utilities.validate_args import input_tbl_valid, output_tbl_valid, 
is_var_valid
    +from utilities.utilities import _assert
    +from utilities.utilities import split_quoted_delimited_str
    +
    +
    +def _validate_args(table_in, table_out, validate_cols):
    +    input_tbl_valid(table_in, "Prediction Metrics")
    +    output_tbl_valid(table_out, "Prediction Metrics")
    +    is_var_valid(table_in, ', '.join(validate_cols))
    +
    +
    +def _parse_grp_col_str(grp_col_str):
    +    group_set = set(split_quoted_delimited_str(grp_col_str))
    +    return list(group_set)
    +# ----------------------------------------------------------------------
    +
    +
    +def _create_output_table(table_in, table_out, agg_fun, agg_name, 
grp_col_str=None):
    +    """ Create an output table with optional groups
    +
    +    General template function that builds an output table with grouping 
while
    +    applying an aggregate function.
    +
    +    Args:
    +        @param agg_fun: str, SQL aggregate to be executed
    +        @param grp_cols: str, Comma-separated list of column names
    +    """
    +    grp_cols = _parse_grp_col_str(grp_col_str)
    +    _validate_args(table_in, table_out, grp_cols)
    +    if not grp_cols:
    +        grp_by_str = grp_out_str = ""
    +    else:
    +        grp_by_str = "GROUP BY " + grp_col_str
    +        grp_out_str = grp_col_str + ", "
    +    plpy.execute("""
    +                 CREATE TABLE {table_out} AS
    +                 SELECT
    +                    {grp_out_str}
    +                    {agg_fun} AS {agg_name}
    +                 FROM {table_in}
    +                 {grp_by_str}
    +                 """.format(**locals()))
    +
    +
    +# Mean Absolute Error.
    +def mean_abs_error(table_in, table_out, pred_col, obs_col, grp_cols=None):
    +    mean_abs_agg = "AVG(ABS({0} - {1}))".format(pred_col, obs_col)
    +    _create_output_table(table_in, table_out, mean_abs_agg, 
"mean_abs_error", grp_cols)
    +
    +
    +# Mean Absolute Percentage Error.
    +def mean_abs_perc_error(table_in, table_out, pred_col, obs_col, 
grp_cols=None):
    +    mean_abs_perc_agg = "AVG(ABS({0} - {1})/NULLIF({1}, 
0))".format(pred_col, obs_col)
    +    _create_output_table(table_in, table_out, mean_abs_perc_agg, 
"mean_abs_perc_error", grp_cols)
    +
    +
    +# Mean Percentage Error.
    +def mean_perc_error(table_in, table_out, pred_col, obs_col, grp_cols=None):
    +    mean_perc_agg = "AVG(({0} - {1})/NULLIF({1}, 0))".format(pred_col, 
obs_col)
    +    _create_output_table(table_in, table_out, mean_perc_agg, 
"mean_perc_error", grp_cols)
    +
    +
    +# Mean Squared Error.
    +def mean_squared_error(table_in, table_out, pred_col, obs_col, 
grp_cols=None):
    +    mean_sq_agg = "AVG(({0} - {1})^2)".format(pred_col, obs_col)
    +    _create_output_table(table_in, table_out, mean_sq_agg, 
"mean_squared_error", grp_cols)
    +
    +
    +def metric_agg_help_msg(schema_madlib, message, agg_name, **kwargs):
    +
    +    if not message:
    +        help_string = """
    +------------------------------------------------------------
    +                        SUMMARY
    +------------------------------------------------------------
    +Functionality: Evaluate prediction results using metric functions.
    +
    +This module provides a set of prediction accuracy metrics. It is a support
    +module for several machine learning algorithms that require metrics to 
validate
    +their models. The function will take "prediction" and "observation" values 
to
    +calculate the desired metric. Grouping is supported by all of these 
functions.
    +    """
    +    elif message.lower().strip() in ['usage', 'help', '?']:
    +        help_string = """
    +------------------------------------------------------------
    +                        USAGE
    +------------------------------------------------------------
    +SELECT {schema_madlib}.{agg_name}(
    +    'table_in',     -- Name of the input table
    +    'table_out',    -- Table name to store the metric results
    +    'pred_col',     -- Column name containing prediction results
    +    'obs_col',      -- Column name containing observed (actual) values
    +    'grouping_cols' -- Comma-separated list of columns to use as group-by
    +);
    +    """
    +    else:
    +        help_string = "No such option. Use 
{schema_madlib}.{agg_name}('usage')"
    +    return help_string.format(**locals())
    +
    +
    +def _get_r2_score_sql(table_in, pred_col, obs_col, grp_col_str=None):
    +    """ Generate the SQL query to compute r2 score.
    +
    +    This function abstracts the SQL to calculate r2 score from actually 
building
    +    the output table. This allows reusing the query for adjusted r2 
function.
    +
    +    Args:
    +        @param table_in: str, Input table name containing the data
    +        @param pred_col: str, Column name containing the predictions
    +        @param obs_col: str, Column name containing the actual observed 
class
    +        @param grp_col_str: str, Comma-separated list of columns to group 
by
    +
    +    Definition:
    +        r2 = 1 - SS_res / SS_tot
    +        where SS_res = sum (pred - obs)^2
    +              SS_tot = sum (obs - mean)^2
    +
    +    """
    +    if grp_col_str:
    +        grp_out_str = grp_col_str + ","
    +        grp_by_str = "GROUP BY " + grp_col_str
    +        partition_str = "PARTITION BY " + grp_col_str
    +    else:
    +        grp_out_str = grp_by_str = partition_str = ""
    +    return """
    +            SELECT {grp_out_str}
    +                1 - ssres/sstot AS r2_score
    +            FROM (
    +                SELECT {grp_out_str}
    +                       sum(({obs_col} - mean)^2) as sstot,
    +                       sum(({pred_col} - {obs_col})^2) AS ssres
    +                FROM(
    +                    SELECT {grp_out_str}
    +                           {pred_col}, {obs_col},
    +                           avg({obs_col}) OVER ({partition_str}) as mean
    +                    FROM {table_in}
    +                ) x {grp_by_str}
    +            ) y
    +        """.format(**locals())
    +
    +
    +def r2_score(table_in, table_out, pred_col, obs_col, grp_col_str=None):
    +    grp_cols = _parse_grp_col_str(grp_col_str)
    +    _validate_args(table_in, table_out, grp_cols + [pred_col, obs_col])
    +    plpy.execute("""
    +        CREATE TABLE {table_out} AS
    +        {r2_score_sql}
    +        """.format(table_out=table_out,
    +                   r2_score_sql=_get_r2_score_sql(table_in, pred_col, 
obs_col,
    +                                                  grp_col_str)))
    +
    +
    +def r2_score_help(schema_madlib, message, **kwargs):
    +    if not message:
    +        help_string = """
    +------------------------------------------------------------
    +                        SUMMARY
    +------------------------------------------------------------
    +Functionality: Compute coefficient of determination
    +
    +This function returns the coefficient of determination (R2) between the
    +predicted and observed values. An R2 of 1 indicates that the regression 
line
    +perfectly fits the data, while an R2 of 0 indicates that the line does not 
fit
    +the data at all. Negative values of R2 may occur when fitting non-linear
    +functions to data.
    +    """
    +    elif message.lower().strip() in ['usage', 'help', '?']:
    +        help_string = """
    +------------------------------------------------------------
    +                        USAGE
    +------------------------------------------------------------
    +SELECT {schema_madlib}.r2_score(
    +    'table_in',     -- Name of the input table
    +    'table_out',    -- Table name to store the metric results
    +    'pred_col',     -- Column name containing prediction results
    +    'obs_col',      -- Column name containing observed (actual) values
    +    'grouping_cols' -- Comma-separated list of columns to use as group-by
    +);
    +    """
    +    else:
    +        help_string = "No such option. Use 
{schema_madlib}.r2_score('usage')"
    +    return help_string.format(**locals())
    +
    +
    +def adjusted_r2_score(table_in, table_out, pred_col, obs_col,
    +                      n_predictors, train_size, grp_col_str):
    +    """ Compute the adjusted r2 score
    +
    +    Args:
    +        @param table_in: str, Input table name containing the data
    +        @param pred_col: str, Column name containing the predictions
    +        @param obs_col: str, Column name containing the actual observed 
class
    +        @param grp_col_str: str, Comma-separated list of columns to group 
by
    +
    +    Definition:
    +        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p)
    +        where n = degrees of freedom
    +              p = number of explanatory variables
    +    """
    +    grp_cols = _parse_grp_col_str(grp_col_str)
    +    _validate_args(table_in, table_out, grp_cols + [pred_col, obs_col])
    +    plpy.execute("""
    +        CREATE TABLE {table_out} AS
    +        SELECT *,
    +               1 - ( ((1- r2_score) * ({train_size} -1)) /
    +                     ({train_size} - {n_predictors} - 1)
    +                    ) AS adjusted_r2_score
    +        FROM (
    +            {r2_score_sql}
    +        )z
    +        """.format(r2_score_sql=_get_r2_score_sql(table_in, pred_col, 
obs_col, grp_col_str),
    +                   **locals()))
    +
    +
    +def adjusted_r2_score_help(schema_madlib, message, **kwargs):
    +    if not message:
    +        help_string = """
    +------------------------------------------------------------
    +                        SUMMARY
    +------------------------------------------------------------
    +Functionality: Compute coefficient of determination (with adjustment for 
number of predictors)
    +
    +This function returns the adjusted R2 score. Adjusted R2 score is used to
    +counter the problem of the R2 automatically increasing when extra 
explanatory
    +variables are added to the model. It takes additional two integers 
describing
    +the degrees of freedom of the model and the size of the training set over 
which
    +it was developed, and returning the adjusted R-squared prediction accuracy
    +metric.
    +    """
    +    elif message.lower().strip() in ['usage', 'help', '?']:
    +        help_string = """
    +------------------------------------------------------------
    +                        USAGE
    +------------------------------------------------------------
    +SELECT {schema_madlib}.adjusted_r2_score(
    +    'table_in',        -- Name of the input table
    +    'table_out',       -- Table name to store the metric results
    +    'pred_col',        -- Column name containing prediction results
    +    'obs_col',         -- Column name containing observed (actual) values
    +    'num_predictors',  -- Number of predictors (features) used in the model
    +    'training_size',   -- Size of the training data
    +    'grouping_cols'    -- Comma-separated list of columns to use as 
group-by
    +);
    +    """
    +    else:
    +        help_string = "No such option. Use 
{schema_madlib}.adjusted_r2_score('usage')"
    +    return help_string.format(**locals())
    +
    +
    +def binary_classifier(table_in, table_out, pred_col, obs_col, 
grp_col_str=None):
    +    """ Get multiple metrics useful for evaluating binary classifiers.
    +
    +    The 'obs_col' column in 'table_in' is assumed to be a int/float column 
with
    +    two level: 0 and 1, 1 is considered positive and 0 is negative.
    +
    +    The 'pred_col' is assumed to be a float column that gives the 
probability of
    +    the prediction being positive.
    +
    +    """
    +    grp_cols = _parse_grp_col_str(grp_col_str)
    +    _validate_args(table_in, table_out, grp_cols + [pred_col, obs_col])
    +    obs_levels = [i['c'] for i in
    +                  plpy.execute("SELECT {0} as c FROM {1} GROUP BY {0} 
ORDER BY {0}".
    +                               format(obs_col, table_in))
    +                  ]
    +    _assert(obs_levels == [0, 1], "Prediction metrics: Observed levels 
should be 0 and 1")
    +
    +    if grp_col_str:
    +        grp_str = grp_col_str + ","
    +        partition_str = "PARTITION BY " + grp_col_str
    +    else:
    +        grp_str = partition_str = ""
    +
    +    sql_st = """
    +        CREATE TABLE {table_out} AS
    +        SELECT *,
    +            tp::float8 / NULLIF(tp + fn, 0) AS tpr,
    +            tn::float8 / NULLIF(fp + tn, 0) AS tnr,
    +            tp::float8 / NULLIF(tp + fp, 0) AS ppv,
    +            tn::float8 / NULLIF(tn + fn, 0) AS npv,
    +            fp::float8 / NULLIF(fp + tn, 0) AS fpr,
    +            fp::float8 / NULLIF(fp + tp, 0) AS fdr,
    +            fn::float8 / NULLIF(fn + tp, 0) AS fnr,
    +            (tp + tn)::float8 / NULLIF(tp + tn + fp + fn, 0) AS acc,
    +            tp * 2.0 / NULLIF(2.0 * tp + fp + fn, 0) AS f1
    +        FROM (
    +            SELECT
    +                {grp_str}
    +                threshold,
    +                sum(t) OVER ({partition_str} ORDER BY threshold DESC) AS 
tp,
    +                sum(f) OVER ({partition_str} ORDER BY threshold DESC) AS 
fp,
    +                sum(t) OVER ({partition_str}) -
    +                    sum(t) OVER ({partition_str} ORDER BY threshold DESC) 
AS fn,
    +                sum(f) OVER ({partition_str}) -
    +                    sum(f) OVER ({partition_str} ORDER BY threshold DESC) 
AS tn
    +            FROM (
    +                SELECT {grp_str}
    +                       {pred_col} AS threshold,
    +                       sum({obs_col}) AS t,
    +                       count(*) - sum({obs_col}) AS f
    +                FROM {table_in}
    +                GROUP BY {grp_str}
    +                         threshold
    +            ) x
    +        ) y
    +        """.format(**locals())
    +    plpy.execute(sql_st)
    +
    +
    +def binary_classifier_help(schema_madlib, message, **kwargs):
    +    if not message:
    +        help_string = """
    +------------------------------------------------------------
    +                        SUMMARY
    +------------------------------------------------------------
    +Functionality: Metrics for binary classification
    +
    +This function returns an output table with a number of metrics commonly 
used to
    +evaluated binary classification.
    +
    +List of the various metrics output by the function:
    +
    +- tp is the count of correctly-classified positives
    +- tn is the count of correctly-classified negatives
    +- fp is the count of misclassified negatives
    +- fn is the count of misclassified positives
    +- tpr = tp / (tp + fn)
    +- tnr = tn / (fp + tn)
    +- ppv = tp / (tp + fp)
    +- npv = tn / (tn + fn)
    +- fpr = fp / (fp + tn)
    +- fdr = 1 - ppv
    +- fnr = fn / (fn + tp).
    +- acc = (tp + tn) / (tp + tn + fp + fn).
    +- f1 = 2* tp / (2 * tp + fp + fn).
    +    """
    +    elif message.lower().strip() in ['usage', 'help', '?']:
    +        help_string = """
    +------------------------------------------------------------
    +                        USAGE
    +------------------------------------------------------------
    +SELECT {schema_madlib}.binary_classifier(
    +    'table_in',        -- Name of the input table
    +    'table_out',       -- Table name to store the metric results
    +    'pred_col',        -- Column name containing prediction results
    +    'obs_col',         -- Column name containing observed (actual) values
    +    'grouping_cols'    -- Comma-separated list of columns to use as 
group-by
    +);
    +    """
    +    else:
    +        help_string = "No such option. Use 
{schema_madlib}.binary_classifier('usage')"
    +    return help_string.format(**locals())
    +
    +
    +def area_under_roc(table_in, table_out, pred_col, obs_col, 
grp_col_str=None):
    +    """ Get area under ROC curve for a binary classifier.
    +
    +    The 'obs_col' column in 'table_in' is assumed to be a int/float column 
with
    +    two level: 0 and 1, 1 is considered positive and 0 is negative.
    +
    +    The 'pred_col' is assumed to be a float column that gives the 
probability of
    +    the prediction being positive.
    +
    +    """
    +    grp_cols = _parse_grp_col_str(grp_col_str)
    +    _validate_args(table_in, table_out, grp_cols + [pred_col, obs_col])
    +    if grp_col_str:
    +        grp_str = grp_col_str + ","
    +        grp_by_str = "GROUP BY " + grp_col_str
    +        partition_str = "PARTITION BY " + grp_col_str
    +    else:
    +        grp_str = grp_by_str = partition_str = ""
    +
    +    sql_st = """
    +    CREATE TABLE {table_out} AS
    +    SELECT {grp_str}
    +           sum((tpr + prev_tpr) * (fpr - prev_fpr) * 0.5) AS area_under_roc
    +    FROM (
    +      SELECT {grp_str}
    +             tpr, fpr,
    +             coalesce(lag(tpr) OVER ({partition_str} ORDER BY threshold 
DESC),
    +                      0) AS prev_tpr,
    +             coalesce(lag(fpr) OVER ({partition_str} ORDER BY threshold 
DESC),
    +                      0) AS prev_fpr
    +      FROM(
    +        SELECT {grp_str}
    +               threshold,
    +               sum(t) OVER ({partition_str} ORDER BY threshold DESC) *
    +                    1.0/ NULLIF(sum(t) OVER ({partition_str}), 0) AS tpr,
    +               sum(f) OVER ({partition_str} ORDER BY threshold DESC) *
    +                    1.0/NULLIF(sum(f) OVER ({partition_str}),0) AS fpr
    +        FROM (
    +          SELECT {grp_str}
    +                 {pred_col} AS threshold,
    +                 sum({obs_col}) AS t,
    --- End diff --
    
    Let's add a `::int` inside the sum. That way we can also support `obs_col` 
as a boolean (which would be nice considering this is binary classification).



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-madlib pull request: Prediction Metrics: New module

Reply via email to