Repository: incubator-madlib Updated Branches: refs/heads/master f1aa9af62 -> 4264fa9d3
DT: Correctly encode unseen categorical features Changes applied in commit a2f4740 added an option to treat NULL values as a new category. This was implemented by changing the encoding process of categorical features to add a new value at the end of the list of values. The intention with the commit was to treat new unseen, non-null values equivalent to NULL. The process, however, still encoded the unseen categorical value as -1, which is interpreted as NULL in underlying functions. This commit updates this process to correctly use the last index as the encoding for the unseen/NULL value. Closes #171 Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/4264fa9d Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/4264fa9d Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/4264fa9d Branch: refs/heads/master Commit: 4264fa9d3bbfdb616364339c127373d7f0568e30 Parents: f1aa9af Author: Rahul Iyer <ri...@apache.org> Authored: Fri Aug 18 16:59:44 2017 -0700 Committer: Rahul Iyer <ri...@apache.org> Committed: Fri Aug 18 16:59:44 2017 -0700 ---------------------------------------------------------------------- .../recursive_partitioning/feature_encoding.cpp | 22 ++++-- .../recursive_partitioning/decision_tree.py_in | 7 +- .../recursive_partitioning/decision_tree.sql_in | 73 ++++++++++---------- .../test/decision_tree.sql_in | 2 +- 4 files changed, 59 insertions(+), 45 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/modules/recursive_partitioning/feature_encoding.cpp ---------------------------------------------------------------------- diff --git a/src/modules/recursive_partitioning/feature_encoding.cpp b/src/modules/recursive_partitioning/feature_encoding.cpp index 3b0a452..a5f131b 100644 --- a/src/modules/recursive_partitioning/feature_encoding.cpp +++ b/src/modules/recursive_partitioning/feature_encoding.cpp @@ -184,21 +184,31 @@ map_catlevel_to_int::run(AnyType &args){ ArrayHandle<text*> cat_values = args[0].getAs<ArrayHandle<text*> >(); ArrayHandle<text*> cat_levels = args[1].getAs<ArrayHandle<text*> >(); ArrayHandle<int> n_levels = args[2].getAs<ArrayHandle<int> >(); + bool null_as_category = args[3].getAs<bool>(); MutableArrayHandle<int> cat_int = allocateArray<int>(n_levels.size()); int pos = 0; for (size_t i = 0; i < n_levels.size(); i++) { // linear search to find a match - int match = -1; // if cat_values contains any not present in cat_levels, - // then the mapped integer is -1. If cat_values contains - // a known cat_level, then the mapped integer is - // the index of that value in cat_levels - for (int j = 0; j < n_levels[i]; j++) + + // if cat_values contains any not present in cat_levels, then the mapped + // integer is -1. If cat_values contains a known cat_level, then the + // mapped integer is the index of that value in cat_levels. + int match = -1; + for (int j = 0; j < n_levels[i]; j++){ if (cmp_text(cat_values[i], cat_levels[pos + j])) { match = j; break; } - cat_int[i] = match; + } + + // If null_as_category is True, then match is set to the last index + // instead of -1 since the last index is expected to represent NULL. + if (match == -1 and null_as_category){ + cat_int[i] = n_levels[i] - 1; + } else { + cat_int[i] = match; + } pos += static_cast<int>(n_levels[i]); } return cat_int; http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index 35fc3f9..ce56ab0 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -1179,8 +1179,11 @@ def get_feature_str(schema_madlib, boolean_cats, "(coalesce({0}::text, '{1}'))::text".format(col, null_val)) cat_features_str = ("{0}._map_catlevel_to_int(array[" + - ", ".join(cat_features_cast) + "], {1}, {2})" - ).format(schema_madlib, levels_str, n_levels_str) + ", ".join(cat_features_cast) + "], {1}, {2}, {3})" + ).format(schema_madlib, + levels_str, + n_levels_str, + null_proxy is not None) else: cat_features_str = "NULL" http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in index 8290dcf..3632f2e 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in @@ -912,20 +912,20 @@ Result: (32 rows) </pre> -<h4>NULL handling example</h4> +<h4>NULL Handling Example</h4> -- Create toy example to illustrate null-as-category handling +-# Create toy example to illustrate null-as-category handling <pre class='example'> -drop table null_handling_example; -create table null_handling_example ( -id integer, -country text, -city text, -weather text, -response text +DROP TABLE IF EXISTS null_handling_example; +CREATE TABLE null_handling_example ( + id integer, + country text, + city text, + weather text, + response text ); -insert into null_handling_example values +INSERT INTO null_handling_example VALUES (1,null,null,null,'a'), (2,'US',null,null,'b'), (3,'US','NY',null,'c'), @@ -933,22 +933,22 @@ insert into null_handling_example values DROP TABLE IF EXISTS train_output, train_output_summary; SELECT madlib.tree_train('null_handling_example', -- source table - 'train_output', -- output model table - 'id', -- id column - 'response', -- dependent variable - 'country, city, weather', -- features - NULL, - 'gini', -- split criterion - NULL::text, -- no grouping - NULL::text, -- no weights - 4, -- max depth - 1, -- min split - 1, -- number of bins per continuous variable - 10, -- number of splits - NULL, -- pruning parameters - 'null_as_category=true' -- null handling + 'train_output', -- output model table + 'id', -- id column + 'response', -- dependent variable + 'country, weather, city', -- features + NULL, -- features to exclude + 'gini', -- split criterion + NULL::text, -- no grouping + NULL::text, -- no weights + 4, -- max depth + 1, -- min split + 1, -- number of bins per continuous variable + 10, -- number of splits + NULL, -- pruning parameters + 'null_as_category=true' -- null handling ); -select cat_levels_in_text, cat_n_levels from train_output; +SELECT cat_levels_in_text, cat_n_levels FROM train_output; </pre> <pre class='result'> cat_levels_in_text | cat_n_levels @@ -956,17 +956,17 @@ select cat_levels_in_text, cat_n_levels from train_output; {US,__NULL__,rainy,__NULL__,NY,__NULL__} | {2,2,2} </pre> -- Predict for previously not seen data by assuming NULL value as the default +-# Predict for previously not seen data by assuming NULL value as the default <pre class='example'> -drop table if exists table_test; -create table table_test ( -id integer, -country text, -city text, -weather text, -expected_response text +DROP TABLE IF EXISTS table_test; +CREATE TABLE table_test ( + id integer, + country text, + city text, + weather text, + expected_response text ); -insert into table_test values +INSERT INTO table_test VALUES (1,'IN','MUM','cloudy','a'), (2,'US','HOU','humid','b'), (3,'US','NY','sunny','c'), @@ -979,7 +979,7 @@ SELECT madlib.tree_predict('train_output', 'response'); SELECT s.id, expected_response, estimated_response FROM prediction_results p, table_test s -where s.id = p.id ORDER BY id; +WHERE s.id = p.id ORDER BY id; </pre> <pre class='result'> id | expected_response | estimated_response @@ -1229,7 +1229,8 @@ CREATE AGGREGATE MADLIB_SCHEMA._dst_compute_entropy( CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._map_catlevel_to_int( cat_values_in_text TEXT[], -- categorical variable value from each row cat_levels_in_text TEXT[], -- all levels in text - cat_n_levels INTEGER[] -- number of levels for each categorical variable + cat_n_levels INTEGER[], -- number of levels for each categorical variable + null_as_category BOOLEAN -- flag to check if NULL is treated as a separate category ) RETURNS INTEGER[] AS 'MODULE_PATHNAME', 'map_catlevel_to_int' LANGUAGE c IMMUTABLE http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in index 93b9a52..e585379 100644 --- a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in @@ -43,7 +43,7 @@ SELECT * FROM dummy_splits; -- cat encoding SELECT assert( - relative_error(_map_catlevel_to_int('{B}', '{A,B}', ARRAY[2]), ARRAY[1]) < 1e-6, + relative_error(_map_catlevel_to_int('{B}', '{A,B}', ARRAY[2], TRUE), ARRAY[1]) < 1e-6, 'wrong results in _map_catlevel_to_int()') ;