[
https://issues.apache.org/jira/browse/MADLIB-1254?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Frank McQuillan updated MADLIB-1254:
------------------------------------
Comment: was deleted
(was: For RF:
{code}
-[ RECORD 1
]-----------+-----------------------------------------------------------------------------------------------
gid | 1
class | Don't Play
success | t
cat_n_levels | {2,2}
cat_levels_in_text | {c,a,True,False}
oob_error | 72.8680555555556
oob_var_importance | {0,0,0,11.28,0}
impurity_var_importance |
{13.3333332067,0,23.33333295978,26.66666536564,16.66666634414}
-[ RECORD 2
]-----------+-----------------------------------------------------------------------------------------------
gid | 2
class | Play
success | t
cat_n_levels | {2,2,2}
cat_levels_in_text | {c,a,b,d,False,True}
oob_error | 27.6126543209877
oob_var_importance |
{0,0.682,0.682,11.618,14.3408287037037,0.485124999999998}
impurity_var_importance |
{1.75749418342542,6.49425472528,6.49425472528,17.0265458048514,40.89746752486,27.329981237604}
{code}
)
> RF/DT: Grouping might give incorrect results if 1 group eliminates a
> categorical variable
> -----------------------------------------------------------------------------------------
>
> Key: MADLIB-1254
> URL: https://issues.apache.org/jira/browse/MADLIB-1254
> Project: Apache MADlib
> Issue Type: Bug
> Components: Module: Decision Tree
> Reporter: Rahul Iyer
> Priority: Major
> Fix For: v1.15
>
>
> If {{forest_train}} is run with grouping enabled and if one of the groups has
> a categorical feature with just single level, then the categorical feature is
> eliminated for that group. If other groups retain that feature, then the
> output of impurity_var_importance is incorrect for the group in question.
> There could be other ramifications related to this as well.
> {code:java}
> DROP TABLE IF EXISTS dt_golf CASCADE;
> CREATE TABLE dt_golf (
> id integer NOT NULL,
> "OUTLOOK" text,
> temperature double precision,
> humidity double precision,
> "Cont_features" double precision[],
> cat_features text[],
> windy boolean,
> class text
> ) ;
> INSERT INTO dt_golf
> (id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features, windy,class)
> VALUES
> (1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, 'Don''t Play'),
> (2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, 'Don''t Play'),
> (3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, 'Play'),
> (4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, 'Play'),
> (5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, 'Play'),
> (6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
> (7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL , 'Play'),
> (8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, 'Don''t Play'),
> (9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, 'Play'),
> (10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, 'Play'),
> (11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, 'Play'),
> (12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, 'Play'),
> (13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
> (15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
> (16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
> (14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, 'Don''t Play');
> DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group,
> train_output_poisson_count;
> SELECT forest_train(
> 'dt_golf', -- source table
> 'train_output', -- output model table
> 'id', -- id column
> 'temperature::double precision', -- response
> 'humidity, cat_features, windy, "Cont_features"', --
> features
> NULL, -- exclude columns
> 'class', -- grouping
> 5, -- num of trees
> NULL, -- num of random features
> TRUE, -- importance
> 20, -- num_permutations
> 10, -- max depth
> 1, -- min split
> 1, -- min bucket
> 3, -- number of bins per continuous variable
> 'max_surrogates = 2 ',
> FALSE
> );
> \x on
> SELECT * from train_output_summary;
> SELECT * from train_output_group;
> {code}
> Results:
> {code:java}
> SELECT * from train_output_group;
> -[ RECORD 1
> ]-----------+-----------------------------------------------------------------------------
> gid | 1
> class | Don't Play
> success | t
> cat_n_levels | {2,2,2}
> cat_levels_in_text | {c,a,True,False,c,a}
> oob_error | 92.5335905349795
> oob_var_importance | {10.725,10.725,10.725,7.605,10.725,0}
> impurity_var_importance |
> {8.33148348160485,0,0,19.9999998625892,19.9999998625892,11.6685163809844}
> -[ RECORD 2
> ]-----------+-----------------------------------------------------------------------------
> gid | 2
> class | Play
> success | t
> cat_n_levels | {2,2}
> cat_levels_in_text | {b,d,False,True}
> oob_error | 43.0244073645405
> oob_var_importance |
> {1.06581410364015e-15,1.06581410364015e-15,2.1326171875,16.019375,10.570875}
> impurity_var_importance |
> {0,0,0,37.8304000437732,38.4881698525677,23.6814277291654}
> {code}
> Note that the {{impurity_var_importance}} for {{gid=2}} has length 6 while
> the {{oob_var_importance}} correctly has 5.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)