[
https://issues.apache.org/jira/browse/MADLIB-1218?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Frank McQuillan updated MADLIB-1218:
------------------------------------
Description:
{code}
DROP TABLE IF EXISTS mt_cars;
CREATE TABLE mt_cars (
id integer NOT NULL,
mpg double precision,
cyl integer,
disp double precision,
hp integer,
drat double precision,
wt double precision,
qsec double precision,
vs integer,
am integer,
gear integer,
carb integer
);
INSERT INTO mt_cars VALUES
(1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
(2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
(3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
(4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
(5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
(6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
(7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
(8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
(9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
(10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
(11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
(12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
(13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
(14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
(15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
(16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
(17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
(18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
(19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
(20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
(21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
(22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
(23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
(24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
(25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
(26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
(27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
(28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
(29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
(30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
(31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
(32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
{code}
RF:
{code}
DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group,
mt_cars_output_summary;
SELECT madlib.forest_train('mt_cars',
'mt_cars_output',
'id',
'mpg',
'*',
'id, hp, drat, am, gear, carb', -- exclude columns
'am', -- grouping
10::integer,
2::integer,
TRUE::boolean,
1,
10,
8,
3,
10
);
SELECT * FROM mt_cars_output_summary;
{code}
produces
{code}
-[ RECORD 1
]---------+-----------------------------------------------------------------------
method | forest_train
is_classification | f
source_table | mt_cars
model_table | mt_cars_output
id_col_name | id
dependent_varname | mpg
independent_varnames | vs,cyl,disp,qsec,wt
cat_features | vs,cyl
con_features | disp,qsec,wt
grouping_cols | am
num_trees | 10
num_random_features | 2
max_tree_depth | 10
min_split | 8
min_bucket | 3
num_splits | 10
verbose | f
importance | t
num_permutations | 1
num_all_groups | 2
num_failed_groups | 0
total_rows_processed | 32
total_rows_skipped | 0
dependent_var_levels |
dependent_var_type | double precision
independent_var_types | integer, integer, double precision, double precision,
double precision
null_proxy | None
{code}
{code}
SELECT * FROM mt_cars_output_group;
{code}
produces
{code}
-[ RECORD 1 ]------+--------------------------------------
gid | 1
am | 0
success | t
cat_n_levels | {3,3}
cat_levels_in_text | {4,6,8,4,6,8}
oob_error | 10.4384609425616
cat_var_importance | {1.77200744569228,3.23893136930339}
con_var_importance | {6.52806304913008,0,2.71255870662522}
-[ RECORD 2 ]------+--------------------------------------
gid | 2
am | 1
success | t
cat_n_levels | {2,2}
cat_levels_in_text | {0,1,0,1}
oob_error | 22.3349978333876
cat_var_importance | {4.59597959183674,0}
con_var_importance | {5.715875,0,18.4067344897959}
{code}
In the above, cat_n_levels and cat_levels_in_text are wrong. I would expect
{0,1,4,6,8} to be reported for both groups.
Compare the above with the case of no groups:
{code}
DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group,
mt_cars_output_summary;
SELECT madlib.forest_train('mt_cars',
'mt_cars_output',
'id',
'mpg',
'*',
'id, hp, drat, am, gear, carb', -- exclude columns
NULL, -- no groups
10::integer,
2::integer,
TRUE::boolean,
1,
10,
8,
3,
10
);
SELECT * FROM mt_cars_output_summary;
{code}
produces
{code}
[ RECORD 1
]---------+-----------------------------------------------------------------------
method | forest_train
is_classification | f
source_table | mt_cars
model_table | mt_cars_output
id_col_name | id
dependent_varname | mpg
independent_varnames | vs,cyl,disp,qsec,wt
cat_features | vs,cyl
con_features | disp,qsec,wt
grouping_cols |
num_trees | 10
num_random_features | 2
max_tree_depth | 10
min_split | 8
min_bucket | 3
num_splits | 10
verbose | f
importance | t
num_permutations | 1
num_all_groups | 1
num_failed_groups | 0
total_rows_processed | 32
total_rows_skipped | 0
dependent_var_levels |
dependent_var_type | double precision
independent_var_types | integer, integer, double precision, double precision,
double precision
null_proxy | None
{code}
{code}
SELECT * FROM mt_cars_output_group;
{code}
produces
{code}
gid | 1
success | t
cat_n_levels | {2,3}
cat_levels_in_text | {0,1,4,6,8}
oob_error | 9.90200731761036
cat_var_importance | {0.390353827160495,12.8836927689546}
con_var_importance | {0.118665925451092,0.947886524555693,19.910665401879}
{code}
which seems OK.
was:
{code}
DROP TABLE IF EXISTS mt_cars;
CREATE TABLE mt_cars (
id integer NOT NULL,
mpg double precision,
cyl integer,
disp double precision,
hp integer,
drat double precision,
wt double precision,
qsec double precision,
vs integer,
am integer,
gear integer,
carb integer
);
INSERT INTO mt_cars VALUES
(1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
(2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
(3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
(4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
(5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
(6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
(7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
(8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
(9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
(10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
(11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
(12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
(13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
(14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
(15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
(16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
(17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
(18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
(19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
(20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
(21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
(22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
(23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
(24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
(25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
(26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
(27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
(28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
(29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
(30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
(31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
(32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
{code}
RF:
{code}
DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group,
mt_cars_output_summary;
SELECT madlib.forest_train('mt_cars',
'mt_cars_output',
'id',
'mpg',
'*',
'id, hp, drat, am, gear, carb', -- exclude columns
'am', -- grouping
10::integer,
2::integer,
TRUE::boolean,
1,
10,
8,
3,
10
);
SELECT * FROM mt_cars_output_summary;
{code}
produces
{code}
-[ RECORD 1
]---------+-----------------------------------------------------------------------
method | forest_train
is_classification | f
source_table | mt_cars
model_table | mt_cars_output
id_col_name | id
dependent_varname | mpg
independent_varnames | vs,cyl,disp,qsec,wt
cat_features | vs,cyl
con_features | disp,qsec,wt
grouping_cols | am
num_trees | 10
num_random_features | 2
max_tree_depth | 10
min_split | 8
min_bucket | 3
num_splits | 10
verbose | f
importance | t
num_permutations | 1
num_all_groups | 2
num_failed_groups | 0
total_rows_processed | 32
total_rows_skipped | 0
dependent_var_levels |
dependent_var_type | double precision
independent_var_types | integer, integer, double precision, double precision,
double precision
null_proxy | None
{code}
{code}
SELECT * FROM mt_cars_output_group;
{code}
produces
{code}
-[ RECORD 1 ]------+--------------------------------------
gid | 1
am | 0
success | t
cat_n_levels | {3,3}
cat_levels_in_text | {4,6,8,4,6,8}
oob_error | 10.4384609425616
cat_var_importance | {1.77200744569228,3.23893136930339}
con_var_importance | {6.52806304913008,0,2.71255870662522}
-[ RECORD 2 ]------+--------------------------------------
gid | 2
am | 1
success | t
cat_n_levels | {2,2}
cat_levels_in_text | {0,1,0,1}
oob_error | 22.3349978333876
cat_var_importance | {4.59597959183674,0}
con_var_importance | {5.715875,0,18.4067344897959}
{code}
In the above, cat_n_levels and cat_levels_in_text are wrong. I would expect
{0,1,4,6,8} to be reported for both groups.
Compare the above with the case of no groups:
{code}
DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group,
mt_cars_output_summary;
SELECT madlib.forest_train('mt_cars',
'mt_cars_output',
'id',
'mpg',
'*',
'id, hp, drat, am, gear, carb', -- exclude columns
NULL, -- no groups
10::integer,
2::integer,
TRUE::boolean,
1,
10,
8,
3,
10
);
SELECT * FROM mt_cars_output_summary;
{code}
produces
{code}
[ RECORD 1
]---------+-----------------------------------------------------------------------
method | forest_train
is_classification | f
source_table | mt_cars
model_table | mt_cars_output
id_col_name | id
dependent_varname | mpg
independent_varnames | vs,cyl,disp,qsec,wt
cat_features | vs,cyl
con_features | disp,qsec,wt
grouping_cols |
num_trees | 10
num_random_features | 2
max_tree_depth | 10
min_split | 8
min_bucket | 3
num_splits | 10
verbose | f
importance | t
num_permutations | 1
num_all_groups | 1
num_failed_groups | 0
total_rows_processed | 32
total_rows_skipped | 0
dependent_var_levels |
dependent_var_type | double precision
independent_var_types | integer, integer, double precision, double precision,
double precision
null_proxy | None
{code}
{code}
SELECT * FROM mt_cars_output_group;
{code}
produces
{code}
[ RECORD 1 ]------+------------------------------------------------------
gid | 1
success | t
cat_n_levels | {2,3}
cat_levels_in_text | {0,1,4,6,8}
oob_error | 9.90200731761036
cat_var_importance | {0.390353827160495,12.8836927689546}
con_var_importance | {0.118665925451092,0.947886524555693,19.910665401879}
{code}
which seems OK.
> RF: categorical levels reported incorrectly when grouping is used
> ------------------------------------------------------------------
>
> Key: MADLIB-1218
> URL: https://issues.apache.org/jira/browse/MADLIB-1218
> Project: Apache MADlib
> Issue Type: Bug
> Components: Module: Random Forest
> Reporter: Frank McQuillan
> Assignee: Rahul Iyer
> Priority: Minor
> Fix For: v1.14
>
>
> {code}
> DROP TABLE IF EXISTS mt_cars;
> CREATE TABLE mt_cars (
> id integer NOT NULL,
> mpg double precision,
> cyl integer,
> disp double precision,
> hp integer,
> drat double precision,
> wt double precision,
> qsec double precision,
> vs integer,
> am integer,
> gear integer,
> carb integer
> );
> INSERT INTO mt_cars VALUES
> (1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
> (2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
> (3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
> (4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
> (5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
> (6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
> (7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
> (8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
> (9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
> (10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
> (11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
> (12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
> (13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
> (14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
> (15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
> (16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
> (17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
> (18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
> (19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
> (20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
> (21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
> (22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
> (23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
> (24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
> (25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
> (26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
> (27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
> (28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
> (29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
> (30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
> (31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
> (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
> {code}
> RF:
> {code}
> DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group,
> mt_cars_output_summary;
> SELECT madlib.forest_train('mt_cars',
> 'mt_cars_output',
> 'id',
> 'mpg',
> '*',
> 'id, hp, drat, am, gear, carb', -- exclude columns
> 'am', -- grouping
> 10::integer,
> 2::integer,
> TRUE::boolean,
> 1,
> 10,
> 8,
> 3,
> 10
> );
> SELECT * FROM mt_cars_output_summary;
> {code}
> produces
> {code}
> -[ RECORD 1
> ]---------+-----------------------------------------------------------------------
> method | forest_train
> is_classification | f
> source_table | mt_cars
> model_table | mt_cars_output
> id_col_name | id
> dependent_varname | mpg
> independent_varnames | vs,cyl,disp,qsec,wt
> cat_features | vs,cyl
> con_features | disp,qsec,wt
> grouping_cols | am
> num_trees | 10
> num_random_features | 2
> max_tree_depth | 10
> min_split | 8
> min_bucket | 3
> num_splits | 10
> verbose | f
> importance | t
> num_permutations | 1
> num_all_groups | 2
> num_failed_groups | 0
> total_rows_processed | 32
> total_rows_skipped | 0
> dependent_var_levels |
> dependent_var_type | double precision
> independent_var_types | integer, integer, double precision, double precision,
> double precision
> null_proxy | None
> {code}
> {code}
> SELECT * FROM mt_cars_output_group;
> {code}
> produces
> {code}
> -[ RECORD 1 ]------+--------------------------------------
> gid | 1
> am | 0
> success | t
> cat_n_levels | {3,3}
> cat_levels_in_text | {4,6,8,4,6,8}
> oob_error | 10.4384609425616
> cat_var_importance | {1.77200744569228,3.23893136930339}
> con_var_importance | {6.52806304913008,0,2.71255870662522}
> -[ RECORD 2 ]------+--------------------------------------
> gid | 2
> am | 1
> success | t
> cat_n_levels | {2,2}
> cat_levels_in_text | {0,1,0,1}
> oob_error | 22.3349978333876
> cat_var_importance | {4.59597959183674,0}
> con_var_importance | {5.715875,0,18.4067344897959}
> {code}
> In the above, cat_n_levels and cat_levels_in_text are wrong. I would expect
> {0,1,4,6,8} to be reported for both groups.
> Compare the above with the case of no groups:
> {code}
> DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group,
> mt_cars_output_summary;
> SELECT madlib.forest_train('mt_cars',
> 'mt_cars_output',
> 'id',
> 'mpg',
> '*',
> 'id, hp, drat, am, gear, carb', -- exclude columns
> NULL, -- no groups
> 10::integer,
> 2::integer,
> TRUE::boolean,
> 1,
> 10,
> 8,
> 3,
> 10
> );
> SELECT * FROM mt_cars_output_summary;
> {code}
> produces
> {code}
> [ RECORD 1
> ]---------+-----------------------------------------------------------------------
> method | forest_train
> is_classification | f
> source_table | mt_cars
> model_table | mt_cars_output
> id_col_name | id
> dependent_varname | mpg
> independent_varnames | vs,cyl,disp,qsec,wt
> cat_features | vs,cyl
> con_features | disp,qsec,wt
> grouping_cols |
> num_trees | 10
> num_random_features | 2
> max_tree_depth | 10
> min_split | 8
> min_bucket | 3
> num_splits | 10
> verbose | f
> importance | t
> num_permutations | 1
> num_all_groups | 1
> num_failed_groups | 0
> total_rows_processed | 32
> total_rows_skipped | 0
> dependent_var_levels |
> dependent_var_type | double precision
> independent_var_types | integer, integer, double precision, double precision,
> double precision
> null_proxy | None
> {code}
> {code}
> SELECT * FROM mt_cars_output_group;
> {code}
> produces
> {code}
> gid | 1
> success | t
> cat_n_levels | {2,3}
> cat_levels_in_text | {0,1,4,6,8}
> oob_error | 9.90200731761036
> cat_var_importance | {0.390353827160495,12.8836927689546}
> con_var_importance | {0.118665925451092,0.947886524555693,19.910665401879}
> {code}
> which seems OK.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)