This is an automated email from the ASF dual-hosted git repository. nkak pushed a commit to branch madlib2-master in repository https://gitbox.apache.org/repos/asf/madlib.git
commit fa57c4fb40ce71f104449ec8356a3e0d53f97426 Author: Nikhil Kak <n...@vmware.com> AuthorDate: Tue Feb 20 17:10:46 2024 -0800 PMML: Improve dev-check tests for random forest JIRA: MADLIB-1517 This commit adds a few more random forest pmml tests that compare forest_predict's output with pypmml's output --- .../postgres/modules/pmml/test/pmml_rf.sql_in | 72 ++++++++++++++++++++-- 1 file changed, 68 insertions(+), 4 deletions(-) diff --git a/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in b/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in index b91db5d4..805b977a 100644 --- a/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in +++ b/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in @@ -1,3 +1,10 @@ +\i m4_regexp(MADLIB_LIBRARY_PATH, + `\(.*\)/lib', + `\1/../modules/pmml/test/pmml.setup.sql_in' +) + +m4_changequote(`<!'', `!>'') + DROP TABLE IF EXISTS dt_golf; CREATE TABLE dt_golf ( id integer NOT NULL, @@ -6,7 +13,7 @@ CREATE TABLE dt_golf ( humidity double precision, windy text, class text -) ; +); INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES (1, 'sunny', 85, 85, 'false', 'Don''t Play'), @@ -33,7 +40,7 @@ SELECT forest_train('dt_golf'::text, -- source table 'humidity, windy'::text, -- features NULL::text, -- exclude columns 'class'::text, -- grouping - 5::integer, -- num_trees + 3::integer, -- num_trees 1::integer, -- num_random_features FALSE, -- importance 1::integer, -- num_permutations @@ -43,9 +50,51 @@ SELECT forest_train('dt_golf'::text, -- source table 3::integer -- number of bins per continuous variable ); -SELECT pmml('train_output'); +DROP TABLE IF EXISTS forest_predict_output; +SELECT forest_predict('train_output', + 'dt_golf', + 'forest_predict_output', + 'response'); +SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_temperature::double precision','predicted_(temperature::double precision)_pmml_prediction'); + ------------------------------------------------------------------------- +-- classification, no grouping +DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group; +SELECT forest_train('dt_golf'::text, -- source table + 'train_output'::text, -- output model table + 'id'::text, -- id column + '"OUTLOOK"'::text, -- response + 'humidity, windy'::text, -- features + NULL::text, -- exclude columns + NULL, -- grouping + 5::integer, -- num_trees + 1::integer, -- num_random_features + FALSE, -- importance + 1::integer, -- num_permutations + 5::integer, -- max depth + 1::integer, -- min split + 1::integer, -- min bucket + 3::integer -- number of bins per continuous variable + ); + +DROP TABLE IF EXISTS forest_predict_output; +SELECT forest_predict('train_output', + 'dt_golf', + 'forest_predict_output', + 'response'); +SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_OUTLOOK','predicted_OUTLOOK_pmml_prediction'); + +DROP TABLE IF EXISTS forest_predict_output; +SELECT forest_predict('train_output', + 'dt_golf', + 'forest_predict_output', + 'prob'); +SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_overcast','probability_overcast'); +SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_rain','probability_rain'); +SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_sunny','probability_sunny'); + + -- classification, grouping DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group; SELECT forest_train('dt_golf'::text, -- source table @@ -65,6 +114,21 @@ SELECT forest_train('dt_golf'::text, -- source table 3::integer -- number of bins per continuous variable ); -SELECT pmml('train_output'); +DROP TABLE IF EXISTS forest_predict_output; +SELECT forest_predict('train_output', + 'dt_golf', + 'forest_predict_output', + 'response'); +SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_OUTLOOK','predicted_OUTLOOK_pmml_prediction'); + +DROP TABLE IF EXISTS forest_predict_output; +SELECT forest_predict('train_output', + 'dt_golf', + 'forest_predict_output', + 'prob'); +SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_overcast','probability_overcast'); +SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_rain','probability_rain'); +SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_sunny','probability_sunny'); + -------------------------------------------------------------------------