This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit fa57c4fb40ce71f104449ec8356a3e0d53f97426
Author: Nikhil Kak <n...@vmware.com>
AuthorDate: Tue Feb 20 17:10:46 2024 -0800

    PMML: Improve dev-check tests for random forest
    
    JIRA: MADLIB-1517
    
    This commit adds a few more random forest pmml tests that compare
    forest_predict's output with pypmml's output
---
 .../postgres/modules/pmml/test/pmml_rf.sql_in      | 72 ++++++++++++++++++++--
 1 file changed, 68 insertions(+), 4 deletions(-)

diff --git a/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in 
b/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in
index b91db5d4..805b977a 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in
@@ -1,3 +1,10 @@
+\i m4_regexp(MADLIB_LIBRARY_PATH,
+             `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.sql_in'
+)
+
+m4_changequote(`<!'', `!>'')
+
 DROP TABLE IF EXISTS dt_golf;
 CREATE TABLE dt_golf (
     id integer NOT NULL,
@@ -6,7 +13,7 @@ CREATE TABLE dt_golf (
     humidity double precision,
     windy text,
     class text
-) ;
+);
 
 INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
 (1, 'sunny', 85, 85, 'false', 'Don''t Play'),
@@ -33,7 +40,7 @@ SELECT forest_train('dt_golf'::text,         -- source table
                          'humidity, windy'::text,   -- features
                          NULL::text,        -- exclude columns
                          'class'::text,      -- grouping
-                         5::integer,     -- num_trees
+                         3::integer,     -- num_trees
                          1::integer,        -- num_random_features
                          FALSE,       -- importance
                          1::integer,        -- num_permutations
@@ -43,9 +50,51 @@ SELECT forest_train('dt_golf'::text,         -- source table
                          3::integer        -- number of bins per continuous 
variable
                          );
 
-SELECT pmml('train_output');
+DROP TABLE IF EXISTS forest_predict_output;
+SELECT forest_predict('train_output',
+                   'dt_golf',
+                   'forest_predict_output',
+                   'response');
+SELECT test_pmml_output('dt_golf', 'train_output', 
'forest_predict_output','id', 'estimated_temperature::double 
precision','predicted_(temperature::double precision)_pmml_prediction');
+
 -------------------------------------------------------------------------
 
+-- classification, no grouping
+DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
+SELECT forest_train('dt_golf'::text,         -- source table
+                         'train_output'::text,    -- output model table
+                         'id'::text,              -- id column
+                         '"OUTLOOK"'::text,           -- response
+                         'humidity, windy'::text,   -- features
+                         NULL::text,        -- exclude columns
+                         NULL,      -- grouping
+                         5::integer,     -- num_trees
+                         1::integer,        -- num_random_features
+                         FALSE,       -- importance
+                         1::integer,        -- num_permutations
+                         5::integer,        -- max depth
+                         1::integer,        -- min split
+                         1::integer,         -- min bucket
+                         3::integer        -- number of bins per continuous 
variable
+                         );
+
+DROP TABLE IF EXISTS forest_predict_output;
+SELECT forest_predict('train_output',
+                   'dt_golf',
+                   'forest_predict_output',
+                   'response');
+SELECT test_pmml_output('dt_golf', 'train_output', 
'forest_predict_output','id', 
'estimated_OUTLOOK','predicted_OUTLOOK_pmml_prediction');
+
+DROP TABLE IF EXISTS forest_predict_output;
+SELECT forest_predict('train_output',
+                   'dt_golf',
+                   'forest_predict_output',
+                   'prob');
+SELECT test_pmml_output('dt_golf', 'train_output', 
'forest_predict_output','id', 'estimated_prob_overcast','probability_overcast');
+SELECT test_pmml_output('dt_golf', 'train_output', 
'forest_predict_output','id', 'estimated_prob_rain','probability_rain');
+SELECT test_pmml_output('dt_golf', 'train_output', 
'forest_predict_output','id', 'estimated_prob_sunny','probability_sunny');
+
+
 -- classification, grouping
 DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
 SELECT forest_train('dt_golf'::text,         -- source table
@@ -65,6 +114,21 @@ SELECT forest_train('dt_golf'::text,         -- source table
                          3::integer        -- number of bins per continuous 
variable
                          );
 
-SELECT pmml('train_output');
+DROP TABLE IF EXISTS forest_predict_output;
+SELECT forest_predict('train_output',
+                   'dt_golf',
+                   'forest_predict_output',
+                   'response');
+SELECT test_pmml_output('dt_golf', 'train_output', 
'forest_predict_output','id', 
'estimated_OUTLOOK','predicted_OUTLOOK_pmml_prediction');
+
+DROP TABLE IF EXISTS forest_predict_output;
+SELECT forest_predict('train_output',
+                   'dt_golf',
+                   'forest_predict_output',
+                   'prob');
+SELECT test_pmml_output('dt_golf', 'train_output', 
'forest_predict_output','id', 'estimated_prob_overcast','probability_overcast');
+SELECT test_pmml_output('dt_golf', 'train_output', 
'forest_predict_output','id', 'estimated_prob_rain','probability_rain');
+SELECT test_pmml_output('dt_golf', 'train_output', 
'forest_predict_output','id', 'estimated_prob_sunny','probability_sunny');
+
 -------------------------------------------------------------------------
 

Reply via email to