This is an automated email from the ASF dual-hosted git repository. nkak pushed a commit to branch madlib2-master in repository https://gitbox.apache.org/repos/asf/madlib.git
commit 0c1cd4ff9387eb563fecacb0a5d4bb2fdc96ce16 Author: Nikhil Kak <n...@vmware.com> AuthorDate: Thu Feb 22 16:05:43 2024 -0800 PMML: Add tests for intercept acting as a predictor JIRA: MADLIB-1517 A previous commit 0cd28f9733927d63beaefc9488db7f8bfdb3bd80 made changes to the pmml code so that the intercept won't be used as a predictor. But it's still possible that this assumption may not be true in some scenarios and the intercept might still be treated as a predictor in the pmml. For e.g. consider this scenario: While using any of the regression algorithms, user passes the independent variable as "ARRAY[x1,1,x2] or ARRAY[x1,x2,1]" instead of "ARRAY[1,x1,x2]" In this scenario, the pmml code will assume that there isn't a intercept in this expression and will treat "1" as a predictor. When predicting using this pmml, users will need to create a column/field named "1" which has the value 1 for each data row. The test added in this commit mimics this scenario --- .../pmml/test/pmml_intercept_as_predictor.sql_in | 84 ++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/src/ports/postgres/modules/pmml/test/pmml_intercept_as_predictor.sql_in b/src/ports/postgres/modules/pmml/test/pmml_intercept_as_predictor.sql_in new file mode 100644 index 00000000..dc0e28bd --- /dev/null +++ b/src/ports/postgres/modules/pmml/test/pmml_intercept_as_predictor.sql_in @@ -0,0 +1,84 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + *//* ----------------------------------------------------------------------- */ + + \i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib', + `\1/../modules/pmml/test/pmml.setup.sql_in' +) + +m4_changequote(`<!'', `!>'') + +------------------ This file will test scenarios when the intercept might still be treated as a predictor in the pmml -------------------- + +-------------- intercept expression "1" not being the first value in the array ------------------- + +-- logistic +DROP TABLE IF EXISTS logregr_model, logregr_model_summary; +SELECT logregr_train( + '"Patients"', + 'logregr_model', + '"Second_attack"', + 'ARRAY["Treatment", 1, trait_anxiety]', + NULL, + 20, + 'irls' +); +CREATE TABLE patients_with_1 as SELECT 1 as "1", * from "Patients"; + +DROP TABLE IF EXISTS logregr_predict_output; CREATE TABLE logregr_predict_output as SELECT id, logregr_predict(coef, ARRAY["Treatment", 1, trait_anxiety]) +FROM logregr_model, "Patients"; +SELECT test_pmml_output('patients_with_1', 'logregr_model', 'logregr_predict_output', 'id', 'logregr_predict', 'predicted_Second_attack_pmml_prediction'); + +DROP TABLE IF EXISTS logregr_predict_output; CREATE TABLE logregr_predict_output as SELECT id, logregr_predict_prob(coef, ARRAY["Treatment", 1, trait_anxiety]) +FROM logregr_model, "Patients"; +SELECT test_pmml_output('patients_with_1', 'logregr_model', 'logregr_predict_output', 'id', 'logregr_predict_prob', 'probability_true'); + +DROP TABLE IF EXISTS logregr_model, logregr_model_summary; +SELECT logregr_train( + '"Patients"', + 'logregr_model', + '"Second_attack"', + 'ARRAY["Treatment", trait_anxiety, 1]', + NULL, + 20, + 'irls' +); + +DROP TABLE IF EXISTS logregr_predict_output; CREATE TABLE logregr_predict_output as SELECT id, logregr_predict(coef, ARRAY["Treatment", trait_anxiety, 1]) +FROM logregr_model, "Patients"; +SELECT test_pmml_output('patients_with_1', 'logregr_model', 'logregr_predict_output', 'id', 'logregr_predict', 'predicted_Second_attack_pmml_prediction'); + +DROP TABLE IF EXISTS logregr_predict_output; CREATE TABLE logregr_predict_output as SELECT id, logregr_predict_prob(coef, ARRAY["Treatment", trait_anxiety, 1]) +FROM logregr_model, "Patients"; +SELECT test_pmml_output('patients_with_1', 'logregr_model', 'logregr_predict_output', 'id', 'logregr_predict_prob', 'probability_true'); + +---- glm ----- +DROP TABLE IF EXISTS glm_model, glm_model_summary; +SELECT glm( + 'abalone', + 'glm_model', + 'rings', + 'ARRAY[1.0, length, diameter, height, whole, shucked, viscera, shell]', + 'family=gaussian, link=identity', NULL, 'max_iter=1000, tolerance=1e-16' +); +CREATE TABLE abalone_with_1 AS SELECT 1, * FROM abalone; +DROP TABLE IF EXISTS glm_predict_out; CREATE TABLE glm_predict_out as SELECT id, glm_predict(coef, ARRAY[1, length, diameter, height, whole, shucked, viscera, shell], 'identity') +FROM glm_model, abalone_with_1; +SELECT test_pmml_output('abalone_with_1', 'glm_model', 'glm_predict_out', 'id', 'glm_predict', 'predicted_rings_pmml_prediction');