Rahul Iyer created MADLIB-1236:
----------------------------------

             Summary: DT: tree_predict fails if a categorical variable has been 
discarded
                 Key: MADLIB-1236
                 URL: https://issues.apache.org/jira/browse/MADLIB-1236
             Project: Apache MADlib
          Issue Type: Task
          Components: Module: Decision Tree
            Reporter: Rahul Iyer
             Fix For: v1.15


{{tree_predict}} fails if {{tree_train}} removed a categorical variable 
(possibly due to presence of a a single level). This is because the summary 
table incorrectly does not exclude the discarded categorical variable, leading 
to {{tree_predict}} mapping the levels of that variable with a pre-built map. 
This "mapping" fails when because {{tree_train}} does not include the variable 
in this pre-built map. 

Repro steps with output given below. 

{code}

DROP TABLE IF EXISTS dt_golf CASCADE;
CREATE TABLE dt_golf (
    id integer NOT NULL,
    "OUTLOOK" text,
    temperature double precision,
    humidity double precision,
    "Cont_features" double precision[],
    cat_features text[],
    windy boolean,
    class text
) ;
INSERT INTO dt_golf 
(id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features, windy,class) 
VALUES
(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
(16, 'overcast', 80, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
(17, 'overcast', 60, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
(18, 'overcast', 70, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play');

SELECT tree_train('dt_golf'::text,         -- source table
                          'train_output'::text,    -- output model table
                          'id'::text,              -- id column
                          'temperature::double precision'::text,           -- 
response
                          '"OUTLOOK", humidity, windy, cat_features'::text,   
-- features
                          NULL::text,        -- exclude columns
                          'gini'::text,      -- split criterion
                          'class'::text,     -- grouping
                          NULL::text,        -- no weights
                          10::integer,       -- max depth
                          6::integer,        -- min split
                          2::integer,        -- min bucket
                          3::integer,        -- number of bins per continuous 
variable
                          'cp=0.01'          -- cost-complexity pruning 
parameter
                          );
CREATE TABLE dt_golf2 as
SELECT * FROM dt_golf
UNION
SELECT 15 as id, 'humid' as "OUTLOOK", 71 as temperature, 80 as humidity,
       ARRAY[90, 90] as "Cont_features", ARRAY['b', 'c'] as cat_features,
       true as windy, 'Don''t Play' as class;

SELECT tree_predict('train_output', 'dt_golf2', 'predict_output');
{code}

Error message: 
{code}
psql:/tmp/madlib.88brFX/recursive_partitioning/test/decision_tree.sql_in.tmp:327:
 ERROR:  plpy.SPIError:           Function 
"_map_catlevel_to_int(text[],text[],integer[],boolean)": Invalid type 
conversion. Null where not expected.  (seg0 slice2 127.0.0.1:25432 pid=88213)
CONTEXT:  Traceback (most recent call last):
  PL/Python function "tree_predict", line 23, in <module>
    return decision_tree.tree_predict(**globals())
  PL/Python function "tree_predict", line 1690, in tree_predict
PL/Python function "tree_predict"
{code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to