[ https://issues.apache.org/jira/browse/MADLIB-1290?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16767640#comment-16767640 ]
Frank McQuillan commented on MADLIB-1290: ----------------------------------------- In my testing this seems mostly OK, the only issue I found is that NULL is not being handled properly for the input parameter "normalizing_const" {code} DROP TABLE IF EXISTS image_data; CREATE TABLE image_data AS ( SELECT ARRAY[ (random() * 256)::integer, -- R values (random() * 256)::integer, (random() * 256)::integer, (random() * 256)::integer, (random() * 256)::integer, -- G values (random() * 256)::integer, (random() * 256)::integer, (random() * 256)::integer, (random() * 256)::integer, -- B values (random() * 256)::integer, (random() * 256)::integer, (random() * 256)::integer ] as rgb, ('{cat,dog,bird}'::text[])[ceil(random()*3)] as species FROM generate_series(1, 52) ); {code} {code} DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary; SELECT madlib.minibatch_preprocessor_dl('image_data', -- Source table 'image_data_packed', -- Output table 'species', -- Dependent variable 'rgb', -- Independent variable NULL, -- Buffer size NULL, -- Normalizing constant -1 -- Dependent variable offset ); SELECT * FROM image_data_packed ORDER BY buffer_id; {code} produces error {code} ERROR: plpy.SPIError: column "none" does not exist LINE 4: rgb::REAL[], (1/None)::REAL) AS x_norm, ^ QUERY: CREATE TEMP TABLE __madlib_temp_normalized79323135_1550096173_57808848__ AS SELECT madlib.array_scalar_mult( rgb::REAL[], (1/None)::REAL) AS x_norm, species + -1 AS y, row_number() over() AS row_id FROM image_data CONTEXT: Traceback (most recent call last): PL/Python function "minibatch_preprocessor_dl", line 24, in <module> minibatch_preprocessor_obj.minibatch_preprocessor_dl() PL/Python function "minibatch_preprocessor_dl", line 93, in minibatch_preprocessor_dl PL/Python function "minibatch_preprocessor_dl" {code} > Minibatch pre-processor for deep learning > ----------------------------------------- > > Key: MADLIB-1290 > URL: https://issues.apache.org/jira/browse/MADLIB-1290 > Project: Apache MADlib > Issue Type: New Feature > Components: Deep Learning > Reporter: Frank McQuillan > Assignee: Nandish Jayaram > Priority: Major > Fix For: v1.16 > > > The minibatch preprocessor we currently have in MADlib is bloated for DL > tasks. This feature adds a simplified version of creating buffers, and > divides each element of the independent array by a normalizing constant > for standardization (which is 255.0 by default). This is standard practice > with image data. -- This message was sent by Atlassian JIRA (v7.6.3#76005)