Hi, I want to execute a pig command in embedded java program. For moment, I try Pig in local mode. My data file size is around 15MB but the execution of this command is very long so I think my script need optimizations...
My script : A = LOAD 'data' USING MyUDFLoader('data.xml'); > filter_response_time_less_than_1_s = FILTER A BY (response_time < 1000.0); > filter_response_time_between_1_s_and_2_s = FILTER A BY (response_time >= > 1000.0 AND response_time < 1999.0); > filter_response_time_between_greater_than_2_s = FILTER A BY (response_time >= > 2000.0); > star__zne_asfo_access_log = FOREACH ( COGROUP A BY > (date_day,url,date_minute,ret_code,serveur), > filter_response_time_between_greater_than_2_s BY > (date_day,url,date_minute,ret_code,serveur), > filter_response_time_less_than_1_s BY > (date_day,url,date_minute,ret_code,serveur), > filter_response_time_between_1_s_and_2_s BY > (date_day,url,date_minute,ret_code,serveur) ) > { > GENERATE > FLATTEN(group) AS > (date_day,zne_asfo_url,date_minute,zne_http_code,zne_asfo_server), > (long)SUM((bag{tuple(long)})A.response_time) AS response_time, > COUNT(filter_response_time_less_than_1_s) AS > response_time_less_than_1_s, > COUNT(filter_response_time_between_1_s_and_2_s) AS > response_time_between_1_s_and_2_s, > COUNT(filter_response_time_between_greater_than_2_s) AS > response_time_between_greater_than_2_s, > COUNT(A) AS nb_hit; > }; > agg__zne_asfo_access_log_ymd = FOREACH ( COGROUP A BY > (date_day,date_year,date_month), > filter_response_time_between_greater_than_2_s BY > (date_day,date_year,date_month), filter_response_time_less_than_1_s BY > (date_day,date_year,date_month), filter_response_time_between_1_s_and_2_s BY > (date_day,date_year,date_month) ) > { > GENERATE > FLATTEN(group) AS (date_day,date_year,date_month), > (long)SUM((bag{tuple(long)})A.response_time) AS response_time, > COUNT(filter_response_time_less_than_1_s) AS > response_time_less_than_1_s, > COUNT(filter_response_time_between_1_s_and_2_s) AS > response_time_between_1_s_and_2_s, > COUNT(filter_response_time_between_greater_than_2_s) AS > response_time_between_greater_than_2_s, > COUNT(A) AS nb_hit; > }; > agg__zne_asfo_access_log_ymd_ret_url = FOREACH ( COGROUP A BY > (date_day,url,date_year,date_month), > filter_response_time_between_greater_than_2_s BY > (date_day,url,date_year,date_month), filter_response_time_less_than_1_s BY > (date_day,url,date_year,date_month), filter_response_time_between_1_s_and_2_s > BY (date_day,url,date_year,date_month) ) > { > GENERATE > FLATTEN(group) AS > (date_day,zne_asfo_url,date_year,date_month), > (long)SUM((bag{tuple(long)})A.response_time) AS response_time, > COUNT(filter_response_time_less_than_1_s) AS > response_time_less_than_1_s, > COUNT(filter_response_time_between_1_s_and_2_s) AS > response_time_between_1_s_and_2_s, > COUNT(filter_response_time_between_greater_than_2_s) AS > response_time_between_greater_than_2_s, > COUNT(A) AS nb_hit; > }; > agg__zne_asfo_access_log_ymd_ret_code = FOREACH ( COGROUP A BY > (date_day,ret_code,date_year,date_month), > filter_response_time_between_greater_than_2_s BY > (date_day,ret_code,date_year,date_month), filter_response_time_less_than_1_s > BY (date_day,ret_code,date_year,date_month), > filter_response_time_between_1_s_and_2_s BY > (date_day,ret_code,date_year,date_month) ) > { > GENERATE > FLATTEN(group) AS > (date_day,zne_http_code,date_year,date_month), > (long)SUM((bag{tuple(long)})A.response_time) AS response_time, > COUNT(filter_response_time_less_than_1_s) AS > response_time_less_than_1_s, > COUNT(filter_response_time_between_1_s_and_2_s) AS > response_time_between_1_s_and_2_s, > COUNT(filter_response_time_between_greater_than_2_s) AS > response_time_between_greater_than_2_s, > COUNT(A) AS nb_hit; > }; > agg__zne_asfo_access_log_ymd_ret_url_server = FOREACH ( COGROUP A BY > (date_day,url,date_year,date_month,serveur), > filter_response_time_between_greater_than_2_s BY > (date_day,url,date_year,date_month,serveur), > filter_response_time_less_than_1_s BY > (date_day,url,date_year,date_month,serveur), > filter_response_time_between_1_s_and_2_s BY > (date_day,url,date_year,date_month,serveur) ) > { > GENERATE > FLATTEN(group) AS > (date_day,zne_asfo_url,date_year,date_month,zne_asfo_server), > (long)SUM((bag{tuple(long)})A.response_time) AS response_time, > COUNT(filter_response_time_less_than_1_s) AS > response_time_less_than_1_s, > COUNT(filter_response_time_between_1_s_and_2_s) AS > response_time_between_1_s_and_2_s, > COUNT(filter_response_time_between_greater_than_2_s) AS > response_time_between_greater_than_2_s, > COUNT(A) AS nb_hit; > }; > agg__zne_asfo_access_log_ymd_ret_code_server = FOREACH ( COGROUP A BY > (date_day,ret_code,date_year,date_month,serveur), > filter_response_time_between_greater_than_2_s BY > (date_day,ret_code,date_year,date_month,serveur), > filter_response_time_less_than_1_s BY > (date_day,ret_code,date_year,date_month,serveur), > filter_response_time_between_1_s_and_2_s BY > (date_day,ret_code,date_year,date_month,serveur) ) > { > GENERATE > FLATTEN(group) AS > (date_day,zne_http_code,date_year,date_month,zne_asfo_server), > (long)SUM((bag{tuple(long)})A.response_time) AS response_time, > COUNT(filter_response_time_less_than_1_s) AS > response_time_less_than_1_s, > COUNT(filter_response_time_between_1_s_and_2_s) AS > response_time_between_1_s_and_2_s, > COUNT(filter_response_time_between_greater_than_2_s) AS > response_time_between_greater_than_2_s, > COUNT(A) AS nb_hit; > }; > agg__zne_asfo_access_log_ymdi_server = FOREACH ( COGROUP A BY > (date_day,date_minute,date_year,date_month,serveur), > filter_response_time_between_greater_than_2_s BY > (date_day,date_minute,date_year,date_month,serveur), > filter_response_time_less_than_1_s BY > (date_day,date_minute,date_year,date_month,serveur), > filter_response_time_between_1_s_and_2_s BY > (date_day,date_minute,date_year,date_month,serveur) ) > { > GENERATE > FLATTEN(group) AS > (date_day,date_minute,date_year,date_month,zne_asfo_server), > (long)SUM((bag{tuple(long)})A.response_time) AS response_time, > COUNT(filter_response_time_less_than_1_s) AS > response_time_less_than_1_s, > COUNT(filter_response_time_between_1_s_and_2_s) AS > response_time_between_1_s_and_2_s, > COUNT(filter_response_time_between_greater_than_2_s) AS > response_time_between_greater_than_2_s, > COUNT(A) AS nb_hit; > }; > agg__zne_asfo_access_log_ymdhi_url = FOREACH ( COGROUP A BY > (date_day,url,date_minute,date_year,date_month), > filter_response_time_between_greater_than_2_s BY > (date_day,url,date_minute,date_year,date_month), > filter_response_time_less_than_1_s BY > (date_day,url,date_minute,date_year,date_month), > filter_response_time_between_1_s_and_2_s BY > (date_day,url,date_minute,date_year,date_month) ) > { > GENERATE > FLATTEN(group) AS > (date_day,zne_asfo_url,date_minute,date_year,date_month), > (long)SUM((bag{tuple(long)})A.response_time) AS response_time, > COUNT(filter_response_time_less_than_1_s) AS > response_time_less_than_1_s, > COUNT(filter_response_time_between_1_s_and_2_s) AS > response_time_between_1_s_and_2_s, > COUNT(filter_response_time_between_greater_than_2_s) AS > response_time_between_greater_than_2_s, > COUNT(A) AS nb_hit; > }; > agg__zne_asfo_access_log_ymdhi = FOREACH ( COGROUP A BY > (date_day,date_minute,date_year,date_month), > filter_response_time_between_greater_than_2_s BY > (date_day,date_minute,date_year,date_month), > filter_response_time_less_than_1_s BY > (date_day,date_minute,date_year,date_month), > filter_response_time_between_1_s_and_2_s BY > (date_day,date_minute,date_year,date_month) ) > { > GENERATE > FLATTEN(group) AS (date_day,date_minute,date_year,date_month), > (long)SUM((bag{tuple(long)})A.response_time) AS response_time, > COUNT(filter_response_time_less_than_1_s) AS > response_time_less_than_1_s, > COUNT(filter_response_time_between_1_s_and_2_s) AS > response_time_between_1_s_and_2_s, > COUNT(filter_response_time_between_greater_than_2_s) AS > response_time_between_greater_than_2_s, > COUNT(A) AS nb_hit; > }; > STORE star__zne_asfo_access_log INTO 'star__zne_asfo_access_log' USING > PigStorage('\t', '-schema'); > STORE agg__zne_asfo_access_log_ymd INTO 'agg__zne_asfo_access_log_ymd' USING > PigStorage('\t', '-schema'); > STORE agg__zne_asfo_access_log_ymd_ret_url INTO > 'agg__zne_asfo_access_log_ymd_ret_url' USING PigStorage('\t', '-schema'); > STORE agg__zne_asfo_access_log_ymd_ret_code INTO > 'agg__zne_asfo_access_log_ymd_ret_code' USING PigStorage('\t', '-schema'); > STORE agg__zne_asfo_access_log_ymd_ret_url_server INTO > 'agg__zne_asfo_access_log_ymd_ret_url_server' USING PigStorage('\t', > '-schema'); > STORE agg__zne_asfo_access_log_ymd_ret_code_server INTO > 'agg__zne_asfo_access_log_ymd_ret_code_server' USING PigStorage('\t', > '-schema'); > STORE agg__zne_asfo_access_log_ymdi_server INTO > 'agg__zne_asfo_access_log_ymdi_server' USING PigStorage('\t', '-schema'); > STORE agg__zne_asfo_access_log_ymdhi_url INTO > 'agg__zne_asfo_access_log_ymdhi_url' USING PigStorage('\t', '-schema'); > STORE agg__zne_asfo_access_log_ymdhi INTO 'agg__zne_asfo_access_log_ymdhi' > USING PigStorage('\t', '-schema'); > > Any ideas ?