[jira] [Updated] (SPARK-37581) sql hang at planning stage
[ https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ocean updated SPARK-37581: -- Description: when exec a sql, this sql hang at planning stage. when disable DPP, sql can finish very quickly. we can reproduce this problem through example below: create table test.test_a ( day string, week int, weekday int) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_a partition (dt=20211126) values('1',1,2); create table test.test_b ( session_id string, device_id string, brand string, model string, wx_version string, os string, net_work_type string, app_id string, app_name string, col_z string, page_url string, page_title string, olabel string, otitle string, source string, send_dt string, recv_dt string, request_time string, write_time string, client_ip string, col_a string, dt_hour varchar(12), product string, channelfrom string, customer_um string, kb_code string, col_b string, rectype string, errcode string, col_c string, pageid_merge string) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_b partition(dt=20211126) values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2'); drop table if exists test.test_c;create table if not exists test.test_c stored as ORCFILE as select calendar.day,calendar.week,calendar.weekday, a_kbs, b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs from (select * from test.test_a where dt = '20211126') calendar left join (select dt,count(distinct kb_code) as a_kbs from test.test_b where dt = '20211126' group by dt) t1 on calendar.dt = t1.dt left join (select dt,count(distinct kb_code) as b_kbs from test.test_b where dt = '20211126' group by dt) t2 on calendar.dt = t2.dt left join (select dt,count(distinct kb_code) as c_kbs from test.test_b where dt = '20211126' group by dt) t3 on calendar.dt = t3.dt left join (select dt,count(distinct kb_code) as d_kbs from test.test_b where dt = '20211126' group by dt) t4 on calendar.dt = t4.dt left join (select dt,count(distinct kb_code) as e_kbs from test.test_b where dt = '20211126' group by dt) t5 on calendar.dt = t5.dt left join (select dt,count(distinct kb_code) as f_kbs from test.test_b where dt = '20211126' group by dt) t6 on calendar.dt = t6.dt left join (select dt,count(distinct kb_code) as g_kbs from test.test_b where dt = '20211126' group by dt) t7 on calendar.dt = t7.dt left join (select dt,count(distinct kb_code) as h_kbs from test.test_b where dt = '20211126' group by dt) t8 on calendar.dt = t8.dt left join (select dt,count(distinct kb_code) as i_kbs from test.test_b where dt = '20211126' group by dt) t9 on calendar.dt = t9.dt left join (select dt,count(distinct kb_code) as j_kbs from test.test_b where dt = '20211126' group by dt) t10 on calendar.dt = t10.dt left join (select dt,count(distinct kb_code) as k_kbs from test.test_b where dt = '20211126' group by dt) t11 on calendar.dt = t11.dt was: when exec a sql, this sql hang at planning stage. when disable DPP, sql can finish very quickly. we can reproduce this problem through example below: create table test.test_a ( day string, week int, weekday int) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_a partition (dt=20211126) values('1',1,2); create table test.test_b ( session_id string, device_id string, brand string, model string, wx_version string, os string, net_work_type string, app_id string, app_name string, col_z string, page_url string, page_title string, olabel string, otitle string, source string, send_dt string, recv_dt string, request_time string, write_time string, client_ip string, col_a string, dt_hour varchar(12), product string, channelfrom string, customer_um string, kb_code string, col_b string, rectype string, errcode string, col_c string, pageid_merge string) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_b partition(dt=20211126) values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2'); > sql hang at planning stage > -- > > Key: SPARK-37581 > URL: https://issues.apache.org/jira/browse/SPARK-37581 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.1.1, 3.2.0 >Reporter: ocean >Priority: Major > > when exec a sql, this sql hang at planning stage. > when disable DPP, sql can finish very quickly. > we can reproduce this problem through example below: > create table test.test_a ( > day string, > week int, > weekday int) > partitioned by ( > dt varchar(8)) > stored as orc; > insert into test.test_a partition (dt=20211126) values('1',1,2); > create table test.test_b ( > session_id string, > device_id string, > brand string, > model string, > wx_version
[jira] [Updated] (SPARK-37581) sql hang at planning stage
[ https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ocean updated SPARK-37581: -- Description: when exec a sql, this sql hang at planning stage. when disable DPP, sql can finish very quickly. we can reproduce this problem through example below: create table test.test_a ( day string, week int, weekday int) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_a partition (dt=20211126) values('1',1,2); create table test.test_b ( session_id string, device_id string, brand string, model string, wx_version string, os string, net_work_type string, app_id string, app_name string, col_z string, page_url string, page_title string, olabel string, otitle string, source string, send_dt string, recv_dt string, request_time string, write_time string, client_ip string, col_a string, dt_hour varchar(12), product string, channelfrom string, customer_um string, kb_code string, col_b string, rectype string, errcode string, col_c string, pageid_merge string) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_b partition(dt=20211126) values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2'); was: when exec a sql, this sql hang at planning stage. when disable DPP, sql can finish very quickly. we can reproduce this problem through example below: create table test.test_a ( day string, week int, weekday int) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_a partition (dt=20211126) values('1',1,2); create table test.test_b ( session_id string, device_id string, brand string, model string, wx_version string, os string, net_work_type string, app_id string, app_name string, col_z string, page_url string, page_title string, olabel string, otitle string, source string, send_dt string, recv_dt string, request_time string, write_time string, client_ip string, col_a string, dt_hour varchar(12), product string, channelfrom string, customer_um string, kb_code string, col_b string, rectype string, errcode string, col_c string, pageid_merge string) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_b partition(dt=20211126) values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2'); create table if not exists test.test_c stored as ORCFILE as select calendar.day,calendar.week,calendar.weekday, a_kbs, b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs, j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs from (select * from test.test_a where dt = '20211126') calendar left join (select dt,count(distinct kb_code) as a_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t1 on calendar.dt = t1.dt left join (select dt,count(distinct kb_code) as b_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t2 on calendar.dt = t2.dt left join (select dt,count(distinct kb_code) as c_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t3 on calendar.dt = t3.dt left join (select dt,count(distinct kb_code) as d_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t4 on calendar.dt = t4.dt left join (select dt,count(distinct kb_code) as e_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t5 on calendar.dt = t5.dt left join (select dt,count(distinct kb_code) as f_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t6 on calendar.dt = t6.dt left join (select dt,count(distinct kb_code) as g_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t7 on calendar.dt = t7.dt left join (select dt,count(distinct kb_code) as h_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t8 on calendar.dt = t8.dt left join (select dt,count(distinct kb_code) as i_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t9 on calendar.dt = t9.dt left join (select dt,count(distinct kb_code) as j_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t10 on calendar.dt = t10.dt left join (select dt,count(distinct kb_code) as k_kbs from test.test_b where dt =
[jira] [Commented] (SPARK-37581) sql hang at planning stage
[ https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17456193#comment-17456193 ] ocean commented on SPARK-37581: --- I have narrowed down in the comment.please have a look. > sql hang at planning stage > -- > > Key: SPARK-37581 > URL: https://issues.apache.org/jira/browse/SPARK-37581 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.1.1, 3.2.0 >Reporter: ocean >Priority: Major > > when exec a sql, this sql hang at planning stage. > when disable DPP, sql can finish very quickly. > we can reproduce this problem through example below: > create table test.test_a ( > day string, > week int, > weekday int) > partitioned by ( > dt varchar(8)) > stored as orc; > insert into test.test_a partition (dt=20211126) values('1',1,2); > create table test.test_b ( > session_id string, > device_id string, > brand string, > model string, > wx_version string, > os string, > net_work_type string, > app_id string, > app_name string, > col_z string, > page_url string, > page_title string, > olabel string, > otitle string, > source string, > send_dt string, > recv_dt string, > request_time string, > write_time string, > client_ip string, > col_a string, > dt_hour varchar(12), > product string, > channelfrom string, > customer_um string, > kb_code string, > col_b string, > rectype string, > errcode string, > col_c string, > pageid_merge string) > partitioned by ( > dt varchar(8)) > stored as orc; > insert into test.test_b partition(dt=20211126) > values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2'); > create table if not exists test.test_c stored as ORCFILE as > select calendar.day,calendar.week,calendar.weekday, a_kbs, > b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs, > j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs > from (select * from test.test_a where dt = '20211126') calendar > left join > (select dt,count(distinct kb_code) as a_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t1 > on calendar.dt = t1.dt > left join > (select dt,count(distinct kb_code) as b_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t2 > on calendar.dt = t2.dt > left join > (select dt,count(distinct kb_code) as c_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t3 > on calendar.dt = t3.dt > left join > (select dt,count(distinct kb_code) as d_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t4 > on calendar.dt = t4.dt > left join > (select dt,count(distinct kb_code) as e_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t5 > on calendar.dt = t5.dt > left join > (select dt,count(distinct kb_code) as f_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t6 > on calendar.dt = t6.dt > left join > (select dt,count(distinct kb_code) as g_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t7 > on calendar.dt = t7.dt > left join > (select dt,count(distinct kb_code) as h_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t8 > on calendar.dt = t8.dt > left join > (select dt,count(distinct kb_code) as i_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t9 > on calendar.dt = t9.dt > left join > (select dt,count(distinct kb_code) as j_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t10 > on calendar.dt = t10.dt > left join > (select dt,count(distinct kb_code) as k_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t11 > on calendar.dt = t11.dt > left join > (select dt,count(distinct kb_code) as l_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t12 > on calendar.dt = t12.dt > left join > (select
[jira] (SPARK-37581) sql hang at planning stage
[ https://issues.apache.org/jira/browse/SPARK-37581 ] ocean deleted comment on SPARK-37581: --- was (Author: oceaneast): I have narrowed down in the comment.please have a look. > sql hang at planning stage > -- > > Key: SPARK-37581 > URL: https://issues.apache.org/jira/browse/SPARK-37581 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.1.1, 3.2.0 >Reporter: ocean >Priority: Major > > when exec a sql, this sql hang at planning stage. > when disable DPP, sql can finish very quickly. > we can reproduce this problem through example below: > create table test.test_a ( > day string, > week int, > weekday int) > partitioned by ( > dt varchar(8)) > stored as orc; > insert into test.test_a partition (dt=20211126) values('1',1,2); > create table test.test_b ( > session_id string, > device_id string, > brand string, > model string, > wx_version string, > os string, > net_work_type string, > app_id string, > app_name string, > col_z string, > page_url string, > page_title string, > olabel string, > otitle string, > source string, > send_dt string, > recv_dt string, > request_time string, > write_time string, > client_ip string, > col_a string, > dt_hour varchar(12), > product string, > channelfrom string, > customer_um string, > kb_code string, > col_b string, > rectype string, > errcode string, > col_c string, > pageid_merge string) > partitioned by ( > dt varchar(8)) > stored as orc; > insert into test.test_b partition(dt=20211126) > values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2'); > create table if not exists test.test_c stored as ORCFILE as > select calendar.day,calendar.week,calendar.weekday, a_kbs, > b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs, > j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs > from (select * from test.test_a where dt = '20211126') calendar > left join > (select dt,count(distinct kb_code) as a_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t1 > on calendar.dt = t1.dt > left join > (select dt,count(distinct kb_code) as b_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t2 > on calendar.dt = t2.dt > left join > (select dt,count(distinct kb_code) as c_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t3 > on calendar.dt = t3.dt > left join > (select dt,count(distinct kb_code) as d_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t4 > on calendar.dt = t4.dt > left join > (select dt,count(distinct kb_code) as e_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t5 > on calendar.dt = t5.dt > left join > (select dt,count(distinct kb_code) as f_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t6 > on calendar.dt = t6.dt > left join > (select dt,count(distinct kb_code) as g_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t7 > on calendar.dt = t7.dt > left join > (select dt,count(distinct kb_code) as h_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t8 > on calendar.dt = t8.dt > left join > (select dt,count(distinct kb_code) as i_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t9 > on calendar.dt = t9.dt > left join > (select dt,count(distinct kb_code) as j_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t10 > on calendar.dt = t10.dt > left join > (select dt,count(distinct kb_code) as k_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t11 > on calendar.dt = t11.dt > left join > (select dt,count(distinct kb_code) as l_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t12 > on calendar.dt = t12.dt > left join > (select dt,count(distinct kb_code) as m_kbs > from test.test_b > where dt =
[jira] [Updated] (SPARK-37581) sql hang at planning stage
[ https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ocean updated SPARK-37581: -- Description: when exec a sql, this sql hang at planning stage. when disable DPP, sql can finish very quickly. we can reproduce this problem through example below: create table test.test_a ( day string, week int, weekday int) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_a partition (dt=20211126) values('1',1,2); create table test.test_b ( session_id string, device_id string, brand string, model string, wx_version string, os string, net_work_type string, app_id string, app_name string, col_z string, page_url string, page_title string, olabel string, otitle string, source string, send_dt string, recv_dt string, request_time string, write_time string, client_ip string, col_a string, dt_hour varchar(12), product string, channelfrom string, customer_um string, kb_code string, col_b string, rectype string, errcode string, col_c string, pageid_merge string) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_b partition(dt=20211126) values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2'); create table if not exists test.test_c stored as ORCFILE as select calendar.day,calendar.week,calendar.weekday, a_kbs, b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs, j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs from (select * from test.test_a where dt = '20211126') calendar left join (select dt,count(distinct kb_code) as a_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t1 on calendar.dt = t1.dt left join (select dt,count(distinct kb_code) as b_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t2 on calendar.dt = t2.dt left join (select dt,count(distinct kb_code) as c_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t3 on calendar.dt = t3.dt left join (select dt,count(distinct kb_code) as d_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t4 on calendar.dt = t4.dt left join (select dt,count(distinct kb_code) as e_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t5 on calendar.dt = t5.dt left join (select dt,count(distinct kb_code) as f_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t6 on calendar.dt = t6.dt left join (select dt,count(distinct kb_code) as g_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t7 on calendar.dt = t7.dt left join (select dt,count(distinct kb_code) as h_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t8 on calendar.dt = t8.dt left join (select dt,count(distinct kb_code) as i_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t9 on calendar.dt = t9.dt left join (select dt,count(distinct kb_code) as j_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t10 on calendar.dt = t10.dt left join (select dt,count(distinct kb_code) as k_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t11 on calendar.dt = t11.dt left join (select dt,count(distinct kb_code) as l_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t12 on calendar.dt = t12.dt left join (select dt,count(distinct kb_code) as m_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t13 on calendar.dt = t13.dt left join (select dt,count(distinct kb_code) as n_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t14 on calendar.dt = t14.dt left join (select dt,count(distinct kb_code) as o_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t15 on calendar.dt = t15.dt left join (select dt,count(distinct kb_code) as p_kbs from test.test_b where dt = '20211126' and
[jira] [Comment Edited] (SPARK-37581) sql hang at planning stage
[ https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17456154#comment-17456154 ] ocean edited comment on SPARK-37581 at 12/9/21, 6:12 AM: - Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 join operators, costs 39s. when 12 join operators, costs 120s. when 13 join operators , it can not finish. I think, we can debug it at 11 join operators, to find why it is so slow. I had narrowed down as below: === drop table if exists test.test_c;create table if not exists test.test_c stored as ORCFILE as select calendar.day,calendar.week,calendar.weekday, a_kbs, b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs from (select * from test.test_a where dt = '20211126') calendar left join (select dt,count(distinct kb_code) as a_kbs from test.test_b where dt = '20211126' group by dt) t1 on calendar.dt = t1.dt left join (select dt,count(distinct kb_code) as b_kbs from test.test_b where dt = '20211126' group by dt) t2 on calendar.dt = t2.dt left join (select dt,count(distinct kb_code) as c_kbs from test.test_b where dt = '20211126' group by dt) t3 on calendar.dt = t3.dt left join (select dt,count(distinct kb_code) as d_kbs from test.test_b where dt = '20211126' group by dt) t4 on calendar.dt = t4.dt left join (select dt,count(distinct kb_code) as e_kbs from test.test_b where dt = '20211126' group by dt) t5 on calendar.dt = t5.dt left join (select dt,count(distinct kb_code) as f_kbs from test.test_b where dt = '20211126' group by dt) t6 on calendar.dt = t6.dt left join (select dt,count(distinct kb_code) as g_kbs from test.test_b where dt = '20211126' group by dt) t7 on calendar.dt = t7.dt left join (select dt,count(distinct kb_code) as h_kbs from test.test_b where dt = '20211126' group by dt) t8 on calendar.dt = t8.dt left join (select dt,count(distinct kb_code) as i_kbs from test.test_b where dt = '20211126' group by dt) t9 on calendar.dt = t9.dt left join (select dt,count(distinct kb_code) as j_kbs from test.test_b where dt = '20211126' group by dt) t10 on calendar.dt = t10.dt left join (select dt,count(distinct kb_code) as k_kbs from test.test_b where dt = '20211126' group by dt) t11 on calendar.dt = t11.dt was (Author: oceaneast): Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 join operators, costs 39s. when 12 join operators, costs 120s. when 13 join operators , it can not finish. I think, we can debug it at 11 join operators, to find why it is so slow. === drop table if exists test.test_c;create table if not exists test.test_c stored as ORCFILE as select calendar.day,calendar.week,calendar.weekday, a_kbs, b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs from (select * from test.test_a where dt = '20211126') calendar left join (select dt,count(distinct kb_code) as a_kbs from test.test_b where dt = '20211126' group by dt) t1 on calendar.dt = t1.dt left join (select dt,count(distinct kb_code) as b_kbs from test.test_b where dt = '20211126' group by dt) t2 on calendar.dt = t2.dt left join (select dt,count(distinct kb_code) as c_kbs from test.test_b where dt = '20211126' group by dt) t3 on calendar.dt = t3.dt left join (select dt,count(distinct kb_code) as d_kbs from test.test_b where dt = '20211126' group by dt) t4 on calendar.dt = t4.dt left join (select dt,count(distinct kb_code) as e_kbs from test.test_b where dt = '20211126' group by dt) t5 on calendar.dt = t5.dt left join (select dt,count(distinct kb_code) as f_kbs from test.test_b where dt = '20211126' group by dt) t6 on calendar.dt = t6.dt left join (select dt,count(distinct kb_code) as g_kbs from test.test_b where dt = '20211126' group by dt) t7 on calendar.dt = t7.dt left join (select dt,count(distinct kb_code) as h_kbs from test.test_b where dt = '20211126' group by dt) t8 on calendar.dt = t8.dt left join (select dt,count(distinct kb_code) as i_kbs from test.test_b where dt = '20211126' group by dt) t9 on calendar.dt = t9.dt left join (select dt,count(distinct kb_code) as j_kbs from test.test_b where dt = '20211126' group by dt) t10 on calendar.dt = t10.dt left join (select dt,count(distinct kb_code) as k_kbs from test.test_b where dt = '20211126' group by dt) t11 on calendar.dt = t11.dt > sql hang at planning stage > -- > > Key: SPARK-37581 > URL: https://issues.apache.org/jira/browse/SPARK-37581 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.1.1, 3.2.0 >Reporter: ocean >Priority: Major > > when exec a sql, this sql hang at planning stage. > when disable DPP, sql can finish normally. > we
[jira] [Comment Edited] (SPARK-37581) sql hang at planning stage
[ https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17456154#comment-17456154 ] ocean edited comment on SPARK-37581 at 12/9/21, 6:06 AM: - Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 join operators, costs 39s. when 12 join operators, costs 120s. when 13 join operators , it can not finish. I think, we can debug it at 11 join operators, to find why it is so slow. === drop table if exists test.test_c;create table if not exists test.test_c stored as ORCFILE as select calendar.day,calendar.week,calendar.weekday, a_kbs, b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs from (select * from test.test_a where dt = '20211126') calendar left join (select dt,count(distinct kb_code) as a_kbs from test.test_b where dt = '20211126' group by dt) t1 on calendar.dt = t1.dt left join (select dt,count(distinct kb_code) as b_kbs from test.test_b where dt = '20211126' group by dt) t2 on calendar.dt = t2.dt left join (select dt,count(distinct kb_code) as c_kbs from test.test_b where dt = '20211126' group by dt) t3 on calendar.dt = t3.dt left join (select dt,count(distinct kb_code) as d_kbs from test.test_b where dt = '20211126' group by dt) t4 on calendar.dt = t4.dt left join (select dt,count(distinct kb_code) as e_kbs from test.test_b where dt = '20211126' group by dt) t5 on calendar.dt = t5.dt left join (select dt,count(distinct kb_code) as f_kbs from test.test_b where dt = '20211126' group by dt) t6 on calendar.dt = t6.dt left join (select dt,count(distinct kb_code) as g_kbs from test.test_b where dt = '20211126' group by dt) t7 on calendar.dt = t7.dt left join (select dt,count(distinct kb_code) as h_kbs from test.test_b where dt = '20211126' group by dt) t8 on calendar.dt = t8.dt left join (select dt,count(distinct kb_code) as i_kbs from test.test_b where dt = '20211126' group by dt) t9 on calendar.dt = t9.dt left join (select dt,count(distinct kb_code) as j_kbs from test.test_b where dt = '20211126' group by dt) t10 on calendar.dt = t10.dt left join (select dt,count(distinct kb_code) as k_kbs from test.test_b where dt = '20211126' group by dt) t11 on calendar.dt = t11.dt was (Author: oceaneast): Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 join operators, costs 39s. when 12 join operators, costs 120s. when 13 join operators , it can not finish. I think, we can debug it at 11 join operators, to find why it is so slow. === drop table if exists test.test_c;create table if not exists test.test_c stored as ORCFILE as select calendar.day,calendar.week,calendar.weekday, a_kbs, b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs from (select * from test.test_a where dt = '20211126') calendar left join (select dt,count(distinct kb_code) as a_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t1 on calendar.dt = t1.dt left join (select dt,count(distinct kb_code) as b_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t2 on calendar.dt = t2.dt left join (select dt,count(distinct kb_code) as c_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t3 on calendar.dt = t3.dt left join (select dt,count(distinct kb_code) as d_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t4 on calendar.dt = t4.dt left join (select dt,count(distinct kb_code) as e_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t5 on calendar.dt = t5.dt left join (select dt,count(distinct kb_code) as f_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t6 on calendar.dt = t6.dt left join (select dt,count(distinct kb_code) as g_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t7 on calendar.dt = t7.dt left join (select dt,count(distinct kb_code) as h_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t8 on calendar.dt = t8.dt left join (select dt,count(distinct kb_code) as i_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t9 on
[jira] [Commented] (SPARK-37581) sql hang at planning stage
[ https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17456154#comment-17456154 ] ocean commented on SPARK-37581: --- Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 join operators, costs 39s. when 12 join operators, costs 120s. when 13 join operators , it can not finish. I think, we can debug it at 11 join operators, to find why it is so slow. === drop table if exists test.test_c;create table if not exists test.test_c stored as ORCFILE as select calendar.day,calendar.week,calendar.weekday, a_kbs, b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs from (select * from test.test_a where dt = '20211126') calendar left join (select dt,count(distinct kb_code) as a_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t1 on calendar.dt = t1.dt left join (select dt,count(distinct kb_code) as b_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t2 on calendar.dt = t2.dt left join (select dt,count(distinct kb_code) as c_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t3 on calendar.dt = t3.dt left join (select dt,count(distinct kb_code) as d_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t4 on calendar.dt = t4.dt left join (select dt,count(distinct kb_code) as e_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t5 on calendar.dt = t5.dt left join (select dt,count(distinct kb_code) as f_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t6 on calendar.dt = t6.dt left join (select dt,count(distinct kb_code) as g_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t7 on calendar.dt = t7.dt left join (select dt,count(distinct kb_code) as h_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t8 on calendar.dt = t8.dt left join (select dt,count(distinct kb_code) as i_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t9 on calendar.dt = t9.dt left join (select dt,count(distinct kb_code) as j_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t10 on calendar.dt = t10.dt left join (select dt,count(distinct kb_code) as k_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t11 on calendar.dt = t11.dt > sql hang at planning stage > -- > > Key: SPARK-37581 > URL: https://issues.apache.org/jira/browse/SPARK-37581 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.1.1, 3.2.0 >Reporter: ocean >Priority: Major > > when exec a sql, this sql hang at planning stage. > when disable DPP, sql can finish normally. > we can reproduce this problem through example below: > create table test.test_a ( > day string, > week int, > weekday int) > partitioned by ( > dt varchar(8)) > stored as orc; > insert into test.test_a partition (dt=20211126) values('1',1,2); > create table test.test_b ( > session_id string, > device_id string, > brand string, > model string, > wx_version string, > os string, > net_work_type string, > app_id string, > app_name string, > col_z string, > page_url string, > page_title string, > olabel string, > otitle string, > source string, > send_dt string, > recv_dt string, > request_time string, > write_time string, > client_ip string, > col_a string, > dt_hour varchar(12), > product string, > channelfrom string, > customer_um string, > kb_code string, > col_b string, > rectype string, > errcode string, > col_c string, > pageid_merge string) > partitioned by ( > dt varchar(8)) > stored as orc; > insert into test.test_b partition(dt=20211126) > values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2'); > create table if not exists test.test_c stored as ORCFILE as > select calendar.day,calendar.week,calendar.weekday, a_kbs, > b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs, >
[jira] [Updated] (SPARK-37581) sql hang at planning stage
[ https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ocean updated SPARK-37581: -- Priority: Critical (was: Major) > sql hang at planning stage > -- > > Key: SPARK-37581 > URL: https://issues.apache.org/jira/browse/SPARK-37581 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.1.1, 3.2.0 >Reporter: ocean >Priority: Critical > > when exec a sql, this sql hang at planning stage. > when disable DPP, sql can finish normally. > we can reproduce this problem through example below: > create table test.test_a ( > day string, > week int, > weekday int) > partitioned by ( > dt varchar(8)) > stored as orc; > insert into test.test_a partition (dt=20211126) values('1',1,2); > create table test.test_b ( > session_id string, > device_id string, > brand string, > model string, > wx_version string, > os string, > net_work_type string, > app_id string, > app_name string, > col_z string, > page_url string, > page_title string, > olabel string, > otitle string, > source string, > send_dt string, > recv_dt string, > request_time string, > write_time string, > client_ip string, > col_a string, > dt_hour varchar(12), > product string, > channelfrom string, > customer_um string, > kb_code string, > col_b string, > rectype string, > errcode string, > col_c string, > pageid_merge string) > partitioned by ( > dt varchar(8)) > stored as orc; > insert into test.test_b partition(dt=20211126) > values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2'); > create table if not exists test.test_c stored as ORCFILE as > select calendar.day,calendar.week,calendar.weekday, a_kbs, > b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs, > j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs > from (select * from test.test_a where dt = '20211126') calendar > left join > (select dt,count(distinct kb_code) as a_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t1 > on calendar.dt = t1.dt > left join > (select dt,count(distinct kb_code) as b_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t2 > on calendar.dt = t2.dt > left join > (select dt,count(distinct kb_code) as c_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t3 > on calendar.dt = t3.dt > left join > (select dt,count(distinct kb_code) as d_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t4 > on calendar.dt = t4.dt > left join > (select dt,count(distinct kb_code) as e_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t5 > on calendar.dt = t5.dt > left join > (select dt,count(distinct kb_code) as f_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t6 > on calendar.dt = t6.dt > left join > (select dt,count(distinct kb_code) as g_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t7 > on calendar.dt = t7.dt > left join > (select dt,count(distinct kb_code) as h_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t8 > on calendar.dt = t8.dt > left join > (select dt,count(distinct kb_code) as i_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t9 > on calendar.dt = t9.dt > left join > (select dt,count(distinct kb_code) as j_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t10 > on calendar.dt = t10.dt > left join > (select dt,count(distinct kb_code) as k_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t11 > on calendar.dt = t11.dt > left join > (select dt,count(distinct kb_code) as l_kbs > from test.test_b > where dt = '20211126' > and app_id in ('1','2') > and substr(kb_code,1,6) = '66' > and pageid_merge = 'a' > group by dt) t12 > on calendar.dt = t12.dt > left join > (select dt,count(distinct kb_code) as m_kbs > from test.test_b > where dt =
[jira] [Created] (SPARK-37581) sql hang at planning stage
ocean created SPARK-37581: - Summary: sql hang at planning stage Key: SPARK-37581 URL: https://issues.apache.org/jira/browse/SPARK-37581 Project: Spark Issue Type: Bug Components: SQL Affects Versions: 3.2.0, 3.1.1 Reporter: ocean when exec a sql, this sql hang at planning stage. when disable DPP, sql can finish normally. we can reproduce this problem through example below: create table test.test_a ( day string, week int, weekday int) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_a partition (dt=20211126) values('1',1,2); create table test.test_b ( session_id string, device_id string, brand string, model string, wx_version string, os string, net_work_type string, app_id string, app_name string, col_z string, page_url string, page_title string, olabel string, otitle string, source string, send_dt string, recv_dt string, request_time string, write_time string, client_ip string, col_a string, dt_hour varchar(12), product string, channelfrom string, customer_um string, kb_code string, col_b string, rectype string, errcode string, col_c string, pageid_merge string) partitioned by ( dt varchar(8)) stored as orc; insert into test.test_b partition(dt=20211126) values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2'); create table if not exists test.test_c stored as ORCFILE as select calendar.day,calendar.week,calendar.weekday, a_kbs, b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs, j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs from (select * from test.test_a where dt = '20211126') calendar left join (select dt,count(distinct kb_code) as a_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t1 on calendar.dt = t1.dt left join (select dt,count(distinct kb_code) as b_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t2 on calendar.dt = t2.dt left join (select dt,count(distinct kb_code) as c_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t3 on calendar.dt = t3.dt left join (select dt,count(distinct kb_code) as d_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t4 on calendar.dt = t4.dt left join (select dt,count(distinct kb_code) as e_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t5 on calendar.dt = t5.dt left join (select dt,count(distinct kb_code) as f_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t6 on calendar.dt = t6.dt left join (select dt,count(distinct kb_code) as g_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t7 on calendar.dt = t7.dt left join (select dt,count(distinct kb_code) as h_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t8 on calendar.dt = t8.dt left join (select dt,count(distinct kb_code) as i_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t9 on calendar.dt = t9.dt left join (select dt,count(distinct kb_code) as j_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t10 on calendar.dt = t10.dt left join (select dt,count(distinct kb_code) as k_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t11 on calendar.dt = t11.dt left join (select dt,count(distinct kb_code) as l_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t12 on calendar.dt = t12.dt left join (select dt,count(distinct kb_code) as m_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t13 on calendar.dt = t13.dt left join (select dt,count(distinct kb_code) as n_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge = 'a' group by dt) t14 on calendar.dt = t14.dt left join (select dt,count(distinct kb_code) as o_kbs from test.test_b where dt = '20211126' and app_id in ('1','2') and substr(kb_code,1,6) = '66' and pageid_merge
[jira] [Updated] (SPARK-37344) split function behave differently between spark 2.3 and spark 3.2
[ https://issues.apache.org/jira/browse/SPARK-37344?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ocean updated SPARK-37344: -- Labels: incorrect (was: ) > split function behave differently between spark 2.3 and spark 3.2 > - > > Key: SPARK-37344 > URL: https://issues.apache.org/jira/browse/SPARK-37344 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 3.1.1, 3.1.2, 3.2.0 >Reporter: ocean >Priority: Major > Labels: incorrect > > while use split function in sql, it behave differently between 2.3 and 3.2, > which cause incorrect problem. > we can use this sql to reproduce this problem: > > create table split_test ( id int,name string) > insert into split_test values(1,"abc;def") > explain extended select split(name,';') from split_test > > spark3: > spark-sql> Explain extended select split(name,';') from split_test; > == Parsed Logical Plan == > 'Project [unresolvedalias('split('name, \\;), None)] > +- 'UnresolvedRelation [split_test], [], false > > spark2: > > spark-sql> Explain extended select split(name,';') from split_test; > == Parsed Logical Plan == > 'Project [unresolvedalias('split('name, \;), None)] > +- 'UnresolvedRelation split_test > > It looks like the deal of escape is different -- This message was sent by Atlassian Jira (v8.20.1#820001) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Created] (SPARK-37344) split function behave differently between spark 2.3 and spark 3.2
ocean created SPARK-37344: - Summary: split function behave differently between spark 2.3 and spark 3.2 Key: SPARK-37344 URL: https://issues.apache.org/jira/browse/SPARK-37344 Project: Spark Issue Type: Bug Components: SQL Affects Versions: 3.2.0, 3.1.2, 3.1.1 Reporter: ocean while use split function in sql, it behave differently between 2.3 and 3.2, which cause incorrect problem. we can use this sql to reproduce this problem: create table split_test ( id int,name string) insert into split_test values(1,"abc;def") explain extended select split(name,';') from split_test spark3: spark-sql> Explain extended select split(name,';') from split_test; == Parsed Logical Plan == 'Project [unresolvedalias('split('name, \\;), None)] +- 'UnresolvedRelation [split_test], [], false spark2: spark-sql> Explain extended select split(name,';') from split_test; == Parsed Logical Plan == 'Project [unresolvedalias('split('name, \;), None)] +- 'UnresolvedRelation split_test It looks like the deal of escape is different -- This message was sent by Atlassian Jira (v8.20.1#820001) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Created] (SPARK-36552) varchar datatype behave differently on hive table and datasource table
ocean created SPARK-36552: - Summary: varchar datatype behave differently on hive table and datasource table Key: SPARK-36552 URL: https://issues.apache.org/jira/browse/SPARK-36552 Project: Spark Issue Type: Bug Components: SQL Affects Versions: 3.1.1, 3.1.0, 2.3.1 Reporter: ocean in spark 3.1.X, when set spark.sql.hive.convertMetastoreOrc=false,and spark.sql.legacy.charVarcharAsString=true. Execute the following sql: CREATE TABLE t (col varchar(2)) stored as orc; INSERT INTO t SELECT 'aaa'; select * from t; result is aa But when set spark.sql.hive.convertMetastoreOrc=true,and spark.sql.legacy.charVarcharAsString=true alse execute the sql, the result is "aaa" -- This message was sent by Atlassian Jira (v8.3.4#803005) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Updated] (SPARK-36552) varchar datatype behave differently on hive table and datasource table
[ https://issues.apache.org/jira/browse/SPARK-36552?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ocean updated SPARK-36552: -- Affects Version/s: (was: 3.1.0) 3.1.2 > varchar datatype behave differently on hive table and datasource table > > > Key: SPARK-36552 > URL: https://issues.apache.org/jira/browse/SPARK-36552 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 2.3.1, 3.1.1, 3.1.2 >Reporter: ocean >Priority: Major > > in spark 3.1.X, when set spark.sql.hive.convertMetastoreOrc=false,and > spark.sql.legacy.charVarcharAsString=true. > Execute the following sql: > CREATE TABLE t (col varchar(2)) stored as orc; > INSERT INTO t SELECT 'aaa'; > select * from t; > result is aa > > But when set spark.sql.hive.convertMetastoreOrc=true,and > spark.sql.legacy.charVarcharAsString=true > alse execute the sql, the result is "aaa" > > > > -- This message was sent by Atlassian Jira (v8.3.4#803005) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-29015) Can not support "add jar" on JDK 11
[ https://issues.apache.org/jira/browse/SPARK-29015?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16924882#comment-16924882 ] ocean commented on SPARK-29015: --- I try to fix it > Can not support "add jar" on JDK 11 > --- > > Key: SPARK-29015 > URL: https://issues.apache.org/jira/browse/SPARK-29015 > Project: Spark > Issue Type: Sub-task > Components: SQL >Affects Versions: 3.0.0 >Reporter: Yuming Wang >Priority: Major > > How to reproduce: > Case 1: > {code:bash} > export JAVA_HOME=/usr/lib/jdk-11.0.3 > export PATH=$JAVA_HOME/bin:$PATH > build/sbt clean package -Phive -Phadoop-3.2 -Phive-thriftserver > export SPARK_PREPEND_CLASSES=true > sbin/start-thriftserver.sh > bin/beeline -u jdbc:hive2://localhost:1 > {code} > {noformat} > 0: jdbc:hive2://localhost:1> add jar > /root/.m2/repository/org/apache/hive/hcatalog/hive-hcatalog-core/2.3.6/hive-hcatalog-core-2.3.6.jar; > INFO : Added > [/root/.m2/repository/org/apache/hive/hcatalog/hive-hcatalog-core/2.3.6/hive-hcatalog-core-2.3.6.jar] > to class path > INFO : Added resources: > [/root/.m2/repository/org/apache/hive/hcatalog/hive-hcatalog-core/2.3.6/hive-hcatalog-core-2.3.6.jar] > +-+ > | result | > +-+ > +-+ > No rows selected (0.381 seconds) > 0: jdbc:hive2://localhost:1> CREATE TABLE addJar(key string) ROW FORMAT > SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'; > +-+ > | Result | > +-+ > +-+ > No rows selected (0.613 seconds) > 0: jdbc:hive2://localhost:1> select * from addJar; > Error: Error running query: java.lang.RuntimeException: > java.lang.ClassNotFoundException: org.apache.hive.hcatalog.data.JsonSerDe > (state=,code=0) > {noformat} > Case 2: > {noformat} > spark-sql> add jar > /root/.m2/repository/org/apache/hive/hcatalog/hive-hcatalog-core/2.3.6/hive-hcatalog-core-2.3.6.jar; > ADD JAR > /root/.m2/repository/org/apache/hive/hcatalog/hive-hcatalog-core/2.3.6/hive-hcatalog-core-2.3.6.jar > spark-sql> CREATE TABLE addJar(key string) ROW FORMAT SERDE > 'org.apache.hive.hcatalog.data.JsonSerDe'; > spark-sql> select * from addJar; > 19/09/07 03:06:54 ERROR SparkSQLDriver: Failed in [select * from addJar] > java.lang.RuntimeException: java.lang.ClassNotFoundException: > org.apache.hive.hcatalog.data.JsonSerDe > at > org.apache.hadoop.hive.ql.plan.TableDesc.getDeserializerClass(TableDesc.java:79) > at > org.apache.spark.sql.hive.execution.HiveTableScanExec.addColumnMetadataToConf(HiveTableScanExec.scala:123) > at > org.apache.spark.sql.hive.execution.HiveTableScanExec.hadoopConf$lzycompute(HiveTableScanExec.scala:101) > at > org.apache.spark.sql.hive.execution.HiveTableScanExec.hadoopConf(HiveTableScanExec.scala:98) > at > org.apache.spark.sql.hive.execution.HiveTableScanExec.hadoopReader$lzycompute(HiveTableScanExec.scala:110) > at > org.apache.spark.sql.hive.execution.HiveTableScanExec.hadoopReader(HiveTableScanExec.scala:105) > at > org.apache.spark.sql.hive.execution.HiveTableScanExec.$anonfun$doExecute$1(HiveTableScanExec.scala:188) > at org.apache.spark.util.Utils$.withDummyCallSite(Utils.scala:2488) > at > org.apache.spark.sql.hive.execution.HiveTableScanExec.doExecute(HiveTableScanExec.scala:188) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:189) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:227) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:224) > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:185) > at > org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:329) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:378) > at > org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:408) > at > org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:52) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$4(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:87) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65) > at > org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:367) > at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:403) >
[jira] [Commented] (SPARK-9976) create function do not work
[ https://issues.apache.org/jira/browse/SPARK-9976?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15106517#comment-15106517 ] ocean commented on SPARK-9976: -- the second problem I just found that only function can not describe, but still can use. Just a little problem > create function do not work > --- > > Key: SPARK-9976 > URL: https://issues.apache.org/jira/browse/SPARK-9976 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 1.4.0, 1.4.1, 1.5.0 > Environment: spark 1.4.1 yarn 2.2.0 >Reporter: cen yuhai > > I use beeline to connect to ThriftServer, but add jar can not work, so I use > create function , see the link below. > http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cm_mc_hive_udf.html > I do as blow: > {code} > create function gdecodeorder as 'com.hive.udf.GOrderDecode' USING JAR > 'hdfs://mycluster/user/spark/lib/gorderdecode.jar'; > {code} > It returns Ok, and I connect to the metastore, I see records in table FUNCS. > {code} > select gdecodeorder(t1) from tableX limit 1; > {code} > It returns error 'Couldn't find function default.gdecodeorder' > This is the Exception > {code} > 15/08/14 14:53:51 ERROR UserGroupInformation: PriviledgedActionException > as:xiaoju (auth:SIMPLE) cause:org.apache.hive.service.cli.HiveSQLException: > java.lang.RuntimeException: Couldn't find function default.gdecodeorder > 15/08/14 15:04:47 ERROR RetryingHMSHandler: > MetaException(message:NoSuchObjectException(message:Function > default.t_gdecodeorder does not exist)) > at > org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newMetaException(HiveMetaStore.java:4613) > at > org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.get_function(HiveMetaStore.java:4740) > at sun.reflect.GeneratedMethodAccessor57.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:606) > at > org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:105) > at com.sun.proxy.$Proxy21.get_function(Unknown Source) > at > org.apache.hadoop.hive.metastore.HiveMetaStoreClient.getFunction(HiveMetaStoreClient.java:1721) > at sun.reflect.GeneratedMethodAccessor56.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:606) > at > org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:89) > at com.sun.proxy.$Proxy22.getFunction(Unknown Source) > at org.apache.hadoop.hive.ql.metadata.Hive.getFunction(Hive.java:2662) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfoFromMetastore(FunctionRegistry.java:546) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getQualifiedFunctionInfo(FunctionRegistry.java:579) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:645) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:652) > at > org.apache.spark.sql.hive.HiveFunctionRegistry.lookupFunction(hiveUdfs.scala:54) > at > org.apache.spark.sql.hive.HiveContext$$anon$3.org$apache$spark$sql$catalyst$analysis$OverrideFunctionRegistry$$super$lookupFunction(HiveContext.scala:376) > at > org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44) > at > org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44) > at scala.Option.getOrElse(Option.scala:120) > at > org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$class.lookupFunction(FunctionRegistry.scala:44) > at > org.apache.spark.sql.hive.HiveContext$$anon$3.lookupFunction(HiveContext.scala:376) > at > org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:465) > at > org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:463) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:51) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:221) > at >
[jira] [Commented] (SPARK-9976) create function do not work
[ https://issues.apache.org/jira/browse/SPARK-9976?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15105168#comment-15105168 ] ocean commented on SPARK-9976: -- Does sparksql support temporary udf function which extends udf now? Now I found that udf temporary function which extends GenericUDF run ok, but udf temporary function which extends UDF can not work. > create function do not work > --- > > Key: SPARK-9976 > URL: https://issues.apache.org/jira/browse/SPARK-9976 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 1.4.0, 1.4.1, 1.5.0 > Environment: spark 1.4.1 yarn 2.2.0 >Reporter: cen yuhai > > I use beeline to connect to ThriftServer, but add jar can not work, so I use > create function , see the link below. > http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cm_mc_hive_udf.html > I do as blow: > {code} > create function gdecodeorder as 'com.hive.udf.GOrderDecode' USING JAR > 'hdfs://mycluster/user/spark/lib/gorderdecode.jar'; > {code} > It returns Ok, and I connect to the metastore, I see records in table FUNCS. > {code} > select gdecodeorder(t1) from tableX limit 1; > {code} > It returns error 'Couldn't find function default.gdecodeorder' > This is the Exception > {code} > 15/08/14 14:53:51 ERROR UserGroupInformation: PriviledgedActionException > as:xiaoju (auth:SIMPLE) cause:org.apache.hive.service.cli.HiveSQLException: > java.lang.RuntimeException: Couldn't find function default.gdecodeorder > 15/08/14 15:04:47 ERROR RetryingHMSHandler: > MetaException(message:NoSuchObjectException(message:Function > default.t_gdecodeorder does not exist)) > at > org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newMetaException(HiveMetaStore.java:4613) > at > org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.get_function(HiveMetaStore.java:4740) > at sun.reflect.GeneratedMethodAccessor57.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:606) > at > org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:105) > at com.sun.proxy.$Proxy21.get_function(Unknown Source) > at > org.apache.hadoop.hive.metastore.HiveMetaStoreClient.getFunction(HiveMetaStoreClient.java:1721) > at sun.reflect.GeneratedMethodAccessor56.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:606) > at > org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:89) > at com.sun.proxy.$Proxy22.getFunction(Unknown Source) > at org.apache.hadoop.hive.ql.metadata.Hive.getFunction(Hive.java:2662) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfoFromMetastore(FunctionRegistry.java:546) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getQualifiedFunctionInfo(FunctionRegistry.java:579) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:645) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:652) > at > org.apache.spark.sql.hive.HiveFunctionRegistry.lookupFunction(hiveUdfs.scala:54) > at > org.apache.spark.sql.hive.HiveContext$$anon$3.org$apache$spark$sql$catalyst$analysis$OverrideFunctionRegistry$$super$lookupFunction(HiveContext.scala:376) > at > org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44) > at > org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44) > at scala.Option.getOrElse(Option.scala:120) > at > org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$class.lookupFunction(FunctionRegistry.scala:44) > at > org.apache.spark.sql.hive.HiveContext$$anon$3.lookupFunction(HiveContext.scala:376) > at > org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:465) > at > org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:463) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:51) > at >
[jira] [Commented] (SPARK-11948) Permanent UDF not work
[ https://issues.apache.org/jira/browse/SPARK-11948?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15105170#comment-15105170 ] ocean commented on SPARK-11948: --- Does sparksql support temporary udf function which extends udf now? Now I found that udf temporary function which extends GenericUDF run ok, but udf temporary function which extends UDF can not work. my spark version is 1.5.2 > Permanent UDF not work > -- > > Key: SPARK-11948 > URL: https://issues.apache.org/jira/browse/SPARK-11948 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 1.5.1 >Reporter: Weizhong >Priority: Minor > > We create a function, > {noformat} > add jar /home/test/smartcare-udf-0.0.1-SNAPSHOT.jar; > create function arr_greater_equal as > 'smartcare.dac.hive.udf.UDFArrayGreaterEqual'; > {noformat} > but "show functions" don't display, and when we create the same function > again, it throw exception as below: > {noformat} > Error: org.apache.spark.sql.execution.QueryExecutionException: FAILED: > Execution Error, return code 1 from > org.apache.hadoop.hive.ql.exec.FunctionTask. > AlreadyExistsException(message:Function arr_greater_equal already exists) > (state=,code=0) > {noformat} > But if we use this function, it throw exception as below: > {noformat} > Error: org.apache.spark.sql.AnalysisException: undefined function > arr_greater_equal; line 1 pos 119 (state=,code=0) > {noformat} -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-9976) create function do not work
[ https://issues.apache.org/jira/browse/SPARK-9976?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15106385#comment-15106385 ] ocean commented on SPARK-9976: -- The second problem is: some temporary udf function which extends UDF can create, but when you describe it, it says that it is not found. spark-sql> create temporary FUNCTION dict AS 'td.enterprise.hive.udfs.ConvertDict'; OK Time taken: 0.04 seconds spark-sql> describe function dict; Function: dict is not found. Time taken: 0.033 seconds, Fetched 1 row(s) > create function do not work > --- > > Key: SPARK-9976 > URL: https://issues.apache.org/jira/browse/SPARK-9976 > Project: Spark > Issue Type: Bug > Components: SQL >Affects Versions: 1.4.0, 1.4.1, 1.5.0 > Environment: spark 1.4.1 yarn 2.2.0 >Reporter: cen yuhai > > I use beeline to connect to ThriftServer, but add jar can not work, so I use > create function , see the link below. > http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cm_mc_hive_udf.html > I do as blow: > {code} > create function gdecodeorder as 'com.hive.udf.GOrderDecode' USING JAR > 'hdfs://mycluster/user/spark/lib/gorderdecode.jar'; > {code} > It returns Ok, and I connect to the metastore, I see records in table FUNCS. > {code} > select gdecodeorder(t1) from tableX limit 1; > {code} > It returns error 'Couldn't find function default.gdecodeorder' > This is the Exception > {code} > 15/08/14 14:53:51 ERROR UserGroupInformation: PriviledgedActionException > as:xiaoju (auth:SIMPLE) cause:org.apache.hive.service.cli.HiveSQLException: > java.lang.RuntimeException: Couldn't find function default.gdecodeorder > 15/08/14 15:04:47 ERROR RetryingHMSHandler: > MetaException(message:NoSuchObjectException(message:Function > default.t_gdecodeorder does not exist)) > at > org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newMetaException(HiveMetaStore.java:4613) > at > org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.get_function(HiveMetaStore.java:4740) > at sun.reflect.GeneratedMethodAccessor57.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:606) > at > org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:105) > at com.sun.proxy.$Proxy21.get_function(Unknown Source) > at > org.apache.hadoop.hive.metastore.HiveMetaStoreClient.getFunction(HiveMetaStoreClient.java:1721) > at sun.reflect.GeneratedMethodAccessor56.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:606) > at > org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:89) > at com.sun.proxy.$Proxy22.getFunction(Unknown Source) > at org.apache.hadoop.hive.ql.metadata.Hive.getFunction(Hive.java:2662) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfoFromMetastore(FunctionRegistry.java:546) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getQualifiedFunctionInfo(FunctionRegistry.java:579) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:645) > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:652) > at > org.apache.spark.sql.hive.HiveFunctionRegistry.lookupFunction(hiveUdfs.scala:54) > at > org.apache.spark.sql.hive.HiveContext$$anon$3.org$apache$spark$sql$catalyst$analysis$OverrideFunctionRegistry$$super$lookupFunction(HiveContext.scala:376) > at > org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44) > at > org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44) > at scala.Option.getOrElse(Option.scala:120) > at > org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$class.lookupFunction(FunctionRegistry.scala:44) > at > org.apache.spark.sql.hive.HiveContext$$anon$3.lookupFunction(HiveContext.scala:376) > at > org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:465) > at > org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:463) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222) >
[jira] [Commented] (SPARK-9976) create function do not work
[ https://issues.apache.org/jira/browse/SPARK-9976?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15106372#comment-15106372 ] ocean commented on SPARK-9976: -- Now I am migrating from HIVE to Sparksql. Now I encountered two problem. first one: the temporary udf function which extends GenericUDF can create, and you can describe this udf function. but when I use this function, some exception occurs. I don't know where the problem is. spark version: apache 1.5.1 /standalone mode/CLI spark-sql> describe function devicemodel; Function: deviceModel Class: td.enterprise.hive.udfs.ConvertDeviceModel Usage: deviceModel(expr) - return other information Time taken: 0.041 seconds, Fetched 3 row(s) spark-sql> > select devicemodel(p.mobile_id, 'type', 'MOBILE_TYPE') as mobile_type from analytics_device_profile_zh3 p limit 1; 16/01/19 15:19:46 ERROR Executor: Exception in task 0.0 in stage 14.0 (TID 15) com.esotericsoftware.kryo.KryoException: Buffer underflow. Serialization trace: mobileAttributeCahceMap (td.enterprise.hive.udfs.ConvertDeviceModel) at com.esotericsoftware.kryo.io.Input.require(Input.java:156) at com.esotericsoftware.kryo.io.Input.readAscii_slow(Input.java:580) at com.esotericsoftware.kryo.io.Input.readAscii(Input.java:558) at com.esotericsoftware.kryo.io.Input.readString(Input.java:436) at com.esotericsoftware.kryo.serializers.DefaultSerializers$StringSerializer.read(DefaultSerializers.java:157) at com.esotericsoftware.kryo.serializers.DefaultSerializers$StringSerializer.read(DefaultSerializers.java:146) at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:729) at com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:134) at com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:17) at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:729) at com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:134) at com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:17) at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:648) at com.esotericsoftware.kryo.serializers.FieldSerializer$ObjectField.read(FieldSerializer.java:605) at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:221) at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:626) at org.apache.spark.sql.hive.HiveShim$HiveFunctionWrapper.deserializeObjectByKryo(HiveShim.scala:134) at org.apache.spark.sql.hive.HiveShim$HiveFunctionWrapper.deserializePlan(HiveShim.scala:150) at org.apache.spark.sql.hive.HiveShim$HiveFunctionWrapper.readExternal(HiveShim.scala:191) at java.io.ObjectInputStream.readExternalData(ObjectInputStream.java:1837) at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1796) at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350) at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990) at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915) at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798) at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350) at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990) at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915) at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798) at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350) at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1706) at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1344) at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990) at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915) at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798) at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350) at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990) at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915) at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798) at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350) at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990) at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915) at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798) at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350) at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990) at