[jira] [Updated] (SPARK-37581) sql hang at planning stage

2021-12-09 Thread ocean (Jira)


 [ 
https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ocean updated SPARK-37581:
--
Description: 
when exec a sql, this sql hang at planning stage.

when disable DPP, sql can finish very quickly.

we can reproduce this  problem through example below:

create table test.test_a (
day string,
week int,
weekday int)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_a partition (dt=20211126) values('1',1,2);

create table test.test_b (
session_id string,
device_id string,
brand string,
model string,
wx_version string,
os string,
net_work_type string,
app_id string,
app_name string,
col_z string,
page_url string,
page_title string,
olabel string,
otitle string,
source string,
send_dt string,
recv_dt string,
request_time string,
write_time string,
client_ip string,
col_a string,
dt_hour varchar(12),
product string,
channelfrom string,
customer_um string,
kb_code string,
col_b string,
rectype string,
errcode string,
col_c string,
pageid_merge string)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_b partition(dt=20211126)
values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2');

 

 

drop table if exists test.test_c;create table if not exists test.test_c stored 
as ORCFILE as
select calendar.day,calendar.week,calendar.weekday, a_kbs,
b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs
from (select * from test.test_a where dt = '20211126') calendar
left join
(select dt,count(distinct kb_code) as a_kbs
from test.test_b
where dt = '20211126'
group by dt) t1
on calendar.dt = t1.dt

left join
(select dt,count(distinct kb_code) as b_kbs
from test.test_b
where dt = '20211126'
group by dt) t2
on calendar.dt = t2.dt

left join
(select dt,count(distinct kb_code) as c_kbs
from test.test_b
where dt = '20211126'
group by dt) t3
on calendar.dt = t3.dt

left join
(select dt,count(distinct kb_code) as d_kbs
from test.test_b
where dt = '20211126'
group by dt) t4
on calendar.dt = t4.dt

left join
(select dt,count(distinct kb_code) as e_kbs
from test.test_b
where dt = '20211126'
group by dt) t5
on calendar.dt = t5.dt

left join
(select dt,count(distinct kb_code) as f_kbs
from test.test_b
where dt = '20211126'
group by dt) t6
on calendar.dt = t6.dt

left join
(select dt,count(distinct kb_code) as g_kbs
from test.test_b
where dt = '20211126'
group by dt) t7
on calendar.dt = t7.dt

left join
(select dt,count(distinct kb_code) as h_kbs
from test.test_b
where dt = '20211126'
group by dt) t8
on calendar.dt = t8.dt

left join
(select dt,count(distinct kb_code) as i_kbs
from test.test_b
where dt = '20211126'
group by dt) t9
on calendar.dt = t9.dt

left join
(select dt,count(distinct kb_code) as j_kbs
from test.test_b
where dt = '20211126'
group by dt) t10
on calendar.dt = t10.dt

left join
(select dt,count(distinct kb_code) as k_kbs
from test.test_b
where dt = '20211126'
group by dt) t11
on calendar.dt = t11.dt

 

  was:
when exec a sql, this sql hang at planning stage.

when disable DPP, sql can finish very quickly.

we can reproduce this  problem through example below:

create table test.test_a (
day string,
week int,
weekday int)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_a partition (dt=20211126) values('1',1,2);

create table test.test_b (
session_id string,
device_id string,
brand string,
model string,
wx_version string,
os string,
net_work_type string,
app_id string,
app_name string,
col_z string,
page_url string,
page_title string,
olabel string,
otitle string,
source string,
send_dt string,
recv_dt string,
request_time string,
write_time string,
client_ip string,
col_a string,
dt_hour varchar(12),
product string,
channelfrom string,
customer_um string,
kb_code string,
col_b string,
rectype string,
errcode string,
col_c string,
pageid_merge string)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_b partition(dt=20211126)
values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2');

 


> sql hang at planning stage
> --
>
> Key: SPARK-37581
> URL: https://issues.apache.org/jira/browse/SPARK-37581
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 3.1.1, 3.2.0
>Reporter: ocean
>Priority: Major
>
> when exec a sql, this sql hang at planning stage.
> when disable DPP, sql can finish very quickly.
> we can reproduce this  problem through example below:
> create table test.test_a (
> day string,
> week int,
> weekday int)
> partitioned by (
> dt varchar(8))
> stored as orc;
> insert into test.test_a partition (dt=20211126) values('1',1,2);
> create table test.test_b (
> session_id string,
> device_id string,
> brand string,
> model string,
> wx_version 

[jira] [Updated] (SPARK-37581) sql hang at planning stage

2021-12-09 Thread ocean (Jira)


 [ 
https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ocean updated SPARK-37581:
--
Description: 
when exec a sql, this sql hang at planning stage.

when disable DPP, sql can finish very quickly.

we can reproduce this  problem through example below:

create table test.test_a (
day string,
week int,
weekday int)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_a partition (dt=20211126) values('1',1,2);

create table test.test_b (
session_id string,
device_id string,
brand string,
model string,
wx_version string,
os string,
net_work_type string,
app_id string,
app_name string,
col_z string,
page_url string,
page_title string,
olabel string,
otitle string,
source string,
send_dt string,
recv_dt string,
request_time string,
write_time string,
client_ip string,
col_a string,
dt_hour varchar(12),
product string,
channelfrom string,
customer_um string,
kb_code string,
col_b string,
rectype string,
errcode string,
col_c string,
pageid_merge string)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_b partition(dt=20211126)
values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2');

 

  was:
when exec a sql, this sql hang at planning stage.

when disable DPP, sql can finish very quickly.

we can reproduce this  problem through example below:

create table test.test_a (
day string,
week int,
weekday int)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_a partition (dt=20211126) values('1',1,2);

create table test.test_b (
session_id string,
device_id string,
brand string,
model string,
wx_version string,
os string,
net_work_type string,
app_id string,
app_name string,
col_z string,
page_url string,
page_title string,
olabel string,
otitle string,
source string,
send_dt string,
recv_dt string,
request_time string,
write_time string,
client_ip string,
col_a string,
dt_hour varchar(12),
product string,
channelfrom string,
customer_um string,
kb_code string,
col_b string,
rectype string,
errcode string,
col_c string,
pageid_merge string)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_b partition(dt=20211126)
values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2');

create table if not exists test.test_c stored as ORCFILE as
select calendar.day,calendar.week,calendar.weekday, a_kbs,
b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,
j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs
from (select * from test.test_a where dt = '20211126') calendar
left join
(select dt,count(distinct kb_code) as a_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t1
on calendar.dt = t1.dt

left join
(select dt,count(distinct kb_code) as b_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t2
on calendar.dt = t2.dt

left join
(select dt,count(distinct kb_code) as c_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t3
on calendar.dt = t3.dt

left join
(select dt,count(distinct kb_code) as d_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t4
on calendar.dt = t4.dt
left join
(select dt,count(distinct kb_code) as e_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t5
on calendar.dt = t5.dt
left join
(select dt,count(distinct kb_code) as f_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t6
on calendar.dt = t6.dt

left join
(select dt,count(distinct kb_code) as g_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t7
on calendar.dt = t7.dt

left join
(select dt,count(distinct kb_code) as h_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t8
on calendar.dt = t8.dt

left join
(select dt,count(distinct kb_code) as i_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t9
on calendar.dt = t9.dt

left join
(select dt,count(distinct kb_code) as j_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t10
on calendar.dt = t10.dt

left join
(select dt,count(distinct kb_code) as k_kbs
from test.test_b
where dt = 

[jira] [Commented] (SPARK-37581) sql hang at planning stage

2021-12-08 Thread ocean (Jira)


[ 
https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17456193#comment-17456193
 ] 

ocean commented on SPARK-37581:
---

I have narrowed down in the comment.please have a look.

> sql hang at planning stage
> --
>
> Key: SPARK-37581
> URL: https://issues.apache.org/jira/browse/SPARK-37581
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 3.1.1, 3.2.0
>Reporter: ocean
>Priority: Major
>
> when exec a sql, this sql hang at planning stage.
> when disable DPP, sql can finish very quickly.
> we can reproduce this  problem through example below:
> create table test.test_a (
> day string,
> week int,
> weekday int)
> partitioned by (
> dt varchar(8))
> stored as orc;
> insert into test.test_a partition (dt=20211126) values('1',1,2);
> create table test.test_b (
> session_id string,
> device_id string,
> brand string,
> model string,
> wx_version string,
> os string,
> net_work_type string,
> app_id string,
> app_name string,
> col_z string,
> page_url string,
> page_title string,
> olabel string,
> otitle string,
> source string,
> send_dt string,
> recv_dt string,
> request_time string,
> write_time string,
> client_ip string,
> col_a string,
> dt_hour varchar(12),
> product string,
> channelfrom string,
> customer_um string,
> kb_code string,
> col_b string,
> rectype string,
> errcode string,
> col_c string,
> pageid_merge string)
> partitioned by (
> dt varchar(8))
> stored as orc;
> insert into test.test_b partition(dt=20211126)
> values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2');
> create table if not exists test.test_c stored as ORCFILE as
> select calendar.day,calendar.week,calendar.weekday, a_kbs,
> b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,
> j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs
> from (select * from test.test_a where dt = '20211126') calendar
> left join
> (select dt,count(distinct kb_code) as a_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t1
> on calendar.dt = t1.dt
> left join
> (select dt,count(distinct kb_code) as b_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t2
> on calendar.dt = t2.dt
> left join
> (select dt,count(distinct kb_code) as c_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t3
> on calendar.dt = t3.dt
> left join
> (select dt,count(distinct kb_code) as d_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t4
> on calendar.dt = t4.dt
> left join
> (select dt,count(distinct kb_code) as e_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t5
> on calendar.dt = t5.dt
> left join
> (select dt,count(distinct kb_code) as f_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t6
> on calendar.dt = t6.dt
> left join
> (select dt,count(distinct kb_code) as g_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t7
> on calendar.dt = t7.dt
> left join
> (select dt,count(distinct kb_code) as h_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t8
> on calendar.dt = t8.dt
> left join
> (select dt,count(distinct kb_code) as i_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t9
> on calendar.dt = t9.dt
> left join
> (select dt,count(distinct kb_code) as j_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t10
> on calendar.dt = t10.dt
> left join
> (select dt,count(distinct kb_code) as k_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t11
> on calendar.dt = t11.dt
> left join
> (select dt,count(distinct kb_code) as l_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t12
> on calendar.dt = t12.dt
> left join
> (select 

[jira] (SPARK-37581) sql hang at planning stage

2021-12-08 Thread ocean (Jira)


[ https://issues.apache.org/jira/browse/SPARK-37581 ]


ocean deleted comment on SPARK-37581:
---

was (Author: oceaneast):
I have narrowed down in the comment.please have a look.

> sql hang at planning stage
> --
>
> Key: SPARK-37581
> URL: https://issues.apache.org/jira/browse/SPARK-37581
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 3.1.1, 3.2.0
>Reporter: ocean
>Priority: Major
>
> when exec a sql, this sql hang at planning stage.
> when disable DPP, sql can finish very quickly.
> we can reproduce this  problem through example below:
> create table test.test_a (
> day string,
> week int,
> weekday int)
> partitioned by (
> dt varchar(8))
> stored as orc;
> insert into test.test_a partition (dt=20211126) values('1',1,2);
> create table test.test_b (
> session_id string,
> device_id string,
> brand string,
> model string,
> wx_version string,
> os string,
> net_work_type string,
> app_id string,
> app_name string,
> col_z string,
> page_url string,
> page_title string,
> olabel string,
> otitle string,
> source string,
> send_dt string,
> recv_dt string,
> request_time string,
> write_time string,
> client_ip string,
> col_a string,
> dt_hour varchar(12),
> product string,
> channelfrom string,
> customer_um string,
> kb_code string,
> col_b string,
> rectype string,
> errcode string,
> col_c string,
> pageid_merge string)
> partitioned by (
> dt varchar(8))
> stored as orc;
> insert into test.test_b partition(dt=20211126)
> values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2');
> create table if not exists test.test_c stored as ORCFILE as
> select calendar.day,calendar.week,calendar.weekday, a_kbs,
> b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,
> j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs
> from (select * from test.test_a where dt = '20211126') calendar
> left join
> (select dt,count(distinct kb_code) as a_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t1
> on calendar.dt = t1.dt
> left join
> (select dt,count(distinct kb_code) as b_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t2
> on calendar.dt = t2.dt
> left join
> (select dt,count(distinct kb_code) as c_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t3
> on calendar.dt = t3.dt
> left join
> (select dt,count(distinct kb_code) as d_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t4
> on calendar.dt = t4.dt
> left join
> (select dt,count(distinct kb_code) as e_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t5
> on calendar.dt = t5.dt
> left join
> (select dt,count(distinct kb_code) as f_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t6
> on calendar.dt = t6.dt
> left join
> (select dt,count(distinct kb_code) as g_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t7
> on calendar.dt = t7.dt
> left join
> (select dt,count(distinct kb_code) as h_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t8
> on calendar.dt = t8.dt
> left join
> (select dt,count(distinct kb_code) as i_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t9
> on calendar.dt = t9.dt
> left join
> (select dt,count(distinct kb_code) as j_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t10
> on calendar.dt = t10.dt
> left join
> (select dt,count(distinct kb_code) as k_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t11
> on calendar.dt = t11.dt
> left join
> (select dt,count(distinct kb_code) as l_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t12
> on calendar.dt = t12.dt
> left join
> (select dt,count(distinct kb_code) as m_kbs
> from test.test_b
> where dt = 

[jira] [Updated] (SPARK-37581) sql hang at planning stage

2021-12-08 Thread ocean (Jira)


 [ 
https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ocean updated SPARK-37581:
--
Description: 
when exec a sql, this sql hang at planning stage.

when disable DPP, sql can finish very quickly.

we can reproduce this  problem through example below:

create table test.test_a (
day string,
week int,
weekday int)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_a partition (dt=20211126) values('1',1,2);

create table test.test_b (
session_id string,
device_id string,
brand string,
model string,
wx_version string,
os string,
net_work_type string,
app_id string,
app_name string,
col_z string,
page_url string,
page_title string,
olabel string,
otitle string,
source string,
send_dt string,
recv_dt string,
request_time string,
write_time string,
client_ip string,
col_a string,
dt_hour varchar(12),
product string,
channelfrom string,
customer_um string,
kb_code string,
col_b string,
rectype string,
errcode string,
col_c string,
pageid_merge string)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_b partition(dt=20211126)
values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2');

create table if not exists test.test_c stored as ORCFILE as
select calendar.day,calendar.week,calendar.weekday, a_kbs,
b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,
j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs
from (select * from test.test_a where dt = '20211126') calendar
left join
(select dt,count(distinct kb_code) as a_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t1
on calendar.dt = t1.dt

left join
(select dt,count(distinct kb_code) as b_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t2
on calendar.dt = t2.dt

left join
(select dt,count(distinct kb_code) as c_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t3
on calendar.dt = t3.dt

left join
(select dt,count(distinct kb_code) as d_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t4
on calendar.dt = t4.dt
left join
(select dt,count(distinct kb_code) as e_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t5
on calendar.dt = t5.dt
left join
(select dt,count(distinct kb_code) as f_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t6
on calendar.dt = t6.dt

left join
(select dt,count(distinct kb_code) as g_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t7
on calendar.dt = t7.dt

left join
(select dt,count(distinct kb_code) as h_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t8
on calendar.dt = t8.dt

left join
(select dt,count(distinct kb_code) as i_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t9
on calendar.dt = t9.dt

left join
(select dt,count(distinct kb_code) as j_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t10
on calendar.dt = t10.dt

left join
(select dt,count(distinct kb_code) as k_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t11
on calendar.dt = t11.dt

left join
(select dt,count(distinct kb_code) as l_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t12
on calendar.dt = t12.dt

left join
(select dt,count(distinct kb_code) as m_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t13
on calendar.dt = t13.dt

left join
(select dt,count(distinct kb_code) as n_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t14
on calendar.dt = t14.dt

left join
(select dt,count(distinct kb_code) as o_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t15
on calendar.dt = t15.dt

left join
(select dt,count(distinct kb_code) as p_kbs
from test.test_b
where dt = '20211126'
and 

[jira] [Comment Edited] (SPARK-37581) sql hang at planning stage

2021-12-08 Thread ocean (Jira)


[ 
https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17456154#comment-17456154
 ] 

ocean edited comment on SPARK-37581 at 12/9/21, 6:12 AM:
-

Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the 
same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 
join operators, costs 39s. when 12 join operators, costs 120s. when 13 join 
operators , it can not finish.  

 

I think, we can debug it at 11 join operators, to find why it is so slow. I had 
narrowed down 

as below:

===

drop table if exists test.test_c;create table if not exists test.test_c stored 
as ORCFILE as
select calendar.day,calendar.week,calendar.weekday, a_kbs,
b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs
from (select * from test.test_a where dt = '20211126') calendar
left join
(select dt,count(distinct kb_code) as a_kbs
from test.test_b
where dt = '20211126'
group by dt) t1
on calendar.dt = t1.dt

left join
(select dt,count(distinct kb_code) as b_kbs
from test.test_b
where dt = '20211126'
group by dt) t2
on calendar.dt = t2.dt

left join
(select dt,count(distinct kb_code) as c_kbs
from test.test_b
where dt = '20211126'
group by dt) t3
on calendar.dt = t3.dt

left join
(select dt,count(distinct kb_code) as d_kbs
from test.test_b
where dt = '20211126'
group by dt) t4
on calendar.dt = t4.dt

left join
(select dt,count(distinct kb_code) as e_kbs
from test.test_b
where dt = '20211126'
group by dt) t5
on calendar.dt = t5.dt

left join
(select dt,count(distinct kb_code) as f_kbs
from test.test_b
where dt = '20211126'
group by dt) t6
on calendar.dt = t6.dt

left join
(select dt,count(distinct kb_code) as g_kbs
from test.test_b
where dt = '20211126'
group by dt) t7
on calendar.dt = t7.dt

left join
(select dt,count(distinct kb_code) as h_kbs
from test.test_b
where dt = '20211126'
group by dt) t8
on calendar.dt = t8.dt

left join
(select dt,count(distinct kb_code) as i_kbs
from test.test_b
where dt = '20211126'
group by dt) t9
on calendar.dt = t9.dt

left join
(select dt,count(distinct kb_code) as j_kbs
from test.test_b
where dt = '20211126'
group by dt) t10
on calendar.dt = t10.dt

left join
(select dt,count(distinct kb_code) as k_kbs
from test.test_b
where dt = '20211126'
group by dt) t11
on calendar.dt = t11.dt

 


was (Author: oceaneast):
Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the 
same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 
join operators, costs 39s. when 12 join operators, costs 120s. when 13 join 
operators , it can not finish.  

 

I think, we can debug it at 11 join operators, to find why it is so slow.

===

drop table if exists test.test_c;create table if not exists test.test_c stored 
as ORCFILE as
select calendar.day,calendar.week,calendar.weekday, a_kbs,
b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs
from (select * from test.test_a where dt = '20211126') calendar
left join
(select dt,count(distinct kb_code) as a_kbs
from test.test_b
where dt = '20211126'
group by dt) t1
on calendar.dt = t1.dt

left join
(select dt,count(distinct kb_code) as b_kbs
from test.test_b
where dt = '20211126'
group by dt) t2
on calendar.dt = t2.dt


left join
(select dt,count(distinct kb_code) as c_kbs
from test.test_b
where dt = '20211126'
group by dt) t3
on calendar.dt = t3.dt

left join
(select dt,count(distinct kb_code) as d_kbs
from test.test_b
where dt = '20211126'
group by dt) t4
on calendar.dt = t4.dt

left join
(select dt,count(distinct kb_code) as e_kbs
from test.test_b
where dt = '20211126'
group by dt) t5
on calendar.dt = t5.dt

left join
(select dt,count(distinct kb_code) as f_kbs
from test.test_b
where dt = '20211126'
group by dt) t6
on calendar.dt = t6.dt

left join
(select dt,count(distinct kb_code) as g_kbs
from test.test_b
where dt = '20211126'
group by dt) t7
on calendar.dt = t7.dt

left join
(select dt,count(distinct kb_code) as h_kbs
from test.test_b
where dt = '20211126'
group by dt) t8
on calendar.dt = t8.dt

left join
(select dt,count(distinct kb_code) as i_kbs
from test.test_b
where dt = '20211126'
group by dt) t9
on calendar.dt = t9.dt

left join
(select dt,count(distinct kb_code) as j_kbs
from test.test_b
where dt = '20211126'
group by dt) t10
on calendar.dt = t10.dt

left join
(select dt,count(distinct kb_code) as k_kbs
from test.test_b
where dt = '20211126'
group by dt) t11
on calendar.dt = t11.dt







 

> sql hang at planning stage
> --
>
> Key: SPARK-37581
> URL: https://issues.apache.org/jira/browse/SPARK-37581
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 3.1.1, 3.2.0
>Reporter: ocean
>Priority: Major
>
> when exec a sql, this sql hang at planning stage.
> when disable DPP, sql can finish normally.
> we 

[jira] [Comment Edited] (SPARK-37581) sql hang at planning stage

2021-12-08 Thread ocean (Jira)


[ 
https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17456154#comment-17456154
 ] 

ocean edited comment on SPARK-37581 at 12/9/21, 6:06 AM:
-

Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the 
same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 
join operators, costs 39s. when 12 join operators, costs 120s. when 13 join 
operators , it can not finish.  

 

I think, we can debug it at 11 join operators, to find why it is so slow.

===

drop table if exists test.test_c;create table if not exists test.test_c stored 
as ORCFILE as
select calendar.day,calendar.week,calendar.weekday, a_kbs,
b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs
from (select * from test.test_a where dt = '20211126') calendar
left join
(select dt,count(distinct kb_code) as a_kbs
from test.test_b
where dt = '20211126'
group by dt) t1
on calendar.dt = t1.dt

left join
(select dt,count(distinct kb_code) as b_kbs
from test.test_b
where dt = '20211126'
group by dt) t2
on calendar.dt = t2.dt


left join
(select dt,count(distinct kb_code) as c_kbs
from test.test_b
where dt = '20211126'
group by dt) t3
on calendar.dt = t3.dt

left join
(select dt,count(distinct kb_code) as d_kbs
from test.test_b
where dt = '20211126'
group by dt) t4
on calendar.dt = t4.dt

left join
(select dt,count(distinct kb_code) as e_kbs
from test.test_b
where dt = '20211126'
group by dt) t5
on calendar.dt = t5.dt

left join
(select dt,count(distinct kb_code) as f_kbs
from test.test_b
where dt = '20211126'
group by dt) t6
on calendar.dt = t6.dt

left join
(select dt,count(distinct kb_code) as g_kbs
from test.test_b
where dt = '20211126'
group by dt) t7
on calendar.dt = t7.dt

left join
(select dt,count(distinct kb_code) as h_kbs
from test.test_b
where dt = '20211126'
group by dt) t8
on calendar.dt = t8.dt

left join
(select dt,count(distinct kb_code) as i_kbs
from test.test_b
where dt = '20211126'
group by dt) t9
on calendar.dt = t9.dt

left join
(select dt,count(distinct kb_code) as j_kbs
from test.test_b
where dt = '20211126'
group by dt) t10
on calendar.dt = t10.dt

left join
(select dt,count(distinct kb_code) as k_kbs
from test.test_b
where dt = '20211126'
group by dt) t11
on calendar.dt = t11.dt







 


was (Author: oceaneast):
Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the 
same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 
join operators, costs 39s. when 12 join operators, costs 120s. when 13 join 
operators , it can not finish.  

 

I think, we can debug it at 11 join operators, to find why it is so slow.

===

drop table if exists test.test_c;create table if not exists test.test_c stored 
as ORCFILE as
select calendar.day,calendar.week,calendar.weekday, a_kbs,
b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs
from (select * from test.test_a where dt = '20211126') calendar
left join
(select dt,count(distinct kb_code) as a_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t1
on calendar.dt = t1.dt

left join
(select dt,count(distinct kb_code) as b_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t2
on calendar.dt = t2.dt


left join
(select dt,count(distinct kb_code) as c_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t3
on calendar.dt = t3.dt

left join
(select dt,count(distinct kb_code) as d_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t4
on calendar.dt = t4.dt

left join
(select dt,count(distinct kb_code) as e_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t5
on calendar.dt = t5.dt

left join
(select dt,count(distinct kb_code) as f_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t6
on calendar.dt = t6.dt

left join
(select dt,count(distinct kb_code) as g_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t7
on calendar.dt = t7.dt

left join
(select dt,count(distinct kb_code) as h_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t8
on calendar.dt = t8.dt

left join
(select dt,count(distinct kb_code) as i_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t9
on 

[jira] [Commented] (SPARK-37581) sql hang at planning stage

2021-12-08 Thread ocean (Jira)


[ 
https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17456154#comment-17456154
 ] 

ocean commented on SPARK-37581:
---

Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the 
same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 
join operators, costs 39s. when 12 join operators, costs 120s. when 13 join 
operators , it can not finish.  

 

I think, we can debug it at 11 join operators, to find why it is so slow.

===

drop table if exists test.test_c;create table if not exists test.test_c stored 
as ORCFILE as
select calendar.day,calendar.week,calendar.weekday, a_kbs,
b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs
from (select * from test.test_a where dt = '20211126') calendar
left join
(select dt,count(distinct kb_code) as a_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t1
on calendar.dt = t1.dt

left join
(select dt,count(distinct kb_code) as b_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t2
on calendar.dt = t2.dt


left join
(select dt,count(distinct kb_code) as c_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t3
on calendar.dt = t3.dt

left join
(select dt,count(distinct kb_code) as d_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t4
on calendar.dt = t4.dt

left join
(select dt,count(distinct kb_code) as e_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t5
on calendar.dt = t5.dt

left join
(select dt,count(distinct kb_code) as f_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t6
on calendar.dt = t6.dt

left join
(select dt,count(distinct kb_code) as g_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t7
on calendar.dt = t7.dt

left join
(select dt,count(distinct kb_code) as h_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t8
on calendar.dt = t8.dt

left join
(select dt,count(distinct kb_code) as i_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t9
on calendar.dt = t9.dt

left join
(select dt,count(distinct kb_code) as j_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t10
on calendar.dt = t10.dt

left join
(select dt,count(distinct kb_code) as k_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t11
on calendar.dt = t11.dt







 

> sql hang at planning stage
> --
>
> Key: SPARK-37581
> URL: https://issues.apache.org/jira/browse/SPARK-37581
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 3.1.1, 3.2.0
>Reporter: ocean
>Priority: Major
>
> when exec a sql, this sql hang at planning stage.
> when disable DPP, sql can finish normally.
> we can reproduce this  problem through example below:
> create table test.test_a (
> day string,
> week int,
> weekday int)
> partitioned by (
> dt varchar(8))
> stored as orc;
> insert into test.test_a partition (dt=20211126) values('1',1,2);
> create table test.test_b (
> session_id string,
> device_id string,
> brand string,
> model string,
> wx_version string,
> os string,
> net_work_type string,
> app_id string,
> app_name string,
> col_z string,
> page_url string,
> page_title string,
> olabel string,
> otitle string,
> source string,
> send_dt string,
> recv_dt string,
> request_time string,
> write_time string,
> client_ip string,
> col_a string,
> dt_hour varchar(12),
> product string,
> channelfrom string,
> customer_um string,
> kb_code string,
> col_b string,
> rectype string,
> errcode string,
> col_c string,
> pageid_merge string)
> partitioned by (
> dt varchar(8))
> stored as orc;
> insert into test.test_b partition(dt=20211126)
> values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2');
> create table if not exists test.test_c stored as ORCFILE as
> select calendar.day,calendar.week,calendar.weekday, a_kbs,
> b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,
> 

[jira] [Updated] (SPARK-37581) sql hang at planning stage

2021-12-08 Thread ocean (Jira)


 [ 
https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ocean updated SPARK-37581:
--
Priority: Critical  (was: Major)

> sql hang at planning stage
> --
>
> Key: SPARK-37581
> URL: https://issues.apache.org/jira/browse/SPARK-37581
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 3.1.1, 3.2.0
>Reporter: ocean
>Priority: Critical
>
> when exec a sql, this sql hang at planning stage.
> when disable DPP, sql can finish normally.
> we can reproduce this  problem through example below:
> create table test.test_a (
> day string,
> week int,
> weekday int)
> partitioned by (
> dt varchar(8))
> stored as orc;
> insert into test.test_a partition (dt=20211126) values('1',1,2);
> create table test.test_b (
> session_id string,
> device_id string,
> brand string,
> model string,
> wx_version string,
> os string,
> net_work_type string,
> app_id string,
> app_name string,
> col_z string,
> page_url string,
> page_title string,
> olabel string,
> otitle string,
> source string,
> send_dt string,
> recv_dt string,
> request_time string,
> write_time string,
> client_ip string,
> col_a string,
> dt_hour varchar(12),
> product string,
> channelfrom string,
> customer_um string,
> kb_code string,
> col_b string,
> rectype string,
> errcode string,
> col_c string,
> pageid_merge string)
> partitioned by (
> dt varchar(8))
> stored as orc;
> insert into test.test_b partition(dt=20211126)
> values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2');
> create table if not exists test.test_c stored as ORCFILE as
> select calendar.day,calendar.week,calendar.weekday, a_kbs,
> b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,
> j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs
> from (select * from test.test_a where dt = '20211126') calendar
> left join
> (select dt,count(distinct kb_code) as a_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t1
> on calendar.dt = t1.dt
> left join
> (select dt,count(distinct kb_code) as b_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t2
> on calendar.dt = t2.dt
> left join
> (select dt,count(distinct kb_code) as c_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t3
> on calendar.dt = t3.dt
> left join
> (select dt,count(distinct kb_code) as d_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t4
> on calendar.dt = t4.dt
> left join
> (select dt,count(distinct kb_code) as e_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t5
> on calendar.dt = t5.dt
> left join
> (select dt,count(distinct kb_code) as f_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t6
> on calendar.dt = t6.dt
> left join
> (select dt,count(distinct kb_code) as g_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t7
> on calendar.dt = t7.dt
> left join
> (select dt,count(distinct kb_code) as h_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t8
> on calendar.dt = t8.dt
> left join
> (select dt,count(distinct kb_code) as i_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t9
> on calendar.dt = t9.dt
> left join
> (select dt,count(distinct kb_code) as j_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t10
> on calendar.dt = t10.dt
> left join
> (select dt,count(distinct kb_code) as k_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t11
> on calendar.dt = t11.dt
> left join
> (select dt,count(distinct kb_code) as l_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '66'
> and pageid_merge = 'a'
> group by dt) t12
> on calendar.dt = t12.dt
> left join
> (select dt,count(distinct kb_code) as m_kbs
> from test.test_b
> where dt = 

[jira] [Created] (SPARK-37581) sql hang at planning stage

2021-12-08 Thread ocean (Jira)
ocean created SPARK-37581:
-

 Summary: sql hang at planning stage
 Key: SPARK-37581
 URL: https://issues.apache.org/jira/browse/SPARK-37581
 Project: Spark
  Issue Type: Bug
  Components: SQL
Affects Versions: 3.2.0, 3.1.1
Reporter: ocean


when exec a sql, this sql hang at planning stage.

when disable DPP, sql can finish normally.

we can reproduce this  problem through example below:

create table test.test_a (
day string,
week int,
weekday int)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_a partition (dt=20211126) values('1',1,2);

create table test.test_b (
session_id string,
device_id string,
brand string,
model string,
wx_version string,
os string,
net_work_type string,
app_id string,
app_name string,
col_z string,
page_url string,
page_title string,
olabel string,
otitle string,
source string,
send_dt string,
recv_dt string,
request_time string,
write_time string,
client_ip string,
col_a string,
dt_hour varchar(12),
product string,
channelfrom string,
customer_um string,
kb_code string,
col_b string,
rectype string,
errcode string,
col_c string,
pageid_merge string)
partitioned by (
dt varchar(8))
stored as orc;

insert into test.test_b partition(dt=20211126)
values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2');

create table if not exists test.test_c stored as ORCFILE as
select calendar.day,calendar.week,calendar.weekday, a_kbs,
b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,
j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs
from (select * from test.test_a where dt = '20211126') calendar
left join
(select dt,count(distinct kb_code) as a_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t1
on calendar.dt = t1.dt

left join
(select dt,count(distinct kb_code) as b_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t2
on calendar.dt = t2.dt


left join
(select dt,count(distinct kb_code) as c_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t3
on calendar.dt = t3.dt

left join
(select dt,count(distinct kb_code) as d_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t4
on calendar.dt = t4.dt
left join
(select dt,count(distinct kb_code) as e_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t5
on calendar.dt = t5.dt
left join
(select dt,count(distinct kb_code) as f_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t6
on calendar.dt = t6.dt

left join
(select dt,count(distinct kb_code) as g_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t7
on calendar.dt = t7.dt

left join
(select dt,count(distinct kb_code) as h_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t8
on calendar.dt = t8.dt

left join
(select dt,count(distinct kb_code) as i_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t9
on calendar.dt = t9.dt

left join
(select dt,count(distinct kb_code) as j_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t10
on calendar.dt = t10.dt

left join
(select dt,count(distinct kb_code) as k_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t11
on calendar.dt = t11.dt

left join
(select dt,count(distinct kb_code) as l_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t12
on calendar.dt = t12.dt

left join
(select dt,count(distinct kb_code) as m_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t13
on calendar.dt = t13.dt

left join
(select dt,count(distinct kb_code) as n_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge = 'a'
group by dt) t14
on calendar.dt = t14.dt

left join
(select dt,count(distinct kb_code) as o_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '66'
and pageid_merge 

[jira] [Updated] (SPARK-37344) split function behave differently between spark 2.3 and spark 3.2

2021-11-15 Thread ocean (Jira)


 [ 
https://issues.apache.org/jira/browse/SPARK-37344?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ocean updated SPARK-37344:
--
Labels: incorrect  (was: )

> split function behave differently between spark 2.3 and spark 3.2
> -
>
> Key: SPARK-37344
> URL: https://issues.apache.org/jira/browse/SPARK-37344
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 3.1.1, 3.1.2, 3.2.0
>Reporter: ocean
>Priority: Major
>  Labels: incorrect
>
> while use split function in sql, it behave differently between 2.3 and 3.2, 
> which cause incorrect problem.
> we can use this sql to reproduce this problem:
>  
> create table split_test ( id int,name string)
> insert into split_test values(1,"abc;def")
> explain extended select split(name,';') from split_test
>  
> spark3:
> spark-sql> Explain extended select split(name,';') from split_test;
> == Parsed Logical Plan ==
> 'Project [unresolvedalias('split('name, \\;), None)]
> +- 'UnresolvedRelation [split_test], [], false
>  
> spark2:
>  
> spark-sql> Explain extended select split(name,';') from split_test;
> == Parsed Logical Plan ==
> 'Project [unresolvedalias('split('name, \;), None)]
> +- 'UnresolvedRelation split_test
>  
> It looks like the deal of escape is different



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org



[jira] [Created] (SPARK-37344) split function behave differently between spark 2.3 and spark 3.2

2021-11-15 Thread ocean (Jira)
ocean created SPARK-37344:
-

 Summary: split function behave differently between spark 2.3 and 
spark 3.2
 Key: SPARK-37344
 URL: https://issues.apache.org/jira/browse/SPARK-37344
 Project: Spark
  Issue Type: Bug
  Components: SQL
Affects Versions: 3.2.0, 3.1.2, 3.1.1
Reporter: ocean


while use split function in sql, it behave differently between 2.3 and 3.2, 
which cause incorrect problem.

we can use this sql to reproduce this problem:

 

create table split_test ( id int,name string)

insert into split_test values(1,"abc;def")

explain extended select split(name,';') from split_test

 

spark3:

spark-sql> Explain extended select split(name,';') from split_test;

== Parsed Logical Plan ==

'Project [unresolvedalias('split('name, \\;), None)]

+- 'UnresolvedRelation [split_test], [], false

 

spark2:

 

spark-sql> Explain extended select split(name,';') from split_test;

== Parsed Logical Plan ==

'Project [unresolvedalias('split('name, \;), None)]

+- 'UnresolvedRelation split_test

 

It looks like the deal of escape is different



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org



[jira] [Created] (SPARK-36552) varchar datatype behave differently on hive table and datasource table

2021-08-20 Thread ocean (Jira)
ocean created SPARK-36552:
-

 Summary: varchar datatype behave differently on  hive table  and 
datasource table
 Key: SPARK-36552
 URL: https://issues.apache.org/jira/browse/SPARK-36552
 Project: Spark
  Issue Type: Bug
  Components: SQL
Affects Versions: 3.1.1, 3.1.0, 2.3.1
Reporter: ocean


in spark 3.1.X, when set spark.sql.hive.convertMetastoreOrc=false,and 

spark.sql.legacy.charVarcharAsString=true.

Execute the following sql:

CREATE TABLE t (col varchar(2)) stored as orc;
INSERT INTO t SELECT 'aaa';
select * from t;

result is aa

 

But when set spark.sql.hive.convertMetastoreOrc=true,and 

spark.sql.legacy.charVarcharAsString=true

alse execute the sql, the result is "aaa"

 

 

 

 



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org



[jira] [Updated] (SPARK-36552) varchar datatype behave differently on hive table and datasource table

2021-08-20 Thread ocean (Jira)


 [ 
https://issues.apache.org/jira/browse/SPARK-36552?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ocean updated SPARK-36552:
--
Affects Version/s: (was: 3.1.0)
   3.1.2

> varchar datatype behave differently on  hive table  and datasource table
> 
>
> Key: SPARK-36552
> URL: https://issues.apache.org/jira/browse/SPARK-36552
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 2.3.1, 3.1.1, 3.1.2
>Reporter: ocean
>Priority: Major
>
> in spark 3.1.X, when set spark.sql.hive.convertMetastoreOrc=false,and 
> spark.sql.legacy.charVarcharAsString=true.
> Execute the following sql:
> CREATE TABLE t (col varchar(2)) stored as orc;
> INSERT INTO t SELECT 'aaa';
> select * from t;
> result is aa
>  
> But when set spark.sql.hive.convertMetastoreOrc=true,and 
> spark.sql.legacy.charVarcharAsString=true
> alse execute the sql, the result is "aaa"
>  
>  
>  
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org



[jira] [Commented] (SPARK-29015) Can not support "add jar" on JDK 11

2019-09-07 Thread ocean (Jira)


[ 
https://issues.apache.org/jira/browse/SPARK-29015?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16924882#comment-16924882
 ] 

ocean commented on SPARK-29015:
---

I try to fix it

> Can not support "add jar" on JDK 11
> ---
>
> Key: SPARK-29015
> URL: https://issues.apache.org/jira/browse/SPARK-29015
> Project: Spark
>  Issue Type: Sub-task
>  Components: SQL
>Affects Versions: 3.0.0
>Reporter: Yuming Wang
>Priority: Major
>
> How to reproduce:
> Case 1:
> {code:bash}
> export JAVA_HOME=/usr/lib/jdk-11.0.3
> export PATH=$JAVA_HOME/bin:$PATH
> build/sbt clean package -Phive -Phadoop-3.2 -Phive-thriftserver
> export SPARK_PREPEND_CLASSES=true
> sbin/start-thriftserver.sh
> bin/beeline -u jdbc:hive2://localhost:1
> {code}
> {noformat}
> 0: jdbc:hive2://localhost:1> add jar 
> /root/.m2/repository/org/apache/hive/hcatalog/hive-hcatalog-core/2.3.6/hive-hcatalog-core-2.3.6.jar;
> INFO  : Added 
> [/root/.m2/repository/org/apache/hive/hcatalog/hive-hcatalog-core/2.3.6/hive-hcatalog-core-2.3.6.jar]
>  to class path
> INFO  : Added resources: 
> [/root/.m2/repository/org/apache/hive/hcatalog/hive-hcatalog-core/2.3.6/hive-hcatalog-core-2.3.6.jar]
> +-+
> | result  |
> +-+
> +-+
> No rows selected (0.381 seconds)
> 0: jdbc:hive2://localhost:1> CREATE TABLE addJar(key string) ROW FORMAT 
> SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';
> +-+
> | Result  |
> +-+
> +-+
> No rows selected (0.613 seconds)
> 0: jdbc:hive2://localhost:1> select * from addJar;
> Error: Error running query: java.lang.RuntimeException: 
> java.lang.ClassNotFoundException: org.apache.hive.hcatalog.data.JsonSerDe 
> (state=,code=0)
> {noformat}
> Case 2:
> {noformat}
> spark-sql> add jar 
> /root/.m2/repository/org/apache/hive/hcatalog/hive-hcatalog-core/2.3.6/hive-hcatalog-core-2.3.6.jar;
> ADD JAR 
> /root/.m2/repository/org/apache/hive/hcatalog/hive-hcatalog-core/2.3.6/hive-hcatalog-core-2.3.6.jar
> spark-sql> CREATE TABLE addJar(key string) ROW FORMAT SERDE 
> 'org.apache.hive.hcatalog.data.JsonSerDe';
> spark-sql> select * from addJar;
> 19/09/07 03:06:54 ERROR SparkSQLDriver: Failed in [select * from addJar]
> java.lang.RuntimeException: java.lang.ClassNotFoundException: 
> org.apache.hive.hcatalog.data.JsonSerDe
>   at 
> org.apache.hadoop.hive.ql.plan.TableDesc.getDeserializerClass(TableDesc.java:79)
>   at 
> org.apache.spark.sql.hive.execution.HiveTableScanExec.addColumnMetadataToConf(HiveTableScanExec.scala:123)
>   at 
> org.apache.spark.sql.hive.execution.HiveTableScanExec.hadoopConf$lzycompute(HiveTableScanExec.scala:101)
>   at 
> org.apache.spark.sql.hive.execution.HiveTableScanExec.hadoopConf(HiveTableScanExec.scala:98)
>   at 
> org.apache.spark.sql.hive.execution.HiveTableScanExec.hadoopReader$lzycompute(HiveTableScanExec.scala:110)
>   at 
> org.apache.spark.sql.hive.execution.HiveTableScanExec.hadoopReader(HiveTableScanExec.scala:105)
>   at 
> org.apache.spark.sql.hive.execution.HiveTableScanExec.$anonfun$doExecute$1(HiveTableScanExec.scala:188)
>   at org.apache.spark.util.Utils$.withDummyCallSite(Utils.scala:2488)
>   at 
> org.apache.spark.sql.hive.execution.HiveTableScanExec.doExecute(HiveTableScanExec.scala:188)
>   at 
> org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:189)
>   at 
> org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:227)
>   at 
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
>   at 
> org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:224)
>   at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:185)
>   at 
> org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:329)
>   at 
> org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:378)
>   at 
> org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:408)
>   at 
> org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:52)
>   at 
> org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$1(SparkSQLDriver.scala:65)
>   at 
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$4(SQLExecution.scala:100)
>   at 
> org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
>   at 
> org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:87)
>   at 
> org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:65)
>   at 
> org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:367)
>   at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:403)
>   

[jira] [Commented] (SPARK-9976) create function do not work

2016-01-19 Thread ocean (JIRA)

[ 
https://issues.apache.org/jira/browse/SPARK-9976?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15106517#comment-15106517
 ] 

ocean commented on SPARK-9976:
--

the second problem I just found that only function can not describe, but still 
can use. Just a little problem

> create function do not work
> ---
>
> Key: SPARK-9976
> URL: https://issues.apache.org/jira/browse/SPARK-9976
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 1.4.0, 1.4.1, 1.5.0
> Environment: spark 1.4.1 yarn 2.2.0
>Reporter: cen yuhai
>
> I use beeline to connect to ThriftServer, but add jar can not work, so I use 
> create function , see the link below.
> http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cm_mc_hive_udf.html
> I do as blow:
> {code}
> create function gdecodeorder as 'com.hive.udf.GOrderDecode' USING JAR 
> 'hdfs://mycluster/user/spark/lib/gorderdecode.jar'; 
> {code}
> It returns Ok, and I connect to the metastore, I see records in table FUNCS.
> {code}
> select gdecodeorder(t1)  from tableX  limit 1;
> {code}
> It returns error 'Couldn't find function default.gdecodeorder'
> This is the Exception
> {code}
> 15/08/14 14:53:51 ERROR UserGroupInformation: PriviledgedActionException 
> as:xiaoju (auth:SIMPLE) cause:org.apache.hive.service.cli.HiveSQLException: 
> java.lang.RuntimeException: Couldn't find function default.gdecodeorder
> 15/08/14 15:04:47 ERROR RetryingHMSHandler: 
> MetaException(message:NoSuchObjectException(message:Function 
> default.t_gdecodeorder does not exist))
> at 
> org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newMetaException(HiveMetaStore.java:4613)
> at 
> org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.get_function(HiveMetaStore.java:4740)
> at sun.reflect.GeneratedMethodAccessor57.invoke(Unknown Source)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at 
> org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:105)
> at com.sun.proxy.$Proxy21.get_function(Unknown Source)
> at 
> org.apache.hadoop.hive.metastore.HiveMetaStoreClient.getFunction(HiveMetaStoreClient.java:1721)
> at sun.reflect.GeneratedMethodAccessor56.invoke(Unknown Source)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at 
> org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:89)
> at com.sun.proxy.$Proxy22.getFunction(Unknown Source)
> at org.apache.hadoop.hive.ql.metadata.Hive.getFunction(Hive.java:2662)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfoFromMetastore(FunctionRegistry.java:546)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getQualifiedFunctionInfo(FunctionRegistry.java:579)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:645)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:652)
> at 
> org.apache.spark.sql.hive.HiveFunctionRegistry.lookupFunction(hiveUdfs.scala:54)
> at 
> org.apache.spark.sql.hive.HiveContext$$anon$3.org$apache$spark$sql$catalyst$analysis$OverrideFunctionRegistry$$super$lookupFunction(HiveContext.scala:376)
> at 
> org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44)
> at 
> org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44)
> at scala.Option.getOrElse(Option.scala:120)
> at 
> org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$class.lookupFunction(FunctionRegistry.scala:44)
> at 
> org.apache.spark.sql.hive.HiveContext$$anon$3.lookupFunction(HiveContext.scala:376)
> at 
> org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:465)
> at 
> org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:463)
> at 
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222)
> at 
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222)
> at 
> org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:51)
> at 
> org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:221)
> at 
> 

[jira] [Commented] (SPARK-9976) create function do not work

2016-01-18 Thread ocean (JIRA)

[ 
https://issues.apache.org/jira/browse/SPARK-9976?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15105168#comment-15105168
 ] 

ocean commented on SPARK-9976:
--

Does sparksql support temporary udf function which  extends udf now?
Now I found that udf temporary  function which extends GenericUDF run ok,
but udf temporary  function which extends UDF can not work.

> create function do not work
> ---
>
> Key: SPARK-9976
> URL: https://issues.apache.org/jira/browse/SPARK-9976
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 1.4.0, 1.4.1, 1.5.0
> Environment: spark 1.4.1 yarn 2.2.0
>Reporter: cen yuhai
>
> I use beeline to connect to ThriftServer, but add jar can not work, so I use 
> create function , see the link below.
> http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cm_mc_hive_udf.html
> I do as blow:
> {code}
> create function gdecodeorder as 'com.hive.udf.GOrderDecode' USING JAR 
> 'hdfs://mycluster/user/spark/lib/gorderdecode.jar'; 
> {code}
> It returns Ok, and I connect to the metastore, I see records in table FUNCS.
> {code}
> select gdecodeorder(t1)  from tableX  limit 1;
> {code}
> It returns error 'Couldn't find function default.gdecodeorder'
> This is the Exception
> {code}
> 15/08/14 14:53:51 ERROR UserGroupInformation: PriviledgedActionException 
> as:xiaoju (auth:SIMPLE) cause:org.apache.hive.service.cli.HiveSQLException: 
> java.lang.RuntimeException: Couldn't find function default.gdecodeorder
> 15/08/14 15:04:47 ERROR RetryingHMSHandler: 
> MetaException(message:NoSuchObjectException(message:Function 
> default.t_gdecodeorder does not exist))
> at 
> org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newMetaException(HiveMetaStore.java:4613)
> at 
> org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.get_function(HiveMetaStore.java:4740)
> at sun.reflect.GeneratedMethodAccessor57.invoke(Unknown Source)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at 
> org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:105)
> at com.sun.proxy.$Proxy21.get_function(Unknown Source)
> at 
> org.apache.hadoop.hive.metastore.HiveMetaStoreClient.getFunction(HiveMetaStoreClient.java:1721)
> at sun.reflect.GeneratedMethodAccessor56.invoke(Unknown Source)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at 
> org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:89)
> at com.sun.proxy.$Proxy22.getFunction(Unknown Source)
> at org.apache.hadoop.hive.ql.metadata.Hive.getFunction(Hive.java:2662)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfoFromMetastore(FunctionRegistry.java:546)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getQualifiedFunctionInfo(FunctionRegistry.java:579)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:645)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:652)
> at 
> org.apache.spark.sql.hive.HiveFunctionRegistry.lookupFunction(hiveUdfs.scala:54)
> at 
> org.apache.spark.sql.hive.HiveContext$$anon$3.org$apache$spark$sql$catalyst$analysis$OverrideFunctionRegistry$$super$lookupFunction(HiveContext.scala:376)
> at 
> org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44)
> at 
> org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44)
> at scala.Option.getOrElse(Option.scala:120)
> at 
> org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$class.lookupFunction(FunctionRegistry.scala:44)
> at 
> org.apache.spark.sql.hive.HiveContext$$anon$3.lookupFunction(HiveContext.scala:376)
> at 
> org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:465)
> at 
> org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:463)
> at 
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222)
> at 
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222)
> at 
> org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:51)
> at 
> 

[jira] [Commented] (SPARK-11948) Permanent UDF not work

2016-01-18 Thread ocean (JIRA)

[ 
https://issues.apache.org/jira/browse/SPARK-11948?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15105170#comment-15105170
 ] 

ocean commented on SPARK-11948:
---

Does sparksql support temporary udf function which  extends udf now?
Now I found that udf temporary  function which extends GenericUDF run ok,
but udf temporary  function which extends UDF can not work.

my spark version is 1.5.2

> Permanent UDF not work
> --
>
> Key: SPARK-11948
> URL: https://issues.apache.org/jira/browse/SPARK-11948
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 1.5.1
>Reporter: Weizhong
>Priority: Minor
>
> We create a function,
> {noformat}
> add jar /home/test/smartcare-udf-0.0.1-SNAPSHOT.jar;
> create function arr_greater_equal as 
> 'smartcare.dac.hive.udf.UDFArrayGreaterEqual';
> {noformat}
>  but "show functions" don't display, and when we create the same function 
> again, it throw exception as below:
> {noformat}
> Error: org.apache.spark.sql.execution.QueryExecutionException: FAILED: 
> Execution Error, return code 1 from 
> org.apache.hadoop.hive.ql.exec.FunctionTask. 
> AlreadyExistsException(message:Function arr_greater_equal already exists) 
> (state=,code=0)
> {noformat}
> But if we use this function, it throw exception as below:
> {noformat}
> Error: org.apache.spark.sql.AnalysisException: undefined function 
> arr_greater_equal; line 1 pos 119 (state=,code=0)
> {noformat}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

-
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org



[jira] [Commented] (SPARK-9976) create function do not work

2016-01-18 Thread ocean (JIRA)

[ 
https://issues.apache.org/jira/browse/SPARK-9976?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15106385#comment-15106385
 ] 

ocean commented on SPARK-9976:
--

The second problem is:
some temporary udf function which extends UDF can create, but when you describe 
it,  
it says that it is not found.

spark-sql> create temporary FUNCTION dict AS 
'td.enterprise.hive.udfs.ConvertDict';
OK
Time taken: 0.04 seconds
spark-sql> describe function dict;
Function: dict is not found.
Time taken: 0.033 seconds, Fetched 1 row(s)

> create function do not work
> ---
>
> Key: SPARK-9976
> URL: https://issues.apache.org/jira/browse/SPARK-9976
> Project: Spark
>  Issue Type: Bug
>  Components: SQL
>Affects Versions: 1.4.0, 1.4.1, 1.5.0
> Environment: spark 1.4.1 yarn 2.2.0
>Reporter: cen yuhai
>
> I use beeline to connect to ThriftServer, but add jar can not work, so I use 
> create function , see the link below.
> http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cm_mc_hive_udf.html
> I do as blow:
> {code}
> create function gdecodeorder as 'com.hive.udf.GOrderDecode' USING JAR 
> 'hdfs://mycluster/user/spark/lib/gorderdecode.jar'; 
> {code}
> It returns Ok, and I connect to the metastore, I see records in table FUNCS.
> {code}
> select gdecodeorder(t1)  from tableX  limit 1;
> {code}
> It returns error 'Couldn't find function default.gdecodeorder'
> This is the Exception
> {code}
> 15/08/14 14:53:51 ERROR UserGroupInformation: PriviledgedActionException 
> as:xiaoju (auth:SIMPLE) cause:org.apache.hive.service.cli.HiveSQLException: 
> java.lang.RuntimeException: Couldn't find function default.gdecodeorder
> 15/08/14 15:04:47 ERROR RetryingHMSHandler: 
> MetaException(message:NoSuchObjectException(message:Function 
> default.t_gdecodeorder does not exist))
> at 
> org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newMetaException(HiveMetaStore.java:4613)
> at 
> org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.get_function(HiveMetaStore.java:4740)
> at sun.reflect.GeneratedMethodAccessor57.invoke(Unknown Source)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at 
> org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:105)
> at com.sun.proxy.$Proxy21.get_function(Unknown Source)
> at 
> org.apache.hadoop.hive.metastore.HiveMetaStoreClient.getFunction(HiveMetaStoreClient.java:1721)
> at sun.reflect.GeneratedMethodAccessor56.invoke(Unknown Source)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at 
> org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:89)
> at com.sun.proxy.$Proxy22.getFunction(Unknown Source)
> at org.apache.hadoop.hive.ql.metadata.Hive.getFunction(Hive.java:2662)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfoFromMetastore(FunctionRegistry.java:546)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getQualifiedFunctionInfo(FunctionRegistry.java:579)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:645)
> at 
> org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionInfo(FunctionRegistry.java:652)
> at 
> org.apache.spark.sql.hive.HiveFunctionRegistry.lookupFunction(hiveUdfs.scala:54)
> at 
> org.apache.spark.sql.hive.HiveContext$$anon$3.org$apache$spark$sql$catalyst$analysis$OverrideFunctionRegistry$$super$lookupFunction(HiveContext.scala:376)
> at 
> org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44)
> at 
> org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$$anonfun$lookupFunction$2.apply(FunctionRegistry.scala:44)
> at scala.Option.getOrElse(Option.scala:120)
> at 
> org.apache.spark.sql.catalyst.analysis.OverrideFunctionRegistry$class.lookupFunction(FunctionRegistry.scala:44)
> at 
> org.apache.spark.sql.hive.HiveContext$$anon$3.lookupFunction(HiveContext.scala:376)
> at 
> org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:465)
> at 
> org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions$$anonfun$apply$13$$anonfun$applyOrElse$5.applyOrElse(Analyzer.scala:463)
> at 
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222)
> at 
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:222)
>   

[jira] [Commented] (SPARK-9976) create function do not work

2016-01-18 Thread ocean (JIRA)

[ 
https://issues.apache.org/jira/browse/SPARK-9976?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15106372#comment-15106372
 ] 

ocean commented on SPARK-9976:
--

Now I am migrating from HIVE to Sparksql. Now I encountered two problem.

first one:
the temporary udf function which extends GenericUDF can create, and you can 
describe this udf function.
but when I use this function, some exception occurs.
I don't know where the problem is.
spark version: apache 1.5.1 /standalone mode/CLI

spark-sql> describe function devicemodel;
Function: deviceModel
Class: td.enterprise.hive.udfs.ConvertDeviceModel
Usage: deviceModel(expr) - return other information
Time taken: 0.041 seconds, Fetched 3 row(s)

spark-sql>  
 >  select devicemodel(p.mobile_id, 'type', 'MOBILE_TYPE') as 
mobile_type from analytics_device_profile_zh3 p limit 1;
16/01/19 15:19:46 ERROR Executor: Exception in task 0.0 in stage 14.0 (TID 15)
com.esotericsoftware.kryo.KryoException: Buffer underflow.
Serialization trace:
mobileAttributeCahceMap (td.enterprise.hive.udfs.ConvertDeviceModel)
at com.esotericsoftware.kryo.io.Input.require(Input.java:156)
at com.esotericsoftware.kryo.io.Input.readAscii_slow(Input.java:580)
at com.esotericsoftware.kryo.io.Input.readAscii(Input.java:558)
at com.esotericsoftware.kryo.io.Input.readString(Input.java:436)
at 
com.esotericsoftware.kryo.serializers.DefaultSerializers$StringSerializer.read(DefaultSerializers.java:157)
at 
com.esotericsoftware.kryo.serializers.DefaultSerializers$StringSerializer.read(DefaultSerializers.java:146)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:729)
at 
com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:134)
at 
com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:17)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:729)
at 
com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:134)
at 
com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:17)
at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:648)
at 
com.esotericsoftware.kryo.serializers.FieldSerializer$ObjectField.read(FieldSerializer.java:605)
at 
com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:221)
at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:626)
at 
org.apache.spark.sql.hive.HiveShim$HiveFunctionWrapper.deserializeObjectByKryo(HiveShim.scala:134)
at 
org.apache.spark.sql.hive.HiveShim$HiveFunctionWrapper.deserializePlan(HiveShim.scala:150)
at 
org.apache.spark.sql.hive.HiveShim$HiveFunctionWrapper.readExternal(HiveShim.scala:191)
at 
java.io.ObjectInputStream.readExternalData(ObjectInputStream.java:1837)
at 
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1796)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350)
at 
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915)
at 
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350)
at 
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915)
at 
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350)
at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1706)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1344)
at 
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915)
at 
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350)
at 
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915)
at 
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350)
at 
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915)
at 
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350)
at 
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990)
at