Robert,
You do not need to use qgen.exe to generate queries as you are not
running the TPC-H benchmark test. Attached is an example of the 22
sample TPC-H queries according to the benchmark.
We have not tested using the TPC-H queries for this particular patch and
only use the TPC-H database as a large, skewed data set. The simpler
queries we test involve joins of Part-Lineitem or Supplier-Lineitem such
as:
Select * from part, lineitem where p_partkey = l_partkey
OR
Select count(*) from part, lineitem where p_partkey = l_partkey
The count(*) version is usually more useful for comparisons as the
generation of output tuples on the client side (say with pgadmin)
dominates the actual time to complete the query.
To isolate query costs, we also test using a simple server-side
function. The setup description I have also attached.
I would be happy to help in any way I can.
Bryce is currently working on an updated patch according to your
suggestions.
--
Dr. Ramon Lawrence
Assistant Professor, Department of Computer Science, University of
British Columbia Okanagan
E-mail: ramon.lawre...@ubc.ca
> -----Original Message-----
> From: pgsql-hackers-ow...@postgresql.org [mailto:pgsql-hackers-
> ow...@postgresql.org] On Behalf Of Robert Haas
> Sent: December 17, 2008 7:54 PM
> To: Lawrence, Ramon
> Cc: Tom Lane; pgsql-hackers@postgresql.org; Bryce Cutt
> Subject: Re: [HACKERS] Proposed Patch to Improve Performance of Multi-
> Batch Hash Join for Skewed Data Sets
>
> Dr. Lawrence:
>
> I'm still working on reviewing this patch. I've managed to load the
> sample TPCH data from tpch1g1z.zip after changing the line endings to
> UNIX-style and chopping off the trailing vertical bars. (If anyone is
> interested, I have the results of pg_dump | bzip2 -9 on the resulting
> database, which I would be happy to upload if someone has server
> space. It is about 250MB.)
>
> But, I'm not sure quite what to do in terms of generating queries.
> TPCHSkew contains QGEN.EXE, but that seems to require that you provide
> template queries as input, and I'm not sure where to get the
> templates.
>
> Any suggestions?
>
> Thanks,
>
> ...Robert
>
> --
> Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
> To make changes to your subscription:
> http://www.postgresql.org/mailpref/pgsql-hackers
-- using 101000000 as a seed to the RNG
-- QUERY_1
select
l_returnflag,
l_linestatus,
sum(l_quantity) as sum_qty,
sum(l_extendedprice) as sum_base_price,
sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
avg(l_quantity) as avg_qty,
avg(l_extendedprice) as avg_price,
avg(l_discount) as avg_disc,
count(*) as count_order
from
lineitem
where
l_shipdate <= date '1998-09-01'
group by
l_returnflag,
l_linestatus
order by
l_returnflag,
l_linestatus;
-- QUERY_2
select
s_acctbal,
s_name,
n_name,
p_partkey,
p_mfgr,
s_address,
s_phone,
s_comment
from
part,
supplier,
partsupp,
nation,
region
where
p_partkey = ps_partkey
and s_suppkey = ps_suppkey
and p_size = 28
and p_type like '%STEEL'
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'MIDDLE EAST'
and ps_supplycost = (
select
min(ps_supplycost)
from
partsupp,
supplier,
nation,
region
where
p_partkey = ps_partkey
and s_suppkey = ps_suppkey
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'MIDDLE EAST'
)
order by
s_acctbal desc,
n_name,
s_name,
p_partkey
limit 100;
-- QUERY_3
select
l_orderkey,
sum(l_extendedprice * (1 - l_discount)) as revenue,
o_orderdate,
o_shippriority
from
customer,
orders,
lineitem
where
c_mktsegment = 'BUILDING'
and c_custkey = o_custkey
and l_orderkey = o_orderkey
and o_orderdate < date '1995-03-31'
and l_shipdate > date '1995-03-31'
group by
l_orderkey,
o_orderdate,
o_shippriority
order by
revenue desc,
o_orderdate
limit 10;
-- QUERY_4
select
o_orderpriority,
count(*) as order_count
from
orders
where
o_orderdate >= date '1997-10-01'
and o_orderdate < date '1998-02-01'
and exists (
select
*
from
lineitem
where
l_orderkey = o_orderkey
and l_commitdate < l_receiptdate
)
group by
o_orderpriority
order by
o_orderpriority;
-- QUERY_5
select
n_name,
sum(l_extendedprice * (1 - l_discount)) as revenue
from
customer,
orders,
lineitem,
supplier,
nation,
region
where
c_custkey = o_custkey
and l_orderkey = o_orderkey
and l_suppkey = s_suppkey
and c_nationkey = s_nationkey
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'AMERICA'
and o_orderdate >= date '1994-01-01'
and o_orderdate < date '1995-01-01'
group by
n_name
order by
revenue desc;
-- QUERY_6
select
sum(l_extendedprice * l_discount) as revenue
from
lineitem
where
l_shipdate >= date '1994-01-01'
and l_shipdate < date '1995-01-01'
and l_discount between 0.05 - 0.01 and 0.05 + 0.01
and l_quantity < 25;
-- QUERY_7
select
supp_nation,
cust_nation,
l_year,
sum(volume) as revenue
from
(
select
n1.n_name as supp_nation,
n2.n_name as cust_nation,
extract(year from l_shipdate) as l_year,
l_extendedprice * (1 - l_discount) as volume
from
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2
where
s_suppkey = l_suppkey
and o_orderkey = l_orderkey
and c_custkey = o_custkey
and s_nationkey = n1.n_nationkey
and c_nationkey = n2.n_nationkey
and (
(n1.n_name = 'IRAN' and n2.n_name = 'UNITED
STATES')
or (n1.n_name = 'UNITED STATES' and n2.n_name =
'IRAN')
)
and l_shipdate between date '1995-01-01' and date
'1996-12-31'
) as shipping
group by
supp_nation,
cust_nation,
l_year
order by
supp_nation,
cust_nation,
l_year;
-- QUERY_8
select
o_year,
sum(case
when nation = 'UNITED STATES' then volume
else 0
end) / sum(volume) as mkt_share
from
(
select
extract(year from o_orderdate) as o_year,
l_extendedprice * (1 - l_discount) as volume,
n2.n_name as nation
from
part,
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2,
region
where
p_partkey = l_partkey
and s_suppkey = l_suppkey
and l_orderkey = o_orderkey
and o_custkey = c_custkey
and c_nationkey = n1.n_nationkey
and n1.n_regionkey = r_regionkey
and r_name = 'AMERICA'
and s_nationkey = n2.n_nationkey
and o_orderdate between date '1995-01-01' and date
'1996-12-31'
and p_type = 'MEDIUM BRUSHED STEEL'
) as all_nations
group by
o_year
order by
o_year;
-- QUERY_9
select
nation,
o_year,
sum(amount) as sum_profit
from
(
select
n_name as nation,
extract(year from o_orderdate) as o_year,
l_extendedprice * (1 - l_discount) - ps_supplycost *
l_quantity as amount
from
part,
supplier,
lineitem,
partsupp,
orders,
nation
where
s_suppkey = l_suppkey
and ps_suppkey = l_suppkey
and ps_partkey = l_partkey
and p_partkey = l_partkey
and o_orderkey = l_orderkey
and s_nationkey = n_nationkey
and p_name like '%lime%'
) as profit
group by
nation,
o_year
order by
nation,
o_year desc;
-- QUERY_10
select
c_custkey,
c_name,
sum(l_extendedprice * (1 - l_discount)) as revenue,
c_acctbal,
n_name,
c_address,
c_phone,
c_comment
from
customer,
orders,
lineitem,
nation
where
c_custkey = o_custkey
and l_orderkey = o_orderkey
and o_orderdate >= date '1994-09-01'
and o_orderdate < date '1994-12-01'
and l_returnflag = 'R'
and c_nationkey = n_nationkey
group by
c_custkey,
c_name,
c_acctbal,
c_phone,
n_name,
c_address,
c_comment
order by
revenue desc
limit 20;
-- QUERY_11
select
ps_partkey,
sum(ps_supplycost * ps_availqty) as value
from
partsupp,
supplier,
nation
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = 'KENYA'
group by
ps_partkey having
sum(ps_supplycost * ps_availqty) > (
select
sum(ps_supplycost * ps_availqty) * 0.0001000000
from
partsupp,
supplier,
nation
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = 'KENYA'
)
order by
value desc;
-- QUERY_12
select
l_shipmode,
sum(case
when o_orderpriority = '1-URGENT'
or o_orderpriority = '2-HIGH'
then 1
else 0
end) as high_line_count,
sum(case
when o_orderpriority <> '1-URGENT'
and o_orderpriority <> '2-HIGH'
then 1
else 0
end) as low_line_count
from
orders,
lineitem
where
o_orderkey = l_orderkey
and l_shipmode in ('SHIP', 'AIR')
and l_commitdate < l_receiptdate
and l_shipdate < l_commitdate
and l_receiptdate >= date '1993-01-01'
and l_receiptdate < date '1994-01-01'
group by
l_shipmode
order by
l_shipmode;
-- QUERY_13
select
c_count,
count(*) as custdist
from
(
select
c_custkey,
count(o_orderkey)
from
customer left outer join orders on
c_custkey = o_custkey
and o_comment not like '%special%packages%'
group by
c_custkey
) as c_orders (c_custkey, c_count)
group by
c_count
order by
custdist desc,
c_count desc;
-- QUERY_14
select
100.00 * sum(case
when p_type like 'PROMO%'
then l_extendedprice * (1 - l_discount)
else 0
end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
from
lineitem,
part
where
l_partkey = p_partkey
and l_shipdate >= date '1993-09-01'
and l_shipdate < date '1993-10-01';
-- QUERY_15
create view revenue0 (supplier_no, total_revenue) as
select
l_suppkey,
sum(l_extendedprice * (1 - l_discount))
from
lineitem
where
l_shipdate >= date '1995-07-01'
and l_shipdate < date '1995-10-01'
group by
l_suppkey;
select
s_suppkey,
s_name,
s_address,
s_phone,
total_revenue
from
supplier,
revenue0
where
s_suppkey = supplier_no
and total_revenue = (
select
max(total_revenue)
from
revenue0
)
order by
s_suppkey;
drop view revenue0;
-- QUERY_16
select
p_brand,
p_type,
p_size,
count(distinct ps_suppkey) as supplier_cnt
from
partsupp,
part
where
p_partkey = ps_partkey
and p_brand <> 'Brand#45'
and p_type not like 'ECONOMY BURNISHED%'
and p_size in (13, 11, 15, 47, 34, 3, 27, 5)
and ps_suppkey not in (
select
s_suppkey
from
supplier
where
s_comment like '%Customer%Complaints%'
)
group by
p_brand,
p_type,
p_size
order by
supplier_cnt desc,
p_brand,
p_type,
p_size;
-- QUERY_17
select
sum(l_extendedprice) / 7.0 as avg_yearly
from
lineitem,
part
where
p_partkey = l_partkey
and p_brand = 'Brand#54'
and p_container = 'WRAP DRUM'
and l_quantity < (
select
0.2 * avg(l_quantity)
from
lineitem
where
l_partkey = p_partkey
);
A more efficient version of the query above:
select
sum(l_extendedprice) / 7.0 as avg_yearly
from
( select
l_partkey, 0.2 * avg(l_quantity) as qty
from
lineitem
group by l_partkey
) as avgtmp,
lineitem,
part
where
p_partkey = lineitem.l_partkey and avgtmp.l_partkey = lineitem.l_partkey
and p_brand = 'Brand#54'
and p_container = 'WRAP DRUM'
and l_quantity < avgtmp.qty
-- QUERY_18
select
c_name,
c_custkey,
o_orderkey,
o_orderdate,
o_totalprice,
sum(l_quantity)
from
customer,
orders,
lineitem
where
o_orderkey in (
select
l_orderkey
from
lineitem
group by
l_orderkey having
sum(l_quantity) > 314
)
and c_custkey = o_custkey
and o_orderkey = l_orderkey
group by
c_name,
c_custkey,
o_orderkey,
o_orderdate,
o_totalprice
order by
o_totalprice desc,
o_orderdate
limit 100;
-- QUERY_19
select
sum(l_extendedprice* (1 - l_discount)) as revenue
from
lineitem,
part
where
(
p_partkey = l_partkey
and p_brand = 'Brand#53'
and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
and l_quantity >= 6 and l_quantity <= 6 + 10
and p_size between 1 and 5
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_partkey = l_partkey
and p_brand = 'Brand#55'
and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
and l_quantity >= 20 and l_quantity <= 20 + 10
and p_size between 1 and 10
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_partkey = l_partkey
and p_brand = 'Brand#55'
and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
and l_quantity >= 22 and l_quantity <= 22 + 10
and p_size between 1 and 15
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
);
-- QUERY_20
select
s_name,
s_address
from
supplier,
nation
where
s_suppkey in (
select
ps_suppkey
from
partsupp
where
ps_partkey in (
select
p_partkey
from
part
where
p_name like 'ivory%'
)
and ps_availqty > (
select
0.5 * sum(l_quantity)
from
lineitem
where
l_partkey = ps_partkey
and l_suppkey = ps_suppkey
and l_shipdate >= date '1997-01-01'
and l_shipdate < date '1998-01-01'
)
)
and s_nationkey = n_nationkey
and n_name = 'UNITED KINGDOM'
order by
s_name;
-- Query 20 modified
select
s_name,
s_address
from
supplier,
nation,
part,
partsupp,
(select
l_partkey, l_suppkey, 0.5 * sum(l_quantity)
from
lineitem
where
l_shipdate >= date '1997-01-01' and l_shipdate < date
'1998-01-01'
group by l_partkey, l_suppkey
) as l1
where
s_suppkey = ps_suppkey and ps_partkey = p_partkey and l1.l_partkey =
ps_partkey and l1.l_suppkey = ps_suppkey
and p_name like 'ivory%'
and s_nationkey = n_nationkey
and n_name = 'UNITED KINGDOM'
order by
s_name;
-- QUERY_21
select
s_name,
count(*) as numwait
from
supplier,
lineitem l1,
orders,
nation
where
s_suppkey = l1.l_suppkey
and o_orderkey = l1.l_orderkey
and o_orderstatus = 'F'
and l1.l_receiptdate > l1.l_commitdate
and exists (
select
*
from
lineitem l2
where
l2.l_orderkey = l1.l_orderkey
and l2.l_suppkey <> l1.l_suppkey
)
and not exists (
select
*
from
lineitem l3
where
l3.l_orderkey = l1.l_orderkey
and l3.l_suppkey <> l1.l_suppkey
and l3.l_receiptdate > l3.l_commitdate
)
and s_nationkey = n_nationkey
and n_name = 'UNITED KINGDOM'
group by
s_name
order by
numwait desc,
s_name
limit 100;
-- QUERY_22
select
cntrycode,
count(*) as numcust,
sum(c_acctbal) as totacctbal
from
(
select
substring(c_phone from 1 for 2) as cntrycode,
c_acctbal
from
customer
where
substring(c_phone from 1 for 2) in
('19', '29', '25', '23', '26', '11', '34')
and c_acctbal > (
select
avg(c_acctbal)
from
customer
where
c_acctbal > 0.00
and substring(c_phone from 1 for 2) in
('19', '29', '25', '23', '26',
'11', '34')
)
and not exists (
select
*
from
orders
where
o_custkey = c_custkey
)
) as custsale
group by
cntrycode
order by
cntrycode;
-- Does not finish as sequential scans orders 23,000 times.
-- Should re-write to do left outer join with orders and then check for
o_custkey is NULL.
http://www.postgresql.org/docs/8.3/interactive/xplang-install.html
http://www.postgresql.org/docs/8.3/interactive/plpgsql-control-structures.html
http://www.postgresql.org/docs/8.3/interactive/plpgsql-declarations.html
CREATE FUNCTION plpgsql_call_handler() RETURNS language_handler AS
'$libdir/plpgsql' LANGUAGE C;
CREATE FUNCTION plpgsql_validator(oid) RETURNS void AS
'$libdir/plpgsql' LANGUAGE C;
CREATE TRUSTED PROCEDURAL LANGUAGE plpgsql
HANDLER plpgsql_call_handler
VALIDATOR plpgsql_validator;
CREATE VIEW benchview AS
SELECT * FROM lineitem, part where l_partkey = p_partkey;
CREATE OR REPLACE FUNCTION bench() RETURNS integer AS $$
DECLARE
rr RECORD;
cnt integer;
BEGIN
cnt = 0;
FOR rr IN SELECT * FROM benchview LOOP
cnt = cnt + 1;
END LOOP;
RETURN cnt;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION benchtime()
RETURNS integer AS
$BODY$
DECLARE
rr RECORD;
cnt integer;
starttime timestamp;
curtime timestamp;
timediff interval;
BEGIN
cnt = 0;
starttime := timeofday()::timestamp;
RAISE NOTICE 'START: %',starttime;
FOR rr IN SELECT * FROM benchview LOOP
cnt = cnt + 1;
IF cnt = 100 THEN
curtime := timeofday()::timestamp;
RAISE NOTICE '100 results: %',curtime;
ELSIF cnt = 1000 THEN
curtime := timeofday()::timestamp;
RAISE NOTICE '1000 results: %',curtime;
END IF;
END LOOP;
curtime := timeofday()::timestamp;
timediff = curtime - starttime;
RAISE NOTICE 'DONE: %',curtime;
RAISE NOTICE 'RUNTIME: %',timediff;
--RAISE NOTICE 'RUNTIME: %',to_char(timediff, 'SSSS');
RAISE NOTICE 'Generated results: %',cnt;
RETURN cnt;
END;
$BODY$
LANGUAGE 'plpgsql';
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers