Re: [HACKERS] Windowing Function Patch Review -> Performance Comparison.

David Rowley Sun, 09 Nov 2008 10:56:45 -0800

Hitoshi Harada wrote:
> I found how to do it, though it's only on the case you gave. Thinking
> about the planner optimization of the Window nodes (and its attached
> Sort nodes), we must consider the execution order of more than one
> node. In the test case we only take care of only one window, but there
> may be more window/sort node sets, which is too difficult to choose
> the best execution order including the downer indexscan, mergejoin in
> subquery and sort-based GROUP BY. So I didn't touch the complicated
> planner jungle. I rewrote the patch so that only the given bottom
> window's sort can consider indexscan. Deeper optimizations are over my
> capability.


Sorry for the delay on this. I've updated the benchmark results using the
new patch you sent today. I did a dump and re-load of the tables, since some
of the numbers are randomly generated I wouldn't want to compare them to the
old results for any of the tests. This is a complete new list with the CVS
head as of this morning.

Test   Sub Query Self Join Vladimir     Windowing UOM Window over Best alt
Test 1 504       N/A       N/A      568       TPS 12.7%
Test 2 340.9     425       182  450.38    TPS 6.0%
Test 3 1.304     8.12      1.963        7.52      TPS -7.4%
Test 4 422       365       195  630         TPS 49.3%
Test 5 8.874     N/A       5825     31203     TPH 435.6%
Test 6 251       N/A       N/A      300       TPS 19.5%

Only test 3 and 5 made use of the index scan, performance dropped slightly
on test 3 but there's not much point in paying much attention to that since
we're probably close to the cross over point between a seqscan and indexscan
where the planner's decision is not as critical. 

Certain Self join methods I used don't implement the exact requirements I've
stated at the top of the test. For example the meter reading for self join
requires no days to be missed.

Maybe multi window optimisation is one for 8.5's TODO

I've attached the test scripts.

David.

Test Queries:


Added Self joins test cases. 
Also added Vladimir's method which requires the following code to be run first:

begin work;

create function var_set (text,text) returns text as $$
  select set_config ('public.' || $2 || pg_backend_pid(), $1, false);
$$ LANGUAGE 'sql';

create function var_get (text) returns text as $$
  select current_setting('public.' || $1 || pg_backend_pid());
$$ LANGUAGE 'sql';

create operator >>> (procedure = var_set, leftarg = text, rightarg = text);
create operator <<< (procedure = var_get, rightarg = text);

commit;

The above code exploits the use of custom GUC variables to get the lagged 
value. Clever but ditry :-)

----------------------------------------------------------------------------
-- Test 1 -- 
--
-- Notes: Get the name, department and salary of highest paid department
--        member for each department. If the two highest paid employees
--        in the department get the same salary show the one with the
--        lowest id.
----------------------------------------------------------------------------

create table employees (
  id INT primary key,
  name varchar(30) not null,
  department varchar(30) not null,
  salary int not null,
  check (salary >= 0)
);


insert into employees values(1,'Jeff','IT',10000);
insert into employees values(2,'Sam','IT',12000);

insert into employees values(3,'Richard','Manager',30000);
insert into employees values(4,'Ian','Manager',20000);

insert into employees values(5,'John','IT',60000);
insert into employees values(6,'Matthew','Director',60000);

VACUUM ANALYZE employees;


-- Normal way.

SELECT name,department,salary
FROM employees AS e
WHERE id = (SELECT id FROM employees WHERE department = e.department ORDER BY 
salary DESC,id ASC LIMIT 1);

-- above query = 498 tps


-- With windowing functions.

SELECT name,department,salary
FROM (SELECT name,
             department,
             salary,
             row_number() OVER (PARTITION BY department ORDER BY salary DESC,id 
ASC) AS num
      FROM employees
) AS t WHERE num = 1;
 
-- above query = 578 tps

----------------------------------------------------------------------------
-- Test 2 -- Split times problem
-- 
-- Notes: Find the interval of time between 1 timestamp and the previous
-- timestamp. The previous timestamp being the one with the next lowest id
-- column value. If there is no previous timestamp show the value NULL.

CREATE TABLE tstest (
  id SERIAL NOT NULL PRIMARY KEY,
  timestamp TIMESTAMP NOT NULL
);

INSERT INTO tstest (timestamp) VALUES(NOW());

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval 
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

VACUUM ANALYZE tstest;


-- Sub query
SELECT timestamp - (SELECT timestamp FROM tstest WHERE id < t.id ORDER BY id 
DESC LIMIT 1)
FROM tstest t;

-- above query = 336 tps


-- Self Join, (breaks requirements! -> requires gapless sequence of IDs)

SELECT t1.timestamp - t2.timestamp
FROM tstest t1
LEFT OUTER JOIN tstest t2 ON t2.id + 1 = t1.id;

-- above query = 424 tps

-- Vladimir's method.

select '' >>> 'lag';
select timestamp::timestamp-(case lag when '' then null else lag end)::timestamp
from (select <<<'lag' as lag,
             timestamp::text >>>'lag' as timestamp
      from tstest 
      order by id
) as t

-- above query = 182.78 tps

-- windowing functions.

SELECT timestamp - LAG(timestamp,1) OVER(ORDER BY id)
FROM tstest;

-- above query = 481 tps


------------------------------------------------------------------------------------------
-- Test 3 -- Meter reading problem
--
-- Notes: Some testing with slightly larger tables.
--        Find the day were the least units were used. This will be the 
smallest difference
--        between that day and the previous meter reading. Note, this does not 
have to be
--        the previous day. It's possible that the meter was not read on some 
days.
        


CREATE TABLE meter_readings (
 date DATE NOT NULL PRIMARY KEY,
 reading DOUBLE PRECISION NOT NULL
);


-- Create inital meter reading of 0 for 1st Jan 2000.

INSERT INTO meter_readings (date,reading) VALUES('2000-01-01',0);

-- Create 10000 readings for each day after 2000-01-01.

INSERT INTO meter_readings (date,reading) 
SELECT '2000-01-01'::date + i.a,i.a * 10000 + RANDOM() * 10000 FROM 
generate_series(1,10000) AS i(a);

VACUUM ANALYZE meter_readings;


-- Sub query test

SELECT date,reading - (SELECT reading FROM meter_readings WHERE date < p.date 
ORDER BY date DESC LIMIT 1) AS used
FROM meter_readings p
ORDER BY used ASC NULLS LAST LIMIT 1;

-- above query = 1.3 tps.

-- Self join (breaks requirements! requires meter to be read daily!!)

SELECT t1.date,t1.reading - t2.reading AS used
FROM meter_readings t1
LEFT OUTER JOIN meter_readings t2 ON t2.date + 1 = t1.date
ORDER BY used ASC NULLS LAST LIMIT 1;

-- above query = 7.59 tps


-- Vladimir's method.
-- edit postgresql.conf -> custom_variable_classes = 'public' 

select '' >>> 'lag';
select date,
       reading::numeric-(case lag when '' then null else lag end)::numeric as 
used
from (select date,
             <<<'lag' as lag,
             reading::text >>>'lag' as reading
      from meter_readings
      order by date
) as t order by used asc nulls last limit 1

-- above query = 1.9 tps


-- Windowing functions

SELECT date,reading - LAG(reading,1) OVER (ORDER BY date) AS used
FROM meter_Readings
ORDER BY used ASC NULLS LAST LIMIT 1;


-- above query = 8.45 tps.

------------------------------------------------------------------------------------------
-- Test 4 -- Marathon Results Problem
--
-- Notes: Show the runnerid,name,time taken and the position of each Marathon 
runner.

CREATE TABLE marathon_results (
  runnerid SERIAL NOT NULL PRIMARY KEY,
  name VARCHAR(30) NOT NULL,
  time INTERVAL NOT NULL
);

INSERT INTO marathon_results (name,time) VALUES('Samuel','02:05:24');
INSERT INTO marathon_results (name,time) VALUES('Paul','02:04:55');
INSERT INTO marathon_results (name,time) VALUES('Haile','02:03:59');
INSERT INTO marathon_results (name,time) VALUES('Martin','02:05:15');
INSERT INTO marathon_results (name,time) VALUES('Sammy','02:04:56');
INSERT INTO marathon_results (name,time) VALUES('David','03:15:56');
INSERT INTO marathon_results (name,time) VALUES('Eric','03:15:56');
INSERT INTO marathon_results (name,time) VALUES('Alan','03:24:56');


VACUUM ANALYZE marathon_results;

-- sub query

SELECT runnerid,name,time,(SELECT COUNT(DISTINCT time)+1 FROM marathon_results 
WHERE time < r.time) AS position
FROM marathon_results r
ORDER BY position;

-- above query = 424 tps


-- Self join

SELECT t1.runnerid,t1.name,t1.time,COUNT(DISTINCT t2.time) + 1 AS position
FROM marathon_results t1
LEFT OUTER JOIN marathon_results t2 ON t2.time < t1.time
GROUP BY t1.runnerid,t1.name,t1.time
order by position;

-- above query = 361 tps

-- Vladimir's method.
-- edit postgresql.conf -> custom_variable_classes = 'public' 

-- init values
select ''>>>'prev_time', '0'>>>'dense_rank';

select runnerid,name,time,position from (
    select (((case when time::text = <<<'prev_time' then 0 else 1 
end)+(<<<'dense_rank')::int4)::text>>>'dense_rank')::int4 as position,
           runnerid,
          time,
          time::text>>>'prev_time',
          name
     from (SELECT * FROM marathon_results ORDER BY time) m
) results
ORDER BY position;

-- above query = 182 tps



-- With windowing functions.

SELECT runnerid,name,time,DENSE_RANK() OVER(ORDER BY time ASC) AS position
FROM marathon_results
ORDER BY position;

-- above query = 629 tps


------------------------------------------------------------------------------------------
-- Test 5 -- Big Marathon Problem
--
-- Notes: As above but bigger table.

CREATE TABLE big_marathon (
  runnerid SERIAL NOT NULL PRIMARY KEY,
  time INTERVAL NOT NULL
);

-- Generate 10,000 random times from 2 hrs to 4 hrs
INSERT INTO big_marathon (time)
SELECT '03:00:00'::interval + (CAST(RANDOM() * 3600 AS INT) || ' 
secs')::INTERVAL - (CAST(RANDOM() * 3600 AS INT) || ' secs')::INTERVAL
FROM generate_series(1,10000);


-- Give the non-windowing one a chance.
CREATE INDEX big_marathon_time_idx ON big_marathon (time);

VACUUM ANALYZE big_marathon;

-- Who was in Nth place (using 2nd place here but could be anything)

-- Sub query

SELECT runnerid,time,position
FROM (SELECT runnerid,time,(SELECT COUNT(DISTINCT time)+1 FROM big_marathon 
WHERE time < r.time) AS position
      FROM big_marathon r
) results
WHERE position = 2;

-- above query = 404851.763 ms (6 mins 44 secs)

-- Self join

SELECT t.runnerid,t.name,t.time,t.position 
FROM (SELECT t1.runnerid,t1.name,t1.time,COUNT(DISTINCT t2.time) + 1 AS position
      FROM big_marathon t1
      LEFT OUTER JOIN big_marathon t2 ON t2.time < t1.time
      GROUP BY t1.runnerid,t1.name,t1.time
) t WHERE t.position = 2;

/*
ERROR:  could not write block 645447 of temporary file: No space left on device
HINT:  Perhaps out of disk space?

Perhaps not, but it was REALLY slow anyway.
*/

-- Vladimir's method.
-- edit postgresql.conf -> custom_variable_classes = 'public' 

-- init values
select ''>>>'prev_time', '0'>>>'dense_rank';

select runnerid,time,position from (
    select (((case when time::text = <<<'prev_time' then 0 else 1 
end)+(<<<'dense_rank')::int4)::text>>>'dense_rank')::int4 as position,
           runnerid,
          time,
          time::text>>>'prev_time'
     from big_marathon      order by time
) results where position=2;


-- above query = 617 ms

-- With windowing functions.

SELECT runnerid,time,position
FROM (SELECT runnerid,time,DENSE_RANK() OVER(ORDER BY time ASC) AS position
      FROM big_marathon 
) results
WHERE position = 2;

-- above query = 115.932 ms

------------------------------------------------------------------------------------------
-- Test 6 -- Stock Problem
--
-- Notes: If we have a table for stock which holds the stock levels for each 
product
--        and we have an orders table that contains orders for products and 
their
--        due dates. We need to know how much stock we have for each order. We
--        need to allocate stock to an order with the earliest due date first.
--        If there are two orders with the same due date the one with the lowest
--        order id number should take the stock first.
--        We need to see a list of orders with the orderid, partnum, duedate 
and 
--        the orderqty and the remaining stock after the order is completed. If 
the
--        remaining stock is less than zero we show that rather than 0 to 
indicate
--        how much stock needs to be bought or made.

CREATE TABLE stock (
  partnum VARCHAR(16) NOT NULL PRIMARY KEY,
  stockqty INT NOT NULL,
  CHECK (stockqty > 0)
);

CREATE TABLE orders (
  orderid SERIAL NOT NULL PRIMARY KEY,
  partnum VARCHAR(16) NOT NULL,
  duedate DATE NOT NULL,
  orderqty INT NOT NULL,
  CHECK (orderqty > 0)
);

INSERT INTO stock VALUES('CPU',1000);
INSERT INTO stock VALUES('HARDDRIVE',2000);
INSERT INTO stock VALUES('KEYBOARD',4000);
INSERT INTO stock VALUES('MOUSE',9000);
INSERT INTO stock VALUES('MONITOR',100);

INSERT INTO orders (partnum,duedate,orderqty) VALUES('CPU','2008-12-22', 700);
INSERT INTO orders (partnum,duedate,orderqty) VALUES('CPU','2008-11-15', 600);

INSERT INTO orders (partnum,duedate,orderqty) VALUES('HARDDRIVE','2008-11-05', 
1200);
INSERT INTO orders (partnum,duedate,orderqty) VALUES('HARDDRIVE','2008-11-17', 
200);

INSERT INTO orders (partnum,duedate,orderqty) VALUES('KEYBOARD','2008-11-17', 
1000);
INSERT INTO orders (partnum,duedate,orderqty) VALUES('KEYBOARD','2008-12-03', 
3000);

INSERT INTO orders (partnum,duedate,orderqty) VALUES('MONITOR','2008-12-03', 
50);
INSERT INTO orders (partnum,duedate,orderqty) VALUES('MONITOR','2008-12-03', 
60);

VACUUM ANALYZE stock;
VACUUM ANALYZE orders;



-- Sub query

SELECT t.orderid,
       t.partnum,
       t.duedate,
       t.orderqty,
       st.stockqty - t.orderqty - COALESCE((SELECT SUM(orderqty)
                                                FROM orders
                                                WHERE partnum = t.partnum AND 
(duedate < t.duedate OR (duedate = t.duedate AND orderid < t.orderid))
                                       ),0) AS remaining
FROM orders t
INNER JOIN stock st ON t.partnum = st.partnum
ORDER BY t.partnum ASC,t.duedate ASC,t.orderid ASC;

-- above query = 253 tps.




-- With windowing functions

SELECT o.orderid,
       o.partnum,
       o.duedate,
       o.orderqty,
       s.stockqty - SUM(o.orderqty) OVER (PARTITION BY o.partnum ORDER BY 
o.duedate ASC,o.orderid ASC) AS remaining
FROM orders o
INNER JOIN stock s ON o.partnum = s.partnum
ORDER BY o.partnum ASC,o.duedate ASC,o.orderid ASC;

-- above query = 305 tps.

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Windowing Function Patch Review -> Performance Comparison.

Reply via email to