Re: [HACKERS] Windowing Function Patch Review -> Performance Comparison.

David Rowley Sun, 02 Nov 2008 06:59:40 -0800

Hitoshi Harada Wrote:
> Thanks for your test. Didn't post publicly, I've also tested real
> problems and performed better than I thought. If you can afford it,
> could you add selfjoin cases? It's like:


Ok, did self joins with some. I don't know if it's possible with all.

Test   Sub query Self join Vladimir Windowing UOM        Increase %
Test 1 498.00    N/A       N/A      578.00    Trans/Sec  16.06%
Test 2 336.00    424.00    182.78       481.00    Trans/Sec  13.44%
Test 3 1.30      7.59      1.90     8.45      Trans/Sec  11.33%
Test 4 424.00    361.00    182.00   629.00    Trans/Sec  48.35%
Test 5 8.89      N/A       5844.16  31052.69  Trans/Hour 431.35%
Test 6 253.00    N/A       N/A      305.00    Trans/Sec  20.55%

See attached for details.
The increase % column is now:

Window / max ( Sub query, self join, Vladimir ) - 1

Vladimir, I've included your tests too. I understand that people will
probably use this method as sometimes there is little choice to get the
performance that is required.

Hitoshi Harada Wrote:
> 2008/11/2 David Rowley <[EMAIL PROTECTED]>:
> > Obervations:
> >
> > Test 3 and 5 did not seem to make use of an index to get a sorted list
> of
> > results. I disabled enable_seqscan but the planner still failed to
> choose
> > index_scan. Is there any reason for this? Perhaps I'm missing something.
> > Hitoshi, can you take a look at this?

> Ah, good point. Maybe it's because I haven't paid attention to choose
> index_scan for upper sort node. I just put the sort node whatever the
> downer node is, so it might be needed to sink the information down to
> scan choice process that we use sort node upper. Could someone point
> me out how to do it, or which part of the existing code would be a
> good guide?

I know you need to wait for an answer about this, so I'd like to delay any
further performance tests until that's sorted out as it should affect
performance of larger tables quite a bit.

David.

Test Queries:


Added Self joins test cases. 
Also added Vladimir's method which requires the following code to be run first:

begin work;

create function var_set (text,text) returns text as $$
  select set_config ('public.' || $2 || pg_backend_pid(), $1, false);
$$ LANGUAGE 'sql';

create function var_get (text) returns text as $$
  select current_setting('public.' || $1 || pg_backend_pid());
$$ LANGUAGE 'sql';

create operator >>> (procedure = var_set, leftarg = text, rightarg = text);
create operator <<< (procedure = var_get, rightarg = text);

commit;

The above code exploits the use of custom GUC variables to get the lagged 
value. Clever but ditry :-)

----------------------------------------------------------------------------
-- Test 1 -- 
--
-- Notes: Get the name, department and salary of highest paid department
--        member for each department. If the two highest paid employees
--        in the department get the same salary show the one with the
--        lowest id.
----------------------------------------------------------------------------

create table employees (
  id INT primary key,
  name varchar(30) not null,
  department varchar(30) not null,
  salary int not null,
  check (salary >= 0)
);


insert into employees values(1,'Jeff','IT',10000);
insert into employees values(2,'Sam','IT',12000);

insert into employees values(3,'Richard','Manager',30000);
insert into employees values(4,'Ian','Manager',20000);

insert into employees values(5,'John','IT',60000);
insert into employees values(6,'Matthew','Director',60000);

VACUUM ANALYZE employees;


-- Normal way.

SELECT name,department,salary
FROM employees AS e
WHERE id = (SELECT id FROM employees WHERE department = e.department ORDER BY 
salary DESC,id ASC LIMIT 1);

-- above query = 498 tps


-- With windowing functions.

SELECT name,department,salary
FROM (SELECT name,
             department,
             salary,
             row_number() OVER (PARTITION BY department ORDER BY salary DESC,id 
ASC) AS num
      FROM employees
) AS t WHERE num = 1;
 
-- above query = 578 tps

----------------------------------------------------------------------------
-- Test 2 -- Split times problem
-- 
-- Notes: Find the interval of time between 1 timestamp and the previous
-- timestamp. The previous timestamp being the one with the next lowest id
-- column value. If there is no previous timestamp show the value NULL.

CREATE TABLE tstest (
  id SERIAL NOT NULL PRIMARY KEY,
  timestamp TIMESTAMP NOT NULL
);

INSERT INTO tstest (timestamp) VALUES(NOW());

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval 
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

INSERT INTO tstest (timestamp)
SELECT MAX(timestamp) + ((random() * 100)::TEXT || ' sec ')::interval
FROM tstest;

VACUUM ANALYZE tstest;


-- Sub query
SELECT timestamp - (SELECT timestamp FROM tstest WHERE id < t.id ORDER BY id 
DESC LIMIT 1)
FROM tstest t;

-- above query = 336 tps


-- Self Join, (breaks requirements! -> requires gapless sequence of IDs)

SELECT t1.timestamp - t2.timestamp
FROM tstest t1
LEFT OUTER JOIN tstest t2 ON t2.id + 1 = t1.id;

-- above query = 424 tps

-- Vladimir's method.

select '' >>> 'lag';
select timestamp::timestamp-(case lag when '' then null else lag end)::timestamp
from (select <<<'lag' as lag,
             timestamp::text >>>'lag' as timestamp
      from tstest 
      order by id
) as t

-- above query = 182.78 tps

-- windowing functions.

SELECT timestamp - LAG(timestamp,1) OVER(ORDER BY id)
FROM tstest;

-- above query = 481 tps


------------------------------------------------------------------------------------------
-- Test 3 -- Meter reading problem
--
-- Notes: Some testing with slightly larger tables.
--        Find the day were the least units were used. This will be the 
smallest difference
--        between that day and the previous meter reading. Note, this does not 
have to be
--        the previous day. It's possible that the meter was not read on some 
days.
        


CREATE TABLE meter_readings (
 date DATE NOT NULL PRIMARY KEY,
 reading DOUBLE PRECISION NOT NULL
);


-- Create inital meter reading of 0 for 1st Jan 2000.

INSERT INTO meter_readings (date,reading) VALUES('2000-01-01',0);

-- Create 10000 readings for each day after 2000-01-01.

INSERT INTO meter_readings (date,reading) 
SELECT '2000-01-01'::date + i.a,i.a * 10000 + RANDOM() * 10000 FROM 
generate_series(1,10000) AS i(a);

VACUUM ANALYZE meter_readings;


-- Sub query test

SELECT date,reading - (SELECT reading FROM meter_readings WHERE date < p.date 
ORDER BY date DESC LIMIT 1) AS used
FROM meter_readings p
ORDER BY used ASC NULLS LAST LIMIT 1;

-- above query = 1.3 tps.

-- Self join (breaks requirements! requires meter to be read daily!!)

SELECT t1.date,t1.reading - t2.reading AS used
FROM meter_readings t1
LEFT OUTER JOIN meter_readings t2 ON t2.date + 1 = t1.date
ORDER BY used ASC NULLS LAST LIMIT 1;

-- above query = 7.59 tps


-- Vladimir's method.
-- edit postgresql.conf -> custom_variable_classes = 'public' 

select '' >>> 'lag';
select date,
       reading::numeric-(case lag when '' then null else lag end)::numeric as 
used
from (select date,
             <<<'lag' as lag,
             reading::text >>>'lag' as reading
      from meter_readings
      order by date
) as t order by used asc nulls last limit 1

-- above query = 1.9 tps


-- Windowing functions

SELECT date,reading - LAG(reading,1) OVER (ORDER BY date) AS used
FROM meter_Readings
ORDER BY used ASC NULLS LAST LIMIT 1;


-- above query = 8.45 tps.

/* Notes:

There seems to be a small issue with the above query.

david=# explain SELECT date,reading - LAG(reading,1) OVER (ORDER BY date) AS 
used
david-# FROM meter_Readings
david-# ORDER BY used ASC NULLS LAST LIMIT 1;
                                         QUERY PLAN
---------------------------------------------------------------------------------------------
 Limit  (cost=989.49..989.49 rows=1 width=12)
   ->  Sort  (cost=989.49..1014.49 rows=10001 width=12)
         Sort Key: ((reading - lag(reading, (1))))
         ->  Window  (cost=814.47..939.48 rows=10001 width=12)
               ->  Sort  (cost=814.47..839.47 rows=10001 width=12)
                     Sort Key: date
                     ->  Seq Scan on meter_readings  (cost=0.00..150.01 
rows=10001 width=12)


The planner uses a seq scan on meter_readings then performs a sort on date.
Even if I do SET enable_seqscan = off; the planner still performs a seq_scan.
Why can't the exeutor use an index scan on the primary key?

*/



------------------------------------------------------------------------------------------
-- Test 4 -- Marathon Results Problem
--
-- Notes: Show the runnerid,name,time taken and the position of each Marathon 
runner.

CREATE TABLE marathon_results (
  runnerid SERIAL NOT NULL PRIMARY KEY,
  name VARCHAR(30) NOT NULL,
  time INTERVAL NOT NULL
);

INSERT INTO marathon_results (name,time) VALUES('Samuel','02:05:24');
INSERT INTO marathon_results (name,time) VALUES('Paul','02:04:55');
INSERT INTO marathon_results (name,time) VALUES('Haile','02:03:59');
INSERT INTO marathon_results (name,time) VALUES('Martin','02:05:15');
INSERT INTO marathon_results (name,time) VALUES('Sammy','02:04:56');
INSERT INTO marathon_results (name,time) VALUES('David','03:15:56');
INSERT INTO marathon_results (name,time) VALUES('Eric','03:15:56');
INSERT INTO marathon_results (name,time) VALUES('Alan','03:24:56');


VACUUM ANALYZE marathon_results;

-- sub query

SELECT runnerid,name,time,(SELECT COUNT(DISTINCT time)+1 FROM marathon_results 
WHERE time < r.time) AS position
FROM marathon_results r
ORDER BY position;

-- above query = 424 tps


-- Self join

SELECT t1.runnerid,t1.name,t1.time,COUNT(DISTINCT t2.time) + 1 AS position
FROM marathon_results t1
LEFT OUTER JOIN marathon_results t2 ON t2.time < t1.time
GROUP BY t1.runnerid,t1.name,t1.time
order by position;

-- above query = 361 tps

-- Vladimir's method.
-- edit postgresql.conf -> custom_variable_classes = 'public' 

-- init values
select ''>>>'prev_time', '0'>>>'dense_rank';

select runnerid,name,time,position from (
    select (((case when time::text = <<<'prev_time' then 0 else 1 
end)+(<<<'dense_rank')::int4)::text>>>'dense_rank')::int4 as position,
           runnerid,
          time,
          time::text>>>'prev_time',
          name
     from (SELECT * FROM marathon_results ORDER BY time) m
) results
ORDER BY position;

-- above query = 182 tps



-- With windowing functions.

SELECT runnerid,name,time,DENSE_RANK() OVER(ORDER BY time ASC) AS position
FROM marathon_results
ORDER BY position;

-- above query = 629 tps


------------------------------------------------------------------------------------------
-- Test 5 -- Big Marathon Problem
--
-- Notes: As above but bigger table.

CREATE TABLE big_marathon (
  runnerid SERIAL NOT NULL PRIMARY KEY,
  time INTERVAL NOT NULL
);

-- Generate 10,000 random times from 2 hrs to 4 hrs
INSERT INTO big_marathon (time)
SELECT '03:00:00'::interval + (CAST(RANDOM() * 3600 AS INT) || ' 
secs')::INTERVAL - (CAST(RANDOM() * 3600 AS INT) || ' secs')::INTERVAL
FROM generate_series(1,10000);


-- Give the non-windowing one a chance.
CREATE INDEX big_marathon_time_idx ON big_marathon (time);

VACUUM ANALYZE big_marathon;

-- Who was in Nth place (using 2nd place here but could be anything)

-- Sub query

SELECT runnerid,time,position
FROM (SELECT runnerid,time,(SELECT COUNT(DISTINCT time)+1 FROM big_marathon 
WHERE time < r.time) AS position
      FROM big_marathon r
) results
WHERE position = 2;

-- above query = 404851.763 ms (6 mins 44 secs)

-- Self join

SELECT t.runnerid,t.name,t.time,t.position 
FROM (SELECT t1.runnerid,t1.name,t1.time,COUNT(DISTINCT t2.time) + 1 AS position
      FROM big_marathon t1
      LEFT OUTER JOIN big_marathon t2 ON t2.time < t1.time
      GROUP BY t1.runnerid,t1.name,t1.time
) t WHERE t.position = 2;

/*
ERROR:  could not write block 645447 of temporary file: No space left on device
HINT:  Perhaps out of disk space?

Perhaps not, but it was REALLY slow anyway.
*/

-- Vladimir's method.
-- edit postgresql.conf -> custom_variable_classes = 'public' 

-- init values
select ''>>>'prev_time', '0'>>>'dense_rank';

select runnerid,time,position from (
    select (((case when time::text = <<<'prev_time' then 0 else 1 
end)+(<<<'dense_rank')::int4)::text>>>'dense_rank')::int4 as position,
           runnerid,
          time,
          time::text>>>'prev_time'
     from big_marathon      order by time
) results where position=2;


-- above query = 617 ms

-- With windowing functions.

SELECT runnerid,time,position
FROM (SELECT runnerid,time,DENSE_RANK() OVER(ORDER BY time ASC) AS position
      FROM big_marathon 
) results
WHERE position = 2;

-- above query = 115.932 ms

-- Notes: The windowing function method did not use the index on the time 
column.


------------------------------------------------------------------------------------------
-- Test 6 -- Stock Problem
--
-- Notes: If we have a table for stock which holds the stock levels for each 
product
--        and we have an orders table that contains orders for products and 
their
--        due dates. We need to know how much stock we have for each order. We
--        need to allocate stock to an order with the earliest due date first.
--        If there are two orders with the same due date the one with the lowest
--        order id number should take the stock first.
--        We need to see a list of orders with the orderid, partnum, duedate 
and 
--        the orderqty and the remaining stock after the order is completed. If 
the
--        remaining stock is less than zero we show that rather than 0 to 
indicate
--        how much stock needs to be bought or made.

CREATE TABLE stock (
  partnum VARCHAR(16) NOT NULL PRIMARY KEY,
  stockqty INT NOT NULL,
  CHECK (stockqty > 0)
);

CREATE TABLE orders (
  orderid SERIAL NOT NULL PRIMARY KEY,
  partnum VARCHAR(16) NOT NULL,
  duedate DATE NOT NULL,
  orderqty INT NOT NULL,
  CHECK (orderqty > 0)
);

INSERT INTO stock VALUES('CPU',1000);
INSERT INTO stock VALUES('HARDDRIVE',2000);
INSERT INTO stock VALUES('KEYBOARD',4000);
INSERT INTO stock VALUES('MOUSE',9000);
INSERT INTO stock VALUES('MONITOR',100);

INSERT INTO orders (partnum,duedate,orderqty) VALUES('CPU','2008-12-22', 700);
INSERT INTO orders (partnum,duedate,orderqty) VALUES('CPU','2008-11-15', 600);

INSERT INTO orders (partnum,duedate,orderqty) VALUES('HARDDRIVE','2008-11-05', 
1200);
INSERT INTO orders (partnum,duedate,orderqty) VALUES('HARDDRIVE','2008-11-17', 
200);

INSERT INTO orders (partnum,duedate,orderqty) VALUES('KEYBOARD','2008-11-17', 
1000);
INSERT INTO orders (partnum,duedate,orderqty) VALUES('KEYBOARD','2008-12-03', 
3000);

INSERT INTO orders (partnum,duedate,orderqty) VALUES('MONITOR','2008-12-03', 
50);
INSERT INTO orders (partnum,duedate,orderqty) VALUES('MONITOR','2008-12-03', 
60);

VACUUM ANALYZE stock;
VACUUM ANALYZE orders;



-- Sub query

SELECT t.orderid,
       t.partnum,
       t.duedate,
       t.orderqty,
       st.stockqty - t.orderqty - COALESCE((SELECT SUM(orderqty)
                                                FROM orders
                                                WHERE partnum = t.partnum AND 
(duedate < t.duedate OR (duedate = t.duedate AND orderid < t.orderid))
                                       ),0) AS remaining
FROM orders t
INNER JOIN stock st ON t.partnum = st.partnum
ORDER BY t.partnum ASC,t.duedate ASC,t.orderid ASC;

-- above query = 253 tps.




-- With windowing functions

SELECT o.orderid,
       o.partnum,
       o.duedate,
       o.orderqty,
       s.stockqty - SUM(o.orderqty) OVER (PARTITION BY o.partnum ORDER BY 
o.duedate ASC,o.orderid ASC) AS remaining
FROM orders o
INNER JOIN stock s ON o.partnum = s.partnum
ORDER BY o.partnum ASC,o.duedate ASC,o.orderid ASC;

-- above query = 305 tps.

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Windowing Function Patch Review -> Performance Comparison.

Reply via email to