Re: MERGE tables considered harmful for data warehouse fact tables

2006-01-13 Thread sheeri kritzer
(still catching up!)

Tom,

We have similar queries where I work, and a union is the best way to
go.  Leaving the tables large eats up valuable buffer space for us,
putting the logic in the client app is a bad idea (since you'd have to
do it for all apps).

In particular, our sessions table is very large (our record so far is
18k people logged in at once) and we're splitting it up currently
because we've gotten to that point.

-Sheeri

On 12/22/05, Tom Cunningham [EMAIL PROTECTED] wrote:
 The script to prove it follows.

 Splitting a million-row fact table into a 5-part merge table makes
 3-second queries take 9  seconds.

 The basic problem is this: fact tables are generally referenced by
 unique combinations of dimensions, and though often only one
 underlying table needs to be referenced, mysql doesn't know this, so
 every single underlying table is polled for each
 dimension-combination.

 Practical alternatives: (1) leave your fact table as a big one; (2)
 split the table up, but put logic in the client-app so it knows which
 table to address; (3) use a *union* of queries instead of a merge
 table, then mysql could look at each underlying table one at a time.

 If anyone has ideas for other alternatives, or improvements on the
 script, please tell me.

 Thanks.

 Tom.



 #

 # MERGE FACT TABLE TEST -
 [EMAIL PROTECTED]


USE sandbox;

 CREATE TABLE big_table LIKE mysql.help_keyword;
  ALTER TABLE big_table ENGINE=MERGE UNION=(mysql.help_keyword);

   DROP TABLE IF EXISTS dimension_1;
 CREATE TABLE dimension_1 (
key_1   INT PRIMARY KEY NOT NULL,
attribute_1 VARCHAR(255) NOT NULL,
INDEX attribute_1 (attribute_1(10))
 );
SET @A:=1;
 INSERT INTO dimension_1
 SELECT @A:[EMAIL PROTECTED],
SHA(RAND())
   FROM big_table b1, big_table b2, big_table b3
  LIMIT 10;

   DROP TABLE IF EXISTS dimension_2;
 CREATE TABLE dimension_2 (
key_2   INT PRIMARY KEY NOT NULL,
attribute_2 VARCHAR(255) NOT NULL,
INDEX attribute_1 (attribute_2(10))
 );
SET @A:=1;
 INSERT INTO dimension_2
 SELECT @A:[EMAIL PROTECTED],
SHA(RAND())
   FROM big_table b1, big_table b2, big_table b3
  LIMIT 10;


   DROP TABLE IF EXISTS facts;
 CREATE TABLE facts (
key_1 INT UNSIGNED,
key_2 INT UNSIGNED,
fact_1 INT UNSIGNED,
fact_2 INT UNSIGNED,
PRIMARY KEY key_1_key_2 (key_1,key_2)
 );
 INSERT IGNORE INTO facts
 SELECT CEIL(RAND()*10),
CEIL(RAND()*10),
CEIL(RAND()*100),
CEIL(RAND()*100)
   FROM big_table b1, big_table b2, big_table b3
  LIMIT 100;



 ## Typical query: (takes about 1.5seconds)
 SELECT SQL_NO_CACHE attribute_1, sum(fact_1), avg(fact_2)
   FROM facts
   JOIN dimension_1 ON dimension_1.key_1=facts.key_1
   JOIN dimension_2 ON dimension_2.key_2=facts.key_2
  WHERE attribute_1 LIKE 'ff%'
AND attribute_2 LIKE 'a3%'
  GROUP BY attribute_1
  LIMIT 1;


 ## Variation 1: Unionised fact table;

 DROP TABLES f1, f2, f3, f4, f5;
 CREATE TABLE f1 LIKE facts; INSERT INTO f1 SELECT * FROM facts WHERE
 key_1 BETWEEN 0 AND 2;
 CREATE TABLE f2 LIKE facts; INSERT INTO f2 SELECT * FROM facts WHERE
 key_1 BETWEEN 20001 AND 4;
 CREATE TABLE f3 LIKE facts; INSERT INTO f3 SELECT * FROM facts WHERE
 key_1 BETWEEN 40001 AND 6;
 CREATE TABLE f4 LIKE facts; INSERT INTO f4 SELECT * FROM facts WHERE
 key_1 BETWEEN 60001 AND 8;
 CREATE TABLE f5 LIKE facts; INSERT INTO f5 SELECT * FROM facts WHERE
 key_1 BETWEEN 80001 AND 10;
 RENAME TABLE facts TO facts_old;
 CREATE TABLE facts LIKE facts_old;
 ALTER TABLE facts ENGINE=MERGE UNION=(f1,f2,f3,f4,f5);

 # (now try the above same query again)

 --
 MySQL General Mailing List
 For list archives: http://lists.mysql.com/mysql
 To unsubscribe:http://lists.mysql.com/[EMAIL PROTECTED]



--
MySQL General Mailing List
For list archives: http://lists.mysql.com/mysql
To unsubscribe:http://lists.mysql.com/[EMAIL PROTECTED]



MERGE tables considered harmful for data warehouse fact tables

2005-12-22 Thread Tom Cunningham
The script to prove it follows.

Splitting a million-row fact table into a 5-part merge table makes
3-second queries take 9  seconds.

The basic problem is this: fact tables are generally referenced by
unique combinations of dimensions, and though often only one
underlying table needs to be referenced, mysql doesn't know this, so
every single underlying table is polled for each
dimension-combination.

Practical alternatives: (1) leave your fact table as a big one; (2)
split the table up, but put logic in the client-app so it knows which
table to address; (3) use a *union* of queries instead of a merge
table, then mysql could look at each underlying table one at a time.

If anyone has ideas for other alternatives, or improvements on the
script, please tell me.

Thanks.

Tom.



#

# MERGE FACT TABLE TEST -
[EMAIL PROTECTED]


   USE sandbox;

CREATE TABLE big_table LIKE mysql.help_keyword;
 ALTER TABLE big_table ENGINE=MERGE UNION=(mysql.help_keyword);

  DROP TABLE IF EXISTS dimension_1;
CREATE TABLE dimension_1 (
   key_1   INT PRIMARY KEY NOT NULL,
   attribute_1 VARCHAR(255) NOT NULL,
   INDEX attribute_1 (attribute_1(10))
);
   SET @A:=1;
INSERT INTO dimension_1
SELECT @A:[EMAIL PROTECTED],
   SHA(RAND())
  FROM big_table b1, big_table b2, big_table b3
 LIMIT 10;

  DROP TABLE IF EXISTS dimension_2;
CREATE TABLE dimension_2 (
   key_2   INT PRIMARY KEY NOT NULL,
   attribute_2 VARCHAR(255) NOT NULL,
   INDEX attribute_1 (attribute_2(10))
);
   SET @A:=1;
INSERT INTO dimension_2
SELECT @A:[EMAIL PROTECTED],
   SHA(RAND())
  FROM big_table b1, big_table b2, big_table b3
 LIMIT 10;


  DROP TABLE IF EXISTS facts;
CREATE TABLE facts (
   key_1 INT UNSIGNED,
   key_2 INT UNSIGNED,
   fact_1 INT UNSIGNED,
   fact_2 INT UNSIGNED,
   PRIMARY KEY key_1_key_2 (key_1,key_2)
);
INSERT IGNORE INTO facts
SELECT CEIL(RAND()*10),
   CEIL(RAND()*10),
   CEIL(RAND()*100),
   CEIL(RAND()*100)
  FROM big_table b1, big_table b2, big_table b3
 LIMIT 100;



## Typical query: (takes about 1.5seconds)
SELECT SQL_NO_CACHE attribute_1, sum(fact_1), avg(fact_2)
  FROM facts
  JOIN dimension_1 ON dimension_1.key_1=facts.key_1
  JOIN dimension_2 ON dimension_2.key_2=facts.key_2
 WHERE attribute_1 LIKE 'ff%'
   AND attribute_2 LIKE 'a3%'
 GROUP BY attribute_1
 LIMIT 1;


## Variation 1: Unionised fact table;

DROP TABLES f1, f2, f3, f4, f5;
CREATE TABLE f1 LIKE facts; INSERT INTO f1 SELECT * FROM facts WHERE
key_1 BETWEEN 0 AND 2;
CREATE TABLE f2 LIKE facts; INSERT INTO f2 SELECT * FROM facts WHERE
key_1 BETWEEN 20001 AND 4;
CREATE TABLE f3 LIKE facts; INSERT INTO f3 SELECT * FROM facts WHERE
key_1 BETWEEN 40001 AND 6;
CREATE TABLE f4 LIKE facts; INSERT INTO f4 SELECT * FROM facts WHERE
key_1 BETWEEN 60001 AND 8;
CREATE TABLE f5 LIKE facts; INSERT INTO f5 SELECT * FROM facts WHERE
key_1 BETWEEN 80001 AND 10;
RENAME TABLE facts TO facts_old;
CREATE TABLE facts LIKE facts_old;
ALTER TABLE facts ENGINE=MERGE UNION=(f1,f2,f3,f4,f5);

# (now try the above same query again)

--
MySQL General Mailing List
For list archives: http://lists.mysql.com/mysql
To unsubscribe:http://lists.mysql.com/[EMAIL PROTECTED]