[sqlite] First-N query time scaling with table size

Keith Medcalf Fri, 24 Apr 2015 21:22:51 -0400

Your query has to visit every row of table d and execute the correlated 
subquery multiple times.


You need to devise a way to do this only once for each d.m and then join that 
table back into your query.

>sqlite < demo.sql
.eqp on
.timer on

CREATE TABLE d
(
    m INT NOT NULL,
    t INT NOT NULL,
    v REAL
);
--EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?)
Run Time: real 0.007 user 0.000000 sys 0.000000
CREATE UNIQUE INDEX d_ind_1 ON d (m, t);
Run Time: real 0.002 user 0.000000 sys 0.000000

.timer on
INSERT INTO d (m, t, v)
  WITH RECURSIVE
    cnt(x) AS (          VALUES(0)
               UNION ALL
                         SELECT x+1
                           FROM cnt
                          WHERE x<200000
              )
  SELECT (x % 10) + 1 AS m,
         x+1 AS t,
         1.0 AS v
    FROM cnt;
--EQP-- 3,0,0,SCAN TABLE cnt
--EQP-- 1,0,0,COMPOUND SUBQUERIES 0 AND 0 (UNION ALL)
--EQP-- 0,0,0,SCAN SUBQUERY 1
Run Time: real 1.064 user 1.046875 sys 0.000000

-- arrange for one rare value of m, which should not appear in results
DELETE FROM d
      WHERE m = 5;
--EQP-- 0,0,0,SEARCH TABLE d USING COVERING INDEX d_ind_1 (m=?)
Run Time: real 0.060 user 0.046875 sys 0.000000
INSERT INTO d (m, t, v)
  WITH RECURSIVE
    cnt(x) AS (          VALUES(0)
               UNION ALL
                         SELECT x+1
                           FROM cnt
                          WHERE x<10)
  SELECT 5 AS m,
         x + 1230000 AS t,
         1.0 AS v
    FROM cnt;
--EQP-- 3,0,0,SCAN TABLE cnt
--EQP-- 1,0,0,COMPOUND SUBQUERIES 0 AND 0 (UNION ALL)
--EQP-- 0,0,0,SCAN SUBQUERY 1
Run Time: real 0.013 user 0.000000 sys 0.000000

-- now, for each value of m, look for exactly N rows, ordered by t
CREATE TEMP TABLE tt (m INTEGER NOT NULL UNIQUE, t integer not null);
--EQP-- 0,0,0,SEARCH TABLE sqlite_temp_master USING INTEGER PRIMARY KEY 
(rowid=?)
Run Time: real 0.011 user 0.000000 sys 0.000000
INSERT INTO tt (m, t) SELECT DISTINCT m, 0 from d;
--EQP-- 0,0,0,SCAN TABLE d USING COVERING INDEX d_ind_1
--EQP-- 0,0,0,USE TEMP B-TREE FOR DISTINCT
Run Time: real 0.076 user 0.046875 sys 0.000000
update tt
   set t = (SELECT MAX(d3.t)
              FROM (SELECT d2.t
                      FROM d AS d2
                     WHERE tt.m = d2.m
                  ORDER BY d2.t
                     LIMIT 20
                   ) AS d3
           );
--EQP-- 0,0,0,SCAN TABLE tt
--EQP-- 0,0,0,EXECUTE CORRELATED SCALAR SUBQUERY 0
--EQP-- 1,0,0,SEARCH TABLE d AS d2 USING COVERING INDEX d_ind_1 (m=?)
--EQP-- 0,0,0,SEARCH SUBQUERY 1 AS d3
Run Time: real 0.058 user 0.000000 sys 0.000000
select * from tt;
--EQP-- 0,0,0,SCAN TABLE tt
1|191
2|192
3|193
4|194
5|1230010
6|196
7|197
8|198
9|199
10|200
Run Time: real 0.053 user 0.000000 sys 0.000000

ANALYZE;
--EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?)
--EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?)
Run Time: real 0.173 user 0.156250 sys 0.000000


SELECT m, n_points, t_csv, v_csv, max_rowid
  FROM (SELECT d.m AS "m",
               count(v) AS "n_points",
               substr(group_concat(d.t), 1, 16) || '...' AS "t_csv",
               substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
               max(d.rowid) AS "max_rowid"
          FROM d, tt
         WHERE d.t <= tt.t
           and d.m = tt.m
      GROUP BY d.m
      ORDER BY d.m, d.t
       )
 WHERE n_points = 20;
--EQP-- 1,0,0,SCAN TABLE d USING INDEX d_ind_1
--EQP-- 1,1,1,SEARCH TABLE tt USING INDEX sqlite_autoindex_tt_1 (m=?)
--EQP-- 1,0,0,USE TEMP B-TREE FOR ORDER BY
--EQP-- 0,0,0,SCAN SUBQUERY 1
1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
Run Time: real 0.308 user 0.203125 sys 0.000000

SELECT m, n_points, t_csv, v_csv, max_rowid
  FROM (SELECT d.m AS "m",
               count(v) AS "n_points",
               substr(group_concat(t), 1, 16) || '...' AS "t_csv",
               substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
               max(d.rowid) AS "max_rowid"
          FROM d
         WHERE d.t <= (SELECT MAX(d3.t)
                         FROM (SELECT d2.t
                                 FROM d AS d2
                                WHERE d.m = d2.m
                             ORDER BY d2.t
                                LIMIT 20
                              ) AS d3
                      )
      GROUP BY d.m
      ORDER BY d.m, d.t
       )
 WHERE n_points = 20;
--EQP-- 1,0,0,SCAN TABLE d USING INDEX d_ind_1
--EQP-- 1,0,0,EXECUTE CORRELATED SCALAR SUBQUERY 2
--EQP-- 3,0,0,SEARCH TABLE d AS d2 USING COVERING INDEX d_ind_1 (m=?)
--EQP-- 2,0,0,SEARCH SUBQUERY 3 AS d3
--EQP-- 1,0,0,USE TEMP B-TREE FOR ORDER BY
--EQP-- 0,0,0,SCAN SUBQUERY 1
1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
Run Time: real 2.421 user 2.281250 sys 0.000000



2015-04-24 21:17:08 [D:\Temp]
>


In case you missed it the answer is:

CREATE TEMP TABLE tt (m INTEGER NOT NULL UNIQUE, t integer not null);

INSERT INTO tt (m, t) SELECT DISTINCT m, 0 from d;

update tt
   set t = (SELECT MAX(d3.t)
              FROM (SELECT d2.t
                      FROM d AS d2
                     WHERE tt.m = d2.m
                  ORDER BY d2.t
                     LIMIT 20
                   ) AS d3
           );

SELECT m, n_points, t_csv, v_csv, max_rowid
  FROM (SELECT d.m AS "m",
               count(v) AS "n_points",
               substr(group_concat(d.t), 1, 16) || '...' AS "t_csv",
               substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
               max(d.rowid) AS "max_rowid"
          FROM d, tt
         WHERE d.t <= tt.t
           and d.m = tt.m
      GROUP BY d.m
      ORDER BY d.m, d.t
       )
 WHERE n_points = 20;

This is now linear on the number of distinct m rather than the number of rows 
in table d.


> -----Original Message-----
> From: sqlite-users-bounces at mailinglists.sqlite.org [mailto:sqlite-users-
> bounces at mailinglists.sqlite.org] On Behalf Of John Pitney
> Sent: Friday, 24 April, 2015 20:13
> To: sqlite-users
> Subject: [sqlite] First-N query time scaling with table size
> 
> In my application, values v associated with metrics m and times t are
> inserted into one big table d. Another process opens the same database
> and looks in d for metrics with at least N entries, sorted by t.  I'm
> finding that the time to complete the query grows linearly with the
> row count M of table d, even though, in the example here, the query
> results are identical for any M >= 200.
> 
> I've made a self-contained example, where N = 20 and M = 200.  Is
> there a way to write the final select statement so that its completion
> time does not grow with M?
> 
> -- demo.sql
> -- create big table of data with M = 200
> CREATE TABLE d (m INT NOT NULL, t INT NOT NULL, v REAL);
> CREATE UNIQUE INDEX d_ind_1 ON d (m, t);
> .timer on
> INSERT INTO d (m, t, v)
>   WITH RECURSIVE
>     cnt(x) AS (VALUES(0) UNION ALL SELECT x+1 FROM cnt WHERE x<200)
>   SELECT (x % 10) + 1 AS m, x+1 AS t, 1.0 AS v
>   FROM cnt;
> .timer off
> 
> -- arrange for one rare value of m, which should not appear in results
> DELETE FROM d WHERE m = 5;
> INSERT INTO d (m, t, v)
>   WITH RECURSIVE
>     cnt(x) AS (VALUES(0) UNION ALL SELECT x+1 FROM cnt WHERE x<10)
>   SELECT 5 AS m, x + 1230000 AS t, 1.0 AS v
>   FROM cnt;
> 
> -- now, for each value of m, look for exactly N rows, ordered by t
> CREATE TEMP TABLE tt (m INT NOT NULL);
> INSERT INTO tt (m) SELECT DISTINCT m from d;
> .timer on
> SELECT m, n_points, t_csv, v_csv, max_rowid FROM
> (SELECT tt.m AS "m",
>   count(v) AS "n_points",
>   substr(group_concat(t), 1, 16) || '...' AS "t_csv",
>   substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
>   max(d.rowid) AS "max_rowid"
>   FROM tt, d
>   WHERE tt.m = d.m
>     AND d.t <= (SELECT MAX(d3.t) FROM (SELECT d2.t FROM d AS d2
>         WHERE d.m = d2.m ORDER BY d2.t LIMIT 20) AS d3)
>   GROUP BY tt.m ORDER BY tt.m, d.t)
> WHERE n_points = 20;
> -- end of demo.sql
> 
> The results are the following, on a Windows 7 64-bit platform:
> 
> $ sqlite3 -version
> 3.8.4.3 2014-04-03 16:53:12 a611fa96c4a848614efe899130359c9f6fb889c3
> 
> $ sqlite3 < demo.sql
> Run Time: real 0.003 user 0.000000 sys 0.000000
> 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
> 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
> 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
> 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
> 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
> 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
> 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
> 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
> 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
> Run Time: real 0.012 user 0.000000 sys 0.000000
> 
> $ sed -e 's/200/200000/' < demo.sql | sqlite3
> Run Time: real 0.608 user 0.592804 sys 0.000000
> 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
> 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
> 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
> 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
> 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
> 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
> 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
> 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
> 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
> Run Time: real 1.063 user 1.045207 sys 0.000000
> 
> $ sed -e 's/200/400000/' < demo.sql | sqlite3
> Run Time: real 1.255 user 1.248008 sys 0.000000
> 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
> 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
> 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
> 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
> 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
> 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
> 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
> 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
> 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
> Run Time: real 2.090 user 2.074813 sys 0.000000
> 
> --
> John Pitney
> john at pitney.org
> _______________________________________________
> sqlite-users mailing list
> sqlite-users at mailinglists.sqlite.org
> http://mailinglists.sqlite.org/cgi-bin/mailman/listinfo/sqlite-users

[sqlite] First-N query time scaling with table size

Reply via email to