Your query has to visit every row of table d and execute the correlated
subquery multiple times.
You need to devise a way to do this only once for each d.m and then join that
table back into your query.
>sqlite < demo.sql
.eqp on
.timer on
CREATE TABLE d
(
m INT NOT NULL,
t INT NOT NULL,
v REAL
);
--EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?)
Run Time: real 0.007 user 0.000000 sys 0.000000
CREATE UNIQUE INDEX d_ind_1 ON d (m, t);
Run Time: real 0.002 user 0.000000 sys 0.000000
.timer on
INSERT INTO d (m, t, v)
WITH RECURSIVE
cnt(x) AS ( VALUES(0)
UNION ALL
SELECT x+1
FROM cnt
WHERE x<200000
)
SELECT (x % 10) + 1 AS m,
x+1 AS t,
1.0 AS v
FROM cnt;
--EQP-- 3,0,0,SCAN TABLE cnt
--EQP-- 1,0,0,COMPOUND SUBQUERIES 0 AND 0 (UNION ALL)
--EQP-- 0,0,0,SCAN SUBQUERY 1
Run Time: real 1.064 user 1.046875 sys 0.000000
-- arrange for one rare value of m, which should not appear in results
DELETE FROM d
WHERE m = 5;
--EQP-- 0,0,0,SEARCH TABLE d USING COVERING INDEX d_ind_1 (m=?)
Run Time: real 0.060 user 0.046875 sys 0.000000
INSERT INTO d (m, t, v)
WITH RECURSIVE
cnt(x) AS ( VALUES(0)
UNION ALL
SELECT x+1
FROM cnt
WHERE x<10)
SELECT 5 AS m,
x + 1230000 AS t,
1.0 AS v
FROM cnt;
--EQP-- 3,0,0,SCAN TABLE cnt
--EQP-- 1,0,0,COMPOUND SUBQUERIES 0 AND 0 (UNION ALL)
--EQP-- 0,0,0,SCAN SUBQUERY 1
Run Time: real 0.013 user 0.000000 sys 0.000000
-- now, for each value of m, look for exactly N rows, ordered by t
CREATE TEMP TABLE tt (m INTEGER NOT NULL UNIQUE, t integer not null);
--EQP-- 0,0,0,SEARCH TABLE sqlite_temp_master USING INTEGER PRIMARY KEY
(rowid=?)
Run Time: real 0.011 user 0.000000 sys 0.000000
INSERT INTO tt (m, t) SELECT DISTINCT m, 0 from d;
--EQP-- 0,0,0,SCAN TABLE d USING COVERING INDEX d_ind_1
--EQP-- 0,0,0,USE TEMP B-TREE FOR DISTINCT
Run Time: real 0.076 user 0.046875 sys 0.000000
update tt
set t = (SELECT MAX(d3.t)
FROM (SELECT d2.t
FROM d AS d2
WHERE tt.m = d2.m
ORDER BY d2.t
LIMIT 20
) AS d3
);
--EQP-- 0,0,0,SCAN TABLE tt
--EQP-- 0,0,0,EXECUTE CORRELATED SCALAR SUBQUERY 0
--EQP-- 1,0,0,SEARCH TABLE d AS d2 USING COVERING INDEX d_ind_1 (m=?)
--EQP-- 0,0,0,SEARCH SUBQUERY 1 AS d3
Run Time: real 0.058 user 0.000000 sys 0.000000
select * from tt;
--EQP-- 0,0,0,SCAN TABLE tt
1|191
2|192
3|193
4|194
5|1230010
6|196
7|197
8|198
9|199
10|200
Run Time: real 0.053 user 0.000000 sys 0.000000
ANALYZE;
--EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?)
--EQP-- 0,0,0,SEARCH TABLE sqlite_master USING INTEGER PRIMARY KEY (rowid=?)
Run Time: real 0.173 user 0.156250 sys 0.000000
SELECT m, n_points, t_csv, v_csv, max_rowid
FROM (SELECT d.m AS "m",
count(v) AS "n_points",
substr(group_concat(d.t), 1, 16) || '...' AS "t_csv",
substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
max(d.rowid) AS "max_rowid"
FROM d, tt
WHERE d.t <= tt.t
and d.m = tt.m
GROUP BY d.m
ORDER BY d.m, d.t
)
WHERE n_points = 20;
--EQP-- 1,0,0,SCAN TABLE d USING INDEX d_ind_1
--EQP-- 1,1,1,SEARCH TABLE tt USING INDEX sqlite_autoindex_tt_1 (m=?)
--EQP-- 1,0,0,USE TEMP B-TREE FOR ORDER BY
--EQP-- 0,0,0,SCAN SUBQUERY 1
1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
Run Time: real 0.308 user 0.203125 sys 0.000000
SELECT m, n_points, t_csv, v_csv, max_rowid
FROM (SELECT d.m AS "m",
count(v) AS "n_points",
substr(group_concat(t), 1, 16) || '...' AS "t_csv",
substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
max(d.rowid) AS "max_rowid"
FROM d
WHERE d.t <= (SELECT MAX(d3.t)
FROM (SELECT d2.t
FROM d AS d2
WHERE d.m = d2.m
ORDER BY d2.t
LIMIT 20
) AS d3
)
GROUP BY d.m
ORDER BY d.m, d.t
)
WHERE n_points = 20;
--EQP-- 1,0,0,SCAN TABLE d USING INDEX d_ind_1
--EQP-- 1,0,0,EXECUTE CORRELATED SCALAR SUBQUERY 2
--EQP-- 3,0,0,SEARCH TABLE d AS d2 USING COVERING INDEX d_ind_1 (m=?)
--EQP-- 2,0,0,SEARCH SUBQUERY 3 AS d3
--EQP-- 1,0,0,USE TEMP B-TREE FOR ORDER BY
--EQP-- 0,0,0,SCAN SUBQUERY 1
1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
Run Time: real 2.421 user 2.281250 sys 0.000000
2015-04-24 21:17:08 [D:\Temp]
>
In case you missed it the answer is:
CREATE TEMP TABLE tt (m INTEGER NOT NULL UNIQUE, t integer not null);
INSERT INTO tt (m, t) SELECT DISTINCT m, 0 from d;
update tt
set t = (SELECT MAX(d3.t)
FROM (SELECT d2.t
FROM d AS d2
WHERE tt.m = d2.m
ORDER BY d2.t
LIMIT 20
) AS d3
);
SELECT m, n_points, t_csv, v_csv, max_rowid
FROM (SELECT d.m AS "m",
count(v) AS "n_points",
substr(group_concat(d.t), 1, 16) || '...' AS "t_csv",
substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
max(d.rowid) AS "max_rowid"
FROM d, tt
WHERE d.t <= tt.t
and d.m = tt.m
GROUP BY d.m
ORDER BY d.m, d.t
)
WHERE n_points = 20;
This is now linear on the number of distinct m rather than the number of rows
in table d.
> -----Original Message-----
> From: sqlite-users-bounces at mailinglists.sqlite.org [mailto:sqlite-users-
> bounces at mailinglists.sqlite.org] On Behalf Of John Pitney
> Sent: Friday, 24 April, 2015 20:13
> To: sqlite-users
> Subject: [sqlite] First-N query time scaling with table size
>
> In my application, values v associated with metrics m and times t are
> inserted into one big table d. Another process opens the same database
> and looks in d for metrics with at least N entries, sorted by t. I'm
> finding that the time to complete the query grows linearly with the
> row count M of table d, even though, in the example here, the query
> results are identical for any M >= 200.
>
> I've made a self-contained example, where N = 20 and M = 200. Is
> there a way to write the final select statement so that its completion
> time does not grow with M?
>
> -- demo.sql
> -- create big table of data with M = 200
> CREATE TABLE d (m INT NOT NULL, t INT NOT NULL, v REAL);
> CREATE UNIQUE INDEX d_ind_1 ON d (m, t);
> .timer on
> INSERT INTO d (m, t, v)
> WITH RECURSIVE
> cnt(x) AS (VALUES(0) UNION ALL SELECT x+1 FROM cnt WHERE x<200)
> SELECT (x % 10) + 1 AS m, x+1 AS t, 1.0 AS v
> FROM cnt;
> .timer off
>
> -- arrange for one rare value of m, which should not appear in results
> DELETE FROM d WHERE m = 5;
> INSERT INTO d (m, t, v)
> WITH RECURSIVE
> cnt(x) AS (VALUES(0) UNION ALL SELECT x+1 FROM cnt WHERE x<10)
> SELECT 5 AS m, x + 1230000 AS t, 1.0 AS v
> FROM cnt;
>
> -- now, for each value of m, look for exactly N rows, ordered by t
> CREATE TEMP TABLE tt (m INT NOT NULL);
> INSERT INTO tt (m) SELECT DISTINCT m from d;
> .timer on
> SELECT m, n_points, t_csv, v_csv, max_rowid FROM
> (SELECT tt.m AS "m",
> count(v) AS "n_points",
> substr(group_concat(t), 1, 16) || '...' AS "t_csv",
> substr(group_concat(round(v, 2)), 1, 16) || '...' AS "v_csv",
> max(d.rowid) AS "max_rowid"
> FROM tt, d
> WHERE tt.m = d.m
> AND d.t <= (SELECT MAX(d3.t) FROM (SELECT d2.t FROM d AS d2
> WHERE d.m = d2.m ORDER BY d2.t LIMIT 20) AS d3)
> GROUP BY tt.m ORDER BY tt.m, d.t)
> WHERE n_points = 20;
> -- end of demo.sql
>
> The results are the following, on a Windows 7 64-bit platform:
>
> $ sqlite3 -version
> 3.8.4.3 2014-04-03 16:53:12 a611fa96c4a848614efe899130359c9f6fb889c3
>
> $ sqlite3 < demo.sql
> Run Time: real 0.003 user 0.000000 sys 0.000000
> 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
> 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
> 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
> 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
> 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
> 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
> 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
> 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
> 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
> Run Time: real 0.012 user 0.000000 sys 0.000000
>
> $ sed -e 's/200/200000/' < demo.sql | sqlite3
> Run Time: real 0.608 user 0.592804 sys 0.000000
> 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
> 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
> 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
> 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
> 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
> 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
> 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
> 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
> 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
> Run Time: real 1.063 user 1.045207 sys 0.000000
>
> $ sed -e 's/200/400000/' < demo.sql | sqlite3
> Run Time: real 1.255 user 1.248008 sys 0.000000
> 1|20|1,11,21,31,41,51...|1.0,1.0,1.0,1.0,...|191
> 2|20|2,12,22,32,42,52...|1.0,1.0,1.0,1.0,...|192
> 3|20|3,13,23,33,43,53...|1.0,1.0,1.0,1.0,...|193
> 4|20|4,14,24,34,44,54...|1.0,1.0,1.0,1.0,...|194
> 6|20|6,16,26,36,46,56...|1.0,1.0,1.0,1.0,...|196
> 7|20|7,17,27,37,47,57...|1.0,1.0,1.0,1.0,...|197
> 8|20|8,18,28,38,48,58...|1.0,1.0,1.0,1.0,...|198
> 9|20|9,19,29,39,49,59...|1.0,1.0,1.0,1.0,...|199
> 10|20|10,20,30,40,50,6...|1.0,1.0,1.0,1.0,...|200
> Run Time: real 2.090 user 2.074813 sys 0.000000
>
> --
> John Pitney
> john at pitney.org
> _______________________________________________
> sqlite-users mailing list
> sqlite-users at mailinglists.sqlite.org
> http://mailinglists.sqlite.org/cgi-bin/mailman/listinfo/sqlite-users