[sqlite] Suggestions for Fast Set Logic?

Keith Medcalf Thu, 09 Jul 2015 18:44:40 -0400

create table data
( 
     data_id integer primary key,
     ...
);

create table keywords
(
     keyword_id integer primary key,
     keyword text collate nocase unique
);



create table n2mKeywords
(
     data_id integer references data,
     keyword_id integer references keywords,
     primary key (data_id, keyword_id),
     unique (keyword_id, data_id)
) WITHOUT ROWID;


select * form data where id in
(select data_id from keywords natural join n2mkeywords where keyword = 'onca' 
or keyword = 'bonca'
INTERSECT
select data_id from keywords natural join n2mkeyworks where keyword = 
'chocolate');

will return all the data tuples associated with the keyword ('once' or 'bonca') 
and 'chocolate'


> -----Original Message-----
> From: sqlite-users-bounces at mailinglists.sqlite.org [mailto:sqlite-users-
> bounces at mailinglists.sqlite.org] On Behalf Of Andy Rahn
> Sent: Thursday, 9 July, 2015 11:26
> To: General
> Subject: [sqlite] Suggestions for Fast Set Logic?
> 
> I want to build an application that can search for specific documents
> based
> on a lot of criteria from a user (e.g. matching keywords, text, etc...).
> And then to combine these results using boolean logic ... For example,
> keyword 'animal' AND rating > 3
> 
> Each document has an integer id.
> 
> My strategy so far is to gather the id for each subquery into a temporary
> table.  Then I can combine the tables using UNION or INTERSECT keywords.
> 
> However, when I try a sample that does this on 250,000 rows, I find the
> performance is rather slow: 2 seconds.  (I've built sqlite3 myself on
> MacOS)
> 
> sqlite3 --version
> > 3.8.10.2 2015-05-20 18:17:19 2ef4f3a5b1d1d0c4338f8243d40a2452cc1f7fe4
> 
> 
> 
> > cat config.sql intersection.sql | sqlite3
> > 250001
> > Run Time: real 2.000 user 1.956734 sys 0.044040
> >
> 
> (samples files included below.)
> 
> I wonder if anyone has suggestions for how to improve this approach.  For
> example, I've considered trying to build a monolithic query which does the
> subqueries and UNION/INTERSECTION logic all at once, but my previous
> experience has shown that the resulting queries are very complex and hard
> for me to reason about if/when they get slow.  This approach lets me
> profile each subquery easily.  It also lets me tackle sorting the results
> as a separate step.
> 
> Another idea I've toyed with is building a custom implementation of UNION
> /
> INTERSECTION for result sets that are just sets of integers.  I could do
> this as a virtual table in sqlite.
> 
> Thanks for your thoughts on this problem.
> 
> Andy
> 
> 
> config.sql contains:
> 
> CREATE TABLE config( numAssets );
> 
> INSERT INTO config VALUES( 250000 );
> 
> intersection.sql contains:
> 
> CREATE TEMPORARY TABLE idset1 ( id PRIMARY KEY );
> CREATE TEMPORARY TABLE idset2 ( id PRIMARY KEY );
> 
> CREATE INDEX idset1_id ON idset1 ( id );
> CREATE INDEX idset2_id ON idset2 ( id );
> 
> BEGIN TRANSACTION;
> 
> INSERT INTO idset1
>     SELECT id FROM ( WITH RECURSIVE
>       cnt( id ) AS (
>       VALUES( 0 ) UNION ALL
>       SELECT id+1 FROM cnt WHERE id < ( SELECT MAX( numAssets ) FROM
> config
> ))
>     select * from cnt ), config;
> 
> COMMIT TRANSACTION;
> 
> 
> BEGIN TRANSACTION;
> 
> INSERT INTO idset2
>     SELECT id + 1000000 FROM ( WITH RECURSIVE
>       cnt( id ) AS (
>       VALUES( 0 ) UNION ALL
>       SELECT id+1 FROM cnt WHERE id < ( SELECT MAX( numAssets ) FROM
> config
> ))
>     select * from cnt ), config;
> 
> COMMIT TRANSACTION;
> 
> .timer ON
> SELECT count(id) FROM idset1 WHERE id IN ( SELECT id FROM idset1 UNION
> SELECT id FROM idset2 );
> .timer OFF
> _______________________________________________
> sqlite-users mailing list
> sqlite-users at mailinglists.sqlite.org
> http://mailinglists.sqlite.org/cgi-bin/mailman/listinfo/sqlite-users

[sqlite] Suggestions for Fast Set Logic?

Reply via email to