This new attached patch corrects a slight inefficiency in my 
previous patch. It now uses UNION ALL (TK_ALL) instead of UNION
(TK_UNION) for the subselects, as was my initial intention. 
OR queries on dis-similar columns with large intermediate result
sets are now even faster.

Given the same sample database below and running this query:

 select count(*) from stuff 
 where b=1 or b=5 or c=2 or d=4 or b=9;

will yield these timings:

 sqlite 3.5.4 unpatched: CPU Time: user 3.560223 sys 0.156010
 3.5.4 + 20071221 patch: CPU Time: user 1.252079 sys 0.072004
 3.5.4 + 20071222 patch: CPU Time: user 0.776048 sys 0.116007

I also changed the OR cost estimator to reduce the cost for each 
additional OR expression pair, as it better reflects the actual
cost.

Here's the meta-diff between yesterday's patch and the new one:

-+        return (costL+costR);
++        /* Reduce the cost for each OR expression pair by a few percent due
++        ** to the fact that if the transform was performed then there would
++        ** be fewer expressions to evaluate in each compound subselect's
++        ** WHERE clause.
++        */
++        return (costL+costR)*31/32;
@@ -323 +328 @@
-+      p->op = TK_UNION;
++      p->op = TK_ALL;
@@ -329 +334 @@
-+        p->op = TK_UNION;
++        p->op = TK_ALL;
@@ -339 +344 @@
-+        p->op = TK_UNION;
++        p->op = TK_ALL;


--- Joe Wilson <[EMAIL PROTECTED]> wrote:
> The attached patch implements the WHERE clause "OR to UNION" 
> optimization as described in this post:
> 
>  http://www.mail-archive.com/sqlite-users@sqlite.org/msg09004.html
> 
> If the computed cost of the rewritten WHERE clause is lower than 
> the original query when indexes are taken into account, then it 
> will perform the optimization. If the cost is estimated to be 
> higher then the query will not be rewritten.
> 
> Given the database formed by running these statements:
> 
>   create table stuff(a,b,c,d);
>   insert into stuff values(1,2,3,4);
>   create temp view v1 as select random()%100,
>     random()%100, random()%1000, random()%10000
>      from stuff x, stuff y;
>   insert into stuff select * from v1;
>   insert into stuff select * from v1;
>   insert into stuff select * from v1;
>   insert into stuff select * from v1;
>   insert into stuff select * from v1;
>   create index stuff_b on stuff(b);
>   create index stuff_c on stuff(c);
>   create index stuff_d on stuff(d);
>   analyze;
> 
> The patched version of sqlite 3.5.4 will run the following query 
> many times faster than an unpatched sqlite 3.5.4:
> 
>   select b, count(*) from stuff 
>   where c=2 or b=23 or c=17 or c=493 or d=7 or c=111 and a=14 
>   group by 1 order by 2 DESC, 1 limit 10;
> 
> On my machine, the patched version produces these query timings:
> 
>   CPU Time: user 0.724045 sys 0.092005
> 
> with the EXPLAIN QUERY PLAN:
> 
>   0|0|TABLE stuff USING PRIMARY KEY
>   0|0|TABLE stuff WITH INDEX stuff_c
>   0|0|TABLE stuff WITH INDEX stuff_d
>   0|0|TABLE stuff WITH INDEX stuff_b
>   0|0|TABLE stuff WITH INDEX stuff_c
> 
> For the same query the unpatched sqlite 3.5.4 produces:
> 
>   CPU Time: user 20.869304 sys 8.912557
> 
>   0|0|TABLE stuff WITH INDEX stuff_b ORDER BY
> 
> Only single table queries are supported by this OR optimization. 
> For this optimization to be considered, the WHERE clause may only 
> consist of column equality comparisons to constants, ORs and ANDs.
> 
> The optimization only looks at the top-level WHERE clause ORs. It 
> will not work with "IN" expressions. Nor will it will not expand 
> expressions like "a=1 AND (b=2 or c=3)" into "a=1 AND b=2 OR a=1 
> AND c=3" - although if manually expanded, the latter form could 
> potentially be optimized.
> 
> It passes "make test" without regressions, but more testing is needed.




      
____________________________________________________________________________________
Never miss a thing.  Make Yahoo your home page. 
http://www.yahoo.com/r/hs
Index: src/select.c
===================================================================
RCS file: /sqlite/sqlite/src/select.c,v
retrieving revision 1.372
diff -u -3 -p -r1.372 select.c
--- src/select.c        14 Dec 2007 17:24:40 -0000      1.372
+++ src/select.c        22 Dec 2007 17:37:13 -0000
@@ -12,7 +12,7 @@
 ** This file contains C code routines that are called by the parser
 ** to handle SELECT statements in SQLite.
 **
-** $Id: select.c,v 1.372 2007/12/14 17:24:40 drh Exp $
+** $Id: select.c,v 1.370 2007/12/13 21:54:11 drh Exp $
 */
 #include "sqliteInt.h"
 
@@ -2961,6 +2961,445 @@ static void updateAccumulator(Parse *pPa
   pAggInfo->directMode = 0;
 }
 
+#ifndef SQLITE_OMIT_OR_UNION_TRANSFORM
+
+/* The function prefix "o2u" stands for "OR to UNION TRANSFORM" */
+static double o2uAndCost(Expr *p, int iTable, Bitmask *bm);
+static double o2uEqCost(Expr *p, int iTable, Bitmask *bm);
+
+/*
+** Count the number of bits in Bitmask. Each bit represents the existance
+** of a column in an expression. The zeroth bit represents the use of the
+** rowid column in the WHERE clause, which is different from non-o2u
+** uses of Bitmask in the code.
+*/
+static int o2uBitCount(Bitmask x){
+  int n = 0;
+  while( x ){
+    x &= x-1;
+    ++n;
+  }
+  return n;
+}
+
+/*
+** Full table scan cost is simply the average number rows for each index
+** for the specified table.
+*/
+static double o2uFullTableScanCost(Table* pTab){
+  if( pTab ){
+    double sum = 0;
+    int n = 0;
+    Index *pIndex;
+    for( pIndex = pTab->pIndex; pIndex; pIndex = pIndex->pNext, n++ ){
+      if( pIndex->nColumn>0 ){
+        sum += pIndex->aiRowEst[0];
+      }
+    }
+    if( n && sum>n ){
+      return sum/n;
+    }
+  }
+  return 1000000;
+}
+
+/*
+** Estimate a WHERE clause column's cost taking indexes into account.
+** Record any columns encountered in Bitmask.
+*/
+static double o2uColumnCost(Expr *p, int iTable, Bitmask *bm){
+  if( p && p->op==TK_COLUMN && p->pSelect==0 && p->iTable==iTable && p->pTab ){
+    int iColumn = p->iColumn;
+    Index* pIndex;
+    if( iColumn>=-1 && iColumn<=(int)(sizeof(Bitmask)*8-2) ){
+      *bm |= 1<<(iColumn+1); /* rowid column -1 is the 0th bit */
+    }else{
+      *bm |= 0x3; /* iTable beyond range: disqualify single column OR opt */
+    }
+    if( iColumn==-1 ){
+      return 10;                          /* Match: rowid */
+    }
+    for( pIndex = p->pTab->pIndex; pIndex; pIndex = pIndex->pNext ){
+      if( pIndex->nColumn>0 && pIndex->aiColumn[0]==iColumn ){
+        return pIndex->aiRowEst[1];       /* Match: first column of index */
+      }
+    }
+    return o2uFullTableScanCost(p->pTab); /* No index on column */
+  }
+  return 0;                               /* Not a valid column */
+}
+
+/*
+** Estimate the cost of this WHERE clause TK_OR node.
+*/
+static double o2uOrCost(Expr *p, int iTable, Bitmask *bm){
+  if( p && p->op==TK_OR ){
+    Expr *pL = p->pLeft;
+    Expr *pR = p->pRight;
+    double costL = 0;
+    double costR = 0;
+    if(
+      (
+        /* Only one of the following will be valid (non-zero) */
+           (costL = o2uEqCost(pL, iTable, bm))
+        || (costL = o2uAndCost(pL, iTable, bm))
+        || (costL = o2uOrCost(pL, iTable, bm))
+      ) && (
+        /* Only one of the following will be valid (non-zero) */
+           (costR = o2uEqCost(pR, iTable, bm))
+        || (costR = o2uAndCost(pR, iTable, bm))
+        || (costR = o2uOrCost(pR, iTable, bm))
+      )
+    ){
+      /* Add the costs of the left and right expressions if both are valid,
+      ** otherwise disqualify the optimization by returning zero.
+      */
+      if( costL&&costR ){
+        /* Reduce the cost for each OR expression pair by a few percent due
+        ** to the fact that if the transform was performed then there would
+        ** be fewer expressions to evaluate in each compound subselect's
+        ** WHERE clause.
+        */
+        return (costL+costR)*31/32;
+      }
+    }
+  }
+  return 0;
+}
+
+/*
+** Estimate the cost of this WHERE clause TK_EQ node.
+*/
+static double o2uEqCost(Expr *p, int iTable, Bitmask *bm){
+  if( p && p->op==TK_EQ ){
+    double cost;
+    Expr *pL = p->pLeft;
+    Expr *pR = p->pRight;
+    cost = o2uColumnCost(pL, iTable, bm);
+    if( cost && sqlite3ExprIsConstant(pR) ){
+      return cost;
+    }
+    cost = o2uColumnCost(pR, iTable, bm);
+    if( cost && sqlite3ExprIsConstant(pL) ){
+      return cost;
+    }
+  }
+  return 0;
+}
+
+/*
+** Estimate the cost of this WHERE clause AND expression.
+*/
+static double o2uAndCost(Expr *p, int iTable, Bitmask *bm){
+  if( p && p->op==TK_AND ){
+    Expr *pL = p->pLeft;
+    double L;
+    *bm |= 0x3; /* Disqualify single OR column optimization */
+    if( pL->op==TK_EQ ){
+      L = o2uEqCost(pL, iTable, bm);
+    }else{
+      L = o2uAndCost(pL, iTable, bm);
+    }
+    if( L ){
+      Expr *pR = p->pRight;
+      double R;
+      if( pR->op==TK_EQ ){
+        R = o2uEqCost(pR, iTable, bm);
+      }else{
+        R = o2uAndCost(pR, iTable, bm);
+      }
+      /* If both the left and right AND expressions have a valid cost,
+      ** then return the lower cost of the two sides.
+      */
+      if( R ){
+        return L<R ? L : R;
+      }
+    }
+  }
+  return 0;
+}
+
+static double o2uEstLog(double N){
+  double logN = 1;
+  double x = 2;
+  while( N>x ){
+    logN += 1;
+    x *= 2;
+  }
+  return logN;
+}
+
+/*
+** Change all TK_COLUMN iTable values matching iTableFrom to iTableTo
+** in expression `p'.
+*/
+static void o2uChangeCursorOfColumn(Expr *p, int iTableFrom, int iTableTo){
+  assert(p);
+  assert(iTableFrom>=0);
+  assert(iTableTo>=0);
+  assert(iTableFrom!=iTableTo);
+  switch( p->op ){
+    case TK_COLUMN: {
+      if( p->iTable==iTableFrom ){
+        p->iTable = iTableTo;
+      }
+      break;
+    }
+    case TK_EQ:
+    case TK_OR:
+    case TK_AND: {
+      assert(p->pLeft);
+      assert(p->pRight);
+      o2uChangeCursorOfColumn(p->pLeft, iTableFrom, iTableTo);
+      o2uChangeCursorOfColumn(p->pRight, iTableFrom, iTableTo);
+      break;
+    }
+  }
+}
+
+/*
+** Create a raw subselect of the form:
+**
+**   SELECT rowid from TABLE
+*/
+static Select *o2uCreateSubselect(
+  Parse *pParse,
+  SrcList *pSrcParent,
+  Expr *rowid
+){
+  Select *p = sqlite3SelectNew(pParse,
+    sqlite3ExprListAppend(pParse,0,sqlite3ExprDup(pParse->db,rowid),0),
+    sqlite3SrcListDup(pParse->db,pSrcParent), 0,0,0,0,0,0,0);
+  if( p ){
+    int iCursor = pParse->nTab++;
+    p->pSrc->a[0].iCursor = iCursor;
+    p->pEList->a[0].pExpr->iTable = iCursor;
+  }
+  return p;
+}
+
+/*
+** Determine the iColumn value of the expression, or return -2 if 
+** not applicable. -1 is the rowid column, as usual.
+*/
+static int o2uExprColumnNo(Expr *p){
+  assert(p);
+  switch( p->op ){
+    case TK_EQ: {
+      if( sqlite3ExprIsConstant(p->pRight) && p->pLeft->op==TK_COLUMN ){
+        return p->pLeft->iColumn;
+      }
+      if( sqlite3ExprIsConstant(p->pLeft) && p->pRight->op==TK_COLUMN ){
+        return p->pRight->iColumn;
+      }
+    }
+    case TK_OR: {
+      int iColumnL = o2uExprColumnNo(p->pLeft);
+      int iColumnR = o2uExprColumnNo(p->pRight);
+      if( iColumnL>=-1 && iColumnL==iColumnR ){
+        return iColumnL;
+      }
+    }
+  }
+  return -2;   /* Unknown or no single iTable value. */
+}
+
+/*
+** Can this expression be combined with this compound select's WHERE clause?
+** Return true if their iColumns are valid and match.
+*/
+static int o2uIsWhereExprCompatible(Expr *pWhere, Expr *pExpr){
+  int iColumnWhere;
+  assert(pWhere);
+  assert(pExpr);
+  iColumnWhere = o2uExprColumnNo(pWhere);
+  if( iColumnWhere>=-1 ){
+    int iColumnExpr = o2uExprColumnNo(pExpr);
+    if( iColumnWhere==iColumnExpr ){
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/*
+** Traverse the original WHERE clause and transform it into the new
+** WHERE clause with a compound SELECT.
+**
+** Extract each WHERE clause OR term and create subselects for
+** the new WHERE clause's "rowid IN (compound select)" TK_IN node.
+** Expr pE's ownership is transferred to the newly created compound select
+** statements, and any unused OR nodes are deleted. Try to combine OR
+** expressions related to the same column into the same compound subselect.
+*/
+static int o2uTransferOrTerms(
+  Parse *pParse,
+  Select **ppSelectIn,
+  Expr *pE,
+  SrcList *pSrcParent,
+  Expr *rowid
+){
+  int rc = SQLITE_OK;
+  Select *p;
+  assert(pE);
+  assert(ppSelectIn);
+  assert(pSrcParent);
+  assert(rowid);
+  assert(rowid->op==TK_COLUMN);
+  if( pE->op==TK_OR ){
+    rc = o2uTransferOrTerms(pParse,ppSelectIn,pE->pLeft,pSrcParent,rowid);
+    if( rc==SQLITE_OK ){
+      rc = o2uTransferOrTerms(pParse,ppSelectIn,pE->pRight,pSrcParent,rowid);
+    }
+    pE->pLeft = pE->pRight = 0;
+    sqlite3ExprDelete(pE);
+  }else{
+    /* TK_COLUMN, TK_AND, TK_EQ expressions will make it here.
+    */
+    if( *ppSelectIn==0 ){
+      /* Create the SELECT of the "rowid IN (SELECT ...)" */
+      p = *ppSelectIn = o2uCreateSubselect(pParse, pSrcParent, rowid);
+    }else{
+      /* Find a matching UNION SELECT statement or create one.
+      */
+      Select *pSelectFound = 0;
+      p = *ppSelectIn;
+      assert(p);
+      if( o2uIsWhereExprCompatible(p->pWhere, pE) ){
+        pSelectFound = p;
+      }
+      p->op = TK_ALL;
+      while( p->pPrior ){
+        p = p->pPrior;
+        if( !pSelectFound && o2uIsWhereExprCompatible(p->pWhere, pE) ){
+          pSelectFound = p;
+        }
+        p->op = TK_ALL;
+      }
+      p->op = TK_SELECT;
+      if( pSelectFound ){
+        /* A compatible UNION SELECT was found. */
+        p = pSelectFound;
+      }else{
+        /* No compatible UNION SELECT found, so we must create one
+        ** and append to the the compound SELECT chain
+        */
+        p->op = TK_ALL;
+        p = p->pPrior = o2uCreateSubselect(pParse, pSrcParent, rowid);
+        p->op = TK_SELECT;
+      }
+    }
+    if( p==0 ){
+      sqlite3ExprDelete(pE);
+      return SQLITE_ERROR;
+    }
+    assert(p->pEList);
+    assert(p->pEList->nExpr==1);
+    assert(p->pEList->a);
+    assert(p->pSrc);
+    assert(p->pSrc->nSrc==1);
+    assert(p->pSrc->a);
+    if( p->pWhere ){
+      /* Append expression into existing WHERE clause using OR. */
+      Expr *pWhere = p->pWhere;
+      assert(o2uIsWhereExprCompatible(p->pWhere, pE));
+      p->pWhere = sqlite3PExpr(pParse,TK_OR,0,0,0);
+      if( p->pWhere==0 ){
+        p->pWhere = pWhere;
+        sqlite3ExprDelete(pE);
+        return SQLITE_ERROR;
+      }
+      p->pWhere->pLeft = pWhere;
+      p->pWhere->pRight = pE;
+    }else{
+      /* There is no existing WHERE clause, so use expression for WHERE. */
+      p->pWhere = pE;
+    }
+    p->isResolved = 2; /* Differentiate SELECT to not optimize again. */
+
+    /* The iTable of each TK_COLUMN node of the expression must match its
+    ** SELECT's iCursor.
+    */
+    o2uChangeCursorOfColumn(pE, rowid->iTable, p->pSrc->a[0].iCursor);
+  }
+  return rc;
+}
+
+/*
+** Perform the OR to UNION transformation.
+**
+** This routine is only called once you've already determined that
+** the OR to UNION transform is of lower cost than the original WHERE
+** clause.
+**
+** This routine will try to put like OR terms into the same SELECT
+** to take advantage of the already existing WHERE OR optimization.
+**
+** For example, this statement:
+**
+**   SELECT c, count(*) AS total FROM foo
+**   WHERE a=1 OR b=2 OR 3=a OR c=2 AND b=5
+**   GROUP BY c
+**   ORDER BY total, c;
+**
+** will be converted to:
+**
+**   SELECT c, count(*) AS total FROM foo
+**   WHERE rowid IN (
+**     SELECT rowid FROM foo WHERE a=1 OR 3=a  UNION ALL
+**     SELECT rowid FROM foo WHERE b=2         UNION ALL
+**     SELECT rowid FROM foo WHERE c=2 AND b=5
+**   )
+**   GROUP BY c
+**   ORDER BY total, c;
+*/
+static int o2uUnionize(Parse *pParse, Select *p){
+  Expr *pWhere = p->pWhere;
+  struct SrcList_item *pFrom = p->pSrc->a;
+  Expr *pIn = p->pWhere = sqlite3PExpr(pParse,TK_IN,0,0,0);
+  Expr *rowid = pIn->pLeft = sqlite3PExpr(pParse,TK_COLUMN,0,0,0);
+  rowid->token.z = "rowid";
+  rowid->token.n = 5;
+  rowid->token.dyn = 0;
+  rowid->affinity = SQLITE_AFF_NONE;
+  rowid->iColumn = -1;
+  rowid->iTable = p->pSrc->a[0].iCursor;
+  rowid->pTab = sqlite3LocateTable(pParse,pFrom->zName,pFrom->zDatabase);
+  return o2uTransferOrTerms(pParse, &pIn->pSelect, pWhere, p->pSrc, rowid);
+}
+
+/*
+** Determine whether the OR to UNION transform is of lower cost than a
+** full table scan once individual column indexes are taken into account.
+** If it is determined that it is likely a lower cost, then perform the
+** tranform.
+**
+** Note: This routine will not perform the tranform if ANY WHERE clause
+** column name has a '+' prepended to it, as in "+a=3 OR b=3".
+** Nor will this routine perform the transform if the WHERE contains only
+** like columns, as in "a=1 OR a=3 OR a=7", as this is more efficiently
+** performed by the existing OR optimization.
+*/
+static int o2uTransform(Parse *pParse, Select *p){
+  int rc = SQLITE_OK;
+  if( p && p->pWhere && p->isResolved==1 && p->pSrc
+  && p->pSrc->nSrc==1 && p->pSrc->a[0].pTab && p->pSrc->a[0].pSelect==0
+  && p->pSrc->a[0].iCursor>=0 ){
+    Bitmask bm = 0;
+    double orOptRowEst = o2uOrCost(p->pWhere, p->pSrc->a[0].iCursor, &bm);
+    int columnsUsed = o2uBitCount(bm);
+    if( orOptRowEst>0 && columnsUsed>1 ){
+      double orOptCost = orOptRowEst * o2uEstLog(orOptRowEst);
+      double fullScanCost = o2uFullTableScanCost(p->pSrc->a[0].pTab);
+      if( orOptCost<fullScanCost ){
+        rc = o2uUnionize(pParse, p);
+      }
+    }
+  }
+  return rc;
+}
+
+#endif /* SQLITE_OMIT_OR_UNION_TRANSFORM */
 
 /*
 ** Generate code for the given SELECT statement.
@@ -3111,6 +3550,16 @@ int sqlite3Select(
     pOrderBy = 0;
   }
 
+#ifndef SQLITE_OMIT_OR_UNION_TRANSFORM
+  /*
+  ** Perform the WHERE clause OR to UNION transformation if there is a
+  ** cost advantage.
+  */
+  rc = o2uTransform(pParse, p);
+  pWhere = p->pWhere;
+  if( rc ) goto select_end; /* An error occurred - likely a malloc failure. */
+#endif
+
   /* Begin generating code.
   */
   v = sqlite3GetVdbe(pParse);

-----------------------------------------------------------------------------
To unsubscribe, send email to [EMAIL PROTECTED]
-----------------------------------------------------------------------------

Reply via email to