Changeset: 8204fd20a908 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=8204fd20a908
Modified Files:
        gdk/gdk_sample.c
Branch: default
Log Message:

sampling: fix for high sampling ratios


diffs (90 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -27,6 +27,11 @@
  * This implementation has a logarithmic complexity that only depends on the
  * sample size.
  *
+ * There is a pathological case when the sample size is almost the size of the 
BAT.
+ * Then, many collisions occur and performance degrades. To catch this, we 
+ * switch to antiset semantics when the sample size is larger than half the BAT
+ * size. Then, we generate the values that should be omitted from the sample.
+ *
  */
 
 #include "monetdb_config.h"
@@ -92,6 +97,22 @@ static void OIDTreeToBAT(struct oidtreen
                OIDTreeToBAT(node->right, bat);
 }
 
+/* Antiset traversal, give us all values but the ones in the tree */
+static void OIDTreeToBATAntiset(struct oidtreenode* node, BAT *bat, BUN start, 
BUN stop) {
+       BUN noid;
+       if (node->left != NULL)
+               OIDTreeToBATAntiset(node->left, bat, start, node->oid);
+       else 
+               for (noid = start+1; noid < node->oid; noid++)
+                       ((oid *) bat->T->heap.base)[bat->batFirst + 
bat->batCount++] = noid;                    
+       
+        if (node->right != NULL)
+               OIDTreeToBATAntiset(node->right, bat, node->oid, stop);
+       else
+               for (noid = node->oid+1; noid < stop; noid++)
+                        ((oid *) bat->T->heap.base)[bat->batFirst + 
bat->batCount++] = noid;
+}
+
 static void OIDTreeDestroy(struct oidtreenode* node) {
        if (node == NULL) {
                return;
@@ -110,7 +131,7 @@ static void OIDTreeDestroy(struct oidtre
 BAT *
 BATsample(BAT *b, BUN n) {
        BAT *bn;
-       BUN cnt;
+       BUN cnt, slen;
        BUN rescnt = 0;
        struct oidtreenode* tree = NULL;
 
@@ -128,7 +149,7 @@ BATsample(BAT *b, BUN n) {
                BATseqbase(bn, 0);
                BATseqbase(BATmirror(bn), 0);
                /* sample size is larger than the input BAT, return all oids */
-       } else if (cnt <= n) {
+       } else if (n >= cnt) {
                bn = BATnew(TYPE_void, TYPE_void, cnt);
                BATsetcount(bn, cnt);
                BATseqbase(bn, 0);
@@ -136,9 +157,13 @@ BATsample(BAT *b, BUN n) {
        } else {
                BUN minoid = b->hseqbase;
                BUN maxoid = b->hseqbase + cnt;
-               //oid *o;
-               bn = BATnew(TYPE_void, TYPE_oid, n);
-
+               /* if someone samples more than half of our tree, we do the 
antiset */
+               bit antiset = n > cnt/2;
+               slen = n;
+               if (antiset) 
+                       n = cnt - n;
+               
+               bn = BATnew(TYPE_void, TYPE_oid, slen);
                if (bn == NULL ) {
                        GDKerror("#BATsample: memory allocation error");
                        return NULL;
@@ -162,10 +187,14 @@ BATsample(BAT *b, BUN n) {
                        tree = ttree;
                        rescnt++;
                }
-               OIDTreeToBAT(tree, bn);
+               if (!antiset) {
+                       OIDTreeToBAT(tree, bn);
+               } else {
+                       OIDTreeToBATAntiset(tree, bn, minoid-1, maxoid+1);
+               }
                OIDTreeDestroy(tree);
 
-               BATsetcount(bn, n);
+               BATsetcount(bn, slen);
                bn->trevsorted = bn->batCount <= 1;
                bn->tsorted = 1;
                bn->tkey = 1;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to