Changeset: 496fcf31e11a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=496fcf31e11a
Modified Files:
        gdk/gdk_sample.c
Branch: default
Log Message:

New sample code


diffs (164 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -31,22 +31,73 @@
 
 #define DRAND ((double)rand()/(double)RAND_MAX)
 
-/*
- * @+ Uniform Sampling.
- *
- * The implementation of the uniform sampling is based on the
- * algorithm A as described in the paper "Faster Methods for Random
- * Sampling" by Jeffrey Scott Vitter. Algorithm A is not the fastest
- * one, but it only makes s calls in function random() and it is
- * simpler than the other more complex and CPU intensive algorithms in
- * the literature.
- *
- * Algorithm A instead of performing one random experiment for each
- * row to decide if it should be included in the sample or not, it
- * skips S rows and includes the S+1 row. The algorithm scans the
- * input relation sequentially and maintains the unique and sort
- * properties. The sample is without replacement.
- */
+/* this is a straightforward implementation of a binary tree */
+struct oidtreenode {
+       BUN oid;
+       struct oidtreenode* left;
+       struct oidtreenode* right;
+};
+
+static int OIDTreeLookup(struct oidtreenode* node, BUN target) {
+       if (node == NULL) {
+               return (FALSE);
+       } else {
+               if (target == node->oid)
+                       return (TRUE);
+               else {
+                       if (target < node->oid)
+                               return (OIDTreeLookup(node->left, target));
+                       else
+                               return (OIDTreeLookup(node->right, target));
+               }
+       }
+}
+
+static struct oidtreenode* OIDTreeNew(BUN oid) {
+       struct oidtreenode * node = malloc(sizeof(struct oidtreenode));
+       node->oid = oid;
+       node->left = NULL;
+       node->right = NULL;
+       return (node);
+}
+
+static struct oidtreenode* OIDTreeInsert(struct oidtreenode* node, BUN oid) {
+       if (node == NULL) {
+               return (OIDTreeNew(oid));
+       } else {
+               if (oid <= node->oid)
+                       node->left = OIDTreeInsert(node->left, oid);
+               else
+                       node->right = OIDTreeInsert(node->right, oid);
+               return (node);
+       }
+}
+
+#define APPEND(b, o)           (((oid *) b->T->heap.base)[b->batFirst + 
b->batCount++] = (o))
+
+// inorder traversal
+static void OIDTreeToBAT(struct oidtreenode* node, BAT *bat) {
+       if (node->left != NULL) OIDTreeToBAT(node->left,bat);
+        APPEND(bat, node->oid);
+       if (node->right != NULL) OIDTreeToBAT(node->right,bat);
+}
+
+static void OIDTreeDestroy(struct oidtreenode* node) {
+       if (node->left != NULL) {
+               OIDTreeDestroy(node->left);
+               free(node->left);
+       }
+       if (node->right != NULL) {
+               OIDTreeDestroy(node->right);
+               free(node->right);
+       }
+}
+
+// TODO: clean up tree
+
+// TODO: describe
+
+// nice macro, could be globally available
 
 /* BATsample implements sampling for void headed BATs */
 BAT *
@@ -54,6 +105,8 @@ BATsample(BAT *b, BUN n)
 {
        BAT *bn;
        BUN cnt;
+       BUN rescnt = 0;
+       struct oidtreenode* tree = NULL;
 
        BATcheck(b, "BATsample");
        assert(BAThdense(b));
@@ -74,36 +127,31 @@ BATsample(BAT *b, BUN n)
                BATseqbase(bn, 0);
                BATseqbase(BATmirror(bn), b->H->seq);
        } else {
-               BUN smp = 0;
-               /* we use wrd and not BUN since p may be -1 */
-               wrd top = b->hseqbase + cnt - n;
-               wrd p = ((wrd) b->hseqbase) - 1;
-               oid *o;
+               BUN minoid = b->hseqbase;
+               BUN maxoid = b->hseqbase + cnt;
+               //oid *o;
                bn = BATnew(TYPE_void, TYPE_oid, n);
+
                if (bn == NULL) {
                        GDKerror("#BATsample: memory allocation error");
                        return NULL;
                }
-               o = (oid *) Tloc(bn, BUNfirst(bn));
-               while (smp < n-1) { /* loop until all but 1 values are sampled 
*/
-                       double v = DRAND;
-                       double quot = (double)top/(double)cnt;
-                       BUN jump = 0;
-                       while (quot > v) { /* determine how many positions to 
jump */
-                               jump++;
-                               top--;
-                               cnt--;
-                               quot *= (double)top/(double)cnt;
-                       }
-                       p += (jump+1);
-                       cnt--;
-                       o[smp++] = (oid) p;
+               /* while we do not have enough sample OIDs yet */
+               while (rescnt < n) {
+                       BUN candoid;
+                       do {
+                               double v = DRAND;
+                               /* generate a new random OID */
+                               candoid = minoid + v * (maxoid - minoid);
+                               /* if the OID was already in the BAT, try again 
*/
+                       } while (OIDTreeLookup(tree, candoid));
+                       tree = OIDTreeInsert(tree, candoid);
+                       rescnt++;
                }
-               /* 1 left */
-               p += (BUN) rand() % cnt;
-               o[smp] = (oid) p+1;
+               OIDTreeToBAT(tree, bn);
+               OIDTreeDestroy(tree);
+               free(tree);
 
-               /* property management */
                BATsetcount(bn, n);
                bn->trevsorted = bn->batCount <= 1;
                bn->tkey = 1;
@@ -115,6 +163,5 @@ BATsample(BAT *b, BUN n)
                bn->hkey = 1;
                bn->hrevsorted = bn->batCount <= 1;
        }
-
        return bn;
 }
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to