On 25/05/16 04:13, Emilio G. Cota wrote: > diff --git a/util/qdist.c b/util/qdist.c > new file mode 100644 > index 0000000..3343640 > --- /dev/null > +++ b/util/qdist.c > @@ -0,0 +1,386 @@ (snip) > + > +void qdist_add(struct qdist *dist, double x, long count) > +{ > + struct qdist_entry *entry = NULL; > + > + if (dist->entries) { > + struct qdist_entry e; > + > + e.x = x; > + entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp); > + } > + > + if (entry) { > + entry->count += count; > + return; > + } > + > + dist->entries = g_realloc(dist->entries, > + sizeof(*dist->entries) * (dist->n + 1));
Repeated doubling? > + dist->n++; > + entry = &dist->entries[dist->n - 1]; What if we combine the above two lines: entry = &dist->entries[dist->n++]; or just reverse them: entry = &dist->entries[dist->n]; dist->n++; > + entry->x = x; > + entry->count = count; > + qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp); > +} > + (snip) > +static char *qdist_pr_internal(const struct qdist *dist) > +{ > + double min, max, step; > + GString *s = g_string_new(""); > + size_t i; > + > + /* if only one entry, its printout will be either full or empty */ > + if (dist->n == 1) { > + if (dist->entries[0].count) { > + g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - > 1]); > + } else { > + g_string_append_c(s, ' '); > + } > + goto out; > + } > + > + /* get min and max counts */ > + min = dist->entries[0].count; > + max = min; > + for (i = 0; i < dist->n; i++) { > + struct qdist_entry *e = &dist->entries[i]; > + > + if (e->count < min) { > + min = e->count; > + } > + if (e->count > max) { > + max = e->count; > + } > + } > + > + /* floor((count - min) * step) will give us the block index */ > + step = (QDIST_NR_BLOCK_CODES - 1) / (max - min); > + > + for (i = 0; i < dist->n; i++) { > + struct qdist_entry *e = &dist->entries[i]; > + int index; > + > + /* make an exception with 0; instead of using block[0], print a > space */ > + if (e->count) { > + index = (int)((e->count - min) * step); So "e->count == min" gives us one eighth block instead of just space? > + g_string_append_unichar(s, qdist_blocks[index]); > + } else { > + g_string_append_c(s, ' '); > + } > + } > + out: > + return g_string_free(s, FALSE); > +} > + > +/* > + * Bin the distribution in @from into @n bins of consecutive, non-overlapping > + * intervals, copying the result to @to. > + * > + * This function is internal to qdist: only this file and test code should > + * ever call it. > + * > + * Note: calling this function on an already-binned qdist is a bug. > + * > + * If @n == 0 or @from->n == 1, use @from->n. > + */ > +void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t > n) > +{ > + double xmin, xmax; > + double step; > + size_t i, j, j_min; > + > + qdist_init(to); > + > + if (!from->entries) { > + return; > + } > + if (!n || from->n == 1) { > + n = from->n; > + } > + > + /* set equally-sized bins between @from's left and right */ > + xmin = qdist_xmin(from); > + xmax = qdist_xmax(from); > + step = (xmax - xmin) / n; > + > + if (n == from->n) { > + /* if @from's entries are equally spaced, no need to re-bin */ > + for (i = 0; i < from->n; i++) { > + if (from->entries[i].x != xmin + i * step) { > + goto rebin; static inline function instead of goto? > + } > + } > + /* they're equally spaced, so copy the dist and bail out */ > + to->entries = g_malloc(sizeof(*to->entries) * from->n); g_new()? > + to->n = from->n; > + memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n); > + return; > + } > + > + rebin: > + j_min = 0; > + for (i = 0; i < n; i++) { > + double x; > + double left, right; > + > + left = xmin + i * step; > + right = xmin + (i + 1) * step; > + > + /* Add x, even if it might not get any counts later */ > + x = left; This way we round down to the left margin of each bin like this: xmin [*---*---*---*---*] xmax -- from | /| /| /| / | / | / | / | / |/ |/ |/ |/ | | | | V V V V [* * * *] -- to instead of e.g. rounding to the middle of each bin with: x = left + step / 2; which would give the picture like this: xmin [*---*---*---*---*] xmax -- from | | | | | \ / \ / \ / \ / | | | | V V V V [* * * *] -- to or even: left = xmin + (i - 0.5) * step; right = left + step; x = left + step / 2; with corresponding changes in the following loop. That would give us this picture (with the same 'n'): xmin [*----*----*----*] xmax -- from \ /\ /\ /\ / \ / \ / \ / \ / | | | | V V V V [* * * *] -- to I'm not sure which is the more correct option from the mathematical point of view; but multiple-binning with the last variant of the algorithm we would still give the same result. > + qdist_add(to, x, 0); > + > + /* > + * To avoid double-counting we capture [left, right) ranges, except > for > + * the righmost bin, which captures a [left, right] range. > + */ > + for (j = j_min; j < from->n; j++) { Looks like we don't need to keep both 'j' and 'j_min'. We could just use 'j', initialize it before the outer loop, and do the inner loop with "while". > + struct qdist_entry *o = &from->entries[j]; > + > + /* entries are ordered so do not check beyond right */ > + if (o->x > right) { > + break; > + } > + if (o->x >= left && (o->x < right || > + (i == n - 1 && o->x == right))) { > + qdist_add(to, x, o->count); > + /* don't check this entry again */ > + j_min = j + 1; > + } > + } > + } > +} > + (snip) > +double qdist_avg(const struct qdist *dist) > +{ > + unsigned long count; > + size_t i; > + double ret = 0; > + > + count = qdist_sample_count(dist); > + if (!count) { > + return NAN; > + } > + for (i = 0; i < dist->n; i++) { > + struct qdist_entry *e = &dist->entries[i]; > + > + ret += e->x * e->count / count; Please use Welford’s method or something like that, see http://stackoverflow.com/a/1346890. > + } > + return ret; > +}