On 07/06/16 04:05, Emilio G. Cota wrote: > On Sat, May 28, 2016 at 21:15:06 +0300, Sergey Fedorov wrote: >> On 25/05/16 04:13, Emilio G. Cota wrote: >>> diff --git a/util/qdist.c b/util/qdist.c >>> new file mode 100644 >>> index 0000000..3343640 >>> --- /dev/null >>> +++ b/util/qdist.c >>> @@ -0,0 +1,386 @@ >> (snip) >>> + >>> +void qdist_add(struct qdist *dist, double x, long count) >>> +{ >>> + struct qdist_entry *entry = NULL; >>> + >>> + if (dist->entries) { >>> + struct qdist_entry e; >>> + >>> + e.x = x; >>> + entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp); >>> + } >>> + >>> + if (entry) { >>> + entry->count += count; >>> + return; >>> + } >>> + >>> + dist->entries = g_realloc(dist->entries, >>> + sizeof(*dist->entries) * (dist->n + 1)); >> Repeated doubling? > Can you please elaborate?
I mean dynamic array with a growth factor of 2 [https://en.wikipedia.org/wiki/Dynamic_array]. > >>> + dist->n++; >>> + entry = &dist->entries[dist->n - 1]; >> What if we combine the above two lines: >> >> entry = &dist->entries[dist->n++]; >> >> or just reverse them: >> >> entry = &dist->entries[dist->n]; >> dist->n++; > I have less trouble understanding the original. Okay. > >>> + entry->x = x; >>> + entry->count = count; >>> + qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp); >>> +} >>> + >> (snip) >>> +static char *qdist_pr_internal(const struct qdist *dist) >>> +{ >>> + double min, max, step; >>> + GString *s = g_string_new(""); >>> + size_t i; >>> + >>> + /* if only one entry, its printout will be either full or empty */ >>> + if (dist->n == 1) { >>> + if (dist->entries[0].count) { >>> + g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - >>> 1]); >>> + } else { >>> + g_string_append_c(s, ' '); >>> + } >>> + goto out; >>> + } >>> + >>> + /* get min and max counts */ >>> + min = dist->entries[0].count; >>> + max = min; >>> + for (i = 0; i < dist->n; i++) { >>> + struct qdist_entry *e = &dist->entries[i]; >>> + >>> + if (e->count < min) { >>> + min = e->count; >>> + } >>> + if (e->count > max) { >>> + max = e->count; >>> + } >>> + } >>> + >>> + /* floor((count - min) * step) will give us the block index */ >>> + step = (QDIST_NR_BLOCK_CODES - 1) / (max - min); >>> + >>> + for (i = 0; i < dist->n; i++) { >>> + struct qdist_entry *e = &dist->entries[i]; >>> + int index; >>> + >>> + /* make an exception with 0; instead of using block[0], print a >>> space */ >>> + if (e->count) { >>> + index = (int)((e->count - min) * step); >> So "e->count == min" gives us one eighth block instead of just space? > Yes, only 0 can print a space. So our scale is not linear. I think some users might get confused by this. > >>> + g_string_append_unichar(s, qdist_blocks[index]); >>> + } else { >>> + g_string_append_c(s, ' '); >>> + } >>> + } >>> + out: >>> + return g_string_free(s, FALSE); >>> +} >>> + >>> +/* >>> + * Bin the distribution in @from into @n bins of consecutive, >>> non-overlapping >>> + * intervals, copying the result to @to. >>> + * >>> + * This function is internal to qdist: only this file and test code should >>> + * ever call it. >>> + * >>> + * Note: calling this function on an already-binned qdist is a bug. >>> + * >>> + * If @n == 0 or @from->n == 1, use @from->n. >>> + */ >>> +void qdist_bin__internal(struct qdist *to, const struct qdist *from, >>> size_t n) >>> +{ >>> + double xmin, xmax; >>> + double step; >>> + size_t i, j, j_min; >>> + >>> + qdist_init(to); >>> + >>> + if (!from->entries) { >>> + return; >>> + } >>> + if (!n || from->n == 1) { >>> + n = from->n; >>> + } >>> + >>> + /* set equally-sized bins between @from's left and right */ >>> + xmin = qdist_xmin(from); >>> + xmax = qdist_xmax(from); >>> + step = (xmax - xmin) / n; >>> + >>> + if (n == from->n) { >>> + /* if @from's entries are equally spaced, no need to re-bin */ >>> + for (i = 0; i < from->n; i++) { >>> + if (from->entries[i].x != xmin + i * step) { >>> + goto rebin; >> static inline function instead of goto? > It would have quite a few arguments, I think the goto is fine. Actually, it would be 'xmin', 'xmax', and 'step' in addition to 'to', 'from', and 'n'. But yes, probably goto is fine here. > >>> + } >>> + } >>> + /* they're equally spaced, so copy the dist and bail out */ >>> + to->entries = g_malloc(sizeof(*to->entries) * from->n); >> g_new()? > Changed. > >>> + to->n = from->n; >>> + memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n); >>> + return; >>> + } >>> + >>> + rebin: By the way, here's a space before the 'rebin' label. >>> + j_min = 0; >>> + for (i = 0; i < n; i++) { >>> + double x; >>> + double left, right; >>> + >>> + left = xmin + i * step; >>> + right = xmin + (i + 1) * step; >>> + >>> + /* Add x, even if it might not get any counts later */ >>> + x = left; >> This way we round down to the left margin of each bin like this: >> >> xmin [*---*---*---*---*] xmax -- from >> | /| /| /| / >> | / | / | / | / >> |/ |/ |/ |/ >> | | | | >> V V V V >> [* * * *] -- to > (snip) >> xmin [*----*----*----*] xmax -- from >> \ /\ /\ /\ / >> \ / \ / \ / \ / >> | | | | >> V V V V >> [* * * *] -- to >> >> I'm not sure which is the more correct option from the mathematical >> point of view; but multiple-binning with the last variant of the >> algorithm we would still give the same result. > There's no "right" or "wrong" way as long as we're consistent > and we print the right counts in the right bins. I think the > convention I chose is simple enough, and leads to simple printing > of the labels. But yes other alternatives would be OK here. Well, if we go ahead with my last suggestion the code would look like this: rebin: /* We do the binning using the following scheme: * * xmin [*----*----*----*] xmax -- from * \ /\ /\ /\ / * \ / \ / \ / \ / * | | | | * V V V V * [* * * *] -- to * */ step = (xmax - xmin) / (n - 1); j = 0; for (i = 0; i < n; i++) { double x; double right; x = xmin + i * step; right = x + 0.5 * step; /* Add x, even if it might not get any counts later */ qdist_add(to, x, 0); /* To avoid double-counting we capture [left, right) ranges */ while (from->entries[j].x < right && j < from->n) { qdist_add(to, x, from->entries[j].count); j++; } } assert(j == from->n); } Actually it's simpler than current version. Kind regards, Sergey