orhankislal commented on a change in pull request #484: MFVSketch: Remove
duplicate results on GPDB
URL: https://github.com/apache/madlib/pull/484#discussion_r387366763
##########
File path: methods/sketch/src/pg_gp/mfvsketch.c
##########
@@ -568,35 +572,77 @@ bytea *mfvsketch_merge_c(bytea *transblob1, bytea
*transblob2)
qsort(transval1->mfvs, transval1->next_mfv, sizeof(offsetcnt),
cnt_cmp_desc);
qsort(transval2->mfvs, transval2->next_mfv, sizeof(offsetcnt),
cnt_cmp_desc);
+ Datum *values_seen = palloc0((2 * newval->max_mfvs) * sizeof(Datum));
+ unsigned int k;
+
/* choose top k from transval1 and transval2 */
for (i = j = cnt = 0;
cnt < newval->max_mfvs
- && (j < transval2->next_mfv || i < transval1->next_mfv);
- cnt++) {
+ && (j < transval2->next_mfv || i < transval1->next_mfv);) {
Datum iDatum, jDatum;
- if (i < transval1->next_mfv &&
+ if (i < transval1->next_mfv &&
(j == transval2->next_mfv
|| transval1->mfvs[i].cnt >= transval2->mfvs[j].cnt)) {
- /* next item comes from transval1 */
- iDatum = PointerExtractDatum(mfv_transval_getval(transblob1, i),
- transval1->typByVal);
- newblob = mfv_transval_append(newblob, iDatum);
- newval = (mfvtransval *)VARDATA(newblob);
- newval->mfvs[cnt].cnt = transval1->mfvs[i].cnt;
- i++;
+ /* next item comes from transval1 */
+ iDatum = PointerExtractDatum(mfv_transval_getval(transblob1, i),
+ transval1->typByVal);
+ for (k = 0; k < cnt; k++) {
+ if (datumIsEqual(iDatum, values_seen[k], transval1->typByVal,
transval1->typLen)) {
+ break;
+ }
+ }
+
+ if (k == cnt) {
+ newblob = mfv_transval_append(newblob, iDatum);
+ newval = (mfvtransval *)VARDATA(newblob);
+ newval->mfvs[cnt].cnt = transval1->mfvs[i].cnt;
+ if (cnt < 2 * transval1->max_mfvs) {
+ values_seen[cnt] = datumCopy(iDatum, transval1->typByVal,
+ transval1->typLen);
+ cnt++;
+ } else {
+ elog(WARNING, "Merge received more than %d values from
transition functions", 2*transval1->max_mfvs);
+ }
+ }
+ i++;
}
else if (j < transval2->next_mfv &&
(i == transval1->next_mfv
|| transval1->mfvs[i].cnt < transval2->mfvs[j].cnt)) {
- /* next item comes from transval2 */
- jDatum = PointerExtractDatum(mfv_transval_getval(transblob2, j),
+ /* next item comes from transval2 */
+ jDatum = PointerExtractDatum(mfv_transval_getval(transblob2, j),
transval2->typByVal);
- newblob = mfv_transval_append(newblob, jDatum);
- newval = (mfvtransval *)VARDATA(newblob);
- newval->mfvs[cnt].cnt = transval2->mfvs[j].cnt;
- j++;
+
+ for (k = 0; k < cnt; k++) {
+ if (datumIsEqual(jDatum, values_seen[k], transval2->typByVal,
+ transval2->typLen)) {
+ break;
+ }
+ }
+
+ if (k == cnt) {
+ newblob = mfv_transval_append(newblob, jDatum);
+ newval = (mfvtransval *)VARDATA(newblob);
+ newval->mfvs[cnt].cnt = transval2->mfvs[j].cnt;
+ if (cnt < 2*transval1->max_mfvs) {
+ values_seen[cnt] = datumCopy(jDatum, transval2->typByVal,
transval2->typLen);
+ cnt++;
+ } else {
+ elog(WARNING, "Merge received more than %d values from
transition functions", 2*transval2->max_mfvs);
+ }
+ }
+ j++;
}
}
+
+ if (!transval1->typByVal){
Review comment:
`transval1->typByVal` is a boolean and set to `true` or `false` during
initialization (`mfv_init_transval`) by a db function (`get_typbyval`). We have
this check here because we want to free the Datums if they were passed by
reference. Pass by value Datums are like primitive types and they are taken
care of once the array is freed at line 645
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services