Tomas Vondra <tomas.von...@2ndquadrant.com> writes:
> Attached is a slightly improved version of the serialization patch.

I reviewed this patch, and tested it on hppa and ppc.  I found one
serious bug: in the deserialization varlena case, you need

-                                       dataptr += MAXALIGN(len);
+                                       dataptr += MAXALIGN(len + VARHDRSZ);

(approx. line 1100 in mcv.c).  Without this, the output data is corrupt,
plus the Assert a few lines further down about dataptr having been
advanced by the correct amount fires.  (On one machine I tested on,
that happened during the core regression tests.  The other machine
got through regression, but trying to do "select * from pg_stats_ext;"
afterwards exhibited the crash.  I didn't investigate closely, but
I suspect the difference has to do with different MAXALIGN values,
4 and 8 respectively.)

The attached patch (a delta atop your v2) corrects that plus some
cosmetic issues.

If we're going to push this, it would be considerably less complicated
to do so before v12 gets branched --- not long after that, there will be
catversion differences to cope with.  I'm planning to make the branch
tomorrow (Monday), probably ~1500 UTC.  Just sayin'.

                        regards, tom lane

diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 256728a..cfe7e54 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -40,13 +40,14 @@
  * stored in a separate array (deduplicated, to minimize the size), and
  * so the serialized items only store uint16 indexes into that array.
  *
- * Each serialized item store (in this order):
+ * Each serialized item stores (in this order):
  *
  * - indexes to values	  (ndim * sizeof(uint16))
  * - null flags			  (ndim * sizeof(bool))
  * - frequency			  (sizeof(double))
  * - base_frequency		  (sizeof(double))
  *
+ * There is no alignment padding within an MCV item.
  * So in total each MCV item requires this many bytes:
  *
  *	 ndim * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double)
@@ -61,7 +62,7 @@
 	(VARHDRSZ + sizeof(uint32) * 3 + sizeof(AttrNumber))
 
 /*
- * Size of the serialized MCV list, exluding the space needed for
+ * Size of the serialized MCV list, excluding the space needed for
  * deduplicated per-dimension values. The macro is meant to be used
  * when it's not yet safe to access the serialized info about amount
  * of data for each column.
@@ -619,6 +620,7 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats)
 		else if (info[dim].typlen == -1)	/* varlena */
 		{
 			info[dim].nbytes = 0;
+			info[dim].nbytes_aligned = 0;
 			for (i = 0; i < info[dim].nvalues; i++)
 			{
 				Size		len;
@@ -646,6 +648,7 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats)
 		else if (info[dim].typlen == -2)	/* cstring */
 		{
 			info[dim].nbytes = 0;
+			info[dim].nbytes_aligned = 0;
 			for (i = 0; i < info[dim].nvalues; i++)
 			{
 				Size		len;
@@ -743,7 +746,7 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats)
 				 * assumes little endian behavior.  store_att_byval does
 				 * almost what we need, but it requires properly aligned
 				 * buffer - the output buffer does not guarantee that. So we
-				 * simply use a static Datum variable (which guarantees proper
+				 * simply use a local Datum variable (which guarantees proper
 				 * alignment), and then copy the value from it.
 				 */
 				store_att_byval(&tmp, value, info[dim].typlen);
@@ -759,14 +762,14 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats)
 			}
 			else if (info[dim].typlen == -1)	/* varlena */
 			{
-				uint32		len = VARSIZE_ANY_EXHDR(value);
+				uint32		len = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
 
 				/* copy the length */
 				memcpy(ptr, &len, sizeof(uint32));
 				ptr += sizeof(uint32);
 
 				/* data from the varlena value (without the header) */
-				memcpy(ptr, VARDATA(DatumGetPointer(value)), len);
+				memcpy(ptr, VARDATA_ANY(DatumGetPointer(value)), len);
 				ptr += len;
 			}
 			else if (info[dim].typlen == -2)	/* cstring */
@@ -1100,7 +1103,7 @@ statext_mcv_deserialize(bytea *data)
 					map[dim][i] = PointerGetDatum(dataptr);
 
 					/* skip to place of the next deserialized value */
-					dataptr += MAXALIGN(len);
+					dataptr += MAXALIGN(len + VARHDRSZ);
 				}
 			}
 			else if (info[dim].typlen == -2)
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index 6778746..8fc5419 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -36,7 +36,7 @@ typedef struct DimensionInfo
 {
 	int			nvalues;		/* number of deduplicated values */
 	int			nbytes;			/* number of bytes (serialized) */
-	int			nbytes_aligned;	/* deserialized data with alignment */
+	int			nbytes_aligned;	/* size of deserialized data with alignment */
 	int			typlen;			/* pg_type.typlen */
 	bool		typbyval;		/* pg_type.typbyval */
 } DimensionInfo;

Reply via email to