On 08/14/2014 04:01 AM, Tom Lane wrote:
I wrote:
That's a fair question.  I did a very very simple hack to replace the item
offsets with item lengths -- turns out that that mostly requires removing
some code that changes lengths to offsets ;-).  I then loaded up Larry's
example of a noncompressible JSON value, and compared pg_column_size()
which is just about the right thing here since it reports datum size after
compression.  Remembering that the textual representation is 12353 bytes:

json:                           382 bytes
jsonb, using offsets:           12593 bytes
jsonb, using lengths:           406 bytes

Oh, one more result: if I leave the representation alone, but change
the compression parameters to set first_success_by to INT_MAX, this
value takes up 1397 bytes.  So that's better, but still more than a
3X penalty compared to using lengths.  (Admittedly, this test value
probably is an outlier compared to normal practice, since it's a hundred
or so repetitions of the same two strings.)

For comparison, here's a patch that implements the scheme that Alexander Korotkov suggested, where we store an offset every 8th element, and a length in the others. It compresses Larry's example to 525 bytes. Increasing the "stride" from 8 to 16 entries, it compresses to 461 bytes.

A nice thing about this patch is that it's on-disk compatible with the current format, hence initdb is not required.

(The current comments claim that the first element in an array always has the JENTRY_ISFIRST flags set; that is wrong, there is no such flag. I removed the flag in commit d9daff0e, but apparently failed to update the comment and the accompanying JBE_ISFIRST macro. Sorry about that, will fix. This patch uses the bit that used to be JENTRY_ISFIRST to mark entries that store a length instead of an end offset.).

- Heikki

diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04f35bf..47b2998 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -1378,8 +1378,10 @@ convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level
 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
 							JENTRY_POSMASK)));
 
-		if (i > 0)
+		if (i % JBE_STORE_LEN_STRIDE == 0)
 			meta = (meta & ~JENTRY_POSMASK) | totallen;
+		else
+			meta |= JENTRY_HAS_LEN;
 		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
 		metaoffset += sizeof(JEntry);
 	}
@@ -1430,11 +1432,14 @@ convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int leve
 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
 							JENTRY_POSMASK)));
 
-		if (i > 0)
+		if (i % JBE_STORE_LEN_STRIDE == 0)
 			meta = (meta & ~JENTRY_POSMASK) | totallen;
+		else
+			meta |= JENTRY_HAS_LEN;
 		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
 		metaoffset += sizeof(JEntry);
 
+		/* put value */
 		convertJsonbValue(buffer, &meta, &pair->value, level);
 		len = meta & JENTRY_POSMASK;
 		totallen += len;
@@ -1445,7 +1450,7 @@ convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int leve
 					 errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
 							JENTRY_POSMASK)));
 
-		meta = (meta & ~JENTRY_POSMASK) | totallen;
+		meta |= JENTRY_HAS_LEN;
 		copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
 		metaoffset += sizeof(JEntry);
 	}
@@ -1592,3 +1597,39 @@ uniqueifyJsonbObject(JsonbValue *object)
 		object->val.object.nPairs = res + 1 - object->val.object.pairs;
 	}
 }
+
+uint32
+jsonb_get_offset(const JEntry *ja, int index)
+{
+	uint32		off = 0;
+	int			i;
+
+	/*
+	 * Each absolute entry contains the *end* offset. Start offset of this
+	 * entry is equal to the end offset of the previous entry.
+	 */
+	for (i = index - 1; i >= 0; i--)
+	{
+		off += JBE_POSFLD(ja[i]);
+		if (!JBE_HAS_LEN(ja[i]))
+			break;
+	}
+	return off;
+}
+
+uint32
+jsonb_get_length(const JEntry *ja, int index)
+{
+	uint32		off;
+	uint32		len;
+
+	if (JBE_HAS_LEN(ja[index]))
+		len = JBE_POSFLD(ja[index]);
+	else
+	{
+		off = jsonb_get_offset(ja, index);
+		len = JBE_POSFLD(ja[index]) - off;
+	}
+
+	return len;
+}
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 5f2594b..dae6512 100644
--- a/src/include/utils/jsonb.h
+++ b/src/include/utils/jsonb.h
@@ -102,12 +102,12 @@ typedef struct JsonbValue JsonbValue;
  * to JB_FSCALAR | JB_FARRAY.
  *
  * To encode the length and offset of the variable-length portion of each
- * node in a compact way, the JEntry stores only the end offset within the
- * variable-length portion of the container node. For the first JEntry in the
- * container's JEntry array, that equals to the length of the node data. For
- * convenience, the JENTRY_ISFIRST flag is set. The begin offset and length
- * of the rest of the entries can be calculated using the end offset of the
- * previous JEntry in the array.
+ * node in a compact way, the JEntry stores either the length of the element,
+ * or its end offset within the variable-length portion of the container node.
+ * Entries that store a length are marked with the JENTRY_HAS_LEN flag, other
+ * entries store an end offset. The begin offset and length of each entry
+ * can be calculated by scanning backwards to the previous entry storing an
+ * end offset, and adding up the lengths of the elements in between.
  *
  * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
  * the variable-length portion of some node types is aligned to a 4-byte
@@ -121,7 +121,8 @@ typedef struct JsonbValue JsonbValue;
 /*
  * Jentry format.
  *
- * The least significant 28 bits store the end offset of the entry (see
+ * The least significant 28 bits store the end offset or the length of the
+ * entry, depending on whether JENTRY_HAS_LEN flag is set (see
  * JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The next three bits
  * are used to store the type of the entry. The most significant bit
  * is set on the first entry in an array of JEntrys.
@@ -139,8 +140,10 @@ typedef uint32 JEntry;
 #define JENTRY_ISNULL			0x40000000
 #define JENTRY_ISCONTAINER		0x50000000		/* array or object */
 
+#define JENTRY_HAS_LEN			0x80000000
+
 /* Note possible multiple evaluations */
-#define JBE_ISFIRST(je_)		(((je_) & JENTRY_ISFIRST) != 0)
+#define JBE_HAS_LEN(je_)		(((je_) & JENTRY_HAS_LEN) != 0)
 #define JBE_ISSTRING(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING)
 #define JBE_ISNUMERIC(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC)
 #define JBE_ISCONTAINER(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER)
@@ -150,13 +153,20 @@ typedef uint32 JEntry;
 #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
 
 /*
- * Macros for getting the offset and length of an element. Note multiple
- * evaluations and access to prior array element.
+ * Macros for getting the offset and length of an element.
  */
-#define JBE_ENDPOS(je_)			((je_) & JENTRY_POSMASK)
-#define JBE_OFF(ja, i)			((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1]))
-#define JBE_LEN(ja, i)			((i) == 0 ? JBE_ENDPOS((ja)[i]) \
-								 : JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1]))
+#define JBE_POSFLD(je_)			((je_) & JENTRY_POSMASK)
+#define JBE_OFF(ja, i)			jsonb_get_offset(ja, i)
+#define JBE_LEN(ja, i)			jsonb_get_length(ja, i)
+
+/*
+ * Store an absolute end offset every JBE_STORE_LEN_STRIDE elements (for an
+ * array) or key/value pairs (for an object). Others are stored as lengths.
+ */
+#define JBE_STORE_LEN_STRIDE	8
+
+extern uint32 jsonb_get_offset(const JEntry *ja, int index);
+extern uint32 jsonb_get_length(const JEntry *ja, int index);
 
 /*
  * A jsonb array or object node, within a Jsonb Datum.
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to