On 08/14/2014 04:01 AM, Tom Lane wrote:
I wrote:
That's a fair question. I did a very very simple hack to replace the item
offsets with item lengths -- turns out that that mostly requires removing
some code that changes lengths to offsets ;-). I then loaded up Larry's
example of a noncompressible JSON value, and compared pg_column_size()
which is just about the right thing here since it reports datum size after
compression. Remembering that the textual representation is 12353 bytes:
json: 382 bytes
jsonb, using offsets: 12593 bytes
jsonb, using lengths: 406 bytes
Oh, one more result: if I leave the representation alone, but change
the compression parameters to set first_success_by to INT_MAX, this
value takes up 1397 bytes. So that's better, but still more than a
3X penalty compared to using lengths. (Admittedly, this test value
probably is an outlier compared to normal practice, since it's a hundred
or so repetitions of the same two strings.)
For comparison, here's a patch that implements the scheme that Alexander
Korotkov suggested, where we store an offset every 8th element, and a
length in the others. It compresses Larry's example to 525 bytes.
Increasing the "stride" from 8 to 16 entries, it compresses to 461 bytes.
A nice thing about this patch is that it's on-disk compatible with the
current format, hence initdb is not required.
(The current comments claim that the first element in an array always
has the JENTRY_ISFIRST flags set; that is wrong, there is no such flag.
I removed the flag in commit d9daff0e, but apparently failed to update
the comment and the accompanying JBE_ISFIRST macro. Sorry about that,
will fix. This patch uses the bit that used to be JENTRY_ISFIRST to mark
entries that store a length instead of an end offset.).
- Heikki
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04f35bf..47b2998 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -1378,8 +1378,10 @@ convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level
errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
JENTRY_POSMASK)));
- if (i > 0)
+ if (i % JBE_STORE_LEN_STRIDE == 0)
meta = (meta & ~JENTRY_POSMASK) | totallen;
+ else
+ meta |= JENTRY_HAS_LEN;
copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
metaoffset += sizeof(JEntry);
}
@@ -1430,11 +1432,14 @@ convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int leve
errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
JENTRY_POSMASK)));
- if (i > 0)
+ if (i % JBE_STORE_LEN_STRIDE == 0)
meta = (meta & ~JENTRY_POSMASK) | totallen;
+ else
+ meta |= JENTRY_HAS_LEN;
copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
metaoffset += sizeof(JEntry);
+ /* put value */
convertJsonbValue(buffer, &meta, &pair->value, level);
len = meta & JENTRY_POSMASK;
totallen += len;
@@ -1445,7 +1450,7 @@ convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int leve
errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
JENTRY_POSMASK)));
- meta = (meta & ~JENTRY_POSMASK) | totallen;
+ meta |= JENTRY_HAS_LEN;
copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
metaoffset += sizeof(JEntry);
}
@@ -1592,3 +1597,39 @@ uniqueifyJsonbObject(JsonbValue *object)
object->val.object.nPairs = res + 1 - object->val.object.pairs;
}
}
+
+uint32
+jsonb_get_offset(const JEntry *ja, int index)
+{
+ uint32 off = 0;
+ int i;
+
+ /*
+ * Each absolute entry contains the *end* offset. Start offset of this
+ * entry is equal to the end offset of the previous entry.
+ */
+ for (i = index - 1; i >= 0; i--)
+ {
+ off += JBE_POSFLD(ja[i]);
+ if (!JBE_HAS_LEN(ja[i]))
+ break;
+ }
+ return off;
+}
+
+uint32
+jsonb_get_length(const JEntry *ja, int index)
+{
+ uint32 off;
+ uint32 len;
+
+ if (JBE_HAS_LEN(ja[index]))
+ len = JBE_POSFLD(ja[index]);
+ else
+ {
+ off = jsonb_get_offset(ja, index);
+ len = JBE_POSFLD(ja[index]) - off;
+ }
+
+ return len;
+}
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 5f2594b..dae6512 100644
--- a/src/include/utils/jsonb.h
+++ b/src/include/utils/jsonb.h
@@ -102,12 +102,12 @@ typedef struct JsonbValue JsonbValue;
* to JB_FSCALAR | JB_FARRAY.
*
* To encode the length and offset of the variable-length portion of each
- * node in a compact way, the JEntry stores only the end offset within the
- * variable-length portion of the container node. For the first JEntry in the
- * container's JEntry array, that equals to the length of the node data. For
- * convenience, the JENTRY_ISFIRST flag is set. The begin offset and length
- * of the rest of the entries can be calculated using the end offset of the
- * previous JEntry in the array.
+ * node in a compact way, the JEntry stores either the length of the element,
+ * or its end offset within the variable-length portion of the container node.
+ * Entries that store a length are marked with the JENTRY_HAS_LEN flag, other
+ * entries store an end offset. The begin offset and length of each entry
+ * can be calculated by scanning backwards to the previous entry storing an
+ * end offset, and adding up the lengths of the elements in between.
*
* Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
* the variable-length portion of some node types is aligned to a 4-byte
@@ -121,7 +121,8 @@ typedef struct JsonbValue JsonbValue;
/*
* Jentry format.
*
- * The least significant 28 bits store the end offset of the entry (see
+ * The least significant 28 bits store the end offset or the length of the
+ * entry, depending on whether JENTRY_HAS_LEN flag is set (see
* JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The next three bits
* are used to store the type of the entry. The most significant bit
* is set on the first entry in an array of JEntrys.
@@ -139,8 +140,10 @@ typedef uint32 JEntry;
#define JENTRY_ISNULL 0x40000000
#define JENTRY_ISCONTAINER 0x50000000 /* array or object */
+#define JENTRY_HAS_LEN 0x80000000
+
/* Note possible multiple evaluations */
-#define JBE_ISFIRST(je_) (((je_) & JENTRY_ISFIRST) != 0)
+#define JBE_HAS_LEN(je_) (((je_) & JENTRY_HAS_LEN) != 0)
#define JBE_ISSTRING(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING)
#define JBE_ISNUMERIC(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC)
#define JBE_ISCONTAINER(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER)
@@ -150,13 +153,20 @@ typedef uint32 JEntry;
#define JBE_ISBOOL(je_) (JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
/*
- * Macros for getting the offset and length of an element. Note multiple
- * evaluations and access to prior array element.
+ * Macros for getting the offset and length of an element.
*/
-#define JBE_ENDPOS(je_) ((je_) & JENTRY_POSMASK)
-#define JBE_OFF(ja, i) ((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1]))
-#define JBE_LEN(ja, i) ((i) == 0 ? JBE_ENDPOS((ja)[i]) \
- : JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1]))
+#define JBE_POSFLD(je_) ((je_) & JENTRY_POSMASK)
+#define JBE_OFF(ja, i) jsonb_get_offset(ja, i)
+#define JBE_LEN(ja, i) jsonb_get_length(ja, i)
+
+/*
+ * Store an absolute end offset every JBE_STORE_LEN_STRIDE elements (for an
+ * array) or key/value pairs (for an object). Others are stored as lengths.
+ */
+#define JBE_STORE_LEN_STRIDE 8
+
+extern uint32 jsonb_get_offset(const JEntry *ja, int index);
+extern uint32 jsonb_get_length(const JEntry *ja, int index);
/*
* A jsonb array or object node, within a Jsonb Datum.
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers