I wrote:
> I think the idea of hashing only keys/values that are "too long" is a
> reasonable compromise.  I've not finished coding it (because I keep
> getting distracted by other problems in the code :-() but it does not
> look to be very difficult.  I'm envisioning the cutoff as being something
> like 128 bytes; in practice that would mean that few if any keys get
> hashed, I think.

Attached is a draft patch for this.  In addition to the hash logic per se,
I made these changes:

* Replaced the K/V prefix bytes with a code that distinguishes the types
of JSON values.  While this is not of any huge significance for the
current index search operators, it's basically free to store the info,
so I think we should do it for possible future use.

* Fixed the problem with "exists" returning rows it shouldn't.  I
concluded that the best fix is just to force recheck for exists, which
allows considerable simplification in the consistent functions.

* Tried to improve the comments in jsonb_gin.c.

Barring objections I'll commit this tomorrow, and also try to improve the
user-facing documentation about the jsonb opclasses.

                        regards, tom lane

diff --git a/src/backend/utils/adt/jsonb_gin.c b/src/backend/utils/adt/jsonb_gin.c
index 592036a..2c4ade2 100644
*** a/src/backend/utils/adt/jsonb_gin.c
--- b/src/backend/utils/adt/jsonb_gin.c
***************
*** 14,19 ****
--- 14,20 ----
  #include "postgres.h"
  
  #include "access/gin.h"
+ #include "access/hash.h"
  #include "access/skey.h"
  #include "catalog/pg_collation.h"
  #include "catalog/pg_type.h"
*************** typedef struct PathHashStack
*** 26,39 ****
  	struct PathHashStack *parent;
  } PathHashStack;
  
! static text *make_text_key(const char *str, int len, char flag);
! static text *make_scalar_key(const JsonbValue *scalarVal, char flag);
  
  /*
   *
   * jsonb_ops GIN opclass support functions
   *
   */
  Datum
  gin_compare_jsonb(PG_FUNCTION_ARGS)
  {
--- 27,41 ----
  	struct PathHashStack *parent;
  } PathHashStack;
  
! static Datum make_text_key(char flag, const char *str, int len);
! static Datum make_scalar_key(const JsonbValue *scalarVal, bool is_key);
  
  /*
   *
   * jsonb_ops GIN opclass support functions
   *
   */
+ 
  Datum
  gin_compare_jsonb(PG_FUNCTION_ARGS)
  {
*************** gin_extract_jsonb(PG_FUNCTION_ARGS)
*** 65,144 ****
  {
  	Jsonb	   *jb = (Jsonb *) PG_GETARG_JSONB(0);
  	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
- 	Datum	   *entries = NULL;
  	int			total = 2 * JB_ROOT_COUNT(jb);
- 	int			i = 0,
- 				r;
  	JsonbIterator *it;
  	JsonbValue	v;
  
  	if (total == 0)
  	{
  		*nentries = 0;
  		PG_RETURN_POINTER(NULL);
  	}
  
  	entries = (Datum *) palloc(sizeof(Datum) * total);
  
  	it = JsonbIteratorInit(&jb->root);
  
  	while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
  	{
  		if (i >= total)
  		{
  			total *= 2;
  			entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
  		}
  
- 		/*
- 		 * Serialize keys and elements equivalently,  but only when elements
- 		 * are Jsonb strings.  Otherwise, serialize elements as values.  Array
- 		 * elements are indexed as keys, for the benefit of
- 		 * JsonbExistsStrategyNumber.  Our definition of existence does not
- 		 * allow for checking the existence of a non-jbvString element (just
- 		 * like the definition of the underlying operator), because the
- 		 * operator takes a text rhs argument (which is taken as a proxy for
- 		 * an equivalent Jsonb string).
- 		 *
- 		 * The way existence is represented does not preclude an alternative
- 		 * existence operator, that takes as its rhs value an arbitrarily
- 		 * internally-typed Jsonb.  The only reason that isn't the case here
- 		 * is that the existence operator is only really intended to determine
- 		 * if an object has a certain key (object pair keys are of course
- 		 * invariably strings), which is extended to jsonb arrays.  You could
- 		 * think of the default Jsonb definition of existence as being
- 		 * equivalent to a definition where all types of scalar array elements
- 		 * are keys that we can check the existence of, while just forbidding
- 		 * non-string notation.  This inflexibility prevents the user from
- 		 * having to qualify that the rhs string is a raw scalar string (that
- 		 * is, naturally no internal string quoting in required for the text
- 		 * argument), and allows us to not set the reset flag for
- 		 * JsonbExistsStrategyNumber, since we know that keys are strings for
- 		 * both objects and arrays, and don't have to further account for type
- 		 * mismatch.  Not having to set the reset flag makes it less than
- 		 * tempting to tighten up the definition of existence to preclude
- 		 * array elements entirely, which would arguably be a simpler
- 		 * alternative. In any case the infrastructure used to implement the
- 		 * existence operator could trivially support this hypothetical,
- 		 * slightly distinct definition of existence.
- 		 */
  		switch (r)
  		{
  			case WJB_KEY:
! 				/* Serialize key separately, for existence strategies */
! 				entries[i++] = PointerGetDatum(make_scalar_key(&v, JKEYELEM));
  				break;
  			case WJB_ELEM:
! 				if (v.type == jbvString)
! 					entries[i++] = PointerGetDatum(make_scalar_key(&v, JKEYELEM));
! 				else
! 					entries[i++] = PointerGetDatum(make_scalar_key(&v, JVAL));
  				break;
  			case WJB_VALUE:
! 				entries[i++] = PointerGetDatum(make_scalar_key(&v, JVAL));
  				break;
  			default:
! 				continue;
  		}
  	}
  
--- 67,115 ----
  {
  	Jsonb	   *jb = (Jsonb *) PG_GETARG_JSONB(0);
  	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
  	int			total = 2 * JB_ROOT_COUNT(jb);
  	JsonbIterator *it;
  	JsonbValue	v;
+ 	int			i = 0,
+ 				r;
+ 	Datum	   *entries;
  
+ 	/* If the root level is empty, we certainly have no keys */
  	if (total == 0)
  	{
  		*nentries = 0;
  		PG_RETURN_POINTER(NULL);
  	}
  
+ 	/* Otherwise, use 2 * root count as initial estimate of result size */
  	entries = (Datum *) palloc(sizeof(Datum) * total);
  
  	it = JsonbIteratorInit(&jb->root);
  
  	while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
  	{
+ 		/* Since we recurse into the object, we might need more space */
  		if (i >= total)
  		{
  			total *= 2;
  			entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
  		}
  
  		switch (r)
  		{
  			case WJB_KEY:
! 				entries[i++] = make_scalar_key(&v, true);
  				break;
  			case WJB_ELEM:
! 				/* Pretend string array elements are keys, see jsonb.h */
! 				entries[i++] = make_scalar_key(&v, (v.type == jbvString));
  				break;
  			case WJB_VALUE:
! 				entries[i++] = make_scalar_key(&v, false);
  				break;
  			default:
! 				/* we can ignore structural items */
! 				break;
  		}
  	}
  
*************** gin_extract_jsonb_query(PG_FUNCTION_ARGS
*** 168,192 ****
  	}
  	else if (strategy == JsonbExistsStrategyNumber)
  	{
  		text	   *query = PG_GETARG_TEXT_PP(0);
- 		text	   *item;
  
  		*nentries = 1;
  		entries = (Datum *) palloc(sizeof(Datum));
! 		item = make_text_key(VARDATA_ANY(query), VARSIZE_ANY_EXHDR(query),
! 							 JKEYELEM);
! 		entries[0] = PointerGetDatum(item);
  	}
  	else if (strategy == JsonbExistsAnyStrategyNumber ||
  			 strategy == JsonbExistsAllStrategyNumber)
  	{
  		ArrayType  *query = PG_GETARG_ARRAYTYPE_P(0);
  		Datum	   *key_datums;
  		bool	   *key_nulls;
  		int			key_count;
  		int			i,
  					j;
- 		text	   *item;
  
  		deconstruct_array(query,
  						  TEXTOID, -1, false, 'i',
--- 139,163 ----
  	}
  	else if (strategy == JsonbExistsStrategyNumber)
  	{
+ 		/* Query is a text string, which we treat as a key */
  		text	   *query = PG_GETARG_TEXT_PP(0);
  
  		*nentries = 1;
  		entries = (Datum *) palloc(sizeof(Datum));
! 		entries[0] = make_text_key(JGINFLAG_KEY,
! 								   VARDATA_ANY(query),
! 								   VARSIZE_ANY_EXHDR(query));
  	}
  	else if (strategy == JsonbExistsAnyStrategyNumber ||
  			 strategy == JsonbExistsAllStrategyNumber)
  	{
+ 		/* Query is a text array; each element is treated as a key */
  		ArrayType  *query = PG_GETARG_ARRAYTYPE_P(0);
  		Datum	   *key_datums;
  		bool	   *key_nulls;
  		int			key_count;
  		int			i,
  					j;
  
  		deconstruct_array(query,
  						  TEXTOID, -1, false, 'i',
*************** gin_extract_jsonb_query(PG_FUNCTION_ARGS
*** 194,208 ****
  
  		entries = (Datum *) palloc(sizeof(Datum) * key_count);
  
! 		for (i = 0, j = 0; i < key_count; ++i)
  		{
  			/* Nulls in the array are ignored */
  			if (key_nulls[i])
  				continue;
! 			item = make_text_key(VARDATA(key_datums[i]),
! 								 VARSIZE(key_datums[i]) - VARHDRSZ,
! 								 JKEYELEM);
! 			entries[j++] = PointerGetDatum(item);
  		}
  
  		*nentries = j;
--- 165,178 ----
  
  		entries = (Datum *) palloc(sizeof(Datum) * key_count);
  
! 		for (i = 0, j = 0; i < key_count; i++)
  		{
  			/* Nulls in the array are ignored */
  			if (key_nulls[i])
  				continue;
! 			entries[j++] = make_text_key(JGINFLAG_KEY,
! 										 VARDATA_ANY(key_datums[i]),
! 										 VARSIZE_ANY_EXHDR(key_datums[i]));
  		}
  
  		*nentries = j;
*************** gin_consistent_jsonb(PG_FUNCTION_ARGS)
*** 236,248 ****
  	if (strategy == JsonbContainsStrategyNumber)
  	{
  		/*
! 		 * Index doesn't have information about correspondence of Jsonb keys
! 		 * and values (as distinct from GIN keys, which a key/value pair is
! 		 * stored as), so invariably we recheck.  Besides, there are some
! 		 * special rules around the containment of raw scalar arrays and
! 		 * regular arrays that are not represented here.  However, if all of
! 		 * the keys are not present, that's sufficient reason to return false
! 		 * and finish immediately.
  		 */
  		*recheck = true;
  		for (i = 0; i < nkeys; i++)
--- 206,217 ----
  	if (strategy == JsonbContainsStrategyNumber)
  	{
  		/*
! 		 * We must always recheck, since we can't tell from the index whether
! 		 * the positions of the matched items match the structure of the query
! 		 * object.  (Even if we could, we'd also have to worry about hashed
! 		 * keys and the index's failure to distinguish keys from string array
! 		 * elements.)  However, the tuple certainly doesn't match unless it
! 		 * contains all the query keys.
  		 */
  		*recheck = true;
  		for (i = 0; i < nkeys; i++)
*************** gin_consistent_jsonb(PG_FUNCTION_ARGS)
*** 256,275 ****
  	}
  	else if (strategy == JsonbExistsStrategyNumber)
  	{
! 		/* Existence of key guaranteed in default search mode */
! 		*recheck = false;
  		res = true;
  	}
  	else if (strategy == JsonbExistsAnyStrategyNumber)
  	{
! 		/* Existence of key guaranteed in default search mode */
! 		*recheck = false;
  		res = true;
  	}
  	else if (strategy == JsonbExistsAllStrategyNumber)
  	{
! 		/* Testing for the presence of all keys gives an exact result */
! 		*recheck = false;
  		for (i = 0; i < nkeys; i++)
  		{
  			if (!check[i])
--- 225,251 ----
  	}
  	else if (strategy == JsonbExistsStrategyNumber)
  	{
! 		/*
! 		 * Although the key is certainly present in the index, we must recheck
! 		 * because (1) the key might be hashed, and (2) the index match might
! 		 * be for a key that's not at top level of the JSON object.  For (1),
! 		 * we could look at the query key to see if it's hashed and not
! 		 * recheck if not, but the index lacks enough info to tell about (2).
! 		 */
! 		*recheck = true;
  		res = true;
  	}
  	else if (strategy == JsonbExistsAnyStrategyNumber)
  	{
! 		/* As for plain exists, we must recheck */
! 		*recheck = true;
  		res = true;
  	}
  	else if (strategy == JsonbExistsAllStrategyNumber)
  	{
! 		/* As for plain exists, we must recheck */
! 		*recheck = true;
! 		/* ... but unless all the keys are present, we can say "false" */
  		for (i = 0; i < nkeys; i++)
  		{
  			if (!check[i])
*************** gin_triconsistent_jsonb(PG_FUNCTION_ARGS
*** 295,313 ****
  	int32		nkeys = PG_GETARG_INT32(3);
  
  	/* Pointer	   *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
! 	GinTernaryValue res = GIN_TRUE;
! 
  	int32		i;
  
! 	if (strategy == JsonbContainsStrategyNumber)
  	{
! 		bool		has_maybe = false;
! 
! 		/*
! 		 * All extracted keys must be present.  Combination of GIN_MAYBE and
! 		 * GIN_TRUE gives GIN_MAYBE result because then all keys may be
! 		 * present.
! 		 */
  		for (i = 0; i < nkeys; i++)
  		{
  			if (check[i] == GIN_FALSE)
--- 271,288 ----
  	int32		nkeys = PG_GETARG_INT32(3);
  
  	/* Pointer	   *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
! 	GinTernaryValue res = GIN_MAYBE;
  	int32		i;
  
! 	/*
! 	 * Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this
! 	 * corresponds to always forcing recheck in the regular consistent
! 	 * function, for the reasons listed there.
! 	 */
! 	if (strategy == JsonbContainsStrategyNumber ||
! 		strategy == JsonbExistsAllStrategyNumber)
  	{
! 		/* All extracted keys must be present */
  		for (i = 0; i < nkeys; i++)
  		{
  			if (check[i] == GIN_FALSE)
*************** gin_triconsistent_jsonb(PG_FUNCTION_ARGS
*** 315,369 ****
  				res = GIN_FALSE;
  				break;
  			}
- 			if (check[i] == GIN_MAYBE)
- 			{
- 				res = GIN_MAYBE;
- 				has_maybe = true;
- 			}
  		}
- 
- 		/*
- 		 * Index doesn't have information about correspondence of Jsonb keys
- 		 * and values (as distinct from GIN keys, which a key/value pair is
- 		 * stored as), so invariably we recheck.  This is also reflected in
- 		 * how GIN_MAYBE is given in response to there being no GIN_MAYBE
- 		 * input.
- 		 */
- 		if (!has_maybe && res == GIN_TRUE)
- 			res = GIN_MAYBE;
  	}
  	else if (strategy == JsonbExistsStrategyNumber ||
  			 strategy == JsonbExistsAnyStrategyNumber)
  	{
! 		/* Existence of key guaranteed in default search mode */
  		res = GIN_FALSE;
  		for (i = 0; i < nkeys; i++)
  		{
! 			if (check[i] == GIN_TRUE)
! 			{
! 				res = GIN_TRUE;
! 				break;
! 			}
! 			if (check[i] == GIN_MAYBE)
  			{
  				res = GIN_MAYBE;
- 			}
- 		}
- 	}
- 	else if (strategy == JsonbExistsAllStrategyNumber)
- 	{
- 		/* Testing for the presence of all keys gives an exact result */
- 		for (i = 0; i < nkeys; i++)
- 		{
- 			if (check[i] == GIN_FALSE)
- 			{
- 				res = GIN_FALSE;
  				break;
  			}
- 			if (check[i] == GIN_MAYBE)
- 			{
- 				res = GIN_MAYBE;
- 			}
  		}
  	}
  	else
--- 290,310 ----
  				res = GIN_FALSE;
  				break;
  			}
  		}
  	}
  	else if (strategy == JsonbExistsStrategyNumber ||
  			 strategy == JsonbExistsAnyStrategyNumber)
  	{
! 		/* At least one extracted key must be present */
  		res = GIN_FALSE;
  		for (i = 0; i < nkeys; i++)
  		{
! 			if (check[i] == GIN_TRUE ||
! 				check[i] == GIN_MAYBE)
  			{
  				res = GIN_MAYBE;
  				break;
  			}
  		}
  	}
  	else
*************** gin_triconsistent_jsonb(PG_FUNCTION_ARGS
*** 376,382 ****
--- 317,330 ----
   *
   * jsonb_hash_ops GIN opclass support functions
   *
+  * In a jsonb_hash_ops index, the keys are uint32 hashes, one per value; but
+  * the key(s) leading to each value are also included in its hash computation.
+  * This means we can only support containment queries, but the index can
+  * distinguish, for example, {"foo": 42} from {"bar": 42} since different
+  * hashes will be generated.
+  *
   */
+ 
  Datum
  gin_consistent_jsonb_hash(PG_FUNCTION_ARGS)
  {
*************** gin_consistent_jsonb_hash(PG_FUNCTION_AR
*** 395,407 ****
  		elog(ERROR, "unrecognized strategy number: %d", strategy);
  
  	/*
! 	 * jsonb_hash_ops index doesn't have information about correspondence of
! 	 * Jsonb keys and values (as distinct from GIN keys, which a key/value
! 	 * pair is stored as), so invariably we recheck.  Besides, there are some
  	 * special rules around the containment of raw scalar arrays and regular
! 	 * arrays that are not represented here.  However, if all of the keys are
! 	 * not present, that's sufficient reason to return false and finish
! 	 * immediately.
  	 */
  	*recheck = true;
  	for (i = 0; i < nkeys; i++)
--- 343,355 ----
  		elog(ERROR, "unrecognized strategy number: %d", strategy);
  
  	/*
! 	 * jsonb_hash_ops is necessarily lossy, not only because of hash
! 	 * collisions but also because it doesn't preserve complete information
! 	 * about the structure of the JSON object.  Besides, there are some
  	 * special rules around the containment of raw scalar arrays and regular
! 	 * arrays that are not handled here.  So we must always recheck a match.
! 	 * However, if not all of the keys are present, the tuple certainly
! 	 * doesn't match.
  	 */
  	*recheck = true;
  	for (i = 0; i < nkeys; i++)
*************** gin_triconsistent_jsonb_hash(PG_FUNCTION
*** 426,442 ****
  	int32		nkeys = PG_GETARG_INT32(3);
  
  	/* Pointer	   *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
! 	GinTernaryValue res = GIN_TRUE;
  	int32		i;
- 	bool		has_maybe = false;
  
  	if (strategy != JsonbContainsStrategyNumber)
  		elog(ERROR, "unrecognized strategy number: %d", strategy);
  
  	/*
! 	 * All extracted keys must be present.  A combination of GIN_MAYBE and
! 	 * GIN_TRUE induces a GIN_MAYBE result, because then all keys may be
! 	 * present.
  	 */
  	for (i = 0; i < nkeys; i++)
  	{
--- 374,389 ----
  	int32		nkeys = PG_GETARG_INT32(3);
  
  	/* Pointer	   *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
! 	GinTernaryValue res = GIN_MAYBE;
  	int32		i;
  
  	if (strategy != JsonbContainsStrategyNumber)
  		elog(ERROR, "unrecognized strategy number: %d", strategy);
  
  	/*
! 	 * Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this
! 	 * corresponds to always forcing recheck in the regular consistent
! 	 * function, for the reasons listed there.
  	 */
  	for (i = 0; i < nkeys; i++)
  	{
*************** gin_triconsistent_jsonb_hash(PG_FUNCTION
*** 445,467 ****
  			res = GIN_FALSE;
  			break;
  		}
- 		if (check[i] == GIN_MAYBE)
- 		{
- 			res = GIN_MAYBE;
- 			has_maybe = true;
- 		}
  	}
  
- 	/*
- 	 * jsonb_hash_ops index doesn't have information about correspondence of
- 	 * Jsonb keys and values (as distinct from GIN keys, which for this
- 	 * opclass are a hash of a pair, or a hash of just an element), so
- 	 * invariably we recheck.  This is also reflected in how GIN_MAYBE is
- 	 * given in response to there being no GIN_MAYBE input.
- 	 */
- 	if (!has_maybe && res == GIN_TRUE)
- 		res = GIN_MAYBE;
- 
  	PG_RETURN_GIN_TERNARY_VALUE(res);
  }
  
--- 392,399 ----
*************** gin_extract_jsonb_hash(PG_FUNCTION_ARGS)
*** 477,502 ****
  	PathHashStack *stack;
  	int			i = 0,
  				r;
! 	Datum	   *entries = NULL;
  
  	if (total == 0)
  	{
  		*nentries = 0;
  		PG_RETURN_POINTER(NULL);
  	}
  
  	entries = (Datum *) palloc(sizeof(Datum) * total);
  
! 	it = JsonbIteratorInit(&jb->root);
! 
  	tail.parent = NULL;
  	tail.hash = 0;
  	stack = &tail;
  
  	while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
  	{
! 		PathHashStack *tmp;
  
  		if (i >= total)
  		{
  			total *= 2;
--- 409,438 ----
  	PathHashStack *stack;
  	int			i = 0,
  				r;
! 	Datum	   *entries;
  
+ 	/* If the root level is empty, we certainly have no keys */
  	if (total == 0)
  	{
  		*nentries = 0;
  		PG_RETURN_POINTER(NULL);
  	}
  
+ 	/* Otherwise, use 2 * root count as initial estimate of result size */
  	entries = (Datum *) palloc(sizeof(Datum) * total);
  
! 	/* We keep a stack of hashes corresponding to parent key levels */
  	tail.parent = NULL;
  	tail.hash = 0;
  	stack = &tail;
  
+ 	it = JsonbIteratorInit(&jb->root);
+ 
  	while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
  	{
! 		PathHashStack *parent;
  
+ 		/* Since we recurse into the object, we might need more space */
  		if (i >= total)
  		{
  			total *= 2;
*************** gin_extract_jsonb_hash(PG_FUNCTION_ARGS)
*** 507,521 ****
  		{
  			case WJB_BEGIN_ARRAY:
  			case WJB_BEGIN_OBJECT:
! 				tmp = stack;
  				stack = (PathHashStack *) palloc(sizeof(PathHashStack));
  
! 				/*
! 				 * Nesting an array within another array will not alter
! 				 * innermost scalar element hash values, but that seems
! 				 * inconsequential
! 				 */
! 				if (tmp->parent)
  				{
  					/*
  					 * We pass forward hashes from previous container nesting
--- 443,453 ----
  		{
  			case WJB_BEGIN_ARRAY:
  			case WJB_BEGIN_OBJECT:
! 				/* Push a stack level for this object */
! 				parent = stack;
  				stack = (PathHashStack *) palloc(sizeof(PathHashStack));
  
! 				if (parent->parent)
  				{
  					/*
  					 * We pass forward hashes from previous container nesting
*************** gin_extract_jsonb_hash(PG_FUNCTION_ARGS)
*** 524,561 ****
  					 * outermost key.  It's also somewhat useful to have
  					 * nested objects innermost values have hashes that are a
  					 * function of not just their own key, but outer keys too.
  					 */
! 					stack->hash = tmp->hash;
  				}
  				else
  				{
  					/*
! 					 * At least nested level, initialize with stable container
! 					 * type proxy value
  					 */
  					stack->hash = (r == WJB_BEGIN_ARRAY) ? JB_FARRAY : JB_FOBJECT;
  				}
! 				stack->parent = tmp;
  				break;
  			case WJB_KEY:
! 				/* Initialize hash from parent */
  				stack->hash = stack->parent->hash;
  				JsonbHashScalarValue(&v, &stack->hash);
  				break;
  			case WJB_ELEM:
! 				/* Elements have parent hash mixed in separately */
  				stack->hash = stack->parent->hash;
  			case WJB_VALUE:
! 				/* Element/value case */
  				JsonbHashScalarValue(&v, &stack->hash);
  				entries[i++] = UInt32GetDatum(stack->hash);
  				break;
  			case WJB_END_ARRAY:
  			case WJB_END_OBJECT:
  				/* Pop the stack */
! 				tmp = stack->parent;
  				pfree(stack);
! 				stack = tmp;
  				break;
  			default:
  				elog(ERROR, "invalid JsonbIteratorNext rc: %d", r);
--- 456,504 ----
  					 * outermost key.  It's also somewhat useful to have
  					 * nested objects innermost values have hashes that are a
  					 * function of not just their own key, but outer keys too.
+ 					 *
+ 					 * Nesting an array within another array will not alter
+ 					 * innermost scalar element hash values, but that seems
+ 					 * inconsequential.
  					 */
! 					stack->hash = parent->hash;
  				}
  				else
  				{
  					/*
! 					 * At the outermost level, initialize hash with container
! 					 * type proxy value.  Note that this makes JB_FARRAY and
! 					 * JB_FOBJECT part of the on-disk representation, but they
! 					 * are that in the base jsonb object storage already.
  					 */
  					stack->hash = (r == WJB_BEGIN_ARRAY) ? JB_FARRAY : JB_FOBJECT;
  				}
! 				stack->parent = parent;
  				break;
  			case WJB_KEY:
! 				/* initialize hash from parent */
  				stack->hash = stack->parent->hash;
+ 				/* and mix in this key */
  				JsonbHashScalarValue(&v, &stack->hash);
+ 				/* hash is now ready to incorporate the value */
  				break;
  			case WJB_ELEM:
! 				/* array elements use parent hash mixed with element's hash */
  				stack->hash = stack->parent->hash;
+ 				/* FALL THRU */
  			case WJB_VALUE:
! 				/* mix the element or value's hash into the prepared hash */
  				JsonbHashScalarValue(&v, &stack->hash);
+ 				/* and emit an index entry */
  				entries[i++] = UInt32GetDatum(stack->hash);
+ 				/* Note: we assume we'll see KEY before another VALUE */
  				break;
  			case WJB_END_ARRAY:
  			case WJB_END_OBJECT:
  				/* Pop the stack */
! 				parent = stack->parent;
  				pfree(stack);
! 				stack = parent;
  				break;
  			default:
  				elog(ERROR, "invalid JsonbIteratorNext rc: %d", r);
*************** gin_extract_jsonb_query_hash(PG_FUNCTION
*** 592,605 ****
  }
  
  /*
!  * Build a text value from a cstring and flag suitable for storage as a key
!  * value
   */
! static text *
! make_text_key(const char *str, int len, char flag)
  {
  	text	   *item;
  
  	item = (text *) palloc(VARHDRSZ + len + 1);
  	SET_VARSIZE(item, VARHDRSZ + len + 1);
  
--- 535,563 ----
  }
  
  /*
!  * Construct a GIN key from a flag byte and a textual representation
!  * (which need not be null-terminated).  This function is responsible
!  * for hashing overlength text representations; it will add the
!  * JGINFLAG_HASHED bit to the flag value if it does that.
   */
! static Datum
! make_text_key(char flag, const char *str, int len)
  {
  	text	   *item;
+ 	char		hashbuf[10];
+ 
+ 	if (len > JGIN_MAXLENGTH)
+ 	{
+ 		uint32		hashval;
  
+ 		hashval = DatumGetUInt32(hash_any((const unsigned char *) str, len));
+ 		snprintf(hashbuf, sizeof(hashbuf), "%08x", hashval);
+ 		str = hashbuf;
+ 		len = 8;
+ 		flag |= JGINFLAG_HASHED;
+ 	}
+ 
+ 	/* Now build the text Datum */
  	item = (text *) palloc(VARHDRSZ + len + 1);
  	SET_VARSIZE(item, VARHDRSZ + len + 1);
  
*************** make_text_key(const char *str, int len, 
*** 607,637 ****
  
  	memcpy(VARDATA(item) + 1, str, len);
  
! 	return item;
  }
  
  /*
!  * Create a textual representation of a jsonbValue for GIN storage.
   */
! static text *
! make_scalar_key(const JsonbValue *scalarVal, char flag)
  {
! 	text	   *item;
  	char	   *cstr;
  
  	switch (scalarVal->type)
  	{
  		case jbvNull:
! 			item = make_text_key("n", 1, flag);
  			break;
  		case jbvBool:
! 			item = make_text_key(scalarVal->val.boolean ? "t" : "f", 1, flag);
  			break;
  		case jbvNumeric:
  
  			/*
! 			 * A normalized textual representation, free of trailing zeroes is
! 			 * is required.
  			 *
  			 * It isn't ideal that numerics are stored in a relatively bulky
  			 * textual format.  However, it's a notationally convenient way of
--- 565,603 ----
  
  	memcpy(VARDATA(item) + 1, str, len);
  
! 	return PointerGetDatum(item);
  }
  
  /*
!  * Create a textual representation of a JsonbValue that will serve as a GIN
!  * key in a jsonb_ops index.  is_key is true if the JsonbValue is a key,
!  * or if it is a string array element (since we pretend those are keys,
!  * see jsonb.h).
   */
! static Datum
! make_scalar_key(const JsonbValue *scalarVal, bool is_key)
  {
! 	Datum		item;
  	char	   *cstr;
  
  	switch (scalarVal->type)
  	{
  		case jbvNull:
! 			Assert(!is_key);
! 			item = make_text_key(JGINFLAG_NULL, "", 0);
  			break;
  		case jbvBool:
! 			Assert(!is_key);
! 			item = make_text_key(JGINFLAG_BOOL,
! 								 scalarVal->val.boolean ? "t" : "f", 1);
  			break;
  		case jbvNumeric:
+ 			Assert(!is_key);
  
  			/*
! 			 * A normalized textual representation, free of trailing zeroes,
! 			 * is required so that numerically equal values will produce equal
! 			 * strings.
  			 *
  			 * It isn't ideal that numerics are stored in a relatively bulky
  			 * textual format.  However, it's a notationally convenient way of
*************** make_scalar_key(const JsonbValue *scalar
*** 639,653 ****
  			 * strings takes precedence.
  			 */
  			cstr = numeric_normalize(scalarVal->val.numeric);
! 			item = make_text_key(cstr, strlen(cstr), flag);
  			pfree(cstr);
  			break;
  		case jbvString:
! 			item = make_text_key(scalarVal->val.string.val, scalarVal->val.string.len,
! 								 flag);
  			break;
  		default:
! 			elog(ERROR, "invalid jsonb scalar type");
  	}
  
  	return item;
--- 605,622 ----
  			 * strings takes precedence.
  			 */
  			cstr = numeric_normalize(scalarVal->val.numeric);
! 			item = make_text_key(JGINFLAG_NUM, cstr, strlen(cstr));
  			pfree(cstr);
  			break;
  		case jbvString:
! 			item = make_text_key(is_key ? JGINFLAG_KEY : JGINFLAG_STR,
! 								 scalarVal->val.string.val,
! 								 scalarVal->val.string.len);
  			break;
  		default:
! 			elog(ERROR, "unrecognized jsonb scalar type: %d", scalarVal->type);
! 			item = 0;			/* keep compiler quiet */
! 			break;
  	}
  
  	return item;
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index fc746c8..1a6409a 100644
*** a/src/include/utils/jsonb.h
--- b/src/include/utils/jsonb.h
*************** typedef enum
*** 29,53 ****
  	WJB_END_OBJECT
  } JsonbIteratorToken;
  
! /*
!  * When using a GIN index for jsonb, we choose to index both keys and values.
!  * The storage format is text, with K, or V prepended to the string to indicate
!  * key/element or value/element.
!  *
!  * Jsonb Keys and string array elements are treated equivalently when
!  * serialized to text index storage.  One day we may wish to create an opclass
!  * that only indexes values, but for now keys and values are stored in GIN
!  * indexes in a way that doesn't really consider their relationship to each
!  * other.
!  */
! #define JKEYELEM	'K'
! #define JVAL		'V'
! 
  #define JsonbContainsStrategyNumber		7
  #define JsonbExistsStrategyNumber		9
  #define JsonbExistsAnyStrategyNumber	10
  #define JsonbExistsAllStrategyNumber	11
  
  /* Convenience macros */
  #define DatumGetJsonb(d)	((Jsonb *) PG_DETOAST_DATUM(d))
  #define JsonbGetDatum(p)	PointerGetDatum(p)
--- 29,69 ----
  	WJB_END_OBJECT
  } JsonbIteratorToken;
  
! /* Strategy numbers for GIN index opclasses */
  #define JsonbContainsStrategyNumber		7
  #define JsonbExistsStrategyNumber		9
  #define JsonbExistsAnyStrategyNumber	10
  #define JsonbExistsAllStrategyNumber	11
  
+ /*
+  * In the standard jsonb_ops GIN opclass for jsonb, we choose to index both
+  * keys and values.  The storage format is text.  The first byte of the text
+  * string distinguishes whether this is a key (always a string), null value,
+  * boolean value, numeric value, or string value.  However, array elements
+  * that are strings are marked as though they were keys; this imprecision
+  * supports the definition of the "exists" operator, which treats array
+  * elements like keys.  The remainder of the text string is empty for a null
+  * value, "t" or "f" for a boolean value, a normalized print representation of
+  * a numeric value, or the text of a string value.  However, if the length of
+  * this text representation would exceed JGIN_MAXLENGTH bytes, we instead hash
+  * the text representation and store an 8-hex-digit representation of the
+  * uint32 hash value, marking the prefix byte with an additional bit to
+  * distinguish that this has happened.  Hashing long strings saves space and
+  * ensures that we won't overrun the maximum entry length for a GIN index.
+  * (But JGIN_MAXLENGTH is quite a bit shorter than GIN's limit.  It's chosen
+  * to ensure that the on-disk text datum will have a short varlena header.)
+  * Note that when any hashed item appears in a query, we must recheck index
+  * matches against the heap tuple; currently, this costs nothing because we
+  * must always recheck for other reasons.
+  */
+ #define JGINFLAG_KEY	0x01	/* key (or string array element) */
+ #define JGINFLAG_NULL	0x02	/* null value */
+ #define JGINFLAG_BOOL	0x03	/* boolean value */
+ #define JGINFLAG_NUM	0x04	/* numeric value */
+ #define JGINFLAG_STR	0x05	/* string value (if not an array element) */
+ #define JGINFLAG_HASHED 0x10	/* OR'd into flag if value was hashed */
+ #define JGIN_MAXLENGTH	125		/* max length of text part before hashing */
+ 
  /* Convenience macros */
  #define DatumGetJsonb(d)	((Jsonb *) PG_DETOAST_DATUM(d))
  #define JsonbGetDatum(p)	PointerGetDatum(p)
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to