> We would really appreciate help on finalizing this patch, especially in
> regard to style issues.  Thank you for all the help.

Here is a cleaned-up version.  I fixed a number of whitespace issues,
improved a few comments, and rearranged one set of nested if-else
statements (hopefully without breaking anything in the process).

Josh / eggyknap -

Can you rerun your performance tests with this version of the patch?

...Robert
*** a/src/backend/executor/nodeHash.c
--- b/src/backend/executor/nodeHash.c
***************
*** 53,58 **** ExecHash(HashState *node)
--- 53,222 ----
  	return NULL;
  }
  
+ /*
+  * ----------------------------------------------------------------
+  *		ExecHashGetIMBucket
+  *
+  *  	Returns the index of the in-memory bucket for this
+  *		hashvalue, or IM_INVALID_BUCKET if the hashvalue is not
+  *		associated with any unfrozen bucket (or if skew
+  *		optimization is not being used).
+  *  
+  *		It is possible for a tuple whose join attribute value is
+  *		not a MCV to hash to an in-memory bucket due to the limited
+  * 		number of hash values but it is unlikely and everything
+  *		continues to work even if it does happen. We would
+  *		accidentally cache some less optimal tuples in memory
+  *		but the join result would still be accurate.
+  *
+  *		hashtable->imBucket is an open addressing hashtable of
+  *		in-memory buckets (HashJoinIMBucket).
+  * ----------------------------------------------------------------
+  */
+ int 
+ ExecHashGetIMBucket(HashJoinTable hashtable, uint32 hashvalue)
+ {
+ 	int bucket;
+ 
+ 	if (!hashtable->enableSkewOptimization)
+ 		return IM_INVALID_BUCKET;
+ 	
+ 	/* Modulo the hashvalue (using bitmask) to find the IM bucket. */
+ 	bucket = hashvalue & (hashtable->nIMBuckets - 1);
+ 
+ 	/*
+ 	 * While we have not hit a hole in the hashtable and have not hit the 
+ 	 * actual bucket we have collided in the hashtable so try the next
+ 	 * bucket location.
+ 	 */
+ 	while (hashtable->imBucket[bucket] != NULL
+ 		&& hashtable->imBucket[bucket]->hashvalue != hashvalue)
+ 		bucket = (bucket + 1) & (hashtable->nIMBuckets - 1);
+ 
+ 	/*
+ 	 * If the bucket exists and has been correctly determined return
+ 	 * the bucket index.
+ 	 */
+ 	if (hashtable->imBucket[bucket] != NULL
+ 		&& hashtable->imBucket[bucket]->hashvalue == hashvalue)
+ 		return bucket;
+ 
+ 	/*
+ 	 * Must have run into an empty location or a frozen bucket which means the
+ 	 * tuple with this hashvalue is not to be handled as if it matches with an
+ 	 * in-memory bucket.
+ 	 */
+ 	return IM_INVALID_BUCKET;
+ }
+ 
+ /*
+  * ----------------------------------------------------------------
+  *		ExecHashFreezeNextIMBucket
+  *
+  *		Freeze the tuples of the next in-memory bucket by pushing
+  *		them into the main hashtable.  Buckets are frozen in order
+  *		so that the best tuples are kept in memory the longest.
+  * ----------------------------------------------------------------
+  */
+ static bool 
+ ExecHashFreezeNextIMBucket(HashJoinTable hashtable)
+ {
+ 	int bucketToFreeze;
+ 	int bucketno;
+ 	int batchno;
+ 	uint32 hashvalue;
+ 	HashJoinTuple hashTuple;
+ 	HashJoinTuple nextHashTuple;
+ 	HashJoinIMBucket *bucket;
+ 	MinimalTuple mintuple;
+ 
+ 	/* Calculate the imBucket index of the bucket to freeze. */
+ 	bucketToFreeze = hashtable->imBucketFreezeOrder
+ 		[hashtable->nUsedIMBuckets - 1 - hashtable->nIMBucketsFrozen];
+ 
+ 	/* Grab a pointer to the actual IM bucket. */
+ 	bucket = hashtable->imBucket[bucketToFreeze];
+ 	hashvalue = bucket->hashvalue;
+ 
+ 	/*
+ 	 * Grab a pointer to the first tuple in the soon to be frozen IM bucket.
+ 	 */
+ 	hashTuple = bucket->tuples;
+ 
+ 	/*
+ 	 * Calculate which bucket and batch the tuples belong to in the main
+ 	 * non-IM hashtable.
+ 	 */
+ 	ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
+ 
+ 	/* until we have read all tuples from this bucket */
+ 	while (hashTuple != NULL)
+ 	{
+ 		/*
+ 		 * Some of this code is very similar to that of ExecHashTableInsert.
+ 		 * We do not call ExecHashTableInsert directly as
+ 		 * ExecHashTableInsert expects a TupleTableSlot and we already have
+ 		 * HashJoinTuples.
+ 		 */
+ 		mintuple = HJTUPLE_MINTUPLE(hashTuple);
+ 
+ 		/* Decide whether to put the tuple in the hash table or a temp file. */
+ 		if (batchno == hashtable->curbatch)
+ 		{
+ 			/* Put the tuple in hash table. */
+ 			nextHashTuple = hashTuple->next;
+ 			hashTuple->next = hashtable->buckets[bucketno];
+ 			hashtable->buckets[bucketno] = hashTuple;
+ 			hashTuple = nextHashTuple;
+ 			hashtable->spaceUsedIM -= HJTUPLE_OVERHEAD + mintuple->t_len;
+ 		}
+ 		else
+ 		{
+ 			/* Put the tuples into a temp file for later batches. */
+ 			Assert(batchno > hashtable->curbatch);
+ 			ExecHashJoinSaveTuple(mintuple, hashvalue,
+ 								  &hashtable->innerBatchFile[batchno]);
+ 			/*
+ 			 * Some memory has been freed up. This must be done before we
+ 			 * pfree the hashTuple of we lose access to the tuple size.
+ 			 */
+ 			hashtable->spaceUsed -= HJTUPLE_OVERHEAD + mintuple->t_len;
+ 			hashtable->spaceUsedIM -= HJTUPLE_OVERHEAD + mintuple->t_len;
+ 			nextHashTuple = hashTuple->next;
+ 			pfree(hashTuple);
+ 			hashTuple = nextHashTuple;
+ 		}
+ 	}
+ 
+ 	/*
+ 	 * Free the memory the bucket struct was using as it is not necessary
+ 	 * any more.  All code treats a frozen in-memory bucket the same as one
+ 	 * that did not exist; by setting the pointer to null the rest of the code
+ 	 * will function as if we had not created this bucket at all.
+ 	 */
+ 	pfree(bucket);
+ 	hashtable->imBucket[bucketToFreeze] = NULL;
+ 	hashtable->spaceUsed -= IM_BUCKET_OVERHEAD;
+ 	hashtable->spaceUsedIM -= IM_BUCKET_OVERHEAD;
+ 	hashtable->nIMBucketsFrozen++;
+ 
+ 	/*
+ 	 * All buckets have been frozen and deleted from memory so turn off
+ 	 * skew aware partitioning and remove the structs from memory as they are
+ 	 * just wasting space from now on.
+ 	 */
+ 	if (hashtable->nUsedIMBuckets == hashtable->nIMBucketsFrozen)
+ 	{
+ 		hashtable->enableSkewOptimization = false;
+ 		pfree(hashtable->imBucket);
+ 		pfree(hashtable->imBucketFreezeOrder);
+ 		hashtable->spaceUsed -= hashtable->spaceUsedIM;
+ 		hashtable->spaceUsedIM = 0;
+ 	}
+ 
+ 	return true;
+ }
+ 
  /* ----------------------------------------------------------------
   *		MultiExecHash
   *
***************
*** 69,74 **** MultiExecHash(HashState *node)
--- 233,240 ----
  	TupleTableSlot *slot;
  	ExprContext *econtext;
  	uint32		hashvalue;
+ 	MinimalTuple mintuple;
+ 	int bucketNumber;
  
  	/* must provide our own instrumentation support */
  	if (node->ps.instrument)
***************
*** 99,106 **** MultiExecHash(HashState *node)
  		if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, false,
  								 &hashvalue))
  		{
! 			ExecHashTableInsert(hashtable, slot, hashvalue);
! 			hashtable->totalTuples += 1;
  		}
  	}
  
--- 265,306 ----
  		if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, false,
  								 &hashvalue))
  		{
! 			bucketNumber = ExecHashGetIMBucket(hashtable, hashvalue);
! 
! 			/* handle tuples not destined for an in-memory bucket normally */
! 			if (bucketNumber == IM_INVALID_BUCKET)
! 				ExecHashTableInsert(hashtable, slot, hashvalue);
! 			else
! 			{
! 				HashJoinTuple hashTuple;
! 				int			hashTupleSize;
! 				
! 				/* get the HashJoinTuple */
! 				mintuple = ExecFetchSlotMinimalTuple(slot);
! 				hashTupleSize = HJTUPLE_OVERHEAD + mintuple->t_len;
! 				hashTuple = (HashJoinTuple)
! 					MemoryContextAlloc(hashtable->batchCxt, hashTupleSize);
! 				hashTuple->hashvalue = hashvalue;
! 				memcpy(HJTUPLE_MINTUPLE(hashTuple), mintuple, mintuple->t_len);
! 
! 				/* Push the HashJoinTuple onto the front of the IM bucket. */
! 				hashTuple->next = hashtable->imBucket[bucketNumber]->tuples;
! 				hashtable->imBucket[bucketNumber]->tuples = hashTuple;
! 
! 				/*
! 				 * More memory is now in use so make sure we are not over
! 				 * spaceAllowedIM.
! 				 */
! 				hashtable->spaceUsed += hashTupleSize;
! 				hashtable->spaceUsedIM += hashTupleSize;
! 				while (hashtable->spaceUsedIM > hashtable->spaceAllowedIM
! 					&& ExecHashFreezeNextIMBucket(hashtable))
! 					;
! 				/* Guarantee we are not over the spaceAllowed. */
! 				if (hashtable->spaceUsed > hashtable->spaceAllowed)
! 					ExecHashIncreaseNumBatches(hashtable);
! 			}
! 			hashtable->totalTuples++;
  		}
  	}
  
***************
*** 269,274 **** ExecHashTableCreate(Hash *node, List *hashOperators)
--- 469,483 ----
  	hashtable->outerBatchFile = NULL;
  	hashtable->spaceUsed = 0;
  	hashtable->spaceAllowed = work_mem * 1024L;
+ 	/* Initialize skew optimization related hashtable variables. */
+ 	hashtable->spaceUsedIM = 0;
+ 	hashtable->spaceAllowedIM
+ 		= hashtable->spaceAllowed * IM_WORK_MEM_PERCENT / 100;
+ 	hashtable->enableSkewOptimization = false;
+ 	hashtable->nUsedIMBuckets = 0;
+ 	hashtable->nIMBuckets = 0;
+ 	hashtable->imBucket = NULL;
+ 	hashtable->nIMBucketsFrozen = 0;
  
  	/*
  	 * Get info about the hash functions to be used for each hash key. Also
***************
*** 815,825 **** ExecScanHashBucket(HashJoinState *hjstate,
  	/*
  	 * hj_CurTuple is NULL to start scanning a new bucket, or the address of
  	 * the last tuple returned from the current bucket.
  	 */
! 	if (hashTuple == NULL)
! 		hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
! 	else
  		hashTuple = hashTuple->next;
  
  	while (hashTuple != NULL)
  	{
--- 1024,1040 ----
  	/*
  	 * hj_CurTuple is NULL to start scanning a new bucket, or the address of
  	 * the last tuple returned from the current bucket.
+ 	 *
+ 	 * If the tuple hashed to an IM bucket then scan the IM bucket
+ 	 * otherwise scan the standard hashtable bucket.
  	 */
! 	if (hashTuple != NULL)
  		hashTuple = hashTuple->next;
+ 	else if (hjstate->hj_OuterTupleIMBucketNo != IM_INVALID_BUCKET)
+ 		hashTuple = hashtable->imBucket[hjstate->hj_OuterTupleIMBucketNo]
+ 			->tuples;
+ 	else
+ 		hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
  
  	while (hashTuple != NULL)
  	{
*** a/src/backend/executor/nodeHashjoin.c
--- b/src/backend/executor/nodeHashjoin.c
***************
*** 20,25 ****
--- 20,29 ----
  #include "executor/nodeHash.h"
  #include "executor/nodeHashjoin.h"
  #include "utils/memutils.h"
+ #include "utils/syscache.h"
+ #include "utils/lsyscache.h"
+ #include "parser/parsetree.h"
+ #include "catalog/pg_statistic.h"
  
  
  /* Returns true for JOIN_LEFT and JOIN_ANTI jointypes */
***************
*** 34,39 **** static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
--- 38,227 ----
  						  TupleTableSlot *tupleSlot);
  static int	ExecHashJoinNewBatch(HashJoinState *hjstate);
  
+ /*
+  * ----------------------------------------------------------------
+  *		ExecHashJoinDetectSkew
+  *
+  *		If MCV statistics can be found for the join attribute of
+  *		this hashjoin then create a hash table of buckets. Each
+  *		bucket will correspond to an MCV hashvalue and will be
+  *		filled with inner relation tuples whose join attribute
+  *		hashes to the same value as that MCV.  If a join
+  *		attribute value is a MCV for the join attribute in the
+  *		outer (probe) relation, tuples with this value in the
+  *		inner (build) relation are more likely to join with outer
+  *		relation tuples and a benefit can be gained by keeping
+  *		them in memory while joining the first batch of tuples.
+  * ----------------------------------------------------------------
+  */
+ static void 
+ ExecHashJoinDetectSkew(EState *estate, HashJoinState *hjstate, int tupwidth)
+ {
+ 	HeapTupleData	*statsTuple;
+ 	FuncExprState	*clause;
+ 	ExprState		*argstate;
+ 	Var				*variable;
+ 	HashJoinTable	hashtable;
+ 	Datum			*values;
+ 	int				nvalues;
+ 	float4			*numbers;
+ 	int				nnumbers;
+ 	Oid				relid;
+ 	Oid				atttype;
+ 	int				i;
+ 	int				mcvsToUse;
+ 
+ 	/* Only use statistics if there is a single join attribute. */
+ 	if (hjstate->hashclauses->length != 1)
+ 		return; /* Histojoin is not defined for more than one join key */
+ 	
+ 	hashtable = hjstate->hj_HashTable;
+ 	
+ 	/*
+ 	 * Estimate the number of IM buckets that will fit in
+ 	 * the memory allowed for IM buckets.
+ 	 *
+ 	 * hashtable->imBucket will have up to 8 times as many HashJoinIMBucket
+ 	 * pointers as the number of MCV hashvalues. A uint16 index in
+ 	 * hashtable->imBucketFreezeOrder will be created for each IM bucket. One
+ 	 * actual HashJoinIMBucket struct will be created for each
+ 	 * unique MCV hashvalue so up to one struct per MCV.
+ 	 *
+ 	 * It is also estimated that each IM bucket will have a single build
+ 	 * tuple stored in it after partitioning the build relation input.  This
+ 	 * estimate could be high if tuples are filtered out before this join but
+ 	 * in that case the extra memory is used by the regular hashjoin batch.
+ 	 * This estimate could be low if it is a many to many join but in that
+ 	 * case IM buckets will be frozen to free up memory as needed
+ 	 * during the inner relation partitioning phase.
+ 	 */
+ 	mcvsToUse = hashtable->spaceAllowedIM / (
+ 		/* size of a hash tuple */
+ 		HJTUPLE_OVERHEAD + MAXALIGN(sizeof(MinimalTupleData))
+ 			+ MAXALIGN(tupwidth)
+ 		/* max size of hashtable pointers per MCV */
+ 		+ (8 * sizeof(HashJoinIMBucket*))
+ 		+ sizeof(uint16) /* size of imBucketFreezeOrder entry */
+ 		+ IM_BUCKET_OVERHEAD /* size of IM bucket struct */
+ 		);
+ 	if (mcvsToUse == 0)
+ 		return;	/* No point in considering this any further. */
+ 
+ 	/*
+ 	 * Determine the relation id and attribute id of the single join
+ 	 * attribute of the probe relation.
+ 	 */
+ 	clause = (FuncExprState *) lfirst(list_head(hjstate->hashclauses));
+ 	argstate = (ExprState *) lfirst(list_head(clause->args));
+ 
+ 	/*
+ 	 * Do not try to exploit stats if the join attribute is an expression
+ 	 * instead of just a simple attribute.
+ 	 */		
+ 	if (argstate->expr->type != T_Var)
+ 		return;
+ 
+ 	variable = (Var *) argstate->expr;
+ 	relid = getrelid(variable->varnoold, estate->es_range_table);
+ 	atttype = variable->vartype;
+ 
+ 	statsTuple = SearchSysCache(STATRELATT, ObjectIdGetDatum(relid),
+ 		Int16GetDatum(variable->varoattno), 0, 0);
+ 	if (!HeapTupleIsValid(statsTuple))
+ 		return;
+ 
+ 	/* Look for MCV statistics for the attribute. */
+ 	if (get_attstatsslot(statsTuple, atttype, variable->vartypmod,
+ 		STATISTIC_KIND_MCV, InvalidOid, &values, &nvalues,
+ 		&numbers, &nnumbers))
+ 	{
+ 		FmgrInfo   *hashfunctions;
+ 		int nbuckets = 2;
+ 
+ 		/*
+ 		 * IM buckets (imBucket) is an open addressing hashtable with a 
+ 		 * power of 2 size that is greater than the number of MCV values.
+ 		 */
+ 		if (mcvsToUse > nvalues)
+ 			mcvsToUse = nvalues;
+ 		while (nbuckets <= mcvsToUse)
+ 			nbuckets <<= 1;
+ 		/* use two more bits just to help avoid collisions */
+ 		nbuckets <<= 2;
+ 		hashtable->nIMBuckets = nbuckets;
+ 		hashtable->enableSkewOptimization = true;
+ 
+ 		/*
+ 		 * Allocate the bucket memory in the hashtable's batch context
+ 		 * because it is only relevant and necessary during the first batch
+ 		 * and will be nicely removed once the first batch is done.
+ 		 */
+ 		hashtable->imBucket = 
+ 			MemoryContextAllocZero(hashtable->batchCxt,
+ 				nbuckets * sizeof(HashJoinIMBucket*));
+ 		hashtable->imBucketFreezeOrder = 
+ 			MemoryContextAllocZero(hashtable->batchCxt,
+ 				mcvsToUse * sizeof(uint16));
+ 		/* Count the overhead of the IM pointers immediately. */
+ 		hashtable->spaceUsed += nbuckets * sizeof(HashJoinIMBucket*)
+ 			+ mcvsToUse * sizeof(uint16);
+ 		hashtable->spaceUsedIM +=  nbuckets * sizeof(HashJoinIMBucket*)
+ 			+ mcvsToUse * sizeof(uint16);
+ 
+ 		/*
+ 		 * Grab the hash functions as we will be generating the hashvalues
+ 		 * in this section.
+ 		 */
+ 		hashfunctions = hashtable->outer_hashfunctions;
+ 
+ 		/* Create the buckets */
+ 		for (i = 0; i < mcvsToUse; i++)
+ 		{
+ 			uint32 hashvalue = DatumGetUInt32(
+ 				FunctionCall1(&hashfunctions[0], values[i]));
+ 			int bucket = hashvalue & (nbuckets - 1);
+ 
+ 			/*
+ 			 * While we have not hit a hole in the hashtable and have not hit
+ 			 * a bucket with the same hashvalue we have collided in the
+ 			 * hashtable so try the next bucket location (remember it is an
+ 			 * open addressing hashtable).
+ 			 */
+ 			while (hashtable->imBucket[bucket] != NULL
+ 				&& hashtable->imBucket[bucket]->hashvalue != hashvalue)
+ 				bucket = (bucket + 1) & (nbuckets - 1);
+ 
+ 			/*
+ 			 * Leave bucket alone if it has the same hashvalue as current
+ 			 * MCV. We only want one bucket per hashvalue. Even if two MCV
+ 			 * values hash to the same bucket we are fine.
+ 			 */
+ 			if (hashtable->imBucket[bucket] == NULL)
+ 			{
+ 				/*
+ 				 * Allocate the actual bucket structure in the hashtable's batch
+ 				 * context because it is only relevant and necessary during
+ 				 * the first batch and will be nicely removed once the first
+ 				 * batch is done.
+ 				 */
+ 				hashtable->imBucket[bucket]
+ 					= MemoryContextAllocZero(hashtable->batchCxt,
+ 						sizeof(HashJoinIMBucket));
+ 				hashtable->imBucket[bucket]->hashvalue = hashvalue;
+ 				hashtable->imBucketFreezeOrder[hashtable->nUsedIMBuckets]
+ 					= bucket;
+ 				hashtable->nUsedIMBuckets++;
+ 				/* Count the overhead of the IM bucket struct */
+ 				hashtable->spaceUsed += IM_BUCKET_OVERHEAD;
+ 				hashtable->spaceUsedIM += IM_BUCKET_OVERHEAD;
+ 			}
+ 		}
+ 
+ 		free_attstatsslot(atttype, values, nvalues, numbers, nnumbers);
+ 	}
+ 
+ 	ReleaseSysCache(statsTuple);
+ }
  
  /* ----------------------------------------------------------------
   *		ExecHashJoin
***************
*** 147,152 **** ExecHashJoin(HashJoinState *node)
--- 335,345 ----
  										node->hj_HashOperators);
  		node->hj_HashTable = hashtable;
  
+ 		/* Use skew optimization only when there is more than one batch. */
+ 		if (hashtable->nbatch > 1)
+ 			ExecHashJoinDetectSkew(estate, node,
+ 				(outerPlan((Hash *) hashNode->ps.plan))->plan_width);
+ 
  		/*
  		 * execute the Hash node, to build the hash table
  		 */
***************
*** 205,216 **** ExecHashJoin(HashJoinState *node)
  			ExecHashGetBucketAndBatch(hashtable, hashvalue,
  									  &node->hj_CurBucketNo, &batchno);
  			node->hj_CurTuple = NULL;
  
  			/*
  			 * Now we've got an outer tuple and the corresponding hash bucket,
! 			 * but this tuple may not belong to the current batch.
  			 */
! 			if (batchno != hashtable->curbatch)
  			{
  				/*
  				 * Need to postpone this outer tuple to a later batch. Save it
--- 398,415 ----
  			ExecHashGetBucketAndBatch(hashtable, hashvalue,
  									  &node->hj_CurBucketNo, &batchno);
  			node->hj_CurTuple = NULL;
+ 			
+ 			/* Does the outer tuple match an IM bucket? */
+ 			node->hj_OuterTupleIMBucketNo = 
+ 				ExecHashGetIMBucket(hashtable, hashvalue);
  
  			/*
  			 * Now we've got an outer tuple and the corresponding hash bucket,
! 			 * but in might not belong to the current batch, or it might need
! 			 * to go into an in-memory bucket.
  			 */
! 			if (node->hj_OuterTupleIMBucketNo == IM_INVALID_BUCKET
! 				&& batchno != hashtable->curbatch)
  			{
  				/*
  				 * Need to postpone this outer tuple to a later batch. Save it
***************
*** 641,647 **** start_over:
  	nbatch = hashtable->nbatch;
  	curbatch = hashtable->curbatch;
  
! 	if (curbatch > 0)
  	{
  		/*
  		 * We no longer need the previous outer batch file; close it right
--- 840,866 ----
  	nbatch = hashtable->nbatch;
  	curbatch = hashtable->curbatch;
  
! 	/* if we just finished the first batch */
! 	if (curbatch == 0)
! 	{
! 		/*
! 		 * Reset some of the skew optimization state variables. IM buckets are
! 		 * no longer being used as of this point because they are only
! 		 * necessary while joining the first batch (before the cleanup phase).
! 		 *
! 		 * Especially need to make sure ExecHashGetIMBucket returns
! 		 * IM_INVALID_BUCKET quickly for all subsequent calls.
! 		 *
! 		 * IM buckets are only taking up memory if this is a multi-batch join
! 		 * and in that case ExecHashTableReset is about to be called which
! 		 * will free all memory currently used by IM buckets and tuples when
! 		 * it deletes hashtable->batchCxt.  If this is a single batch join
! 		 * then imBucket and imBucketFreezeOrder are already NULL and empty.
! 		 */
! 		hashtable->enableSkewOptimization = false;
! 		hashtable->spaceUsedIM = 0;
! 	}
! 	else if (curbatch > 0)
  	{
  		/*
  		 * We no longer need the previous outer batch file; close it right
*** a/src/include/executor/hashjoin.h
--- b/src/include/executor/hashjoin.h
***************
*** 72,77 **** typedef struct HashJoinTupleData
--- 72,96 ----
  #define HJTUPLE_MINTUPLE(hjtup)  \
  	((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
  
+ /*
+  * Stores a hashvalue and linked list of tuples that share that hashvalue.
+  *
+  * When processing MCVs to detect skew in the probe relation of a hash join
+  * the hashvalue is generated and stored before any tuples have been read 
+  * (see ExecHashJoinDetectSkew).
+  *
+  * Build tuples that hash to the same hashvalue are placed in the bucket while
+  * reading the build relation.
+  */
+ typedef struct HashJoinIMBucket
+ {
+ 	uint32 hashvalue;
+ 	HashJoinTuple tuples;
+ } HashJoinIMBucket;
+ 
+ #define IM_INVALID_BUCKET -1
+ #define IM_WORK_MEM_PERCENT 2
+ #define IM_BUCKET_OVERHEAD MAXALIGN(sizeof(HashJoinIMBucket))
  
  typedef struct HashJoinTableData
  {
***************
*** 113,121 **** typedef struct HashJoinTableData
--- 132,161 ----
  
  	Size		spaceUsed;		/* memory space currently used by tuples */
  	Size		spaceAllowed;	/* upper limit for space used */
+ 	/* memory space currently used by IM buckets and tuples */
+ 	Size		spaceUsedIM;
+ 	/* upper limit for space used by IM buckets and tuples */
+ 	Size		spaceAllowedIM;
  
  	MemoryContext hashCxt;		/* context for whole-hash-join storage */
  	MemoryContext batchCxt;		/* context for this-batch-only storage */
+ 	
+ 	/* will the join optimize memory usage when probe relation is skewed */
+ 	bool enableSkewOptimization;
+ 	HashJoinIMBucket **imBucket; /* hashtable of IM buckets */
+ 	/*
+ 	 * array of imBucket indexes to the created IM buckets sorted
+ 	 * in the opposite order that they would be frozen to disk
+ 	 */
+ 	uint16 *imBucketFreezeOrder;
+ 	int nIMBuckets; /* # of buckets in the IM buckets hashtable */
+ 	/*
+ 	 * # of used buckets in the IM buckets hashtable and length of
+ 	 * imBucketFreezeOrder array
+ 	 */
+ 	int nUsedIMBuckets;
+ 	/* # of IM buckets that have already been frozen to disk */
+ 	int nIMBucketsFrozen;
  } HashJoinTableData;
  
  #endif   /* HASHJOIN_H */
*** a/src/include/executor/nodeHash.h
--- b/src/include/executor/nodeHash.h
***************
*** 45,48 **** extern void ExecChooseHashTableSize(double ntuples, int tupwidth,
--- 45,50 ----
  						int *numbuckets,
  						int *numbatches);
  
+ extern int ExecHashGetIMBucket(HashJoinTable hashtable, uint32 hashvalue);
+ 
  #endif   /* NODEHASH_H */
*** a/src/include/nodes/execnodes.h
--- b/src/include/nodes/execnodes.h
***************
*** 1381,1386 **** typedef struct MergeJoinState
--- 1381,1387 ----
   *		hj_NeedNewOuter			true if need new outer tuple on next call
   *		hj_MatchedOuter			true if found a join match for current outer
   *		hj_OuterNotEmpty		true if outer relation known not empty
+  *		hj_OuterTupleIMBucketNo	IM bucket# for the current outer tuple
   * ----------------
   */
  
***************
*** 1406,1411 **** typedef struct HashJoinState
--- 1407,1413 ----
  	bool		hj_NeedNewOuter;
  	bool		hj_MatchedOuter;
  	bool		hj_OuterNotEmpty;
+ 	int			hj_OuterTupleIMBucketNo;
  } HashJoinState;
  
  
*** a/src/include/nodes/primnodes.h
--- b/src/include/nodes/primnodes.h
***************
*** 121,128 **** typedef struct Expr
   * subplans; for example, in a join node varno becomes INNER or OUTER and
   * varattno becomes the index of the proper element of that subplan's target
   * list.  But varnoold/varoattno continue to hold the original values.
!  * The code doesn't really need varnoold/varoattno, but they are very useful
!  * for debugging and interpreting completed plans, so we keep them around.
   */
  #define    INNER		65000
  #define    OUTER		65001
--- 121,132 ----
   * subplans; for example, in a join node varno becomes INNER or OUTER and
   * varattno becomes the index of the proper element of that subplan's target
   * list.  But varnoold/varoattno continue to hold the original values.
!  *
!  * For the most part, the code doesn't really need varnoold/varoattno, but
!  * they are very useful for debugging and interpreting completed plans, so we
!  * keep them around.  As of PostgreSQL 8.4, these values are also used by
!  * ExecHashJoinDetectSkew to fetch MCV statistics when performing multi-batch
!  * hash joins.
   */
  #define    INNER		65000
  #define    OUTER		65001
***************
*** 142,148 **** typedef struct Var
  	Index		varlevelsup;	/* for subquery variables referencing outer
  								 * relations; 0 in a normal var, >0 means N
  								 * levels up */
! 	Index		varnoold;		/* original value of varno, for debugging */
  	AttrNumber	varoattno;		/* original value of varattno */
  	int			location;		/* token location, or -1 if unknown */
  } Var;
--- 146,152 ----
  	Index		varlevelsup;	/* for subquery variables referencing outer
  								 * relations; 0 in a normal var, >0 means N
  								 * levels up */
! 	Index		varnoold;		/* original value of varno */
  	AttrNumber	varoattno;		/* original value of varattno */
  	int			location;		/* token location, or -1 if unknown */
  } Var;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to