I wrote:
> I chewed on this for awhile and decided that there'd be no real harm in
> taking identification of the unique expressions out of 
> create_unique_path() and doing it earlier, in initsplan.c; we'd need a
> couple more fields in SpecialJoinInfo but that doesn't seem like a
> problem.  However, rel->rows is a *big* problem; we simply have not made
> any join size estimates yet, and can't, because these things are done
> bottom up.

> However ... estimate_num_groups's dependency on its rowcount input is not
> large (it's basically using it as a clamp).  So conceivably we could have
> get_loop_count just multiply together the sizes of the base relations
> included in the semijoin's RHS to get a preliminary estimate of that
> number.  This would be the right thing anyway for a single relation in the
> RHS, which is the most common case.  It would usually be an overestimate
> for join RHS, but we could hope that the output of estimate_num_groups
> wouldn't be affected too badly.

Attached is a draft patch that does those two things.  I like the first
part (replacing SpecialJoinInfo's rather ad-hoc join_quals field with
something more explicitly attuned to semijoin uniqueness processing).
The second part is still pretty much of a kluge, but then get_loop_count
was a kluge already.  This arguably makes it better.

Now, on the test case you presented, this has the unfortunate effect that
it now reliably chooses the "wrong" plan for both cases :-(.  But I think
that's a reflection of poor cost parameters (ie, test case fits handily in
RAM but we've not set the cost parameters to reflect that).  We do get the
same rowcount and roughly-same cost estimates for both the random_fk_dupl
and random_fk_uniq queries, so from that standpoint it's doing the right
thing.  If I reduce random_page_cost to 2 or so, it makes the choices you
wanted.

                        regards, tom lane

diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 9fe8008..9f65d66 100644
*** a/src/backend/nodes/copyfuncs.c
--- b/src/backend/nodes/copyfuncs.c
*************** _copySpecialJoinInfo(const SpecialJoinIn
*** 1942,1948 ****
  	COPY_SCALAR_FIELD(jointype);
  	COPY_SCALAR_FIELD(lhs_strict);
  	COPY_SCALAR_FIELD(delay_upper_joins);
! 	COPY_NODE_FIELD(join_quals);
  
  	return newnode;
  }
--- 1942,1951 ----
  	COPY_SCALAR_FIELD(jointype);
  	COPY_SCALAR_FIELD(lhs_strict);
  	COPY_SCALAR_FIELD(delay_upper_joins);
! 	COPY_SCALAR_FIELD(semi_can_btree);
! 	COPY_SCALAR_FIELD(semi_can_hash);
! 	COPY_NODE_FIELD(semi_operators);
! 	COPY_NODE_FIELD(semi_rhs_exprs);
  
  	return newnode;
  }
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index fe509b0..fd876fb 100644
*** a/src/backend/nodes/equalfuncs.c
--- b/src/backend/nodes/equalfuncs.c
*************** _equalSpecialJoinInfo(const SpecialJoinI
*** 798,804 ****
  	COMPARE_SCALAR_FIELD(jointype);
  	COMPARE_SCALAR_FIELD(lhs_strict);
  	COMPARE_SCALAR_FIELD(delay_upper_joins);
! 	COMPARE_NODE_FIELD(join_quals);
  
  	return true;
  }
--- 798,807 ----
  	COMPARE_SCALAR_FIELD(jointype);
  	COMPARE_SCALAR_FIELD(lhs_strict);
  	COMPARE_SCALAR_FIELD(delay_upper_joins);
! 	COMPARE_SCALAR_FIELD(semi_can_btree);
! 	COMPARE_SCALAR_FIELD(semi_can_hash);
! 	COMPARE_NODE_FIELD(semi_operators);
! 	COMPARE_NODE_FIELD(semi_rhs_exprs);
  
  	return true;
  }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 775f482..75c57a2 100644
*** a/src/backend/nodes/outfuncs.c
--- b/src/backend/nodes/outfuncs.c
*************** _outSpecialJoinInfo(StringInfo str, cons
*** 1945,1951 ****
  	WRITE_ENUM_FIELD(jointype, JoinType);
  	WRITE_BOOL_FIELD(lhs_strict);
  	WRITE_BOOL_FIELD(delay_upper_joins);
! 	WRITE_NODE_FIELD(join_quals);
  }
  
  static void
--- 1945,1954 ----
  	WRITE_ENUM_FIELD(jointype, JoinType);
  	WRITE_BOOL_FIELD(lhs_strict);
  	WRITE_BOOL_FIELD(delay_upper_joins);
! 	WRITE_BOOL_FIELD(semi_can_btree);
! 	WRITE_BOOL_FIELD(semi_can_hash);
! 	WRITE_NODE_FIELD(semi_operators);
! 	WRITE_NODE_FIELD(semi_rhs_exprs);
  }
  
  static void
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 5a9daf0..1a0d358 100644
*** a/src/backend/optimizer/path/costsize.c
--- b/src/backend/optimizer/path/costsize.c
*************** compute_semi_anti_join_factors(PlannerIn
*** 3294,3300 ****
  	/* we don't bother trying to make the remaining fields valid */
  	norm_sjinfo.lhs_strict = false;
  	norm_sjinfo.delay_upper_joins = false;
! 	norm_sjinfo.join_quals = NIL;
  
  	nselec = clauselist_selectivity(root,
  									joinquals,
--- 3294,3303 ----
  	/* we don't bother trying to make the remaining fields valid */
  	norm_sjinfo.lhs_strict = false;
  	norm_sjinfo.delay_upper_joins = false;
! 	norm_sjinfo.semi_can_btree = false;
! 	norm_sjinfo.semi_can_hash = false;
! 	norm_sjinfo.semi_operators = NIL;
! 	norm_sjinfo.semi_rhs_exprs = NIL;
  
  	nselec = clauselist_selectivity(root,
  									joinquals,
*************** approx_tuple_count(PlannerInfo *root, Jo
*** 3456,3462 ****
  	/* we don't bother trying to make the remaining fields valid */
  	sjinfo.lhs_strict = false;
  	sjinfo.delay_upper_joins = false;
! 	sjinfo.join_quals = NIL;
  
  	/* Get the approximate selectivity */
  	foreach(l, quals)
--- 3459,3468 ----
  	/* we don't bother trying to make the remaining fields valid */
  	sjinfo.lhs_strict = false;
  	sjinfo.delay_upper_joins = false;
! 	sjinfo.semi_can_btree = false;
! 	sjinfo.semi_can_hash = false;
! 	sjinfo.semi_operators = NIL;
! 	sjinfo.semi_rhs_exprs = NIL;
  
  	/* Get the approximate selectivity */
  	foreach(l, quals)
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index b86a3cd..1b16012 100644
*** a/src/backend/optimizer/path/indxpath.c
--- b/src/backend/optimizer/path/indxpath.c
*************** static Relids get_bitmap_tree_required_o
*** 130,136 ****
  static void find_indexpath_quals(Path *bitmapqual, List **quals, List **preds);
  static int	find_list_position(Node *node, List **nodelist);
  static bool check_index_only(RelOptInfo *rel, IndexOptInfo *index);
! static double get_loop_count(PlannerInfo *root, Relids outer_relids);
  static void match_restriction_clauses_to_index(RelOptInfo *rel,
  								   IndexOptInfo *index,
  								   IndexClauseSet *clauseset);
--- 130,141 ----
  static void find_indexpath_quals(Path *bitmapqual, List **quals, List **preds);
  static int	find_list_position(Node *node, List **nodelist);
  static bool check_index_only(RelOptInfo *rel, IndexOptInfo *index);
! static double get_loop_count(PlannerInfo *root, Index cur_relid, Relids outer_relids);
! static double adjust_rowcount_for_semijoins(PlannerInfo *root,
! 							  Index cur_relid,
! 							  Index outer_relid,
! 							  double rowcount);
! static double approximate_joinrel_size(PlannerInfo *root, Relids relids);
  static void match_restriction_clauses_to_index(RelOptInfo *rel,
  								   IndexOptInfo *index,
  								   IndexClauseSet *clauseset);
*************** create_index_paths(PlannerInfo *root, Re
*** 402,408 ****
  
  			/* And push that path into the mix */
  			required_outer = get_bitmap_tree_required_outer(bitmapqual);
! 			loop_count = get_loop_count(root, required_outer);
  			bpath = create_bitmap_heap_path(root, rel, bitmapqual,
  											required_outer, loop_count);
  			add_path(rel, (Path *) bpath);
--- 407,413 ----
  
  			/* And push that path into the mix */
  			required_outer = get_bitmap_tree_required_outer(bitmapqual);
! 			loop_count = get_loop_count(root, rel->relid, required_outer);
  			bpath = create_bitmap_heap_path(root, rel, bitmapqual,
  											required_outer, loop_count);
  			add_path(rel, (Path *) bpath);
*************** build_index_paths(PlannerInfo *root, Rel
*** 969,975 ****
  		outer_relids = NULL;
  
  	/* Compute loop_count for cost estimation purposes */
! 	loop_count = get_loop_count(root, outer_relids);
  
  	/*
  	 * 2. Compute pathkeys describing index's ordering, if any, then see how
--- 974,980 ----
  		outer_relids = NULL;
  
  	/* Compute loop_count for cost estimation purposes */
! 	loop_count = get_loop_count(root, rel->relid, outer_relids);
  
  	/*
  	 * 2. Compute pathkeys describing index's ordering, if any, then see how
*************** bitmap_scan_cost_est(PlannerInfo *root, 
*** 1553,1559 ****
  	cost_bitmap_heap_scan(&bpath.path, root, rel,
  						  bpath.path.param_info,
  						  ipath,
! 						  get_loop_count(root, required_outer));
  
  	return bpath.path.total_cost;
  }
--- 1558,1564 ----
  	cost_bitmap_heap_scan(&bpath.path, root, rel,
  						  bpath.path.param_info,
  						  ipath,
! 						  get_loop_count(root, rel->relid, required_outer));
  
  	return bpath.path.total_cost;
  }
*************** bitmap_and_cost_est(PlannerInfo *root, R
*** 1594,1600 ****
  	cost_bitmap_heap_scan(&bpath.path, root, rel,
  						  bpath.path.param_info,
  						  (Path *) &apath,
! 						  get_loop_count(root, required_outer));
  
  	return bpath.path.total_cost;
  }
--- 1599,1605 ----
  	cost_bitmap_heap_scan(&bpath.path, root, rel,
  						  bpath.path.param_info,
  						  (Path *) &apath,
! 						  get_loop_count(root, rel->relid, required_outer));
  
  	return bpath.path.total_cost;
  }
*************** check_index_only(RelOptInfo *rel, IndexO
*** 1861,1892 ****
   * answer for single-other-relation cases, and it seems like a reasonable
   * zero-order approximation for multiway-join cases.
   *
   * Note: for this to work, allpaths.c must establish all baserel size
   * estimates before it begins to compute paths, or at least before it
   * calls create_index_paths().
   */
  static double
! get_loop_count(PlannerInfo *root, Relids outer_relids)
  {
  	double		result = 1.0;
  
  	/* For a non-parameterized path, just return 1.0 quickly */
  	if (outer_relids != NULL)
  	{
! 		int			relid;
  
! 		relid = -1;
! 		while ((relid = bms_next_member(outer_relids, relid)) >= 0)
  		{
  			RelOptInfo *outer_rel;
  
  			/* Paranoia: ignore bogus relid indexes */
! 			if (relid >= root->simple_rel_array_size)
  				continue;
! 			outer_rel = root->simple_rel_array[relid];
  			if (outer_rel == NULL)
  				continue;
! 			Assert(outer_rel->relid == relid);	/* sanity check on array */
  
  			/* Other relation could be proven empty, if so ignore */
  			if (IS_DUMMY_REL(outer_rel))
--- 1866,1904 ----
   * answer for single-other-relation cases, and it seems like a reasonable
   * zero-order approximation for multiway-join cases.
   *
+  * In addition, we check to see if the other side of each join clause is on
+  * the inside of some semijoin that the current relation is on the outside of.
+  * If so, the only way that a parameterized path could be used is if the
+  * semijoin RHS has been unique-ified, so we should use the number of unique
+  * RHS rows rather than using the relation's raw rowcount.
+  *
   * Note: for this to work, allpaths.c must establish all baserel size
   * estimates before it begins to compute paths, or at least before it
   * calls create_index_paths().
   */
  static double
! get_loop_count(PlannerInfo *root, Index cur_relid, Relids outer_relids)
  {
  	double		result = 1.0;
  
  	/* For a non-parameterized path, just return 1.0 quickly */
  	if (outer_relids != NULL)
  	{
! 		int			outer_relid;
  
! 		outer_relid = -1;
! 		while ((outer_relid = bms_next_member(outer_relids, outer_relid)) >= 0)
  		{
  			RelOptInfo *outer_rel;
+ 			double		rowcount;
  
  			/* Paranoia: ignore bogus relid indexes */
! 			if (outer_relid >= root->simple_rel_array_size)
  				continue;
! 			outer_rel = root->simple_rel_array[outer_relid];
  			if (outer_rel == NULL)
  				continue;
! 			Assert(outer_rel->relid == outer_relid);	/* sanity check on array */
  
  			/* Other relation could be proven empty, if so ignore */
  			if (IS_DUMMY_REL(outer_rel))
*************** get_loop_count(PlannerInfo *root, Relids
*** 1895,1908 ****
  			/* Otherwise, rel's rows estimate should be valid by now */
  			Assert(outer_rel->rows > 0);
  
  			/* Remember smallest row count estimate among the outer rels */
! 			if (result == 1.0 || result > outer_rel->rows)
! 				result = outer_rel->rows;
  		}
  	}
  	return result;
  }
  
  
  /****************************************************************************
   *				----  ROUTINES TO CHECK QUERY CLAUSES  ----
--- 1907,2006 ----
  			/* Otherwise, rel's rows estimate should be valid by now */
  			Assert(outer_rel->rows > 0);
  
+ 			/* Check to see if rel is on the inside of any semijoins */
+ 			rowcount = adjust_rowcount_for_semijoins(root,
+ 													 cur_relid,
+ 													 outer_relid,
+ 													 outer_rel->rows);
+ 
  			/* Remember smallest row count estimate among the outer rels */
! 			if (result == 1.0 || result > rowcount)
! 				result = rowcount;
  		}
  	}
  	return result;
  }
  
+ /*
+  * Check to see if outer_relid is on the inside of any semijoin that cur_relid
+  * is on the outside of.  If so, replace rowcount with the estimated number of
+  * unique rows from the semijoin RHS.  The estimate is crude but it's the best
+  * we can do at this stage of the proceedings.
+  */
+ static double
+ adjust_rowcount_for_semijoins(PlannerInfo *root,
+ 							  Index cur_relid,
+ 							  Index outer_relid,
+ 							  double rowcount)
+ {
+ 	ListCell   *lc;
+ 
+ 	foreach(lc, root->join_info_list)
+ 	{
+ 		SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
+ 
+ 		if (sjinfo->jointype == JOIN_SEMI &&
+ 			bms_is_member(cur_relid, sjinfo->syn_lefthand) &&
+ 			bms_is_member(outer_relid, sjinfo->syn_righthand))
+ 		{
+ 			/* Estimate number of unique-ified rows */
+ 			double		nraw;
+ 			double		nunique;
+ 
+ 			nraw = approximate_joinrel_size(root, sjinfo->syn_righthand);
+ 			nunique = estimate_num_groups(root,
+ 										  sjinfo->semi_rhs_exprs,
+ 										  nraw);
+ 			if (rowcount > nunique)
+ 				rowcount = nunique;
+ 		}
+ 	}
+ 	return rowcount;
+ }
+ 
+ /*
+  * Make an approximate estimate of the size of a joinrel.
+  *
+  * We don't have enough info at this point to get a good estimate, so we
+  * just multiply the base relation sizes together.  Fortunately, this is
+  * the right answer anyway for the most common case with a single relation
+  * on the RHS of a semijoin.  Also, estimate_num_groups() has only a weak
+  * dependency on its input_rows argument (it basically uses it as a clamp).
+  * So we might be able to get a fairly decent estimate even with a severe
+  * overestimate of the RHS's raw size.
+  */
+ static double
+ approximate_joinrel_size(PlannerInfo *root, Relids relids)
+ {
+ 	double		rowcount = 1.0;
+ 	int			relid;
+ 
+ 	relid = -1;
+ 	while ((relid = bms_next_member(relids, relid)) >= 0)
+ 	{
+ 		RelOptInfo *rel;
+ 
+ 		/* Paranoia: ignore bogus relid indexes */
+ 		if (relid >= root->simple_rel_array_size)
+ 			continue;
+ 		rel = root->simple_rel_array[relid];
+ 		if (rel == NULL)
+ 			continue;
+ 		Assert(rel->relid == relid);	/* sanity check on array */
+ 
+ 		/* Relation could be proven empty, if so ignore */
+ 		if (IS_DUMMY_REL(rel))
+ 			continue;
+ 
+ 		/* Otherwise, rel's rows estimate should be valid by now */
+ 		Assert(rel->rows > 0);
+ 
+ 		/* Accumulate product */
+ 		rowcount *= rel->rows;
+ 	}
+ 	return rowcount;
+ }
+ 
  
  /****************************************************************************
   *				----  ROUTINES TO CHECK QUERY CLAUSES  ----
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index e7e9a1a..fe9fd57 100644
*** a/src/backend/optimizer/path/joinrels.c
--- b/src/backend/optimizer/path/joinrels.c
*************** make_join_rel(PlannerInfo *root, RelOptI
*** 624,630 ****
  		/* we don't bother trying to make the remaining fields valid */
  		sjinfo->lhs_strict = false;
  		sjinfo->delay_upper_joins = false;
! 		sjinfo->join_quals = NIL;
  	}
  
  	/*
--- 624,633 ----
  		/* we don't bother trying to make the remaining fields valid */
  		sjinfo->lhs_strict = false;
  		sjinfo->delay_upper_joins = false;
! 		sjinfo->semi_can_btree = false;
! 		sjinfo->semi_can_hash = false;
! 		sjinfo->semi_operators = NIL;
! 		sjinfo->semi_rhs_exprs = NIL;
  	}
  
  	/*
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index 49d776d..a7655e4 100644
*** a/src/backend/optimizer/plan/initsplan.c
--- b/src/backend/optimizer/plan/initsplan.c
***************
*** 17,22 ****
--- 17,23 ----
  #include "catalog/pg_type.h"
  #include "nodes/nodeFuncs.h"
  #include "optimizer/clauses.h"
+ #include "optimizer/cost.h"
  #include "optimizer/joininfo.h"
  #include "optimizer/pathnode.h"
  #include "optimizer/paths.h"
*************** static SpecialJoinInfo *make_outerjoinin
*** 55,60 ****
--- 56,62 ----
  				   Relids left_rels, Relids right_rels,
  				   Relids inner_join_rels,
  				   JoinType jointype, List *clause);
+ static void compute_semijoin_info(SpecialJoinInfo *sjinfo, List *clause);
  static void distribute_qual_to_rels(PlannerInfo *root, Node *clause,
  						bool is_deduced,
  						bool below_outer_join,
*************** make_outerjoininfo(PlannerInfo *root,
*** 1085,1091 ****
  	sjinfo->jointype = jointype;
  	/* this always starts out false */
  	sjinfo->delay_upper_joins = false;
! 	sjinfo->join_quals = clause;
  
  	/* If it's a full join, no need to be very smart */
  	if (jointype == JOIN_FULL)
--- 1087,1094 ----
  	sjinfo->jointype = jointype;
  	/* this always starts out false */
  	sjinfo->delay_upper_joins = false;
! 
! 	compute_semijoin_info(sjinfo, clause);
  
  	/* If it's a full join, no need to be very smart */
  	if (jointype == JOIN_FULL)
*************** make_outerjoininfo(PlannerInfo *root,
*** 1237,1242 ****
--- 1240,1421 ----
  	return sjinfo;
  }
  
+ /*
+  * compute_semijoin_info
+  *	  Fill semijoin-related fields of a new SpecialJoinInfo
+  *
+  * Note: this relies on only the jointype and syn_righthand fields of the
+  * SpecialJoinInfo; the rest may not be set yet.
+  */
+ static void
+ compute_semijoin_info(SpecialJoinInfo *sjinfo, List *clause)
+ {
+ 	List	   *semi_operators;
+ 	List	   *semi_rhs_exprs;
+ 	bool		all_btree;
+ 	bool		all_hash;
+ 	ListCell   *lc;
+ 
+ 	/* Initialize semijoin-related fields in case we can't unique-ify */
+ 	sjinfo->semi_can_btree = false;
+ 	sjinfo->semi_can_hash = false;
+ 	sjinfo->semi_operators = NIL;
+ 	sjinfo->semi_rhs_exprs = NIL;
+ 
+ 	/* Nothing more to do if it's not a semijoin */
+ 	if (sjinfo->jointype != JOIN_SEMI)
+ 		return;
+ 
+ 	/*
+ 	 * Look to see whether the semijoin's join quals consist of AND'ed
+ 	 * equality operators, with (only) RHS variables on only one side of each
+ 	 * one.  If so, we can figure out how to enforce uniqueness for the RHS.
+ 	 *
+ 	 * Note that the input clause list is the list of quals that are
+ 	 * *syntactically* associated with the semijoin, which in practice means
+ 	 * the synthesized comparison list for an IN or the WHERE of an EXISTS.
+ 	 * Particularly in the latter case, it might contain clauses that aren't
+ 	 * *semantically* associated with the join, but refer to just one side or
+ 	 * the other.  We can ignore such clauses here, as they will just drop
+ 	 * down to be processed within one side or the other.  (It is okay to
+ 	 * consider only the syntactically-associated clauses here because for a
+ 	 * semijoin, no higher-level quals could refer to the RHS, and so there
+ 	 * can be no other quals that are semantically associated with this join.
+ 	 * We do things this way because it is useful to have the set of potential
+ 	 * unique-ification expressions before we can extract the list of quals
+ 	 * that are actually semantically associated with the particular join.)
+ 	 *
+ 	 * Note that the semi_operators list consists of the joinqual operators
+ 	 * themselves (but commuted if needed to put the RHS value on the right).
+ 	 * These could be cross-type operators, in which case the operator
+ 	 * actually needed for uniqueness is a related single-type operator. We
+ 	 * assume here that that operator will be available from the btree or hash
+ 	 * opclass when the time comes ... if not, create_unique_plan() will fail.
+ 	 */
+ 	semi_operators = NIL;
+ 	semi_rhs_exprs = NIL;
+ 	all_btree = true;
+ 	all_hash = enable_hashagg;	/* don't consider hash if not enabled */
+ 	foreach(lc, clause)
+ 	{
+ 		OpExpr	   *op = (OpExpr *) lfirst(lc);
+ 		Oid			opno;
+ 		Node	   *left_expr;
+ 		Node	   *right_expr;
+ 		Relids		left_varnos;
+ 		Relids		right_varnos;
+ 		Relids		all_varnos;
+ 		Oid			opinputtype;
+ 
+ 		/* Is it a binary opclause? */
+ 		if (!IsA(op, OpExpr) ||
+ 			list_length(op->args) != 2)
+ 		{
+ 			/* No, but does it reference both sides? */
+ 			all_varnos = pull_varnos((Node *) op);
+ 			if (!bms_overlap(all_varnos, sjinfo->syn_righthand) ||
+ 				bms_is_subset(all_varnos, sjinfo->syn_righthand))
+ 			{
+ 				/*
+ 				 * Clause refers to only one rel, so ignore it --- unless it
+ 				 * contains volatile functions, in which case we'd better
+ 				 * punt.
+ 				 */
+ 				if (contain_volatile_functions((Node *) op))
+ 					return;
+ 				continue;
+ 			}
+ 			/* Non-operator clause referencing both sides, must punt */
+ 			return;
+ 		}
+ 
+ 		/* Extract data from binary opclause */
+ 		opno = op->opno;
+ 		left_expr = linitial(op->args);
+ 		right_expr = lsecond(op->args);
+ 		left_varnos = pull_varnos(left_expr);
+ 		right_varnos = pull_varnos(right_expr);
+ 		all_varnos = bms_union(left_varnos, right_varnos);
+ 		opinputtype = exprType(left_expr);
+ 
+ 		/* Does it reference both sides? */
+ 		if (!bms_overlap(all_varnos, sjinfo->syn_righthand) ||
+ 			bms_is_subset(all_varnos, sjinfo->syn_righthand))
+ 		{
+ 			/*
+ 			 * Clause refers to only one rel, so ignore it --- unless it
+ 			 * contains volatile functions, in which case we'd better punt.
+ 			 */
+ 			if (contain_volatile_functions((Node *) op))
+ 				return;
+ 			continue;
+ 		}
+ 
+ 		/* check rel membership of arguments */
+ 		if (!bms_is_empty(right_varnos) &&
+ 			bms_is_subset(right_varnos, sjinfo->syn_righthand) &&
+ 			!bms_overlap(left_varnos, sjinfo->syn_righthand))
+ 		{
+ 			/* typical case, right_expr is RHS variable */
+ 		}
+ 		else if (!bms_is_empty(left_varnos) &&
+ 				 bms_is_subset(left_varnos, sjinfo->syn_righthand) &&
+ 				 !bms_overlap(right_varnos, sjinfo->syn_righthand))
+ 		{
+ 			/* flipped case, left_expr is RHS variable */
+ 			opno = get_commutator(opno);
+ 			if (!OidIsValid(opno))
+ 				return;
+ 			right_expr = left_expr;
+ 		}
+ 		else
+ 		{
+ 			/* mixed membership of args, punt */
+ 			return;
+ 		}
+ 
+ 		/* all operators must be btree equality or hash equality */
+ 		if (all_btree)
+ 		{
+ 			/* oprcanmerge is considered a hint... */
+ 			if (!op_mergejoinable(opno, opinputtype) ||
+ 				get_mergejoin_opfamilies(opno) == NIL)
+ 				all_btree = false;
+ 		}
+ 		if (all_hash)
+ 		{
+ 			/* ... but oprcanhash had better be correct */
+ 			if (!op_hashjoinable(opno, opinputtype))
+ 				all_hash = false;
+ 		}
+ 		if (!(all_btree || all_hash))
+ 			return;
+ 
+ 		/* so far so good, keep building lists */
+ 		semi_operators = lappend_oid(semi_operators, opno);
+ 		semi_rhs_exprs = lappend(semi_rhs_exprs, copyObject(right_expr));
+ 	}
+ 
+ 	/* Punt if we didn't find at least one column to unique-ify */
+ 	if (semi_rhs_exprs == NIL)
+ 		return;
+ 
+ 	/*
+ 	 * The expressions we'd need to unique-ify mustn't be volatile.
+ 	 */
+ 	if (contain_volatile_functions((Node *) semi_rhs_exprs))
+ 		return;
+ 
+ 	/*
+ 	 * If we get here, we can unique-ify the semijoin's RHS using at least one
+ 	 * of sorting and hashing.  Save the information about how to do that.
+ 	 */
+ 	sjinfo->semi_can_btree = all_btree;
+ 	sjinfo->semi_can_hash = all_hash;
+ 	sjinfo->semi_operators = semi_operators;
+ 	sjinfo->semi_rhs_exprs = semi_rhs_exprs;
+ }
+ 
  
  /*****************************************************************************
   *
diff --git a/src/backend/optimizer/util/orclauses.c b/src/backend/optimizer/util/orclauses.c
index d1c4e99..f0acc14 100644
*** a/src/backend/optimizer/util/orclauses.c
--- b/src/backend/optimizer/util/orclauses.c
*************** consider_new_or_clause(PlannerInfo *root
*** 335,341 ****
  		/* we don't bother trying to make the remaining fields valid */
  		sjinfo.lhs_strict = false;
  		sjinfo.delay_upper_joins = false;
! 		sjinfo.join_quals = NIL;
  
  		/* Compute inner-join size */
  		orig_selec = clause_selectivity(root, (Node *) join_or_rinfo,
--- 335,344 ----
  		/* we don't bother trying to make the remaining fields valid */
  		sjinfo.lhs_strict = false;
  		sjinfo.delay_upper_joins = false;
! 		sjinfo.semi_can_btree = false;
! 		sjinfo.semi_can_hash = false;
! 		sjinfo.semi_operators = NIL;
! 		sjinfo.semi_rhs_exprs = NIL;
  
  		/* Compute inner-join size */
  		orig_selec = clause_selectivity(root, (Node *) join_or_rinfo,
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 1395a21..faca30b 100644
*** a/src/backend/optimizer/util/pathnode.c
--- b/src/backend/optimizer/util/pathnode.c
*************** create_unique_path(PlannerInfo *root, Re
*** 1088,1099 ****
  	Path		sort_path;		/* dummy for result of cost_sort */
  	Path		agg_path;		/* dummy for result of cost_agg */
  	MemoryContext oldcontext;
- 	List	   *in_operators;
- 	List	   *uniq_exprs;
- 	bool		all_btree;
- 	bool		all_hash;
  	int			numCols;
- 	ListCell   *lc;
  
  	/* Caller made a mistake if subpath isn't cheapest_total ... */
  	Assert(subpath == rel->cheapest_total_path);
--- 1088,1094 ----
*************** create_unique_path(PlannerInfo *root, Re
*** 1106,1113 ****
  	if (rel->cheapest_unique_path)
  		return (UniquePath *) rel->cheapest_unique_path;
  
! 	/* If we previously failed, return NULL quickly */
! 	if (sjinfo->join_quals == NIL)
  		return NULL;
  
  	/*
--- 1101,1108 ----
  	if (rel->cheapest_unique_path)
  		return (UniquePath *) rel->cheapest_unique_path;
  
! 	/* If it's not possible to unique-ify, return NULL */
! 	if (!(sjinfo->semi_can_btree || sjinfo->semi_can_hash))
  		return NULL;
  
  	/*
*************** create_unique_path(PlannerInfo *root, Re
*** 1116,1265 ****
  	 */
  	oldcontext = MemoryContextSwitchTo(root->planner_cxt);
  
- 	/*----------
- 	 * Look to see whether the semijoin's join quals consist of AND'ed
- 	 * equality operators, with (only) RHS variables on only one side of
- 	 * each one.  If so, we can figure out how to enforce uniqueness for
- 	 * the RHS.
- 	 *
- 	 * Note that the input join_quals list is the list of quals that are
- 	 * *syntactically* associated with the semijoin, which in practice means
- 	 * the synthesized comparison list for an IN or the WHERE of an EXISTS.
- 	 * Particularly in the latter case, it might contain clauses that aren't
- 	 * *semantically* associated with the join, but refer to just one side or
- 	 * the other.  We can ignore such clauses here, as they will just drop
- 	 * down to be processed within one side or the other.  (It is okay to
- 	 * consider only the syntactically-associated clauses here because for a
- 	 * semijoin, no higher-level quals could refer to the RHS, and so there
- 	 * can be no other quals that are semantically associated with this join.
- 	 * We do things this way because it is useful to be able to run this test
- 	 * before we have extracted the list of quals that are actually
- 	 * semantically associated with the particular join.)
- 	 *
- 	 * Note that the in_operators list consists of the joinqual operators
- 	 * themselves (but commuted if needed to put the RHS value on the right).
- 	 * These could be cross-type operators, in which case the operator
- 	 * actually needed for uniqueness is a related single-type operator.
- 	 * We assume here that that operator will be available from the btree
- 	 * or hash opclass when the time comes ... if not, create_unique_plan()
- 	 * will fail.
- 	 *----------
- 	 */
- 	in_operators = NIL;
- 	uniq_exprs = NIL;
- 	all_btree = true;
- 	all_hash = enable_hashagg;	/* don't consider hash if not enabled */
- 	foreach(lc, sjinfo->join_quals)
- 	{
- 		OpExpr	   *op = (OpExpr *) lfirst(lc);
- 		Oid			opno;
- 		Node	   *left_expr;
- 		Node	   *right_expr;
- 		Relids		left_varnos;
- 		Relids		right_varnos;
- 		Relids		all_varnos;
- 		Oid			opinputtype;
- 
- 		/* Is it a binary opclause? */
- 		if (!IsA(op, OpExpr) ||
- 			list_length(op->args) != 2)
- 		{
- 			/* No, but does it reference both sides? */
- 			all_varnos = pull_varnos((Node *) op);
- 			if (!bms_overlap(all_varnos, sjinfo->syn_righthand) ||
- 				bms_is_subset(all_varnos, sjinfo->syn_righthand))
- 			{
- 				/*
- 				 * Clause refers to only one rel, so ignore it --- unless it
- 				 * contains volatile functions, in which case we'd better
- 				 * punt.
- 				 */
- 				if (contain_volatile_functions((Node *) op))
- 					goto no_unique_path;
- 				continue;
- 			}
- 			/* Non-operator clause referencing both sides, must punt */
- 			goto no_unique_path;
- 		}
- 
- 		/* Extract data from binary opclause */
- 		opno = op->opno;
- 		left_expr = linitial(op->args);
- 		right_expr = lsecond(op->args);
- 		left_varnos = pull_varnos(left_expr);
- 		right_varnos = pull_varnos(right_expr);
- 		all_varnos = bms_union(left_varnos, right_varnos);
- 		opinputtype = exprType(left_expr);
- 
- 		/* Does it reference both sides? */
- 		if (!bms_overlap(all_varnos, sjinfo->syn_righthand) ||
- 			bms_is_subset(all_varnos, sjinfo->syn_righthand))
- 		{
- 			/*
- 			 * Clause refers to only one rel, so ignore it --- unless it
- 			 * contains volatile functions, in which case we'd better punt.
- 			 */
- 			if (contain_volatile_functions((Node *) op))
- 				goto no_unique_path;
- 			continue;
- 		}
- 
- 		/* check rel membership of arguments */
- 		if (!bms_is_empty(right_varnos) &&
- 			bms_is_subset(right_varnos, sjinfo->syn_righthand) &&
- 			!bms_overlap(left_varnos, sjinfo->syn_righthand))
- 		{
- 			/* typical case, right_expr is RHS variable */
- 		}
- 		else if (!bms_is_empty(left_varnos) &&
- 				 bms_is_subset(left_varnos, sjinfo->syn_righthand) &&
- 				 !bms_overlap(right_varnos, sjinfo->syn_righthand))
- 		{
- 			/* flipped case, left_expr is RHS variable */
- 			opno = get_commutator(opno);
- 			if (!OidIsValid(opno))
- 				goto no_unique_path;
- 			right_expr = left_expr;
- 		}
- 		else
- 			goto no_unique_path;
- 
- 		/* all operators must be btree equality or hash equality */
- 		if (all_btree)
- 		{
- 			/* oprcanmerge is considered a hint... */
- 			if (!op_mergejoinable(opno, opinputtype) ||
- 				get_mergejoin_opfamilies(opno) == NIL)
- 				all_btree = false;
- 		}
- 		if (all_hash)
- 		{
- 			/* ... but oprcanhash had better be correct */
- 			if (!op_hashjoinable(opno, opinputtype))
- 				all_hash = false;
- 		}
- 		if (!(all_btree || all_hash))
- 			goto no_unique_path;
- 
- 		/* so far so good, keep building lists */
- 		in_operators = lappend_oid(in_operators, opno);
- 		uniq_exprs = lappend(uniq_exprs, copyObject(right_expr));
- 	}
- 
- 	/* Punt if we didn't find at least one column to unique-ify */
- 	if (uniq_exprs == NIL)
- 		goto no_unique_path;
- 
- 	/*
- 	 * The expressions we'd need to unique-ify mustn't be volatile.
- 	 */
- 	if (contain_volatile_functions((Node *) uniq_exprs))
- 		goto no_unique_path;
- 
- 	/*
- 	 * If we get here, we can unique-ify using at least one of sorting and
- 	 * hashing.  Start building the result Path object.
- 	 */
  	pathnode = makeNode(UniquePath);
  
  	pathnode->path.pathtype = T_Unique;
--- 1111,1116 ----
*************** create_unique_path(PlannerInfo *root, Re
*** 1273,1290 ****
  	pathnode->path.pathkeys = NIL;
  
  	pathnode->subpath = subpath;
! 	pathnode->in_operators = in_operators;
! 	pathnode->uniq_exprs = uniq_exprs;
  
  	/*
  	 * If the input is a relation and it has a unique index that proves the
! 	 * uniq_exprs are unique, then we don't need to do anything.  Note that
! 	 * relation_has_unique_index_for automatically considers restriction
  	 * clauses for the rel, as well.
  	 */
! 	if (rel->rtekind == RTE_RELATION && all_btree &&
  		relation_has_unique_index_for(root, rel, NIL,
! 									  uniq_exprs, in_operators))
  	{
  		pathnode->umethod = UNIQUE_PATH_NOOP;
  		pathnode->path.rows = rel->rows;
--- 1124,1142 ----
  	pathnode->path.pathkeys = NIL;
  
  	pathnode->subpath = subpath;
! 	pathnode->in_operators = sjinfo->semi_operators;
! 	pathnode->uniq_exprs = sjinfo->semi_rhs_exprs;
  
  	/*
  	 * If the input is a relation and it has a unique index that proves the
! 	 * semi_rhs_exprs are unique, then we don't need to do anything.  Note
! 	 * that relation_has_unique_index_for automatically considers restriction
  	 * clauses for the rel, as well.
  	 */
! 	if (rel->rtekind == RTE_RELATION && sjinfo->semi_can_btree &&
  		relation_has_unique_index_for(root, rel, NIL,
! 									  sjinfo->semi_rhs_exprs,
! 									  sjinfo->semi_operators))
  	{
  		pathnode->umethod = UNIQUE_PATH_NOOP;
  		pathnode->path.rows = rel->rows;
*************** create_unique_path(PlannerInfo *root, Re
*** 1304,1310 ****
  	 * don't need to do anything.  The test for uniqueness has to consider
  	 * exactly which columns we are extracting; for example "SELECT DISTINCT
  	 * x,y" doesn't guarantee that x alone is distinct. So we cannot check for
! 	 * this optimization unless uniq_exprs consists only of simple Vars
  	 * referencing subquery outputs.  (Possibly we could do something with
  	 * expressions in the subquery outputs, too, but for now keep it simple.)
  	 */
--- 1156,1162 ----
  	 * don't need to do anything.  The test for uniqueness has to consider
  	 * exactly which columns we are extracting; for example "SELECT DISTINCT
  	 * x,y" doesn't guarantee that x alone is distinct. So we cannot check for
! 	 * this optimization unless semi_rhs_exprs consists only of simple Vars
  	 * referencing subquery outputs.  (Possibly we could do something with
  	 * expressions in the subquery outputs, too, but for now keep it simple.)
  	 */
*************** create_unique_path(PlannerInfo *root, Re
*** 1316,1326 ****
  		{
  			List	   *sub_tlist_colnos;
  
! 			sub_tlist_colnos = translate_sub_tlist(uniq_exprs, rel->relid);
  
  			if (sub_tlist_colnos &&
  				query_is_distinct_for(rte->subquery,
! 									  sub_tlist_colnos, in_operators))
  			{
  				pathnode->umethod = UNIQUE_PATH_NOOP;
  				pathnode->path.rows = rel->rows;
--- 1168,1180 ----
  		{
  			List	   *sub_tlist_colnos;
  
! 			sub_tlist_colnos = translate_sub_tlist(sjinfo->semi_rhs_exprs,
! 												   rel->relid);
  
  			if (sub_tlist_colnos &&
  				query_is_distinct_for(rte->subquery,
! 									  sub_tlist_colnos,
! 									  sjinfo->semi_operators))
  			{
  				pathnode->umethod = UNIQUE_PATH_NOOP;
  				pathnode->path.rows = rel->rows;
*************** create_unique_path(PlannerInfo *root, Re
*** 1338,1347 ****
  	}
  
  	/* Estimate number of output rows */
! 	pathnode->path.rows = estimate_num_groups(root, uniq_exprs, rel->rows);
! 	numCols = list_length(uniq_exprs);
  
! 	if (all_btree)
  	{
  		/*
  		 * Estimate cost for sort+unique implementation
--- 1192,1203 ----
  	}
  
  	/* Estimate number of output rows */
! 	pathnode->path.rows = estimate_num_groups(root,
! 											  sjinfo->semi_rhs_exprs,
! 											  rel->rows);
! 	numCols = list_length(sjinfo->semi_rhs_exprs);
  
! 	if (sjinfo->semi_can_btree)
  	{
  		/*
  		 * Estimate cost for sort+unique implementation
*************** create_unique_path(PlannerInfo *root, Re
*** 1363,1369 ****
  		sort_path.total_cost += cpu_operator_cost * rel->rows * numCols;
  	}
  
! 	if (all_hash)
  	{
  		/*
  		 * Estimate the overhead per hashtable entry at 64 bytes (same as in
--- 1219,1225 ----
  		sort_path.total_cost += cpu_operator_cost * rel->rows * numCols;
  	}
  
! 	if (sjinfo->semi_can_hash)
  	{
  		/*
  		 * Estimate the overhead per hashtable entry at 64 bytes (same as in
*************** create_unique_path(PlannerInfo *root, Re
*** 1372,1378 ****
  		int			hashentrysize = rel->width + 64;
  
  		if (hashentrysize * pathnode->path.rows > work_mem * 1024L)
! 			all_hash = false;	/* don't try to hash */
  		else
  			cost_agg(&agg_path, root,
  					 AGG_HASHED, NULL,
--- 1228,1240 ----
  		int			hashentrysize = rel->width + 64;
  
  		if (hashentrysize * pathnode->path.rows > work_mem * 1024L)
! 		{
! 			/*
! 			 * We should not try to hash.  Hack the SpecialJoinInfo to
! 			 * remember this, in case we come through here again.
! 			 */
! 			sjinfo->semi_can_hash = false;
! 		}
  		else
  			cost_agg(&agg_path, root,
  					 AGG_HASHED, NULL,
*************** create_unique_path(PlannerInfo *root, Re
*** 1382,1400 ****
  					 rel->rows);
  	}
  
! 	if (all_btree && all_hash)
  	{
  		if (agg_path.total_cost < sort_path.total_cost)
  			pathnode->umethod = UNIQUE_PATH_HASH;
  		else
  			pathnode->umethod = UNIQUE_PATH_SORT;
  	}
! 	else if (all_btree)
  		pathnode->umethod = UNIQUE_PATH_SORT;
! 	else if (all_hash)
  		pathnode->umethod = UNIQUE_PATH_HASH;
  	else
! 		goto no_unique_path;
  
  	if (pathnode->umethod == UNIQUE_PATH_HASH)
  	{
--- 1244,1266 ----
  					 rel->rows);
  	}
  
! 	if (sjinfo->semi_can_btree && sjinfo->semi_can_hash)
  	{
  		if (agg_path.total_cost < sort_path.total_cost)
  			pathnode->umethod = UNIQUE_PATH_HASH;
  		else
  			pathnode->umethod = UNIQUE_PATH_SORT;
  	}
! 	else if (sjinfo->semi_can_btree)
  		pathnode->umethod = UNIQUE_PATH_SORT;
! 	else if (sjinfo->semi_can_hash)
  		pathnode->umethod = UNIQUE_PATH_HASH;
  	else
! 	{
! 		/* we can get here only if we abandoned hashing above */
! 		MemoryContextSwitchTo(oldcontext);
! 		return NULL;
! 	}
  
  	if (pathnode->umethod == UNIQUE_PATH_HASH)
  	{
*************** create_unique_path(PlannerInfo *root, Re
*** 1412,1426 ****
  	MemoryContextSwitchTo(oldcontext);
  
  	return pathnode;
- 
- no_unique_path:			/* failure exit */
- 
- 	/* Mark the SpecialJoinInfo as not unique-able */
- 	sjinfo->join_quals = NIL;
- 
- 	MemoryContextSwitchTo(oldcontext);
- 
- 	return NULL;
  }
  
  /*
--- 1278,1283 ----
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 6845a40..e6dd4e8 100644
*** a/src/include/nodes/relation.h
--- b/src/include/nodes/relation.h
*************** typedef struct PlannerGlobal
*** 101,108 ****
  
  	bool		transientPlan;	/* redo plan when TransactionXmin changes? */
  
! 	bool		hasRowSecurity;	/* row security applied? */
! 
  } PlannerGlobal;
  
  /* macro for fetching the Plan associated with a SubPlan node */
--- 101,107 ----
  
  	bool		transientPlan;	/* redo plan when TransactionXmin changes? */
  
! 	bool		hasRowSecurity; /* row security applied? */
  } PlannerGlobal;
  
  /* macro for fetching the Plan associated with a SubPlan node */
*************** typedef struct PlaceHolderVar
*** 1374,1384 ****
   * commute with this join, because that would leave noplace to check the
   * pushed-down clause.  (We don't track this for FULL JOINs, either.)
   *
!  * join_quals is an implicit-AND list of the quals syntactically associated
!  * with the join (they may or may not end up being applied at the join level).
!  * This is just a side list and does not drive actual application of quals.
!  * For JOIN_SEMI joins, this is cleared to NIL in create_unique_path() if
!  * the join is found not to be suitable for a uniqueify-the-RHS plan.
   *
   * jointype is never JOIN_RIGHT; a RIGHT JOIN is handled by switching
   * the inputs to make it a LEFT JOIN.  So the allowed values of jointype
--- 1373,1385 ----
   * commute with this join, because that would leave noplace to check the
   * pushed-down clause.  (We don't track this for FULL JOINs, either.)
   *
!  * For a semijoin, we also extract the join operators and their RHS arguments
!  * and set semi_operators, semi_rhs_exprs, semi_can_btree, and semi_can_hash.
!  * This is done in support of possibly unique-ifying the RHS, so we don't
!  * bother unless at least one of semi_can_btree and semi_can_hash can be set
!  * true.  (You might expect that this information would be computed during
!  * join planning; but it's helpful to have it available during planning of
!  * parameterized table scans, so we store it in the SpecialJoinInfo structs.)
   *
   * jointype is never JOIN_RIGHT; a RIGHT JOIN is handled by switching
   * the inputs to make it a LEFT JOIN.  So the allowed values of jointype
*************** typedef struct PlaceHolderVar
*** 1391,1397 ****
   * SpecialJoinInfos with jointype == JOIN_INNER for outer joins, since for
   * cost estimation purposes it is sometimes useful to know the join size under
   * plain innerjoin semantics.  Note that lhs_strict, delay_upper_joins, and
!  * join_quals are not set meaningfully within such structs.
   */
  
  typedef struct SpecialJoinInfo
--- 1392,1398 ----
   * SpecialJoinInfos with jointype == JOIN_INNER for outer joins, since for
   * cost estimation purposes it is sometimes useful to know the join size under
   * plain innerjoin semantics.  Note that lhs_strict, delay_upper_joins, and
!  * of course the semi_xxx fields are not set meaningfully within such structs.
   */
  
  typedef struct SpecialJoinInfo
*************** typedef struct SpecialJoinInfo
*** 1404,1410 ****
  	JoinType	jointype;		/* always INNER, LEFT, FULL, SEMI, or ANTI */
  	bool		lhs_strict;		/* joinclause is strict for some LHS rel */
  	bool		delay_upper_joins;		/* can't commute with upper RHS */
! 	List	   *join_quals;		/* join quals, in implicit-AND list format */
  } SpecialJoinInfo;
  
  /*
--- 1405,1415 ----
  	JoinType	jointype;		/* always INNER, LEFT, FULL, SEMI, or ANTI */
  	bool		lhs_strict;		/* joinclause is strict for some LHS rel */
  	bool		delay_upper_joins;		/* can't commute with upper RHS */
! 	/* Remaining fields are set only for JOIN_SEMI jointype: */
! 	bool		semi_can_btree; /* true if semi_operators are all btree */
! 	bool		semi_can_hash;	/* true if semi_operators are all hash */
! 	List	   *semi_operators; /* OIDs of equality join operators */
! 	List	   *semi_rhs_exprs; /* righthand-side expressions of these ops */
  } SpecialJoinInfo;
  
  /*
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to