Re: [HACKERS] multivariate statistics (v25)

Alvaro Herrera Tue, 14 Mar 2017 15:11:47 -0700

Alvaro Herrera wrote:
> I tried patch 0002 today and again there are conflicts, so I rebased and
> fixed the merge problems.


... and attached the patch.

-- 
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 2c2da2a..b5c4129 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -296,6 +296,11 @@
      </row>
 
      <row>
+      <entry><link 
linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link></entry>
+      <entry>extended planner statistics</entry>
+     </row>
+
+     <row>
       <entry><link 
linkend="catalog-pg-subscription"><structname>pg_subscription</structname></link></entry>
       <entry>logical replication subscriptions</entry>
      </row>
@@ -4223,6 +4228,98 @@
   </table>
  </sect1>
 
+ <sect1 id="catalog-pg-statistic-ext">
+  <title><structname>pg_statistic_ext</structname></title>
+
+  <indexterm zone="catalog-pg-statistic-ext">
+   <primary>pg_statistic_ext</primary>
+  </indexterm>
+
+  <para>
+   The catalog <structname>pg_statistic_ext</structname>
+   holds extended planner statistics.
+  </para>
+
+  <table>
+   <title><structname>pg_statistic_ext</> Columns</title>
+
+   <tgroup cols="4">
+    <thead>
+     <row>
+      <entry>Name</entry>
+      <entry>Type</entry>
+      <entry>References</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+    <tbody>
+
+     <row>
+      <entry><structfield>starelid</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link 
linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
+      <entry>The table that the described columns belongs to</entry>
+     </row>
+
+     <row>
+      <entry><structfield>staname</structfield></entry>
+      <entry><type>name</type></entry>
+      <entry></entry>
+      <entry>Name of the statistic.</entry>
+     </row>
+
+     <row>
+      <entry><structfield>stanamespace</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link 
linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.oid</literal></entry>
+      <entry>
+       The OID of the namespace that contains this statistic
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>staowner</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link 
linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.oid</literal></entry>
+      <entry>Owner of the statistic</entry>
+     </row>
+
+     <row>
+      <entry><structfield>staenabled</structfield></entry>
+      <entry><type>char[]</type></entry>
+      <entry></entry>
+      <entry>
+        An array with the modes of the enabled statistic types, encoded as
+        <literal>d</literal> for ndistinct coefficients.
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>stakeys</structfield></entry>
+      <entry><type>int2vector</type></entry>
+      <entry><literal><link 
linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
+      <entry>
+       This is an array of values that indicate which table columns this
+       statistic covers. For example a value of <literal>1 3</literal> would
+       mean that the first and the third table columns make up the statistic 
key.
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>standistinct</structfield></entry>
+      <entry><type>pg_ndistinct</type></entry>
+      <entry></entry>
+      <entry>
+       Ndistict coefficients, serialized as <structname>pg_ndistinct</> type.
+      </entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+ </sect1>
+
  <sect1 id="catalog-pg-namespace">
   <title><structname>pg_namespace</structname></title>
 
diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml
index b73c66b..76955e5 100644
--- a/doc/src/sgml/planstats.sgml
+++ b/doc/src/sgml/planstats.sgml
@@ -448,4 +448,145 @@ rows = (outer_cardinality * inner_cardinality) * 
selectivity
 
  </sect1>
 
+ <sect1 id="extended-statistics">
+  <title>Extended Statistics</title>
+
+  <indexterm zone="extended-statistics">
+   <primary>extended statistics</primary>
+   <secondary>planner</secondary>
+  </indexterm>
+
+  <para>
+   The examples presented in <xref linkend="row-estimation-examples"> used
+   statistics about individual columns to compute selectivity estimates.
+   When estimating conditions on multiple columns, the planner assumes
+   independence of the conditions and multiplies the selectivities. When the
+   columns are correlated, the independence assumption is violated, and the
+   estimates may be off by several orders of magnitude, resulting in poor
+   plan choices.
+  </para>
+
+  <para>
+   The examples presented below demonstrate such estimation errors on simple
+   data sets, and also how to resolve them by creating extended statistics
+   using <command>CREATE STATISTICS</> command.
+  </para>
+
+  <para>
+   Let's start with a very simple data set - a table with two columns,
+   containing exactly the same values:
+
+<programlisting>
+CREATE TABLE t (a INT, b INT);
+INSERT INTO t SELECT i % 100, i % 100 FROM generate_series(1, 10000) s(i);
+ANALYZE t;
+</programlisting>
+
+   As explained in <xref linkend="planner-stats">, the planner can determine
+   cardinality of <structname>t</structname> using the number of pages and
+   rows is looked up in <structname>pg_class</structname>:
+
+<programlisting>
+SELECT relpages, reltuples FROM pg_class WHERE relname = 't';
+
+ relpages | reltuples
+----------+-----------
+       45 |     10000
+</programlisting>
+
+   The data distribution is very simple - there are only 100 distinct values
+   in each column, uniformly distributed.
+  </para>
+
+  <para>
+   The following example shows the result of estimating a <literal>WHERE</>
+   condition on the <structfield>a</> column:
+
+<programlisting>
+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1;
+                                           QUERY PLAN                          
                  
+-------------------------------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..170.00 rows=100 width=8) (actual time=0.031..2.870 
rows=100 loops=1)
+   Filter: (a = 1)
+   Rows Removed by Filter: 9900
+ Planning time: 0.092 ms
+ Execution time: 3.103 ms
+(5 rows)
+</programlisting>
+
+   The planner examines the condition and computes the estimate using
+   <function>eqsel</>, the selectivity function for <literal>=</>, and
+   statistics stored in the <structname>pg_stats</> table. In this case
+   the planner estimates the condition matches 1% rows, and by comparing
+   the estimated and actual number of rows, we see that the estimate is
+   very accurate (in fact exact, as the table is very small).
+ </para>
+
+  <para>
+   Adding a condition on the second column results in the following plan:
+
+<programlisting>
+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1;
+                                          QUERY PLAN                           
                
+-----------------------------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..195.00 rows=1 width=8) (actual time=0.033..3.006 
rows=100 loops=1)
+   Filter: ((a = 1) AND (b = 1))
+   Rows Removed by Filter: 9900
+ Planning time: 0.121 ms
+ Execution time: 3.220 ms
+(5 rows)
+</programlisting>
+
+   The planner estimates the selectivity for each condition individually,
+   arriving to the 1% estimates as above, and then multiplies them, getting
+   the final 0.01% estimate. The plan however shows that this results in
+   a significant underestimate, as the actual number of rows matching the
+   conditions is two orders of magnitude higher than estimated.
+  </para>
+
+  <para>
+   Overestimates, i.e. errors in the opposite direction, are also possible.
+   Consider for example the following combination of range conditions, each
+   matching 
+
+<programlisting>
+EXPLAIN ANALYZE SELECT * FROM t WHERE a <= 49 AND b > 49;
+                                           QUERY PLAN                          
                 
+------------------------------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..195.00 rows=2500 width=8) (actual 
time=1.607..1.607 rows=0 loops=1)
+   Filter: ((a <= 49) AND (b > 49))
+   Rows Removed by Filter: 10000
+ Planning time: 0.050 ms
+ Execution time: 1.623 ms
+(5 rows)
+</programlisting>
+
+   The planner examines both <literal>WHERE</> clauses and estimates them
+   using the <function>scalarltsel</> and <function>scalargtsel</> functions,
+   specified as the selectivity functions matching the <literal>&lt;=</> and
+   <literal>&gt;</literal> operators. Both conditions match 50% of the
+   table, and assuming independence the planner multiplies them to compute
+   the total estimate of 25%. However as the explain output shows, the actual
+   number of rows is 0, because the columns are correlated and the conditions
+   contradict each other.
+  </para>
+
+  <para>
+   Both estimation errors are caused by violation of the independence
+   assumption, as the two columns contain exactly the same values, and are
+   therefore perfectly correlated. Providing additional information about
+   correlation between columns is the purpose of extended statistics,
+   and the rest of this section explains in more detail how the planner
+   leverages them to improve estimates.
+  </para>
+
+  <para>
+   For additional details about extended statistics, see
+   <filename>src/backend/statistics/README</>. There are additional
+   <literal>READMEs</> for each type of statistics, mentioned in the following
+   sections.
+  </para>
+
+ </sect1>
+
 </chapter>
diff --git a/doc/src/sgml/ref/allfiles.sgml b/doc/src/sgml/ref/allfiles.sgml
index 2bc4d9f..255e800 100644
--- a/doc/src/sgml/ref/allfiles.sgml
+++ b/doc/src/sgml/ref/allfiles.sgml
@@ -34,6 +34,7 @@ Complete list of usable sgml source files in this directory.
 <!ENTITY alterSequence      SYSTEM "alter_sequence.sgml">
 <!ENTITY alterSubscription  SYSTEM "alter_subscription.sgml">
 <!ENTITY alterSystem        SYSTEM "alter_system.sgml">
+<!ENTITY alterStatistics    SYSTEM "alter_statistics.sgml">
 <!ENTITY alterTable         SYSTEM "alter_table.sgml">
 <!ENTITY alterTableSpace    SYSTEM "alter_tablespace.sgml">
 <!ENTITY alterTSConfig      SYSTEM "alter_tsconfig.sgml">
@@ -80,6 +81,7 @@ Complete list of usable sgml source files in this directory.
 <!ENTITY createSchema       SYSTEM "create_schema.sgml">
 <!ENTITY createSequence     SYSTEM "create_sequence.sgml">
 <!ENTITY createServer       SYSTEM "create_server.sgml">
+<!ENTITY createStatistics   SYSTEM "create_statistics.sgml">
 <!ENTITY createSubscription SYSTEM "create_subscription.sgml">
 <!ENTITY createTable        SYSTEM "create_table.sgml">
 <!ENTITY createTableAs      SYSTEM "create_table_as.sgml">
@@ -126,6 +128,7 @@ Complete list of usable sgml source files in this directory.
 <!ENTITY dropSchema         SYSTEM "drop_schema.sgml">
 <!ENTITY dropSequence       SYSTEM "drop_sequence.sgml">
 <!ENTITY dropServer         SYSTEM "drop_server.sgml">
+<!ENTITY dropStatistics     SYSTEM "drop_statistics.sgml">
 <!ENTITY dropSubscription   SYSTEM "drop_subscription.sgml">
 <!ENTITY dropTable          SYSTEM "drop_table.sgml">
 <!ENTITY dropTableSpace     SYSTEM "drop_tablespace.sgml">
diff --git a/doc/src/sgml/ref/alter_statistics.sgml 
b/doc/src/sgml/ref/alter_statistics.sgml
new file mode 100644
index 0000000..35cbc09
--- /dev/null
+++ b/doc/src/sgml/ref/alter_statistics.sgml
@@ -0,0 +1,115 @@
+<!--
+doc/src/sgml/ref/alter_statistics.sgml
+PostgreSQL documentation
+-->
+
+<refentry id="SQL-ALTERSTATISTICS">
+ <indexterm zone="sql-alterstatistics">
+  <primary>ALTER STATISTICS</primary>
+ </indexterm>
+
+ <refmeta>
+  <refentrytitle>ALTER STATISTICS</refentrytitle>
+  <manvolnum>7</manvolnum>
+  <refmiscinfo>SQL - Language Statements</refmiscinfo>
+ </refmeta>
+
+ <refnamediv>
+  <refname>ALTER STATISTICS</refname>
+  <refpurpose>
+   change the definition of a extended statistics
+  </refpurpose>
+ </refnamediv>
+
+ <refsynopsisdiv>
+<synopsis>
+ALTER STATISTICS <replaceable class="parameter">name</replaceable> OWNER TO { 
<replaceable class="PARAMETER">new_owner</replaceable> | CURRENT_USER | 
SESSION_USER }
+ALTER STATISTICS <replaceable class="parameter">name</replaceable> RENAME TO 
<replaceable class="parameter">new_name</replaceable>
+ALTER STATISTICS <replaceable class="parameter">name</replaceable> SET SCHEMA 
<replaceable class="parameter">new_schema</replaceable>
+</synopsis>
+ </refsynopsisdiv>
+
+ <refsect1>
+  <title>Description</title>
+
+  <para>
+   <command>ALTER STATISTICS</command> changes the parameters of an existing
+   extended statistics.  Any parameters not specifically set in the
+   <command>ALTER STATISTICS</command> command retain their prior settings.
+  </para>
+
+  <para>
+   You must own the statistics to use <command>ALTER STATISTICS</>.
+   To change a statistics' schema, you must also have <literal>CREATE</>
+   privilege on the new schema.
+   To alter the owner, you must also be a direct or indirect member of the new
+   owning role, and that role must have <literal>CREATE</literal> privilege on
+   the statistics' schema.  (These restrictions enforce that altering the owner
+   doesn't do anything you couldn't do by dropping and recreating the 
statistics.
+   However, a superuser can alter ownership of any statistics anyway.)
+  </para>
+ </refsect1>
+
+ <refsect1>
+  <title>Parameters</title>
+
+   <para>
+    <variablelist>
+     <varlistentry>
+      <term><replaceable class="parameter">name</replaceable></term>
+      <listitem>
+       <para>
+        The name (optionally schema-qualified) of a statistics to be altered.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><replaceable class="PARAMETER">new_owner</replaceable></term>
+      <listitem>
+       <para>
+        The user name of the new owner of the statistics.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><replaceable class="parameter">new_name</replaceable></term>
+      <listitem>
+       <para>
+        The new name for the statistics.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><replaceable class="parameter">new_schema</replaceable></term>
+      <listitem>
+       <para>
+        The new schema for the statistics.
+       </para>
+      </listitem>
+     </varlistentry>
+
+    </variablelist>
+   </para>
+  </refsect1>
+
+ <refsect1>
+  <title>Compatibility</title>
+
+  <para>
+   There's no <command>ALTER STATISTICS</command> command in the SQL standard.
+  </para>
+ </refsect1>
+
+ <refsect1>
+  <title>See Also</title>
+
+  <simplelist type="inline">
+   <member><xref linkend="sql-createstatistics"></member>
+   <member><xref linkend="sql-dropstatistics"></member>
+  </simplelist>
+ </refsect1>
+
+</refentry>
diff --git a/doc/src/sgml/ref/alter_table.sgml 
b/doc/src/sgml/ref/alter_table.sgml
index 077c003..f3ad5ed 100644
--- a/doc/src/sgml/ref/alter_table.sgml
+++ b/doc/src/sgml/ref/alter_table.sgml
@@ -119,9 +119,12 @@ ALTER TABLE [ IF EXISTS ] <replaceable 
class="PARAMETER">name</replaceable>
      <para>
       This form drops a column from a table.  Indexes and
       table constraints involving the column will be automatically
-      dropped as well.  You will need to say <literal>CASCADE</> if
-      anything outside the table depends on the column, for example,
-      foreign key references or views.
+      dropped as well.
+      Multivariate statistics referencing the dropped column will also be
+      removed if the removal of the column would cause the statistics to
+      contain data for only a single column.
+      You will need to say <literal>CASCADE</> if anything outside the table
+      depends on the column, for example, foreign key references or views.
       If <literal>IF EXISTS</literal> is specified and the column
       does not exist, no error is thrown. In this case a notice
       is issued instead.
diff --git a/doc/src/sgml/ref/create_statistics.sgml 
b/doc/src/sgml/ref/create_statistics.sgml
new file mode 100644
index 0000000..5919a25
--- /dev/null
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -0,0 +1,152 @@
+<!--
+doc/src/sgml/ref/create_statistics.sgml
+PostgreSQL documentation
+-->
+
+<refentry id="SQL-CREATESTATISTICS">
+ <indexterm zone="sql-createstatistics">
+  <primary>CREATE STATISTICS</primary>
+ </indexterm>
+
+ <refmeta>
+  <refentrytitle>CREATE STATISTICS</refentrytitle>
+  <manvolnum>7</manvolnum>
+  <refmiscinfo>SQL - Language Statements</refmiscinfo>
+ </refmeta>
+
+ <refnamediv>
+  <refname>CREATE STATISTICS</refname>
+  <refpurpose>define extended statistics</refpurpose>
+ </refnamediv>
+
+ <refsynopsisdiv>
+<synopsis>
+CREATE STATISTICS [ IF NOT EXISTS ] <replaceable 
class="PARAMETER">statistics_name</replaceable> ON (
+  <replaceable class="PARAMETER">column_name</replaceable>, <replaceable 
class="PARAMETER">column_name</replaceable> [, ...])
+  FROM <replaceable class="PARAMETER">table_name</replaceable>
+</synopsis>
+
+ </refsynopsisdiv>
+
+ <refsect1 id="SQL-CREATESTATISTICS-description">
+  <title>Description</title>
+
+  <para>
+   <command>CREATE STATISTICS</command> will create a new extended statistics
+   on the table. The statistics will be created in the current database and
+   will be owned by the user issuing the command.
+  </para>
+
+  <para>
+   If a schema name is given (for example, <literal>CREATE STATISTICS
+   myschema.mystat ...</>) then the statistics is created in the specified
+   schema.  Otherwise it is created in the current schema.  The name of
+   the table must be distinct from the name of any other statistics in the
+   same schema.
+  </para>
+
+  <para>
+   To be able to create a table, you must have <literal>USAGE</literal>
+   privilege on all column types or the type in the <literal>OF</literal>
+   clause, respectively.
+  </para>
+ </refsect1>
+
+ <refsect1>
+  <title>Parameters</title>
+
+  <variablelist>
+
+   <varlistentry>
+    <term><literal>IF NOT EXISTS</></term>
+    <listitem>
+     <para>
+      Do not throw an error if a statistics with the same name already exists.
+      A notice is issued in this case.  Note that there is no guarantee that
+      the existing statistics is anything like the one that would have been
+      created.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><replaceable class="PARAMETER">statistics_name</replaceable></term>
+    <listitem>
+     <para>
+      The name (optionally schema-qualified) of the statistics to be created.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><replaceable class="PARAMETER">table_name</replaceable></term>
+    <listitem>
+     <para>
+      The name (optionally schema-qualified) of the table the statistics should
+      be created on.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><replaceable class="PARAMETER">column_name</replaceable></term>
+    <listitem>
+     <para>
+      The name of a column to be included in the statistics.
+     </para>
+    </listitem>
+   </varlistentry>
+
+  </variablelist>
+
+ </refsect1>
+
+ <refsect1 id="SQL-CREATESTATISTICS-examples">
+  <title>Examples</title>
+
+  <para>
+   Create table <structname>t1</> with two functionally dependent columns, i.e.
+   knowledge of a value in the first column is sufficient for determining the
+   value in the other column. Then functional dependencies are built on those
+   columns:
+
+<programlisting>
+CREATE TABLE t1 (
+    a   int,
+    b   int
+);
+
+INSERT INTO t1 SELECT i/100, i/500
+                 FROM generate_series(1,1000000) s(i);
+
+CREATE STATISTICS s1 ON (a, b) FROM t1;
+
+ANALYZE t1;
+
+-- valid combination of values
+EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
+
+-- invalid combination of values
+EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 1);
+</programlisting>
+  </para>
+
+ </refsect1>
+
+ <refsect1>
+  <title>Compatibility</title>
+
+  <para>
+   There's no <command>CREATE STATISTICS</command> command in the SQL standard.
+  </para>
+ </refsect1>
+
+ <refsect1>
+  <title>See Also</title>
+
+  <simplelist type="inline">
+   <member><xref linkend="sql-alterstatistics"></member>
+   <member><xref linkend="sql-dropstatistics"></member>
+  </simplelist>
+ </refsect1>
+</refentry>
diff --git a/doc/src/sgml/ref/drop_statistics.sgml 
b/doc/src/sgml/ref/drop_statistics.sgml
new file mode 100644
index 0000000..d7c657f
--- /dev/null
+++ b/doc/src/sgml/ref/drop_statistics.sgml
@@ -0,0 +1,91 @@
+<!--
+doc/src/sgml/ref/drop_statistics.sgml
+PostgreSQL documentation
+-->
+
+<refentry id="SQL-DROPSTATISTICS">
+ <indexterm zone="sql-dropstatistics">
+  <primary>DROP STATISTICS</primary>
+ </indexterm>
+
+ <refmeta>
+  <refentrytitle>DROP STATISTICS</refentrytitle>
+  <manvolnum>7</manvolnum>
+  <refmiscinfo>SQL - Language Statements</refmiscinfo>
+ </refmeta>
+
+ <refnamediv>
+  <refname>DROP STATISTICS</refname>
+  <refpurpose>remove extended statistics</refpurpose>
+ </refnamediv>
+
+ <refsynopsisdiv>
+<synopsis>
+DROP STATISTICS [ IF EXISTS ] <replaceable 
class="PARAMETER">name</replaceable> [, ...]
+</synopsis>
+ </refsynopsisdiv>
+
+ <refsect1>
+  <title>Description</title>
+
+  <para>
+   <command>DROP STATISTICS</command> removes statistics from the database.
+   Only the statistics owner, the schema owner, and superuser can drop a
+   statistics.
+  </para>
+
+ </refsect1>
+
+ <refsect1>
+  <title>Parameters</title>
+
+  <variablelist>
+   <varlistentry>
+    <term><literal>IF EXISTS</literal></term>
+    <listitem>
+     <para>
+      Do not throw an error if the statistics do not exist. A notice is
+      issued in this case.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><replaceable class="PARAMETER">name</replaceable></term>
+    <listitem>
+     <para>
+      The name (optionally schema-qualified) of the statistics to drop.
+     </para>
+    </listitem>
+   </varlistentry>
+
+  </variablelist>
+ </refsect1>
+
+ <refsect1>
+  <title>Examples</title>
+
+  <para>
+   ...
+  </para>
+
+ </refsect1>
+
+ <refsect1>
+  <title>Compatibility</title>
+
+  <para>
+   There's no <command>DROP STATISTICS</command> command in the SQL standard.
+  </para>
+ </refsect1>
+
+ <refsect1>
+  <title>See Also</title>
+
+  <simplelist type="inline">
+   <member><xref linkend="sql-alterstatistics"></member>
+   <member><xref linkend="sql-createstatistics"></member>
+  </simplelist>
+ </refsect1>
+
+</refentry>
diff --git a/doc/src/sgml/reference.sgml b/doc/src/sgml/reference.sgml
index c8191de..aa8a157 100644
--- a/doc/src/sgml/reference.sgml
+++ b/doc/src/sgml/reference.sgml
@@ -60,6 +60,7 @@
    &alterSchema;
    &alterSequence;
    &alterServer;
+   &alterStatistics;
    &alterSubscription;
    &alterSystem;
    &alterTable;
@@ -108,6 +109,7 @@
    &createSchema;
    &createSequence;
    &createServer;
+   &createStatistics;
    &createSubscription;
    &createTable;
    &createTableAs;
@@ -154,6 +156,7 @@
    &dropSchema;
    &dropSequence;
    &dropServer;
+   &dropStatistics;
    &dropSubscription;
    &dropTable;
    &dropTableSpace;
diff --git a/src/backend/Makefile b/src/backend/Makefile
index 7a0bbb2..426ef4f 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -19,7 +19,7 @@ include $(top_builddir)/src/Makefile.global
 
 SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \
        main nodes optimizer port postmaster regex replication rewrite \
-       storage tcop tsearch utils $(top_builddir)/src/timezone
+       statistics storage tcop tsearch utils $(top_builddir)/src/timezone
 
 include $(srcdir)/common.mk
 
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index 3136858..ff7cc79 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -33,6 +33,7 @@ POSTGRES_BKI_SRCS = $(addprefix 
$(top_srcdir)/src/include/catalog/,\
        pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \
        pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \
        pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h 
\
+       pg_statistic_ext.h \
        pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h 
pg_description.h \
        pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \
        pg_database.h pg_db_role_setting.h pg_tablespace.h pg_pltemplate.h \
diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c
index be86d76..1d71c7c 100644
--- a/src/backend/catalog/aclchk.c
+++ b/src/backend/catalog/aclchk.c
@@ -48,6 +48,7 @@
 #include "catalog/pg_operator.h"
 #include "catalog/pg_opfamily.h"
 #include "catalog/pg_proc.h"
+#include "catalog/pg_statistic_ext.h"
 #include "catalog/pg_subscription.h"
 #include "catalog/pg_tablespace.h"
 #include "catalog/pg_type.h"
@@ -5104,6 +5105,32 @@ pg_subscription_ownercheck(Oid sub_oid, Oid roleid)
 }
 
 /*
+ * Ownership check for a extended statistics (specified by OID).
+ */
+bool
+pg_statistics_ownercheck(Oid stat_oid, Oid roleid)
+{
+       HeapTuple       tuple;
+       Oid                     ownerId;
+
+       /* Superusers bypass all permission checking. */
+       if (superuser_arg(roleid))
+               return true;
+
+       tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stat_oid));
+       if (!HeapTupleIsValid(tuple))
+               ereport(ERROR,
+                               (errcode(ERRCODE_UNDEFINED_OBJECT),
+                                errmsg("statistics with OID %u do not exist", 
stat_oid)));
+
+       ownerId = ((Form_pg_statistic_ext) GETSTRUCT(tuple))->staowner;
+
+       ReleaseSysCache(tuple);
+
+       return has_privs_of_role(roleid, ownerId);
+}
+
+/*
  * Check whether specified role has CREATEROLE privilege (or is a superuser)
  *
  * Note: roles do not have owners per se; instead we use this test in
diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index fc088b2..ee27cae 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -51,6 +51,7 @@
 #include "catalog/pg_publication.h"
 #include "catalog/pg_publication_rel.h"
 #include "catalog/pg_rewrite.h"
+#include "catalog/pg_statistic_ext.h"
 #include "catalog/pg_subscription.h"
 #include "catalog/pg_tablespace.h"
 #include "catalog/pg_transform.h"
@@ -154,6 +155,7 @@ static const Oid object_classes[] = {
        RewriteRelationId,                      /* OCLASS_REWRITE */
        TriggerRelationId,                      /* OCLASS_TRIGGER */
        NamespaceRelationId,            /* OCLASS_SCHEMA */
+       StatisticExtRelationId,         /* OCLASS_STATISTIC_EXT */
        TSParserRelationId,                     /* OCLASS_TSPARSER */
        TSDictionaryRelationId,         /* OCLASS_TSDICT */
        TSTemplateRelationId,           /* OCLASS_TSTEMPLATE */
@@ -1263,6 +1265,10 @@ doDeletion(const ObjectAddress *object, int flags)
                        DropTransformById(object->objectId);
                        break;
 
+               case OCLASS_STATISTIC_EXT:
+                       RemoveStatisticsById(object->objectId);
+                       break;
+
                default:
                        elog(ERROR, "unrecognized object class: %u",
                                 object->classId);
@@ -2377,6 +2383,9 @@ getObjectClass(const ObjectAddress *object)
                case NamespaceRelationId:
                        return OCLASS_SCHEMA;
 
+               case StatisticExtRelationId:
+                       return OCLASS_STATISTIC_EXT;
+
                case TSParserRelationId:
                        return OCLASS_TSPARSER;
 
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 41c0056..c944b57 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -52,6 +52,7 @@
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_partitioned_table.h"
 #include "catalog/pg_statistic.h"
+#include "catalog/pg_statistic_ext.h"
 #include "catalog/pg_tablespace.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_type_fn.h"
@@ -1608,7 +1609,10 @@ RemoveAttributeById(Oid relid, AttrNumber attnum)
        heap_close(attr_rel, RowExclusiveLock);
 
        if (attnum > 0)
+       {
                RemoveStatistics(relid, attnum);
+               RemoveStatisticsExt(relid, attnum);
+       }
 
        relation_close(rel, NoLock);
 }
@@ -1856,6 +1860,11 @@ heap_drop_with_catalog(Oid relid)
        RemoveStatistics(relid, 0);
 
        /*
+        * delete multi-variate statistics
+        */
+       RemoveStatisticsExt(relid, 0);
+
+       /*
         * delete attribute tuples
         */
        DeleteAttributeTuples(relid);
@@ -2766,6 +2775,98 @@ RemoveStatistics(Oid relid, AttrNumber attnum)
 
 
 /*
+ * RemoveStatisticsExt --- remove entries in pg_statistic_ext for a rel
+ *
+ * If attnum is zero, remove all entries for rel; else remove only the one(s)
+ * for that column.
+ */
+void
+RemoveStatisticsExt(Oid relid, AttrNumber attnum)
+{
+       Relation        pgstatisticext;
+       TupleDesc       tupdesc = NULL;
+       SysScanDesc scan;
+       ScanKeyData key;
+       HeapTuple       tuple;
+
+       /*
+        * When dropping a column, we'll drop statistics with a single remaining
+        * (undropped column). To do that, we need the tuple descriptor.
+        *
+        * We already have the relation locked (as we're running ALTER TABLE ...
+        * DROP COLUMN), so we'll just get the descriptor here.
+        */
+       if (attnum != 0)
+       {
+               Relation        rel = relation_open(relid, NoLock);
+
+               /* extended stats are supported on tables and matviews */
+               if (rel->rd_rel->relkind == RELKIND_RELATION ||
+                       rel->rd_rel->relkind == RELKIND_MATVIEW)
+                       tupdesc = RelationGetDescr(rel);
+
+               relation_close(rel, NoLock);
+       }
+
+       if (tupdesc == NULL)
+               return;
+
+       pgstatisticext = heap_open(StatisticExtRelationId, RowExclusiveLock);
+
+       ScanKeyInit(&key,
+                               Anum_pg_statistic_ext_starelid,
+                               BTEqualStrategyNumber, F_OIDEQ,
+                               ObjectIdGetDatum(relid));
+
+       scan = systable_beginscan(pgstatisticext,
+                                                         
StatisticExtRelidIndexId,
+                                                         true, NULL, 1, &key);
+
+       /* we must loop even when attnum != 0, in case of inherited stats */
+       while (HeapTupleIsValid(tuple = systable_getnext(scan)))
+       {
+               bool            delete = true;
+
+               if (attnum != 0)
+               {
+                       Datum           adatum;
+                       bool            isnull;
+                       int                     i;
+                       int                     ncolumns = 0;
+                       ArrayType  *arr;
+                       int16      *attnums;
+
+                       /* get the columns */
+                       adatum = SysCacheGetAttr(STATEXTOID, tuple,
+                                                                        
Anum_pg_statistic_ext_stakeys, &isnull);
+                       Assert(!isnull);
+
+                       arr = DatumGetArrayTypeP(adatum);
+                       attnums = (int16 *) ARR_DATA_PTR(arr);
+
+                       for (i = 0; i < ARR_DIMS(arr)[0]; i++)
+                       {
+                               /* count the column unless it's has been / is 
being dropped */
+                               if ((!tupdesc->attrs[attnums[i] - 
1]->attisdropped) &&
+                                       (attnums[i] != attnum))
+                                       ncolumns += 1;
+                       }
+
+                       /* delete if there are less than two attributes */
+                       delete = (ncolumns < 2);
+               }
+
+               if (delete)
+                       simple_heap_delete(pgstatisticext, &tuple->t_self);
+       }
+
+       systable_endscan(scan);
+
+       heap_close(pgstatisticext, RowExclusiveLock);
+}
+
+
+/*
  * RelationTruncateIndexes - truncate all indexes associated
  * with the heap relation to zero tuples.
  *
diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
index a38da30..e521bd9 100644
--- a/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@ -2086,6 +2086,62 @@ ConversionIsVisible(Oid conid)
 }
 
 /*
+ * get_statistics_oid - find a statistics by possibly qualified name
+ *
+ * If not found, returns InvalidOid if missing_ok, else throws error
+ */
+Oid
+get_statistics_oid(List *names, bool missing_ok)
+{
+       char       *schemaname;
+       char       *stats_name;
+       Oid                     namespaceId;
+       Oid                     stats_oid = InvalidOid;
+       ListCell   *l;
+
+       /* deconstruct the name list */
+       DeconstructQualifiedName(names, &schemaname, &stats_name);
+
+       if (schemaname)
+       {
+               /* use exact schema given */
+               namespaceId = LookupExplicitNamespace(schemaname, missing_ok);
+               if (missing_ok && !OidIsValid(namespaceId))
+                       stats_oid = InvalidOid;
+               else
+                       stats_oid = GetSysCacheOid2(STATEXTNAMENSP,
+                                                                               
PointerGetDatum(stats_name),
+                                                                               
ObjectIdGetDatum(namespaceId));
+       }
+       else
+       {
+               /* search for it in search path */
+               recomputeNamespacePath();
+
+               foreach(l, activeSearchPath)
+               {
+                       namespaceId = lfirst_oid(l);
+
+                       if (namespaceId == myTempNamespace)
+                               continue;               /* do not look in temp 
namespace */
+                       stats_oid = GetSysCacheOid2(STATEXTNAMENSP,
+                                                                               
PointerGetDatum(stats_name),
+                                                                               
ObjectIdGetDatum(namespaceId));
+                       if (OidIsValid(stats_oid))
+                               break;
+               }
+       }
+
+       if (!OidIsValid(stats_oid) && !missing_ok)
+               ereport(ERROR,
+                               (errcode(ERRCODE_UNDEFINED_OBJECT),
+                                errmsg("statistics \"%s\" do not exist",
+                                               NameListToString(names))));
+
+       return stats_oid;
+}
+
+/*
  * get_ts_parser_oid - find a TS parser by possibly qualified name
  *
  * If not found, returns InvalidOid if missing_ok, else throws error
diff --git a/src/backend/catalog/objectaddress.c 
b/src/backend/catalog/objectaddress.c
index 3a7f049..a346215 100644
--- a/src/backend/catalog/objectaddress.c
+++ b/src/backend/catalog/objectaddress.c
@@ -48,6 +48,7 @@
 #include "catalog/pg_publication.h"
 #include "catalog/pg_publication_rel.h"
 #include "catalog/pg_rewrite.h"
+#include "catalog/pg_statistic_ext.h"
 #include "catalog/pg_subscription.h"
 #include "catalog/pg_tablespace.h"
 #include "catalog/pg_transform.h"
@@ -478,6 +479,18 @@ static const ObjectPropertyType ObjectProperty[] =
                InvalidAttrNumber,
                ACL_KIND_SUBSCRIPTION,
                true
+       },
+       {
+               StatisticExtRelationId,
+               StatisticExtOidIndexId,
+               STATEXTOID,
+               STATEXTNAMENSP,
+               Anum_pg_statistic_ext_staname,
+               Anum_pg_statistic_ext_stanamespace,
+               Anum_pg_statistic_ext_staowner,
+               InvalidAttrNumber,              /* no ACL (same as relation) */
+               -1,                                             /* no ACL */
+               true
        }
 };
 
@@ -696,6 +709,10 @@ static const struct object_type_map
        /* OCLASS_TRANSFORM */
        {
                "transform", OBJECT_TRANSFORM
+       },
+       /* OBJECT_STATISTICS */
+       {
+               "statistics", OBJECT_STATISTICS
        }
 };
 
@@ -974,6 +991,12 @@ get_object_address(ObjectType objtype, Node *object,
                                address = 
get_object_address_defacl(castNode(List, object),
                                                                                
                        missing_ok);
                                break;
+                       case OBJECT_STATISTICS:
+                               address.classId = StatisticExtRelationId;
+                               address.objectId = 
get_statistics_oid(castNode(List, object),
+                                                                               
                          missing_ok);
+                               address.objectSubId = 0;
+                               break;
                        default:
                                elog(ERROR, "unrecognized objtype: %d", (int) 
objtype);
                                /* placate compiler, in case it thinks elog 
might return */
@@ -2079,6 +2102,7 @@ pg_get_object_address(PG_FUNCTION_ARGS)
                case OBJECT_ATTRIBUTE:
                case OBJECT_COLLATION:
                case OBJECT_CONVERSION:
+               case OBJECT_STATISTICS:
                case OBJECT_TSPARSER:
                case OBJECT_TSDICTIONARY:
                case OBJECT_TSTEMPLATE:
@@ -2366,6 +2390,10 @@ check_object_ownership(Oid roleid, ObjectType objtype, 
ObjectAddress address,
                                                
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                                                 errmsg("must be superuser")));
                        break;
+               case OBJECT_STATISTICS:
+                       if (!pg_statistics_ownercheck(address.objectId, roleid))
+                               aclcheck_error_type(ACLCHECK_NOT_OWNER, 
address.objectId);
+                       break;
                default:
                        elog(ERROR, "unrecognized object type: %d",
                                 (int) objtype);
@@ -3853,6 +3881,10 @@ getObjectTypeDescription(const ObjectAddress *object)
                        appendStringInfoString(&buffer, "subscription");
                        break;
 
+               case OCLASS_STATISTIC_EXT:
+                       appendStringInfoString(&buffer, "extended statistics");
+                       break;
+
                default:
                        appendStringInfo(&buffer, "unrecognized %u", 
object->classId);
                        break;
@@ -4876,6 +4908,29 @@ getObjectIdentityParts(const ObjectAddress *object,
                                break;
                        }
 
+               case OCLASS_STATISTIC_EXT:
+                       {
+                               HeapTuple       tup;
+                               Form_pg_statistic_ext formStatistic;
+                               char       *schema;
+
+                               tup = SearchSysCache1(STATEXTOID,
+                                                                         
ObjectIdGetDatum(object->objectId));
+                               if (!HeapTupleIsValid(tup))
+                                       elog(ERROR, "cache lookup failed for 
statistics %u",
+                                                object->objectId);
+                               formStatistic = (Form_pg_statistic_ext) 
GETSTRUCT(tup);
+                               schema = 
get_namespace_name_or_temp(formStatistic->stanamespace);
+                               appendStringInfoString(&buffer,
+                                                                          
quote_qualified_identifier(schema,
+                                                                               
   NameStr(formStatistic->staname)));
+                               if (objname)
+                                       *objname = list_make2(schema,
+                                                                  
pstrdup(NameStr(formStatistic->staname)));
+                               ReleaseSysCache(tup);
+                       }
+                       break;
+
                default:
                        appendStringInfo(&buffer, "unrecognized object %u %u 
%d",
                                                         object->classId,
diff --git a/src/backend/catalog/system_views.sql 
b/src/backend/catalog/system_views.sql
index 0bce209..f3b3578 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -186,6 +186,16 @@ CREATE OR REPLACE VIEW pg_sequences AS
     WHERE NOT pg_is_other_temp_schema(N.oid)
           AND relkind = 'S';
 
+CREATE VIEW pg_stats_ext AS
+    SELECT
+        N.nspname AS schemaname,
+        C.relname AS tablename,
+        S.staname AS staname,
+        S.stakeys AS attnums,
+        length(s.standistinct) AS ndistbytes
+    FROM (pg_statistic_ext S JOIN pg_class C ON (C.oid = S.starelid))
+        LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace);
+
 CREATE VIEW pg_stats WITH (security_barrier) AS
     SELECT
         nspname AS schemaname,
diff --git a/src/backend/commands/Makefile b/src/backend/commands/Makefile
index e0fab38..4a6c99e 100644
--- a/src/backend/commands/Makefile
+++ b/src/backend/commands/Makefile
@@ -18,8 +18,8 @@ OBJS = amcmds.o aggregatecmds.o alter.o analyze.o async.o 
cluster.o comment.o \
        event_trigger.o explain.o extension.o foreigncmds.o functioncmds.o \
        indexcmds.o lockcmds.o matview.o operatorcmds.o opclasscmds.o \
        policy.o portalcmds.o prepare.o proclang.o publicationcmds.o \
-       schemacmds.o seclabel.o sequence.o subscriptioncmds.o tablecmds.o \
-       tablespace.o trigger.o tsearchcmds.o typecmds.o user.o vacuum.o \
-       vacuumlazy.o variable.o view.o
+       schemacmds.o seclabel.o sequence.o statscmds.o subscriptioncmds.o \
+       tablecmds.o tablespace.o trigger.o tsearchcmds.o typecmds.o user.o \
+       vacuum.o vacuumlazy.o variable.o view.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/commands/alter.c b/src/backend/commands/alter.c
index cf1391c..bf1aba1 100644
--- a/src/backend/commands/alter.c
+++ b/src/backend/commands/alter.c
@@ -373,6 +373,7 @@ ExecRenameStmt(RenameStmt *stmt)
                case OBJECT_OPCLASS:
                case OBJECT_OPFAMILY:
                case OBJECT_LANGUAGE:
+               case OBJECT_STATISTICS:
                case OBJECT_TSCONFIGURATION:
                case OBJECT_TSDICTIONARY:
                case OBJECT_TSPARSER:
@@ -489,6 +490,7 @@ ExecAlterObjectSchemaStmt(AlterObjectSchemaStmt *stmt,
                case OBJECT_OPERATOR:
                case OBJECT_OPCLASS:
                case OBJECT_OPFAMILY:
+               case OBJECT_STATISTICS:
                case OBJECT_TSCONFIGURATION:
                case OBJECT_TSDICTIONARY:
                case OBJECT_TSPARSER:
@@ -803,6 +805,7 @@ ExecAlterOwnerStmt(AlterOwnerStmt *stmt)
                case OBJECT_OPERATOR:
                case OBJECT_OPCLASS:
                case OBJECT_OPFAMILY:
+               case OBJECT_STATISTICS:
                case OBJECT_TABLESPACE:
                case OBJECT_TSDICTIONARY:
                case OBJECT_TSCONFIGURATION:
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index b91df98..39d9bdb 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -17,6 +17,7 @@
 #include <math.h>
 
 #include "access/multixact.h"
+#include "access/sysattr.h"
 #include "access/transam.h"
 #include "access/tupconvert.h"
 #include "access/tuptoaster.h"
@@ -28,6 +29,7 @@
 #include "catalog/pg_collation.h"
 #include "catalog/pg_inherits_fn.h"
 #include "catalog/pg_namespace.h"
+#include "catalog/pg_statistic_ext.h"
 #include "commands/dbcommands.h"
 #include "commands/tablecmds.h"
 #include "commands/vacuum.h"
@@ -39,13 +41,17 @@
 #include "parser/parse_relation.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
+#include "statistics/common.h"
+#include "statistics/stats.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
 #include "utils/acl.h"
 #include "utils/attoptcache.h"
+#include "utils/builtins.h"
 #include "utils/datum.h"
+#include "utils/fmgroids.h"
 #include "utils/guc.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
@@ -566,6 +572,9 @@ do_analyze_rel(Relation onerel, int options, VacuumParams 
*params,
                        update_attstats(RelationGetRelid(Irel[ind]), false,
                                                        thisdata->attr_cnt, 
thisdata->vacattrstats);
                }
+
+               /* Build extended statistics (if there are any). */
+               build_ext_stats(onerel, totalrows, numrows, rows, attr_cnt, 
vacattrstats);
        }
 
        /*
@@ -1683,19 +1692,6 @@ ind_fetch_func(VacAttrStatsP stats, int rownum, bool 
*isNull)
  */
 typedef struct
 {
-       Oid                     eqopr;                  /* '=' operator for 
datatype, if any */
-       Oid                     eqfunc;                 /* and associated 
function */
-       Oid                     ltopr;                  /* '<' operator for 
datatype, if any */
-} StdAnalyzeData;
-
-typedef struct
-{
-       Datum           value;                  /* a data value */
-       int                     tupno;                  /* position index for 
tuple it came from */
-} ScalarItem;
-
-typedef struct
-{
        int                     count;                  /* # of duplicates */
        int                     first;                  /* values[] index of 
first occurrence */
 } ScalarMCVItem;
diff --git a/src/backend/commands/dropcmds.c b/src/backend/commands/dropcmds.c
index ab73fbf..e7ae4a5 100644
--- a/src/backend/commands/dropcmds.c
+++ b/src/backend/commands/dropcmds.c
@@ -286,6 +286,13 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
                        msg = gettext_noop("schema \"%s\" does not exist, 
skipping");
                        name = strVal((Value *) object);
                        break;
+               case OBJECT_STATISTICS:
+                       if (!schema_does_not_exist_skipping(castNode(List, 
object), &msg, &name))
+                       {
+                               msg = gettext_noop("statistics \"%s\" do not 
exist, skipping");
+                               name = NameListToString(castNode(List, object));
+                       }
+                       break;
                case OBJECT_TSPARSER:
                        if (!schema_does_not_exist_skipping(castNode(List, 
object), &msg, &name))
                        {
diff --git a/src/backend/commands/event_trigger.c 
b/src/backend/commands/event_trigger.c
index 346b347..b84a10f 100644
--- a/src/backend/commands/event_trigger.c
+++ b/src/backend/commands/event_trigger.c
@@ -112,6 +112,7 @@ static event_trigger_support_data event_trigger_support[] = 
{
        {"SCHEMA", true},
        {"SEQUENCE", true},
        {"SERVER", true},
+       {"STATISTICS", true},
        {"SUBSCRIPTION", true},
        {"TABLE", true},
        {"TABLESPACE", false},
@@ -1108,6 +1109,7 @@ EventTriggerSupportsObjectType(ObjectType obtype)
                case OBJECT_SCHEMA:
                case OBJECT_SEQUENCE:
                case OBJECT_SUBSCRIPTION:
+               case OBJECT_STATISTICS:
                case OBJECT_TABCONSTRAINT:
                case OBJECT_TABLE:
                case OBJECT_TRANSFORM:
@@ -1173,6 +1175,7 @@ EventTriggerSupportsObjectClass(ObjectClass objclass)
                case OCLASS_PUBLICATION:
                case OCLASS_PUBLICATION_REL:
                case OCLASS_SUBSCRIPTION:
+               case OCLASS_STATISTIC_EXT:
                        return true;
        }
 
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
new file mode 100644
index 0000000..77d7a36
--- /dev/null
+++ b/src/backend/commands/statscmds.c
@@ -0,0 +1,270 @@
+/*-------------------------------------------------------------------------
+ *
+ * statscmds.c
+ *       Commands for creating and altering extended statistics
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *       src/backend/commands/statscmds.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "catalog/dependency.h"
+#include "catalog/indexing.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_namespace.h"
+#include "catalog/pg_statistic_ext.h"
+#include "commands/defrem.h"
+#include "miscadmin.h"
+#include "statistics/stats.h"
+#include "utils/builtins.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+
+/* used for sorting the attnums in ExecCreateStatistics */
+static int
+compare_int16(const void *a, const void *b)
+{
+       return memcmp(a, b, sizeof(int16));
+}
+
+/*
+ * Implements the CREATE STATISTICS name ON (columns) FROM table
+ *
+ * We do require that the types support sorting (ltopr), although some
+ * statistics might work with  equality only.
+ */
+ObjectAddress
+CreateStatistics(CreateStatsStmt *stmt)
+{
+       int                     i;
+       ListCell   *l;
+       int16           attnums[STATS_MAX_DIMENSIONS];
+       int                     numcols = 0;
+       ObjectAddress address = InvalidObjectAddress;
+       char       *namestr;
+       NameData        staname;
+       Oid                     statoid;
+       Oid                     namespaceId;
+
+       HeapTuple       htup;
+       Datum           values[Natts_pg_statistic_ext];
+       bool            nulls[Natts_pg_statistic_ext];
+       int2vector *stakeys;
+       Relation        statrel;
+       Relation        rel;
+       Oid                     relid;
+       ObjectAddress parentobject,
+                               childobject;
+
+       /* costruction of array of enabled statistic */
+       Datum           types[1];               /* only ndistinct defined now */
+       int                     ntypes;
+       ArrayType  *staenabled;
+
+       Assert(IsA(stmt, CreateStatsStmt));
+
+       /* resolve the pieces of the name (namespace etc.) */
+       namespaceId = QualifiedNameGetCreationNamespace(stmt->defnames, 
&namestr);
+       namestrcpy(&staname, namestr);
+
+       /*
+        * If if_not_exists was given and the statistics already exists, bail 
out.
+        */
+       if (SearchSysCacheExists2(STATEXTNAMENSP,
+                                                         
PointerGetDatum(&staname),
+                                                         
ObjectIdGetDatum(namespaceId)))
+       {
+               if (stmt->if_not_exists)
+               {
+                       ereport(NOTICE,
+                                       (errcode(ERRCODE_DUPLICATE_OBJECT),
+                                        errmsg("statistics \"%s\" already 
exist, skipping",
+                                                       namestr)));
+                       return InvalidObjectAddress;
+               }
+
+               ereport(ERROR,
+                               (errcode(ERRCODE_DUPLICATE_OBJECT),
+                                errmsg("statistics \"%s\" already exist", 
namestr)));
+       }
+
+       rel = heap_openrv(stmt->relation, AccessExclusiveLock);
+       relid = RelationGetRelid(rel);
+
+       /* ndistinct coefficients is the only known type of extended statistics 
*/
+       ntypes = 1;
+       types[0] = CharGetDatum(STATS_EXT_NDISTINCT);
+
+       /*
+        * Transform column names to array of attnums. While doing that, we also
+        * enforce the maximum number of keys.
+        */
+       foreach(l, stmt->keys)
+       {
+               char       *attname = strVal(lfirst(l));
+               HeapTuple       atttuple;
+
+               atttuple = SearchSysCacheAttName(relid, attname);
+
+               if (!HeapTupleIsValid(atttuple))
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_UNDEFINED_COLUMN),
+                         errmsg("column \"%s\" referenced in statistics does 
not exist",
+                                        attname)));
+
+               /* more than STATS_MAX_DIMENSIONS columns not allowed */
+               if (numcols >= STATS_MAX_DIMENSIONS)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_TOO_MANY_COLUMNS),
+                                        errmsg("cannot have more than %d keys 
in statistics",
+                                                       STATS_MAX_DIMENSIONS)));
+
+               attnums[numcols] = ((Form_pg_attribute) 
GETSTRUCT(atttuple))->attnum;
+               ReleaseSysCache(atttuple);
+               numcols++;
+       }
+
+       /*
+        * Check that at least two columns were specified in the statement. The
+        * upper bound was already checked in the loop above.
+        */
+       if (numcols < 2)
+               ereport(ERROR,
+                               (errcode(ERRCODE_TOO_MANY_COLUMNS),
+                                errmsg("statistics require at least 2 
columns")));
+
+       /*
+        * Sort the attnums, which makes detecting duplicies somewhat easier, 
and
+        * it does not hurt (it does not affect the efficiency, unlike for
+        * indexes, for example).
+        */
+       qsort(attnums, numcols, sizeof(int16), compare_int16);
+
+       /*
+        * Look for duplicities in the list of columns. The attnums are sorted 
so
+        * just check consecutive elements.
+        */
+       for (i = 1; i < numcols; i++)
+               if (attnums[i] == attnums[i - 1])
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_UNDEFINED_COLUMN),
+                                 errmsg("duplicate column name in statistics 
definition")));
+
+       stakeys = buildint2vector(attnums, numcols);
+
+       /* construct the char array of enabled statistic types */
+       staenabled = construct_array(types, ntypes, CHAROID, 1, true, 'c');
+
+       /*
+        * Everything seems fine, so let's build the pg_statistic_ext entry. At
+        * this point we obviously only have the keys and options.
+        */
+
+       memset(values, 0, sizeof(values));
+       memset(nulls, false, sizeof(nulls));
+
+       /* metadata */
+       values[Anum_pg_statistic_ext_starelid - 1] = ObjectIdGetDatum(relid);
+       values[Anum_pg_statistic_ext_staname - 1] = NameGetDatum(&staname);
+       values[Anum_pg_statistic_ext_stanamespace - 1] = 
ObjectIdGetDatum(namespaceId);
+       values[Anum_pg_statistic_ext_staowner - 1] = 
ObjectIdGetDatum(GetUserId());
+
+       values[Anum_pg_statistic_ext_stakeys - 1] = PointerGetDatum(stakeys);
+
+       /* enabled statistics */
+       values[Anum_pg_statistic_ext_staenabled - 1] = 
PointerGetDatum(staenabled);
+
+       /* no statistics build yet */
+       nulls[Anum_pg_statistic_ext_standistinct - 1] = true;
+
+       /* insert the tuple into pg_statistic_ext */
+       statrel = heap_open(StatisticExtRelationId, RowExclusiveLock);
+
+       htup = heap_form_tuple(statrel->rd_att, values, nulls);
+
+       CatalogTupleInsert(statrel, htup);
+
+       statoid = HeapTupleGetOid(htup);
+
+       heap_freetuple(htup);
+
+       /*
+        * Add a dependency on a table, so that stats get dropped on DROP TABLE.
+        */
+       ObjectAddressSet(parentobject, RelationRelationId, relid);
+       ObjectAddressSet(childobject, StatisticExtRelationId, statoid);
+
+       recordDependencyOn(&childobject, &parentobject, DEPENDENCY_AUTO);
+
+       /*
+        * Also add dependency on the schema (to drop statistics on DROP 
SCHEMA).
+        * This is not handled automatically by DROP TABLE because statistics 
have
+        * their own schema.
+        */
+       ObjectAddressSet(parentobject, NamespaceRelationId, namespaceId);
+
+       recordDependencyOn(&childobject, &parentobject, DEPENDENCY_AUTO);
+
+       heap_close(statrel, RowExclusiveLock);
+
+       relation_close(rel, NoLock);
+
+       /*
+        * Invalidate relcache so that others see the new statistics.
+        */
+       CacheInvalidateRelcache(rel);
+
+       ObjectAddressSet(address, StatisticExtRelationId, statoid);
+
+       return address;
+}
+
+
+/*
+ * Implements the DROP STATISTICS
+ *
+ *        DROP STATISTICS stats_name
+ */
+void
+RemoveStatisticsById(Oid statsOid)
+{
+       Relation        relation;
+       Oid                     relid;
+       Relation        rel;
+       HeapTuple       tup;
+       Form_pg_statistic_ext statext;
+
+       /*
+        * Delete the pg_proc tuple.
+        */
+       relation = heap_open(StatisticExtRelationId, RowExclusiveLock);
+
+       tup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statsOid));
+
+       if (!HeapTupleIsValid(tup)) /* should not happen */
+               elog(ERROR, "cache lookup failed for statistics %u", statsOid);
+
+       statext = (Form_pg_statistic_ext) GETSTRUCT(tup);
+       relid = statext->starelid;
+
+       rel = heap_open(relid, AccessExclusiveLock);
+
+       simple_heap_delete(relation, &tup->t_self);
+
+       CacheInvalidateRelcache(rel);
+
+       ReleaseSysCache(tup);
+
+       heap_close(relation, RowExclusiveLock);
+       heap_close(rel, NoLock);
+}
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index bfc2ac1..9a34f94 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -4447,6 +4447,19 @@ _copyDropSubscriptionStmt(const DropSubscriptionStmt 
*from)
        return newnode;
 }
 
+static CreateStatsStmt *
+_copyCreateStatsStmt(const CreateStatsStmt *from)
+{
+       CreateStatsStmt *newnode = makeNode(CreateStatsStmt);
+
+       COPY_NODE_FIELD(defnames);
+       COPY_NODE_FIELD(relation);
+       COPY_NODE_FIELD(keys);
+       COPY_SCALAR_FIELD(if_not_exists);
+
+       return newnode;
+}
+
 /* ****************************************************************
  *                                     pg_list.h copy functions
  * ****************************************************************
@@ -5385,6 +5398,9 @@ copyObject(const void *from)
                case T_CommonTableExpr:
                        retval = _copyCommonTableExpr(from);
                        break;
+               case T_CreateStatsStmt:
+                       retval = _copyCreateStatsStmt(from);
+                       break;
                case T_ObjectWithArgs:
                        retval = _copyObjectWithArgs(from);
                        break;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 7418fbe..953e6e2 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -2266,6 +2266,18 @@ _outForeignKeyOptInfo(StringInfo str, const 
ForeignKeyOptInfo *node)
 }
 
 static void
+_outStatisticExtInfo(StringInfo str, const StatisticExtInfo *node)
+{
+       WRITE_NODE_TYPE("STATISTICEXTINFO");
+
+       /* NB: this isn't a complete set of fields */
+       WRITE_OID_FIELD(statOid);
+
+       /* built/available statistics */
+       WRITE_BOOL_FIELD(ndist_built);
+}
+
+static void
 _outEquivalenceClass(StringInfo str, const EquivalenceClass *node)
 {
        /*
@@ -3915,6 +3927,9 @@ outNode(StringInfo str, const void *obj)
                        case T_PlannerParamItem:
                                _outPlannerParamItem(str, obj);
                                break;
+                       case T_StatisticExtInfo:
+                               _outStatisticExtInfo(str, obj);
+                               break;
 
                        case T_ExtensibleNode:
                                _outExtensibleNode(str, obj);
diff --git a/src/backend/optimizer/util/plancat.c 
b/src/backend/optimizer/util/plancat.c
index 463f806..d90f199 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -29,6 +29,7 @@
 #include "catalog/heap.h"
 #include "catalog/partition.h"
 #include "catalog/pg_am.h"
+#include "catalog/pg_statistic_ext.h"
 #include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
@@ -40,8 +41,11 @@
 #include "parser/parse_relation.h"
 #include "parser/parsetree.h"
 #include "rewrite/rewriteManip.h"
+#include "statistics/stats.h"
 #include "storage/bufmgr.h"
+#include "utils/builtins.h"
 #include "utils/lsyscache.h"
+#include "utils/syscache.h"
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
 
@@ -63,7 +67,7 @@ static List *get_relation_constraints(PlannerInfo *root,
                                                 bool include_notnull);
 static List *build_index_tlist(PlannerInfo *root, IndexOptInfo *index,
                                  Relation heapRelation);
-
+static List *get_relation_statistics(RelOptInfo *rel, Relation relation);
 
 /*
  * get_relation_info -
@@ -398,6 +402,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, 
bool inhparent,
 
        rel->indexlist = indexinfos;
 
+       rel->statlist = get_relation_statistics(rel, relation);
+
        /* Grab foreign-table info using the relcache, while we have it */
        if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
        {
@@ -1251,6 +1257,64 @@ get_relation_constraints(PlannerInfo *root,
        return result;
 }
 
+/*
+ * get_relation_statistics
+ *
+ * Retrieve extended statistics defined on the table.
+ *
+ * Returns a List (possibly empty) of StatisticExtInfo objects describing
+ * the statistics.  Only attributes needed for selecting statistics are
+ * retrieved (columns covered by the statistics, etc.).
+ */
+static List *
+get_relation_statistics(RelOptInfo *rel, Relation relation)
+{
+       List       *statoidlist;
+       ListCell   *l;
+       List       *stainfos = NIL;
+
+       statoidlist = RelationGetStatExtList(relation);
+
+       foreach(l, statoidlist)
+       {
+               ArrayType  *arr;
+               Datum           adatum;
+               bool            isnull;
+               Oid                     statOid = lfirst_oid(l);
+
+               HeapTuple       htup = SearchSysCache1(STATEXTOID, 
ObjectIdGetDatum(statOid));
+
+               /* unavailable stats are not interesting for the planner */
+               if (stats_are_built(htup, STATS_EXT_NDISTINCT))
+               {
+                       StatisticExtInfo *info = makeNode(StatisticExtInfo);
+
+                       info->statOid = statOid;
+                       info->rel = rel;
+
+                       /* built/available statistics */
+                       info->ndist_built = true;
+
+                       /* decode the stakeys array */
+                       adatum = SysCacheGetAttr(STATEXTOID, htup,
+                                                                        
Anum_pg_statistic_ext_stakeys, &isnull);
+                       Assert(!isnull);
+
+                       arr = DatumGetArrayTypeP(adatum);
+
+                       info->stakeys = buildint2vector((int16 *) 
ARR_DATA_PTR(arr),
+                                                                               
        ARR_DIMS(arr)[0]);
+
+                       stainfos = lcons(info, stainfos);
+               }
+
+               ReleaseSysCache(htup);
+       }
+
+       list_free(statoidlist);
+
+       return stainfos;
+}
 
 /*
  * relation_excluded_by_constraints
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index e7acc2d..a0801dc 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -257,7 +257,7 @@ static Node *makeRecursiveViewSelect(char *relname, List 
*aliases, Node *query);
                ConstraintsSetStmt CopyStmt CreateAsStmt CreateCastStmt
                CreateDomainStmt CreateExtensionStmt CreateGroupStmt 
CreateOpClassStmt
                CreateOpFamilyStmt AlterOpFamilyStmt CreatePLangStmt
-               CreateSchemaStmt CreateSeqStmt CreateStmt CreateTableSpaceStmt
+               CreateSchemaStmt CreateSeqStmt CreateStmt CreateStatsStmt 
CreateTableSpaceStmt
                CreateFdwStmt CreateForeignServerStmt CreateForeignTableStmt
                CreateAssertStmt CreateTransformStmt CreateTrigStmt 
CreateEventTrigStmt
                CreateUserStmt CreateUserMappingStmt CreateRoleStmt 
CreatePolicyStmt
@@ -873,6 +873,7 @@ stmt :
                        | CreateSeqStmt
                        | CreateStmt
                        | CreateSubscriptionStmt
+                       | CreateStatsStmt
                        | CreateTableSpaceStmt
                        | CreateTransformStmt
                        | CreateTrigStmt
@@ -3746,6 +3747,34 @@ OptConsTableSpace:   USING INDEX TABLESPACE name { $$ = 
$4; }
 ExistingIndex:   USING INDEX index_name                                { $$ = 
$3; }
                ;
 
+/*****************************************************************************
+ *
+ *             QUERY :
+ *                             CREATE STATISTICS stats_name ON relname 
(columns) WITH (options)
+ *
+ *****************************************************************************/
+
+
+CreateStatsStmt:       CREATE STATISTICS any_name ON '(' columnList ')' FROM 
qualified_name
+                                               {
+                                                       CreateStatsStmt *n = 
makeNode(CreateStatsStmt);
+                                                       n->defnames = $3;
+                                                       n->relation = $9;
+                                                       n->keys = $6;
+                                                       n->if_not_exists = 
false;
+                                                       $$ = (Node *)n;
+                                               }
+                                       | CREATE STATISTICS IF_P NOT EXISTS 
any_name ON '(' columnList ')' FROM qualified_name
+                                               {
+                                                       CreateStatsStmt *n = 
makeNode(CreateStatsStmt);
+                                                       n->defnames = $6;
+                                                       n->relation = $12;
+                                                       n->keys = $9;
+                                                       n->if_not_exists = true;
+                                                       $$ = (Node *)n;
+                                               }
+                       ;
+
 
 /*****************************************************************************
  *
@@ -6033,6 +6062,7 @@ drop_type_name:
                        | PUBLICATION                                           
        { $$ = OBJECT_PUBLICATION; }
                        | SCHEMA                                                
                { $$ = OBJECT_SCHEMA; }
                        | SERVER                                                
                { $$ = OBJECT_FOREIGN_SERVER; }
+                       | STATISTICS                                            
        { $$ = OBJECT_STATISTICS; }
                ;
 
 /* object types attached to a table */
@@ -8377,6 +8407,15 @@ RenameStmt: ALTER AGGREGATE aggregate_with_argtypes 
RENAME TO name
                                        n->missing_ok = false;
                                        $$ = (Node *)n;
                                }
+                       | ALTER STATISTICS any_name RENAME TO name
+                               {
+                                       RenameStmt *n = makeNode(RenameStmt);
+                                       n->renameType = OBJECT_STATISTICS;
+                                       n->object = (Node *) $3;
+                                       n->newname = $6;
+                                       n->missing_ok = false;
+                                       $$ = (Node *)n;
+                               }
                        | ALTER TEXT_P SEARCH PARSER any_name RENAME TO name
                                {
                                        RenameStmt *n = makeNode(RenameStmt);
@@ -8592,6 +8631,15 @@ AlterObjectSchemaStmt:
                                        n->missing_ok = true;
                                        $$ = (Node *)n;
                                }
+                       | ALTER STATISTICS any_name SET SCHEMA name
+                               {
+                                       AlterObjectSchemaStmt *n = 
makeNode(AlterObjectSchemaStmt);
+                                       n->objectType = OBJECT_STATISTICS;
+                                       n->object = (Node *) $3;
+                                       n->newschema = $6;
+                                       n->missing_ok = false;
+                                       $$ = (Node *)n;
+                               }
                        | ALTER TEXT_P SEARCH PARSER any_name SET SCHEMA name
                                {
                                        AlterObjectSchemaStmt *n = 
makeNode(AlterObjectSchemaStmt);
@@ -8855,6 +8903,14 @@ AlterOwnerStmt: ALTER AGGREGATE aggregate_with_argtypes 
OWNER TO RoleSpec
                                        n->newowner = $6;
                                        $$ = (Node *)n;
                                }
+                       | ALTER STATISTICS name OWNER TO RoleSpec
+                               {
+                                       AlterOwnerStmt *n = 
makeNode(AlterOwnerStmt);
+                                       n->objectType = OBJECT_STATISTICS;
+                                       n->object = (Node *) makeString($3);
+                                       n->newowner = $6;
+                                       $$ = (Node *)n;
+                               }
                        | ALTER TEXT_P SEARCH DICTIONARY any_name OWNER TO 
RoleSpec
                                {
                                        AlterOwnerStmt *n = 
makeNode(AlterOwnerStmt);
diff --git a/src/backend/statistics/Makefile b/src/backend/statistics/Makefile
new file mode 100644
index 0000000..e77b350
--- /dev/null
+++ b/src/backend/statistics/Makefile
@@ -0,0 +1,17 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for statistics
+#
+# IDENTIFICATION
+#    src/backend/statistics/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/statistics
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = common.o mvdist.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/statistics/README b/src/backend/statistics/README
new file mode 100644
index 0000000..beb7c24
--- /dev/null
+++ b/src/backend/statistics/README
@@ -0,0 +1,34 @@
+Extended statistics
+===================
+
+When estimating various quantities (e.g. condition selectivities) the default
+approach relies on the assumption of independence. In practice that's often
+not true, resulting in estimation errors.
+
+Extended statistics track different types of dependencies between the columns,
+hopefully improving the estimates and producing better plans.
+
+Currently we only have one type of extended statistics - ndistinct
+coefficients, and we use it to improve estimates of grouping queries. See
+README.ndistinct for details.
+
+
+Size of sample in ANALYZE
+-------------------------
+When performing ANALYZE, the number of rows to sample is determined as
+
+    (300 * statistics_target)
+
+That works reasonably well for statistics on individual columns, but perhaps
+it's not enough for extended statistics. Papers analyzing estimation errors
+all use samples proportional to the table (usually finding that 1-3% of the
+table is enough to build accurate stats).
+
+The requested accuracy (number of MCV items or histogram bins) should also
+be considered when determining the sample size, and in extended statistics
+those are not necessarily limited by statistics_target.
+
+This however merits further discussion, because collecting the sample is quite
+expensive and increasing it further would make ANALYZE even more painful.
+Judging by the experiments with the current implementation, the fixed size
+seems to work reasonably well for now, so we leave this as a future work.
diff --git a/src/backend/statistics/README.ndistinct 
b/src/backend/statistics/README.ndistinct
new file mode 100644
index 0000000..9365b17
--- /dev/null
+++ b/src/backend/statistics/README.ndistinct
@@ -0,0 +1,22 @@
+ndistinct coefficients
+======================
+
+Estimating number of groups in a combination of columns (e.g. for GROUP BY)
+is tricky, and the estimation error is often significant.
+
+The ndistinct coefficients address this by storing ndistinct estimates not
+only for individual columns, but also for (all) combinations of columns.
+So for example given three columns (a,b,c) the statistics will estimate
+ndistinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column estimates
+are already available in pg_statistic.
+
+
+GROUP BY estimation (estimate_num_groups)
+-----------------------------------------
+
+Although ndistinct coefficient might be used for selectivity estimation
+(of equality conditions in WHERE clause), that is not implemented at this
+point.
+
+Instead, ndistinct coefficients are only used in estimate_num_groups() to
+estimate grouped queries.
diff --git a/src/backend/statistics/common.c b/src/backend/statistics/common.c
new file mode 100644
index 0000000..f63d8cc
--- /dev/null
+++ b/src/backend/statistics/common.c
@@ -0,0 +1,454 @@
+/*-------------------------------------------------------------------------
+ *
+ * common.c
+ *       POSTGRES extended statistics
+ *
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *       src/backend/statistics/common.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pg_statistic_ext.h"
+#include "nodes/relation.h"
+#include "statistics/common.h"
+#include "statistics/stats.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+
+static VacAttrStats **lookup_var_attr_stats(int2vector *attrs,
+                                         int natts, VacAttrStats 
**vacattrstats);
+
+static List *list_ext_stats(Oid relid);
+
+static void update_ext_stats(Oid relid, MVNDistinct ndistinct,
+                                int2vector *attrs, VacAttrStats **stats);
+
+
+/*
+ * Compute requested extended stats, using the rows sampled for the plain
+ * (single-column) stats.
+ *
+ * This fetches a list of stats from pg_statistic_ext, computes the stats
+ * and serializes them back into the catalog (as bytea values).
+ */
+void
+build_ext_stats(Relation onerel, double totalrows,
+                               int numrows, HeapTuple *rows,
+                               int natts, VacAttrStats **vacattrstats)
+{
+       ListCell   *lc;
+       List       *stats;
+
+       TupleDesc       tupdesc = RelationGetDescr(onerel);
+
+       /* Fetch defined statistics from pg_statistic_ext, and compute them. */
+       stats = list_ext_stats(RelationGetRelid(onerel));
+
+       foreach(lc, stats)
+       {
+               int                     j;
+               StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(lc);
+               MVNDistinct ndistinct = NULL;
+
+               VacAttrStats **stats = NULL;
+               int                     numatts = 0;
+
+               /* int2 vector of attnums the stats should be computed on */
+               int2vector *attrs = stat->stakeys;
+
+               /* see how many of the columns are not dropped */
+               for (j = 0; j < attrs->dim1; j++)
+                       if (!tupdesc->attrs[attrs->values[j] - 1]->attisdropped)
+                               numatts += 1;
+
+               /* if there are dropped attributes, build a filtered int2vector 
*/
+               if (numatts != attrs->dim1)
+               {
+                       int16      *tmp = palloc0(numatts * sizeof(int16));
+                       int                     attnum = 0;
+
+                       for (j = 0; j < attrs->dim1; j++)
+                               if (!tupdesc->attrs[attrs->values[j] - 
1]->attisdropped)
+                                       tmp[attnum++] = attrs->values[j];
+
+                       pfree(attrs);
+                       attrs = buildint2vector(tmp, numatts);
+               }
+
+               /* filter only the interesting vacattrstats records */
+               stats = lookup_var_attr_stats(attrs, natts, vacattrstats);
+
+               /* check allowed number of dimensions */
+               Assert((attrs->dim1 >= 2) && (attrs->dim1 <= 
STATS_MAX_DIMENSIONS));
+
+               /* compute ndistinct coefficients */
+               if (stat->ndist_enabled)
+                       ndistinct = build_ext_ndistinct(totalrows, numrows, 
rows, attrs, stats);
+
+               /* store the statistics in the catalog */
+               update_ext_stats(stat->statOid, ndistinct, attrs, stats);
+       }
+}
+
+/*
+ * Lookup the VacAttrStats info for the selected columns, with indexes
+ * matching the attrs vector (to make it easy to work with when
+ * computing extended stats).
+ */
+static VacAttrStats **
+lookup_var_attr_stats(int2vector *attrs, int natts, VacAttrStats 
**vacattrstats)
+{
+       int                     i,
+                               j;
+       int                     numattrs = attrs->dim1;
+       VacAttrStats **stats = (VacAttrStats **) palloc0(numattrs * 
sizeof(VacAttrStats *));
+
+       /* lookup VacAttrStats info for the requested columns (same attnum) */
+       for (i = 0; i < numattrs; i++)
+       {
+               stats[i] = NULL;
+               for (j = 0; j < natts; j++)
+               {
+                       if (attrs->values[i] == vacattrstats[j]->tupattnum)
+                       {
+                               stats[i] = vacattrstats[j];
+                               break;
+                       }
+               }
+
+               /*
+                * Check that we found the info, that the attnum matches and 
that
+                * there's the requested 'lt' operator and that the type is
+                * 'passed-by-value'.
+                */
+               Assert(stats[i] != NULL);
+               Assert(stats[i]->tupattnum == attrs->values[i]);
+
+               /*
+                * FIXME This is rather ugly way to check for 'ltopr' (which is
+                * defined for 'scalar' attributes).
+                */
+               Assert(((StdAnalyzeData *) stats[i]->extra_data)->ltopr != 
InvalidOid);
+       }
+
+       return stats;
+}
+
+/*
+ * Fetch list of MV stats defined on a table, without the actual data
+ * for histograms, MCV lists etc.
+ */
+static List *
+list_ext_stats(Oid relid)
+{
+       Relation        indrel;
+       SysScanDesc indscan;
+       ScanKeyData skey;
+       HeapTuple       htup;
+       List       *result = NIL;
+
+       /*
+        * Prepare to scan pg_statistic_ext for entries having indrelid = this
+        * rel.
+        */
+       ScanKeyInit(&skey,
+                               Anum_pg_statistic_ext_starelid,
+                               BTEqualStrategyNumber, F_OIDEQ,
+                               ObjectIdGetDatum(relid));
+
+       indrel = heap_open(StatisticExtRelationId, AccessShareLock);
+       indscan = systable_beginscan(indrel, StatisticExtRelidIndexId, true,
+                                                                NULL, 1, 
&skey);
+
+       while (HeapTupleIsValid(htup = systable_getnext(indscan)))
+       {
+               StatisticExtInfo *info = makeNode(StatisticExtInfo);
+               Form_pg_statistic_ext stats = (Form_pg_statistic_ext) 
GETSTRUCT(htup);
+
+               info->statOid = HeapTupleGetOid(htup);
+               info->stakeys = buildint2vector(stats->stakeys.values, 
stats->stakeys.dim1);
+
+               info->ndist_enabled = stats_are_enabled(htup, 
STATS_EXT_NDISTINCT);
+               info->ndist_built = stats_are_built(htup, STATS_EXT_NDISTINCT);
+
+               result = lappend(result, info);
+       }
+
+       systable_endscan(indscan);
+
+       heap_close(indrel, AccessShareLock);
+
+       /*
+        * TODO maybe save the list into relcache, as in RelationGetIndexList
+        * (which was used as an inspiration of this one)?.
+        */
+
+       return result;
+}
+
+/*
+ * update_ext_stats
+ *     Serializes the statistics and stores them into the pg_statistic_ext 
tuple.
+ */
+static void
+update_ext_stats(Oid statOid, MVNDistinct ndistinct,
+                                int2vector *attrs, VacAttrStats **stats)
+{
+       HeapTuple       stup,
+                               oldtup;
+       Datum           values[Natts_pg_statistic_ext];
+       bool            nulls[Natts_pg_statistic_ext];
+       bool            replaces[Natts_pg_statistic_ext];
+
+       Relation        sd = heap_open(StatisticExtRelationId, 
RowExclusiveLock);
+
+       memset(nulls, 1, Natts_pg_statistic_ext * sizeof(bool));
+       memset(replaces, 0, Natts_pg_statistic_ext * sizeof(bool));
+       memset(values, 0, Natts_pg_statistic_ext * sizeof(Datum));
+
+       /*
+        * Construct a new pg_statistic_ext tuple - replace only the histogram 
and
+        * MCV list, depending whether it actually was computed.
+        */
+       if (ndistinct != NULL)
+       {
+               bytea      *data = serialize_ext_ndistinct(ndistinct);
+
+               nulls[Anum_pg_statistic_ext_standistinct - 1] = (data == NULL);
+               values[Anum_pg_statistic_ext_standistinct - 1] = 
PointerGetDatum(data);
+       }
+
+       /* always replace the value (either by bytea or NULL) */
+       replaces[Anum_pg_statistic_ext_standistinct - 1] = true;
+
+       /* always change the availability flags */
+       nulls[Anum_pg_statistic_ext_stakeys - 1] = false;
+
+       /* use the new attnums, in case we removed some dropped ones */
+       replaces[Anum_pg_statistic_ext_stakeys - 1] = true;
+
+       values[Anum_pg_statistic_ext_stakeys - 1] = PointerGetDatum(attrs);
+
+       /* Is there already a pg_statistic_ext tuple for this attribute? */
+       oldtup = SearchSysCache1(STATEXTOID,
+                                                        
ObjectIdGetDatum(statOid));
+
+       if (!HeapTupleIsValid(oldtup))
+               elog(ERROR, "cache lookup failed for extended statistics %u", 
statOid);
+
+       /* replace it */
+       stup = heap_modify_tuple(oldtup,
+                                                        RelationGetDescr(sd),
+                                                        values,
+                                                        nulls,
+                                                        replaces);
+       ReleaseSysCache(oldtup);
+       CatalogTupleUpdate(sd, &stup->t_self, stup);
+
+       heap_freetuple(stup);
+       heap_close(sd, RowExclusiveLock);
+}
+
+/* multi-variate stats comparator */
+
+/*
+ * qsort_arg comparator for sorting Datums (MV stats)
+ *
+ * This does not maintain the tupnoLink array.
+ */
+int
+compare_scalars_simple(const void *a, const void *b, void *arg)
+{
+       Datum           da = *(Datum *) a;
+       Datum           db = *(Datum *) b;
+       SortSupport ssup = (SortSupport) arg;
+
+       return ApplySortComparator(da, false, db, false, ssup);
+}
+
+/*
+ * qsort_arg comparator for sorting data when partitioning a MV bucket
+ */
+int
+compare_scalars_partition(const void *a, const void *b, void *arg)
+{
+       Datum           da = ((ScalarItem *) a)->value;
+       Datum           db = ((ScalarItem *) b)->value;
+       SortSupport ssup = (SortSupport) arg;
+
+       return ApplySortComparator(da, false, db, false, ssup);
+}
+
+/* initialize multi-dimensional sort */
+MultiSortSupport
+multi_sort_init(int ndims)
+{
+       MultiSortSupport mss;
+
+       Assert(ndims >= 2);
+
+       mss = (MultiSortSupport) palloc0(offsetof(MultiSortSupportData, ssup)
+                                                                        
+sizeof(SortSupportData) * ndims);
+
+       mss->ndims = ndims;
+
+       return mss;
+}
+
+/*
+ * Prepare sort support info for dimension 'dim' (index into vacattrstats) to
+ * 'mss', at the position 'sortdim'
+ */
+void
+multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
+                                                int dim, VacAttrStats 
**vacattrstats)
+{
+       /* first, lookup StdAnalyzeData for the dimension (attribute) */
+       SortSupportData ssup;
+       StdAnalyzeData *tmp = (StdAnalyzeData *) vacattrstats[dim]->extra_data;
+
+       Assert(mss != NULL);
+       Assert(sortdim < mss->ndims);
+
+       /* initialize sort support, etc. */
+       memset(&ssup, 0, sizeof(ssup));
+       ssup.ssup_cxt = CurrentMemoryContext;
+
+       /* We always use the default collation for statistics */
+       ssup.ssup_collation = DEFAULT_COLLATION_OID;
+       ssup.ssup_nulls_first = false;
+
+       PrepareSortSupportFromOrderingOp(tmp->ltopr, &ssup);
+
+       mss->ssup[sortdim] = ssup;
+}
+
+/* compare all the dimensions in the selected order */
+int
+multi_sort_compare(const void *a, const void *b, void *arg)
+{
+       int                     i;
+       SortItem   *ia = (SortItem *) a;
+       SortItem   *ib = (SortItem *) b;
+
+       MultiSortSupport mss = (MultiSortSupport) arg;
+
+       for (i = 0; i < mss->ndims; i++)
+       {
+               int                     compare;
+
+               compare = ApplySortComparator(ia->values[i], ia->isnull[i],
+                                                                         
ib->values[i], ib->isnull[i],
+                                                                         
&mss->ssup[i]);
+
+               if (compare != 0)
+                       return compare;
+       }
+
+       /* equal by default */
+       return 0;
+}
+
+/* compare selected dimension */
+int
+multi_sort_compare_dim(int dim, const SortItem *a, const SortItem *b,
+                                          MultiSortSupport mss)
+{
+       return ApplySortComparator(a->values[dim], a->isnull[dim],
+                                                          b->values[dim], 
b->isnull[dim],
+                                                          &mss->ssup[dim]);
+}
+
+int
+multi_sort_compare_dims(int start, int end,
+                                               const SortItem *a, const 
SortItem *b,
+                                               MultiSortSupport mss)
+{
+       int                     dim;
+
+       for (dim = start; dim <= end; dim++)
+       {
+               int                     r = ApplySortComparator(a->values[dim], 
a->isnull[dim],
+                                                                               
        b->values[dim], b->isnull[dim],
+                                                                               
        &mss->ssup[dim]);
+
+               if (r != 0)
+                       return r;
+       }
+
+       return 0;
+}
+
+bool
+stats_are_enabled(HeapTuple htup, char type)
+{
+       Datum           datum;
+       bool            isnull;
+       int                     i,
+                               nenabled;
+       char       *enabled;
+       ArrayType  *enabledArray;
+
+       /* see which statistics are enabled */
+       datum = SysCacheGetAttr(STATEXTOID, htup,
+                                                       
Anum_pg_statistic_ext_staenabled, &isnull);
+
+       /* if there are no values in staenabled field, everything is enabled */
+       if (isnull || (datum == PointerGetDatum(NULL)))
+               return false;
+
+       /*
+        * We expect the array to be a 1-D CHAR array; verify that. We don't 
need
+        * to use deconstruct_array() since the array data is just going to look
+        * like a C array of char values.
+        */
+       enabledArray = DatumGetArrayTypeP(datum);
+
+       if (ARR_NDIM(enabledArray) != 1 ||
+               ARR_HASNULL(enabledArray) ||
+               ARR_ELEMTYPE(enabledArray) != CHAROID)
+               elog(ERROR, "enabled statistics (staenabled) is not a 1-D char 
array");
+
+       nenabled = ARR_DIMS(enabledArray)[0];
+       enabled = (char *) ARR_DATA_PTR(enabledArray);
+
+       for (i = 0; i < nenabled; i++)
+               if (enabled[i] == type)
+                       return true;
+
+       return false;
+}
+
+bool
+stats_are_built(HeapTuple htup, char type)
+{
+       bool            isnull;
+
+       switch (type)
+       {
+               case STATS_EXT_NDISTINCT:
+                       SysCacheGetAttr(STATEXTOID, htup,
+                                                       
Anum_pg_statistic_ext_standistinct, &isnull);
+                       break;
+
+               default:
+                       elog(ERROR, "unexpected statistics type requested: %d", 
type);
+       }
+
+       return !isnull;
+}
diff --git a/src/backend/statistics/mvdist.c b/src/backend/statistics/mvdist.c
new file mode 100644
index 0000000..8f318da
--- /dev/null
+++ b/src/backend/statistics/mvdist.c
@@ -0,0 +1,621 @@
+/*-------------------------------------------------------------------------
+ *
+ * mvdist.c
+ *       POSTGRES multivariate ndistinct coefficients
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/statistics/mvdist.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/htup_details.h"
+#include "catalog/pg_statistic_ext.h"
+#include "utils/fmgrprotos.h"
+#include "utils/lsyscache.h"
+#include "lib/stringinfo.h"
+#include "utils/syscache.h"
+#include "statistics/common.h"
+#include "statistics/stats.h"
+
+
+static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
+
+/* internal state for generator of k-combinations of n elements */
+typedef struct CombinationGeneratorData
+{
+
+       int                     k;                              /* size of the 
combination */
+       int                     current;                /* index of the next 
combination to return */
+
+       int                     ncombinations;  /* number of combinations (size 
of array) */
+       AttrNumber *combinations;       /* array of pre-built combinations */
+
+} CombinationGeneratorData;
+
+typedef CombinationGeneratorData *CombinationGenerator;
+
+/* generator API */
+static CombinationGenerator generator_init(int2vector *attrs, int k);
+static void generator_free(CombinationGenerator state);
+static AttrNumber *generator_next(CombinationGenerator state, int2vector 
*attrs);
+
+static int     n_choose_k(int n, int k);
+static int     num_combinations(int n);
+static double ndistinct_for_combination(double totalrows, int numrows,
+                                       HeapTuple *rows, int2vector *attrs, 
VacAttrStats **stats,
+                                                 int k, AttrNumber 
*combination);
+
+/*
+ * Compute ndistinct coefficient for the combination of attributes. This
+ * computes the ndistinct estimate using the same estimator used in analyze.c
+ * and then computes the coefficient.
+ */
+MVNDistinct
+build_ext_ndistinct(double totalrows, int numrows, HeapTuple *rows,
+                                       int2vector *attrs, VacAttrStats **stats)
+{
+       int                     i,
+                               k;
+       int                     numattrs = attrs->dim1;
+       int                     numcombs = num_combinations(numattrs);
+
+       MVNDistinct result;
+
+       result = palloc0(offsetof(MVNDistinctData, items) +
+                                        numcombs * sizeof(MVNDistinctItem));
+
+       result->nitems = numcombs;
+
+       i = 0;
+       for (k = 2; k <= numattrs; k++)
+       {
+               AttrNumber *combination;
+               CombinationGenerator generator;
+
+               generator = generator_init(attrs, k);
+
+               while ((combination = generator_next(generator, attrs)))
+               {
+                       MVNDistinctItem *item = &result->items[i++];
+
+                       item->nattrs = k;
+                       item->ndistinct = ndistinct_for_combination(totalrows, 
numrows, rows,
+                                                                               
           attrs, stats, k, combination);
+
+                       item->attrs = palloc(k * sizeof(AttrNumber));
+                       memcpy(item->attrs, combination, k * 
sizeof(AttrNumber));
+
+                       /* must not overflow the output array */
+                       Assert(i <= result->nitems);
+               }
+
+               generator_free(generator);
+       }
+
+       /* must consume exactly the whole output array */
+       Assert(i == result->nitems);
+
+       return result;
+}
+
+/*
+ * ndistinct_for_combination
+ *     Estimates number of distinct values in a combination of columns.
+ *
+ * This uses the same ndistinct estimator as compute_scalar_stats() in
+ * ANALYZE, i.e.,
+ *             n*d / (n - f1 + f1*n/N)
+ *
+ * except that instead of values in a single column we are dealing with
+ * combination of multiple columns.
+ */
+static double
+ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
+                                                 int2vector *attrs, 
VacAttrStats **stats,
+                                                 int k, AttrNumber 
*combination)
+{
+       int                     i,
+                               j;
+       int                     f1,
+                               cnt,
+                               d;
+       int                     nmultiple,
+                               summultiple;
+       bool       *isnull;
+       Datum      *values;
+       SortItem   *items;
+       MultiSortSupport mss;
+
+       /*
+        * It's possible to sort the sample rows directly, but this seemed 
somehow
+        * simpler / less error prone. Another option would be to allocate the
+        * arrays for each SortItem separately, but that'd be significant 
overhead
+        * (not just CPU, but especially memory bloat).
+        */
+       mss = multi_sort_init(k);
+       items = (SortItem *) palloc0(numrows * sizeof(SortItem));
+       values = (Datum *) palloc0(sizeof(Datum) * numrows * k);
+       isnull = (bool *) palloc0(sizeof(bool) * numrows * k);
+
+       Assert((k >= 2) && (k <= attrs->dim1));
+
+       for (i = 0; i < numrows; i++)
+       {
+               items[i].values = &values[i * k];
+               items[i].isnull = &isnull[i * k];
+       }
+
+       for (i = 0; i < k; i++)
+       {
+               /* prepare the sort function for the first dimension */
+               multi_sort_add_dimension(mss, i, combination[i], stats);
+
+               /* accumulate all the data into the array and sort it */
+               for (j = 0; j < numrows; j++)
+               {
+                       items[j].values[i] =
+                               heap_getattr(rows[j], 
attrs->values[combination[i]],
+                                                        
stats[combination[i]]->tupDesc,
+                                                        &items[j].isnull[i]);
+               }
+       }
+
+       qsort_arg((void *) items, numrows, sizeof(SortItem),
+                         multi_sort_compare, mss);
+
+       /* count number of distinct combinations */
+
+       f1 = 0;
+       cnt = 1;
+       d = 1;
+       for (i = 1; i < numrows; i++)
+       {
+               if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
+               {
+                       if (cnt == 1)
+                               f1 += 1;
+                       else
+                       {
+                               nmultiple += 1;
+                               summultiple += cnt;
+                       }
+
+                       d++;
+                       cnt = 0;
+               }
+
+               cnt += 1;
+       }
+
+       if (cnt == 1)
+               f1 += 1;
+       else
+       {
+               nmultiple += 1;
+               summultiple += cnt;
+       }
+
+       return estimate_ndistinct(totalrows, numrows, d, f1);
+}
+
+MVNDistinct
+load_ext_ndistinct(Oid mvoid)
+{
+       bool            isnull = false;
+       Datum           ndist;
+
+       /*
+        * Prepare to scan pg_statistic_ext for entries having indrelid = this
+        * rel.
+        */
+       HeapTuple       htup = SearchSysCache1(STATEXTOID, 
ObjectIdGetDatum(mvoid));
+
+       Assert(stats_are_enabled(htup, STATS_EXT_NDISTINCT));
+       Assert(stats_are_built(htup, STATS_EXT_NDISTINCT));
+
+       ndist = SysCacheGetAttr(STATEXTOID, htup,
+                                                       
Anum_pg_statistic_ext_standistinct, &isnull);
+
+       Assert(!isnull);
+
+       ReleaseSysCache(htup);
+
+       return deserialize_ext_ndistinct(DatumGetByteaP(ndist));
+}
+
+/* The Duj1 estimator (already used in analyze.c). */
+static double
+estimate_ndistinct(double totalrows, int numrows, int d, int f1)
+{
+       double          numer,
+                               denom,
+                               ndistinct;
+
+       numer = (double) numrows *(double) d;
+
+       denom = (double) (numrows - f1) +
+               (double) f1 *(double) numrows / totalrows;
+
+       ndistinct = numer / denom;
+
+       /* Clamp to sane range in case of roundoff error */
+       if (ndistinct < (double) d)
+               ndistinct = (double) d;
+
+       if (ndistinct > totalrows)
+               ndistinct = totalrows;
+
+       return floor(ndistinct + 0.5);
+}
+
+/*
+ * pg_ndistinct_in             - input routine for type pg_ndistinct.
+ *
+ * pg_ndistinct is real enough to be a table column, but it has no operations
+ * of its own, and disallows input too
+ *
+ * XXX This is inspired by what pg_node_tree does.
+ */
+Datum
+pg_ndistinct_in(PG_FUNCTION_ARGS)
+{
+       /*
+        * pg_node_list stores the data in binary form and parsing text input is
+        * not needed, so disallow this.
+        */
+       ereport(ERROR,
+                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                        errmsg("cannot accept a value of type %s", 
"pg_ndistinct")));
+
+       PG_RETURN_VOID();                       /* keep compiler quiet */
+}
+
+/*
+ * pg_ndistinct                - output routine for type pg_ndistinct.
+ *
+ * histograms are serialized into a bytea value, so we simply call byteaout()
+ * to serialize the value into text. But it'd be nice to serialize that into
+ * a meaningful representation (e.g. for inspection by people).
+ */
+Datum
+pg_ndistinct_out(PG_FUNCTION_ARGS)
+{
+       int                     i,
+                               j;
+       StringInfoData str;
+
+       bytea      *data = PG_GETARG_BYTEA_PP(0);
+
+       MVNDistinct ndist = deserialize_ext_ndistinct(data);
+
+       initStringInfo(&str);
+       appendStringInfoChar(&str, '[');
+
+       for (i = 0; i < ndist->nitems; i++)
+       {
+               MVNDistinctItem item = ndist->items[i];
+
+               if (i > 0)
+                       appendStringInfoString(&str, ", ");
+
+               appendStringInfoChar(&str, '{');
+
+               for (j = 0; j < item.nattrs; j++)
+               {
+                       if (j > 0)
+                               appendStringInfoString(&str, ", ");
+
+                       appendStringInfo(&str, "%d", item.attrs[j]);
+               }
+
+               appendStringInfo(&str, ", %f", item.ndistinct);
+
+               appendStringInfoChar(&str, '}');
+       }
+
+       appendStringInfoChar(&str, ']');
+
+       PG_RETURN_CSTRING(str.data);
+}
+
+/*
+ * pg_ndistinct_recv           - binary input routine for type pg_ndistinct.
+ */
+Datum
+pg_ndistinct_recv(PG_FUNCTION_ARGS)
+{
+       ereport(ERROR,
+                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                        errmsg("cannot accept a value of type %s", 
"pg_ndistinct")));
+
+       PG_RETURN_VOID();                       /* keep compiler quiet */
+}
+
+/*
+ * pg_ndistinct_send           - binary output routine for type pg_ndistinct.
+ *
+ * XXX Histograms are serialized into a bytea value, so let's just send that.
+ */
+Datum
+pg_ndistinct_send(PG_FUNCTION_ARGS)
+{
+       return byteasend(fcinfo);
+}
+
+/*
+ * n_choose_k
+ *             computes binomial coefficients using an algorithm that is both
+ *             efficient and prevents overflows
+ */
+static int
+n_choose_k(int n, int k)
+{
+       int                     d,
+                               r;
+
+       Assert((k > 0) && (n >= k));
+
+       /* use symmetry of the binomial coefficients */
+       k = Min(k, n - k);
+
+       r = 1;
+       for (d = 1; d <= k; ++d)
+       {
+               r *= n--;
+               r /= d;
+       }
+
+       return r;
+}
+
+/*
+ * num_combinations
+ *             computes number of combinations, excluding single-value 
combinations
+ */
+static int
+num_combinations(int n)
+{
+       int                     k;
+       int                     ncombs = 1;
+
+       for (k = 1; k <= n; k++)
+               ncombs *= 2;
+
+       ncombs -= (n + 1);
+
+       return ncombs;
+}
+
+/*
+ * generate all combinations (k elements from n)
+ */
+static void
+generate_combinations_recurse(CombinationGenerator state, AttrNumber n,
+                                                       int index, AttrNumber 
start, AttrNumber *current)
+{
+       /* If we haven't filled all the elements, simply recurse. */
+       if (index < state->k)
+       {
+               AttrNumber      i;
+
+               /*
+                * The values have to be in ascending order, so make sure we 
start
+                * with the value passed by parameter.
+                */
+
+               for (i = start; i < n; i++)
+               {
+                       current[index] = i;
+                       generate_combinations_recurse(state, n, (index + 1), (i 
+ 1), current);
+               }
+
+               return;
+       }
+       else
+       {
+               /* we got a correct combination */
+               state->combinations = (AttrNumber *) 
repalloc(state->combinations,
+                                          state->k * (state->current + 1) * 
sizeof(AttrNumber));
+               memcpy(&state->combinations[(state->k * state->current)],
+                          current, state->k * sizeof(AttrNumber));
+               state->current++;
+       }
+}
+
+/* generate all k-combinations of n elements */
+static void
+generate_combinations(CombinationGenerator state, int n)
+{
+       AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * 
state->k);
+
+       generate_combinations_recurse(state, n, 0, 0, current);
+
+       pfree(current);
+}
+
+/*
+ * initialize the generator of combinations, and prebuild them.
+ *
+ * This pre-builds all the combinations. We could also generate them in
+ * generator_next(), but this seems simpler.
+ */
+static CombinationGenerator
+generator_init(int2vector *attrs, int k)
+{
+       int                     n = attrs->dim1;
+       CombinationGenerator state;
+
+       Assert((n >= k) && (k > 0));
+
+       /* allocate the generator state as a single chunk of memory */
+       state = (CombinationGenerator) 
palloc0(sizeof(CombinationGeneratorData));
+       state->combinations = (AttrNumber *) palloc(k * sizeof(AttrNumber));
+
+       state->ncombinations = n_choose_k(n, k);
+       state->current = 0;
+       state->k = k;
+
+       /* now actually pre-generate all the combinations */
+       generate_combinations(state, n);
+
+       /* make sure we got the expected number of combinations */
+       Assert(state->current == state->ncombinations);
+
+       /* reset the number, so we start with the first one */
+       state->current = 0;
+
+       return state;
+}
+
+/* free the generator state */
+static void
+generator_free(CombinationGenerator state)
+{
+       /* we've allocated a single chunk, so just free it */
+       pfree(state);
+}
+
+/* generate next combination */
+static AttrNumber *
+generator_next(CombinationGenerator state, int2vector *attrs)
+{
+       if (state->current == state->ncombinations)
+               return NULL;
+
+       return &state->combinations[state->k * state->current++];
+}
+
+/*
+ * serialize list of ndistinct items into a bytea
+ */
+bytea *
+serialize_ext_ndistinct(MVNDistinct ndistinct)
+{
+       int                     i;
+       bytea      *output;
+       char       *tmp;
+
+       /* we need to store nitems */
+       Size            len = VARHDRSZ + offsetof(MVNDistinctData, items) +
+       ndistinct->nitems * offsetof(MVNDistinctItem, attrs);
+
+       /* and also include space for the actual attribute numbers */
+       for (i = 0; i < ndistinct->nitems; i++)
+               len += (sizeof(AttrNumber) * ndistinct->items[i].nattrs);
+
+       output = (bytea *) palloc0(len);
+       SET_VARSIZE(output, len);
+
+       tmp = VARDATA(output);
+
+       ndistinct->magic = STATS_NDISTINCT_MAGIC;
+       ndistinct->type = STATS_NDISTINCT_TYPE_BASIC;
+
+       /* first, store the number of items */
+       memcpy(tmp, ndistinct, offsetof(MVNDistinctData, items));
+       tmp += offsetof(MVNDistinctData, items);
+
+       /*
+        * store number of attributes and attribute numbers for each ndistinct
+        * entry
+        */
+       for (i = 0; i < ndistinct->nitems; i++)
+       {
+               MVNDistinctItem item = ndistinct->items[i];
+
+               memcpy(tmp, &item, offsetof(MVNDistinctItem, attrs));
+               tmp += offsetof(MVNDistinctItem, attrs);
+
+               memcpy(tmp, item.attrs, sizeof(AttrNumber) * item.nattrs);
+               tmp += sizeof(AttrNumber) * item.nattrs;
+
+               Assert(tmp <= ((char *) output + len));
+       }
+
+       return output;
+}
+
+/*
+ * Reads serialized ndistinct into MVNDistinct structure.
+ */
+MVNDistinct
+deserialize_ext_ndistinct(bytea *data)
+{
+       int                     i;
+       Size            expected_size;
+       MVNDistinct ndistinct;
+       char       *tmp;
+
+       if (data == NULL)
+               return NULL;
+
+       if (VARSIZE_ANY_EXHDR(data) < offsetof(MVNDistinctData, items))
+               elog(ERROR, "invalid MVNDistinct size %ld (expected at least 
%ld)",
+                        VARSIZE_ANY_EXHDR(data), offsetof(MVNDistinctData, 
items));
+
+       /* read the MVNDistinct header */
+       ndistinct = (MVNDistinct) palloc0(sizeof(MVNDistinctData));
+
+       /* initialize pointer to the data part (skip the varlena header) */
+       tmp = VARDATA_ANY(data);
+
+       /* get the header and perform basic sanity checks */
+       memcpy(ndistinct, tmp, offsetof(MVNDistinctData, items));
+       tmp += offsetof(MVNDistinctData, items);
+
+       if (ndistinct->magic != STATS_NDISTINCT_MAGIC)
+               elog(ERROR, "invalid ndistinct magic %d (expected %d)",
+                        ndistinct->magic, STATS_NDISTINCT_MAGIC);
+
+       if (ndistinct->type != STATS_NDISTINCT_TYPE_BASIC)
+               elog(ERROR, "invalid ndistinct type %d (expected %d)",
+                        ndistinct->type, STATS_NDISTINCT_TYPE_BASIC);
+
+       Assert(ndistinct->nitems > 0);
+
+       /* what minimum bytea size do we expect for those parameters */
+       expected_size = offsetof(MVNDistinctData, items) +
+               ndistinct->nitems * (offsetof(MVNDistinctItem, attrs) +
+                                                        sizeof(AttrNumber) * 
2);
+
+       if (VARSIZE_ANY_EXHDR(data) < expected_size)
+               elog(ERROR, "invalid dependencies size %ld (expected at least 
%ld)",
+                        VARSIZE_ANY_EXHDR(data), expected_size);
+
+       /* allocate space for the ndistinct items */
+       ndistinct = repalloc(ndistinct, offsetof(MVNDistinctData, items) +
+                                                (ndistinct->nitems * 
sizeof(MVNDistinctItem)));
+
+       for (i = 0; i < ndistinct->nitems; i++)
+       {
+               MVNDistinctItem *item = &ndistinct->items[i];
+
+               /* number of attributes */
+               memcpy(item, tmp, offsetof(MVNDistinctItem, attrs));
+               tmp += offsetof(MVNDistinctItem, attrs);
+
+               /* is the number of attributes valid? */
+               Assert((item->nattrs >= 2) && (item->nattrs <= 
STATS_MAX_DIMENSIONS));
+
+               /* now that we know the number of attributes, allocate the 
attribute */
+               item->attrs = (AttrNumber *) palloc0(item->nattrs * 
sizeof(AttrNumber));
+
+               /* copy attribute numbers */
+               memcpy(item->attrs, tmp, sizeof(AttrNumber) * item->nattrs);
+               tmp += sizeof(AttrNumber) * item->nattrs;
+
+               /* still within the bytea */
+               Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
+       }
+
+       /* we should have consumed the whole bytea exactly */
+       Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
+
+       return ndistinct;
+}
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 20b5273..0af8c34 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -1623,6 +1623,10 @@ ProcessUtilitySlow(ParseState *pstate,
                                commandCollected = true;
                                break;
 
+                       case T_CreateStatsStmt:         /* CREATE STATISTICS */
+                               address = CreateStatistics((CreateStatsStmt *) 
parsetree);
+                               break;
+
                        default:
                                elog(ERROR, "unrecognized node type: %d",
                                         (int) nodeTag(parsetree));
@@ -1988,6 +1992,8 @@ AlterObjectTypeCommandTag(ObjectType objtype)
                        break;
                case OBJECT_SUBSCRIPTION:
                        tag = "ALTER SUBSCRIPTION";
+               case OBJECT_STATISTICS:
+                       tag = "ALTER STATISTICS";
                        break;
                default:
                        tag = "???";
@@ -2282,6 +2288,8 @@ CreateCommandTag(Node *parsetree)
                                        break;
                                case OBJECT_PUBLICATION:
                                        tag = "DROP PUBLICATION";
+                               case OBJECT_STATISTICS:
+                                       tag = "DROP STATISTICS";
                                        break;
                                default:
                                        tag = "???";
@@ -2681,6 +2689,10 @@ CreateCommandTag(Node *parsetree)
                        tag = "EXECUTE";
                        break;
 
+               case T_CreateStatsStmt:
+                       tag = "CREATE STATISTICS";
+                       break;
+
                case T_DeallocateStmt:
                        {
                                DeallocateStmt *stmt = (DeallocateStmt *) 
parsetree;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 04bd9b9..5ea9e5b 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -126,6 +126,7 @@
 #include "parser/parse_clause.h"
 #include "parser/parse_coerce.h"
 #include "parser/parsetree.h"
+#include "statistics/stats.h"
 #include "utils/builtins.h"
 #include "utils/bytea.h"
 #include "utils/date.h"
@@ -208,6 +209,8 @@ static Const *string_to_const(const char *str, Oid 
datatype);
 static Const *string_to_bytea_const(const char *str, size_t str_len);
 static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals);
 
+static double find_ndistinct(PlannerInfo *root, RelOptInfo *rel, List 
*varinfos,
+                          bool *found);
 
 /*
  *             eqsel                   - Selectivity of "=" for any data types.
@@ -3437,12 +3440,26 @@ estimate_num_groups(PlannerInfo *root, List 
*groupExprs, double input_rows,
                         * don't know by how much.  We should never clamp to 
less than the
                         * largest ndistinct value for any of the Vars, though, 
since
                         * there will surely be at least that many groups.
+                        *
+                        * However we don't need to do this if we have 
ndistinct stats on
+                        * the columns - in that case we can simply use the 
coefficient to
+                        * get the (probably way more accurate) estimate.
+                        *
+                        * XXX Might benefit from some refactoring, mixing the 
ndistinct
+                        * coefficients and clamp seems a bit unfortunate.
                         */
                        double          clamp = rel->tuples;
 
                        if (relvarcount > 1)
                        {
-                               clamp *= 0.1;
+                               bool            found;
+                               double          ndist = find_ndistinct(root, 
rel, varinfos, &found);
+
+                               if (found)
+                                       reldistinct = ndist;
+                               else
+                                       clamp *= 0.1;
+
                                if (clamp < relmaxndistinct)
                                {
                                        clamp = relmaxndistinct;
@@ -3451,6 +3468,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, 
double input_rows,
                                                clamp = rel->tuples;
                                }
                        }
+
                        if (reldistinct > clamp)
                                reldistinct = clamp;
 
@@ -7592,3 +7610,155 @@ brincostestimate(PlannerInfo *root, IndexPath *path, 
double loop_count,
 
        /* XXX what about pages_per_range? */
 }
+
+/*
+ * Find applicable ndistinct statistics and compute the coefficient to
+ * correct the estimate (simply a product of per-column ndistincts).
+ *
+ * XXX Currently we only look for a perfect match, i.e. a single ndistinct
+ * estimate exactly matching all the columns of the statistics. This may be
+ * a bit problematic as adding a column (not covered by the ndistinct stats)
+ * will prevent us from using the stats entirely. So instead this needs to
+ * estimate the covered attributes, and then combine that with the extra
+ * attributes somehow (probably the old way).
+ */
+static double
+find_ndistinct(PlannerInfo *root, RelOptInfo *rel, List *varinfos, bool *found)
+{
+       ListCell   *lc;
+       Bitmapset  *attnums = NULL;
+       VariableStatData vardata;
+
+       /* assume we haven't found any suitable ndistinct statistics */
+       *found = false;
+
+       /* bail out immediately if the table has no extended statistics */
+       if (!rel->statlist)
+               return 0.0;
+
+       foreach(lc, varinfos)
+       {
+               GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc);
+
+               if (varinfo->rel != rel)
+                       continue;
+
+               /* FIXME handle expressions in general only */
+
+               /*
+                * examine the variable (or expression) so that we know which
+                * attribute we're dealing with - we need this for matching the
+                * ndistinct coefficient
+                *
+                * FIXME probably might remember this from estimate_num_groups
+                */
+               examine_variable(root, varinfo->var, 0, &vardata);
+
+               if (HeapTupleIsValid(vardata.statsTuple))
+               {
+                       Form_pg_statistic stats
+                       = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+
+                       attnums = bms_add_member(attnums, stats->staattnum);
+
+                       ReleaseVariableStats(vardata);
+               }
+       }
+
+       /* look for a matching ndistinct statistics */
+       foreach(lc, rel->statlist)
+       {
+               int                     i,
+                                       k;
+               bool            matches;
+               StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
+
+               /* skip statistics without ndistinct coefficient built */
+               if (!info->ndist_built)
+                       continue;
+
+               /*
+                * Only ndistinct stats covering all Vars are acceptable, which 
can't
+                * happen if the statistics has fewer attributes than we have 
Vars.
+                */
+               if (bms_num_members(attnums) > info->stakeys->dim1)
+                       continue;
+
+               /* check that all Vars are covered by the statistic */
+               matches = true;                 /* assume match until we find 
unmatched
+                                                                * attribute */
+               k = -1;
+               while ((k = bms_next_member(attnums, k)) >= 0)
+               {
+                       bool            attr_found = false;
+
+                       for (i = 0; i < info->stakeys->dim1; i++)
+                       {
+                               if (info->stakeys->values[i] == k)
+                               {
+                                       attr_found = true;
+                                       break;
+                               }
+                       }
+
+                       /* found attribute not covered by this ndistinct stats, 
skip */
+                       if (!attr_found)
+                       {
+                               matches = false;
+                               break;
+                       }
+               }
+
+               if (!matches)
+                       continue;
+
+               /* hey, this statistics matches! great, let's extract the value 
*/
+               *found = true;
+
+               {
+                       int                     j;
+                       MVNDistinct stat = load_ext_ndistinct(info->statOid);
+
+                       for (j = 0; j < stat->nitems; j++)
+                       {
+                               bool            item_matches = true;
+                               MVNDistinctItem *item = &stat->items[j];
+
+                               /* not the right item (different number of 
attributes) */
+                               if (item->nattrs != bms_num_members(attnums))
+                                       continue;
+
+                               /* check the attribute numbers */
+                               k = -1;
+                               while ((k = bms_next_member(attnums, k)) >= 0)
+                               {
+                                       bool            attr_found = false;
+
+                                       for (i = 0; i < item->nattrs; i++)
+                                       {
+                                               if 
(info->stakeys->values[item->attrs[i]] == k)
+                                               {
+                                                       attr_found = true;
+                                                       break;
+                                               }
+                                       }
+
+                                       if (!attr_found)
+                                       {
+                                               item_matches = false;
+                                               break;
+                                       }
+                               }
+
+                               if (!item_matches)
+                                       continue;
+
+                               return item->ndistinct;
+                       }
+               }
+       }
+
+       Assert(!(*found));
+
+       return 0.0;
+}
diff --git a/src/backend/utils/cache/relcache.c 
b/src/backend/utils/cache/relcache.c
index ce55fc5..a6b60c6 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -56,6 +56,7 @@
 #include "catalog/pg_publication.h"
 #include "catalog/pg_rewrite.h"
 #include "catalog/pg_shseclabel.h"
+#include "catalog/pg_statistic_ext.h"
 #include "catalog/pg_subscription.h"
 #include "catalog/pg_tablespace.h"
 #include "catalog/pg_trigger.h"
@@ -4452,6 +4453,82 @@ RelationGetIndexList(Relation relation)
 }
 
 /*
+ * RelationGetStatExtList
+ *             get a list of OIDs of extended statistics on this relation
+ *
+ * The statistics list is created only if someone requests it, in a way
+ * similar to RelationGetIndexList().  We scan pg_statistic_ext to find
+ * relevant statistics, and add the list to the relcache entry so that we
+ * won't have to compute it again.  Note that shared cache inval of a
+ * relcache entry will delete the old list and set rd_statvalid to 0,
+ * so that we must recompute the statistics list on next request.  This
+ * handles creation or deletion of a statistic.
+ *
+ * The returned list is guaranteed to be sorted in order by OID, although
+ * this is not currently needed.
+ *
+ * Since shared cache inval causes the relcache's copy of the list to go away,
+ * we return a copy of the list palloc'd in the caller's context.  The caller
+ * may list_free() the returned list after scanning it. This is necessary
+ * since the caller will typically be doing syscache lookups on the relevant
+ * statistics, and syscache lookup could cause SI messages to be processed!
+ */
+List *
+RelationGetStatExtList(Relation relation)
+{
+       Relation        indrel;
+       SysScanDesc indscan;
+       ScanKeyData skey;
+       HeapTuple       htup;
+       List       *result;
+       List       *oldlist;
+       MemoryContext oldcxt;
+
+       /* Quick exit if we already computed the list. */
+       if (relation->rd_statvalid != 0)
+               return list_copy(relation->rd_statlist);
+
+       /*
+        * We build the list we intend to return (in the caller's context) while
+        * doing the scan.  After successfully completing the scan, we copy that
+        * list into the relcache entry.  This avoids cache-context memory 
leakage
+        * if we get some sort of error partway through.
+        */
+       result = NIL;
+
+       /* Prepare to scan pg_statistic_ext for entries having starelid = this 
rel. */
+       ScanKeyInit(&skey,
+                               Anum_pg_statistic_ext_starelid,
+                               BTEqualStrategyNumber, F_OIDEQ,
+                               ObjectIdGetDatum(RelationGetRelid(relation)));
+
+       indrel = heap_open(StatisticExtRelationId, AccessShareLock);
+       indscan = systable_beginscan(indrel, StatisticExtRelidIndexId, true,
+                                                                NULL, 1, 
&skey);
+
+       while (HeapTupleIsValid(htup = systable_getnext(indscan)))
+               /* TODO maybe include only already built statistics? */
+               result = insert_ordered_oid(result, HeapTupleGetOid(htup));
+
+       systable_endscan(indscan);
+
+       heap_close(indrel, AccessShareLock);
+
+       /* Now save a copy of the completed list in the relcache entry. */
+       oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
+       oldlist = relation->rd_statlist;
+       relation->rd_statlist = list_copy(result);
+
+       relation->rd_statvalid = true;
+       MemoryContextSwitchTo(oldcxt);
+
+       /* Don't leak the old list, if there is one */
+       list_free(oldlist);
+
+       return result;
+}
+
+/*
  * insert_ordered_oid
  *             Insert a new Oid into a sorted list of Oids, preserving ordering
  *
@@ -5560,6 +5637,8 @@ load_relcache_init_file(bool shared)
                rel->rd_pkattr = NULL;
                rel->rd_idattr = NULL;
                rel->rd_pubactions = NULL;
+               rel->rd_statvalid = false;
+               rel->rd_statlist = NIL;
                rel->rd_createSubid = InvalidSubTransactionId;
                rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
                rel->rd_amcache = NULL;
diff --git a/src/backend/utils/cache/syscache.c 
b/src/backend/utils/cache/syscache.c
index b1c0b4b..4a9cb76 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -61,6 +61,7 @@
 #include "catalog/pg_shseclabel.h"
 #include "catalog/pg_replication_origin.h"
 #include "catalog/pg_statistic.h"
+#include "catalog/pg_statistic_ext.h"
 #include "catalog/pg_subscription.h"
 #include "catalog/pg_tablespace.h"
 #include "catalog/pg_transform.h"
@@ -725,6 +726,28 @@ static const struct cachedesc cacheinfo[] = {
                },
                32
        },
+       {StatisticExtRelationId,        /* STATEXTNAMENSP */
+               StatisticExtNameIndexId,
+               2,
+               {
+                       Anum_pg_statistic_ext_staname,
+                       Anum_pg_statistic_ext_stanamespace,
+                       0,
+                       0
+               },
+               4
+       },
+       {StatisticExtRelationId,        /* STATEXTOID */
+               StatisticExtOidIndexId,
+               1,
+               {
+                       ObjectIdAttributeNumber,
+                       0,
+                       0,
+                       0
+               },
+               4
+       },
        {StatisticRelationId,           /* STATRELATTINH */
                StatisticRelidAttnumInhIndexId,
                3,
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 61a3e2a..3001dee 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -2320,6 +2320,50 @@ describeOneTableDetails(const char *schemaname,
                        PQclear(result);
                }
 
+               /* print any extended statistics */
+               if (pset.sversion >= 100000)
+               {
+                       printfPQExpBuffer(&buf,
+                                                         "SELECT oid, 
stanamespace::regnamespace AS nsp, staname, stakeys,\n"
+                                                         "  
(staenabled::char[] @> '{d}'::char[]) AS ndist_enabled,\n"
+                                                         "  (standistinct IS 
NOT NULL) AS ndist_built,\n"
+                                                         "  (SELECT 
string_agg(attname::text,', ')\n"
+                                                  "    FROM ((SELECT 
unnest(stakeys) AS attnum) s\n"
+                                                         "         JOIN 
pg_attribute a ON (starelid = a.attrelid and a.attnum = s.attnum))) AS 
attnums\n"
+                         "FROM pg_statistic_ext stat WHERE starelid  = '%s' 
ORDER BY 1;",
+                                                         oid);
+
+                       result = PSQLexec(buf.data);
+                       if (!result)
+                               goto error_return;
+                       else
+                               tuples = PQntuples(result);
+
+                       if (tuples > 0)
+                       {
+                               printTableAddFooter(&cont, _("Statistics:"));
+                               for (i = 0; i < tuples; i++)
+                               {
+                                       printfPQExpBuffer(&buf, "    ");
+
+                                       /* statistics name (qualified with 
namespace) */
+                                       appendPQExpBuffer(&buf, "\"%s.%s\" ",
+                                                                         
PQgetvalue(result, i, 1),
+                                                                         
PQgetvalue(result, i, 2));
+
+                                       /* options */
+                                       if (!strcmp(PQgetvalue(result, i, 4), 
"t"))
+                                               appendPQExpBuffer(&buf, 
"(dependencies)");
+
+                                       appendPQExpBuffer(&buf, " ON (%s)",
+                                                                         
PQgetvalue(result, i, 6));
+
+                                       printTableAddFooter(&cont, buf.data);
+                               }
+                       }
+                       PQclear(result);
+               }
+
                /* print rules */
                if (tableinfo.hasrules && tableinfo.relkind != RELKIND_MATVIEW)
                {
diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h
index 10759c7..9effbce 100644
--- a/src/include/catalog/dependency.h
+++ b/src/include/catalog/dependency.h
@@ -147,6 +147,7 @@ typedef enum ObjectClass
        OCLASS_REWRITE,                         /* pg_rewrite */
        OCLASS_TRIGGER,                         /* pg_trigger */
        OCLASS_SCHEMA,                          /* pg_namespace */
+       OCLASS_STATISTIC_EXT,           /* pg_statistic_ext */
        OCLASS_TSPARSER,                        /* pg_ts_parser */
        OCLASS_TSDICT,                          /* pg_ts_dict */
        OCLASS_TSTEMPLATE,                      /* pg_ts_template */
diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h
index 1187797..473fe17 100644
--- a/src/include/catalog/heap.h
+++ b/src/include/catalog/heap.h
@@ -119,6 +119,7 @@ extern void RemoveAttrDefault(Oid relid, AttrNumber attnum,
                                  DropBehavior behavior, bool complain, bool 
internal);
 extern void RemoveAttrDefaultById(Oid attrdefId);
 extern void RemoveStatistics(Oid relid, AttrNumber attnum);
+extern void RemoveStatisticsExt(Oid relid, AttrNumber attnum);
 
 extern Form_pg_attribute SystemAttributeDefinition(AttrNumber attno,
                                                  bool relhasoids);
diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h
index 6bce732..8130581 100644
--- a/src/include/catalog/indexing.h
+++ b/src/include/catalog/indexing.h
@@ -182,6 +182,13 @@ DECLARE_UNIQUE_INDEX(pg_largeobject_loid_pn_index, 2683, 
on pg_largeobject using
 DECLARE_UNIQUE_INDEX(pg_largeobject_metadata_oid_index, 2996, on 
pg_largeobject_metadata using btree(oid oid_ops));
 #define LargeObjectMetadataOidIndexId  2996
 
+DECLARE_UNIQUE_INDEX(pg_statistic_ext_oid_index, 3380, on pg_statistic_ext 
using btree(oid oid_ops));
+#define StatisticExtOidIndexId 3380
+DECLARE_UNIQUE_INDEX(pg_statistic_ext_name_index, 3997, on pg_statistic_ext 
using btree(staname name_ops, stanamespace oid_ops));
+#define StatisticExtNameIndexId 3997
+DECLARE_INDEX(pg_statistic_ext_relid_index, 3379, on pg_statistic_ext using 
btree(starelid oid_ops));
+#define StatisticExtRelidIndexId 3379
+
 DECLARE_UNIQUE_INDEX(pg_namespace_nspname_index, 2684, on pg_namespace using 
btree(nspname name_ops));
 #define NamespaceNameIndexId  2684
 DECLARE_UNIQUE_INDEX(pg_namespace_oid_index, 2685, on pg_namespace using 
btree(oid oid_ops));
diff --git a/src/include/catalog/namespace.h b/src/include/catalog/namespace.h
index dbeb25b..35e0e2b 100644
--- a/src/include/catalog/namespace.h
+++ b/src/include/catalog/namespace.h
@@ -141,6 +141,8 @@ extern Oid  get_collation_oid(List *collname, bool 
missing_ok);
 extern Oid     get_conversion_oid(List *conname, bool missing_ok);
 extern Oid     FindDefaultConversionProc(int32 for_encoding, int32 
to_encoding);
 
+extern Oid     get_statistics_oid(List *names, bool missing_ok);
+
 /* initialization & transaction cleanup code */
 extern void InitializeSearchPath(void);
 extern void AtEOXact_Namespace(bool isCommit, bool parallel);
diff --git a/src/include/catalog/pg_cast.h b/src/include/catalog/pg_cast.h
index 80a40ab..5bcdce7 100644
--- a/src/include/catalog/pg_cast.h
+++ b/src/include/catalog/pg_cast.h
@@ -254,6 +254,10 @@ DATA(insert (      23       18   78 e f ));
 /* pg_node_tree can be coerced to, but not from, text */
 DATA(insert (  194      25    0 i b ));
 
+/* pg_ndistinct can be coerced to, but not from, bytea and text */
+DATA(insert (  3353  17    0 i b ));
+DATA(insert (  3353  25    0 i i ));
+
 /*
  * Datetime category
  */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index ec4aedb..05baa80 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2726,6 +2726,15 @@ DESCR("current user privilege on any column by rel 
name");
 DATA(insert OID = 3029 (  has_any_column_privilege        PGNSP PGUID 12 10 0 
0 0 f f f f t f s s 2 0 16 "26 25" _null_ _null_ _null_ _null_ _null_ 
has_any_column_privilege_id _null_ _null_ _null_ ));
 DESCR("current user privilege on any column by rel oid");
 
+DATA(insert OID = 3354 (  pg_ndistinct_in      PGNSP PGUID 12 1 0 0 0 f f f f 
t f i s 1 0 3353 "2275" _null_ _null_ _null_ _null_ _null_ pg_ndistinct_in 
_null_ _null_ _null_ ));
+DESCR("I/O");
+DATA(insert OID = 3355 (  pg_ndistinct_out     PGNSP PGUID 12 1 0 0 0 f f f f 
t f i s 1 0 2275 "3353" _null_ _null_ _null_ _null_ _null_ pg_ndistinct_out 
_null_ _null_ _null_ ));
+DESCR("I/O");
+DATA(insert OID = 3356 (  pg_ndistinct_recv PGNSP PGUID 12 1 0 0 0 f f f f t f 
s s 1 0 3353 "2281" _null_ _null_ _null_ _null_ _null_ pg_ndistinct_recv _null_ 
_null_ _null_ ));
+DESCR("I/O");
+DATA(insert OID = 3357 (  pg_ndistinct_send PGNSP PGUID 12 1 0 0 0 f f f f t f 
s s 1 0 17 "3353" _null_ _null_ _null_ _null_ _null_ pg_ndistinct_send _null_ 
_null_ _null_ ));
+DESCR("I/O");
+
 DATA(insert OID = 1928 (  pg_stat_get_numscans                 PGNSP PGUID 12 
1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ 
pg_stat_get_numscans _null_ _null_ _null_ ));
 DESCR("statistics: number of scans done for table/index");
 DATA(insert OID = 1929 (  pg_stat_get_tuples_returned  PGNSP PGUID 12 1 0 0 0 
f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ 
pg_stat_get_tuples_returned _null_ _null_ _null_ ));
diff --git a/src/include/catalog/pg_statistic_ext.h 
b/src/include/catalog/pg_statistic_ext.h
new file mode 100644
index 0000000..94b23a2
--- /dev/null
+++ b/src/include/catalog/pg_statistic_ext.h
@@ -0,0 +1,74 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_statistic_ext.h
+ *       definition of the system "extended statistic" relation 
(pg_statistic_ext)
+ *       along with the relation's initial contents.
+ *
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/catalog/pg_statistic_ext.h
+ *
+ * NOTES
+ *       the genbki.pl script reads this file and generates .bki
+ *       information from the DATA() statements.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_STATISTIC_EXT_H
+#define PG_STATISTIC_EXT_H
+
+#include "catalog/genbki.h"
+
+/* ----------------
+ *             pg_statistic_ext definition.  cpp turns this into
+ *             typedef struct FormData_pg_statistic_ext
+ * ----------------
+ */
+#define StatisticExtRelationId 3381
+
+CATALOG(pg_statistic_ext,3381)
+{
+       /* These fields form the unique key for the entry: */
+       Oid                     starelid;               /* relation containing 
attributes */
+       NameData        staname;                /* statistics name */
+       Oid                     stanamespace;   /* OID of namespace containing 
this statistics */
+       Oid                     staowner;               /* statistics owner */
+
+       /*
+        * variable-length fields start here, but we allow direct access to
+        * stakeys
+        */
+       int2vector      stakeys;                /* array of column keys */
+
+#ifdef CATALOG_VARLEN
+       char            staenabled[1];  /* statistics requested to build */
+       pg_ndistinct standistinct;      /* ndistinct coefficients (serialized) 
*/
+#endif
+
+} FormData_pg_statistic_ext;
+
+/* ----------------
+ *             Form_pg_statistic_ext corresponds to a pointer to a tuple with
+ *             the format of pg_statistic_ext relation.
+ * ----------------
+ */
+typedef FormData_pg_statistic_ext *Form_pg_statistic_ext;
+
+/* ----------------
+ *             compiler constants for pg_statistic_ext
+ * ----------------
+ */
+#define Natts_pg_statistic_ext                                 7
+#define Anum_pg_statistic_ext_starelid                 1
+#define Anum_pg_statistic_ext_staname                  2
+#define Anum_pg_statistic_ext_stanamespace             3
+#define Anum_pg_statistic_ext_staowner                 4
+#define Anum_pg_statistic_ext_stakeys                  5
+#define Anum_pg_statistic_ext_staenabled               6
+#define Anum_pg_statistic_ext_standistinct             7
+
+#define STATS_EXT_NDISTINCT            'd'
+
+#endif   /* PG_STATISTIC_EXT_H */
diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h
index 6e4c65e..9c9caf3 100644
--- a/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
@@ -364,6 +364,10 @@ DATA(insert OID = 194 ( pg_node_tree       PGNSP PGUID -1 
f b S f t \054 0 0 0 pg_node
 DESCR("string representing an internal node tree");
 #define PGNODETREEOID  194
 
+DATA(insert OID = 3353 ( pg_ndistinct          PGNSP PGUID -1 f b S f t \054 0 
0 0 pg_ndistinct_in pg_ndistinct_out pg_ndistinct_recv pg_ndistinct_send - - - 
i x f 0 -1 0 100 _null_ _null_ _null_ ));
+DESCR("multivariate ndistinct coefficients");
+#define PGNDISTINCTOID 3353
+
 DATA(insert OID = 32 ( pg_ddl_command  PGNSP PGUID SIZEOF_POINTER t p P f t 
\054 0 0 0 pg_ddl_command_in pg_ddl_command_out pg_ddl_command_recv 
pg_ddl_command_send - - - ALIGNOF_POINTER p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("internal type for passing CollectedCommand");
 #define PGDDLCOMMANDOID 32
diff --git a/src/include/catalog/toasting.h b/src/include/catalog/toasting.h
index db7f145..00d0a83 100644
--- a/src/include/catalog/toasting.h
+++ b/src/include/catalog/toasting.h
@@ -53,6 +53,7 @@ DECLARE_TOAST(pg_proc, 2836, 2837);
 DECLARE_TOAST(pg_rewrite, 2838, 2839);
 DECLARE_TOAST(pg_seclabel, 3598, 3599);
 DECLARE_TOAST(pg_statistic, 2840, 2841);
+DECLARE_TOAST(pg_statistic_ext, 3439, 3440);
 DECLARE_TOAST(pg_trigger, 2336, 2337);
 
 /* shared catalogs */
diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h
index 8740cee..c323e81 100644
--- a/src/include/commands/defrem.h
+++ b/src/include/commands/defrem.h
@@ -77,6 +77,10 @@ extern ObjectAddress DefineOperator(List *names, List 
*parameters);
 extern void RemoveOperatorById(Oid operOid);
 extern ObjectAddress AlterOperator(AlterOperatorStmt *stmt);
 
+/* commands/statscmds.c */
+extern ObjectAddress CreateStatistics(CreateStatsStmt *stmt);
+extern void RemoveStatisticsById(Oid statsOid);
+
 /* commands/aggregatecmds.c */
 extern ObjectAddress DefineAggregate(ParseState *pstate, List *name, List 
*args, bool oldstyle,
                                List *parameters);
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 2bc7a5d..d269e77 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -278,6 +278,7 @@ typedef enum NodeTag
        T_PlaceHolderInfo,
        T_MinMaxAggInfo,
        T_PlannerParamItem,
+       T_StatisticExtInfo,
 
        /*
         * TAGS FOR MEMORY NODES (memnodes.h)
@@ -423,6 +424,7 @@ typedef enum NodeTag
        T_CreateSubscriptionStmt,
        T_AlterSubscriptionStmt,
        T_DropSubscriptionStmt,
+       T_CreateStatsStmt,
 
        /*
         * TAGS FOR PARSE TREE NODES (parsenodes.h)
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index a44d217..0a7a8d5c 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -644,6 +644,16 @@ typedef struct ColumnDef
        int                     location;               /* parse location, or 
-1 if none/unknown */
 } ColumnDef;
 
+typedef struct CreateStatsStmt
+{
+       NodeTag         type;
+       List       *defnames;           /* qualified name (list of Value 
strings) */
+       RangeVar   *relation;           /* relation to build statistics on */
+       List       *keys;                       /* String nodes naming 
referenced column(s) */
+       bool            if_not_exists;  /* do nothing if statistics already 
exists */
+} CreateStatsStmt;
+
+
 /*
  * TableLikeClause - CREATE TABLE ( ... LIKE ... ) clause
  */
@@ -1593,6 +1603,7 @@ typedef enum ObjectType
        OBJECT_SCHEMA,
        OBJECT_SEQUENCE,
        OBJECT_SUBSCRIPTION,
+       OBJECT_STATISTICS,
        OBJECT_TABCONSTRAINT,
        OBJECT_TABLE,
        OBJECT_TABLESPACE,
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 05d6f07..5923b5f 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -525,6 +525,7 @@ typedef struct RelOptInfo
        List       *lateral_vars;       /* LATERAL Vars and PHVs referenced by 
rel */
        Relids          lateral_referencers;    /* rels that reference me 
laterally */
        List       *indexlist;          /* list of IndexOptInfo */
+       List       *statlist;           /* list of StatisticExtInfo */
        BlockNumber pages;                      /* size estimates derived from 
pg_class */
        double          tuples;
        double          allvisfrac;
@@ -664,6 +665,31 @@ typedef struct ForeignKeyOptInfo
        List       *rinfos[INDEX_MAX_KEYS];
 } ForeignKeyOptInfo;
 
+/*
+ * StatisticExtInfo
+ *             Information about extended statistics for planning/optimization
+ *
+ * This contains information about which columns are covered by the
+ * statistics (stakeys), which options were requested while adding the
+ * statistics (*_enabled), and which kinds of statistics were actually
+ * built and are available for the optimizer (*_built).
+ */
+typedef struct StatisticExtInfo
+{
+       NodeTag         type;
+
+       Oid                     statOid;                /* OID of the 
statistics row */
+       RelOptInfo *rel;                        /* back-link to index's table */
+
+       /* enabled statistics */
+       bool            ndist_enabled;  /* ndistinct coefficient enabled */
+
+       /* built/available statistics */
+       bool            ndist_built;    /* ndistinct coefficient built */
+
+       /* columns in the statistics (attnums) */
+       int2vector *stakeys;            /* attnums of the columns covered */
+} StatisticExtInfo;
 
 /*
  * EquivalenceClasses
diff --git a/src/include/statistics/common.h b/src/include/statistics/common.h
new file mode 100644
index 0000000..39c62bd
--- /dev/null
+++ b/src/include/statistics/common.h
@@ -0,0 +1,62 @@
+/*-------------------------------------------------------------------------
+ *
+ * common.h
+ *       POSTGRES extended statistics internal declarations
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/include/statistics/common.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STATISTICS_COMMON_H
+#define STATISTICS_COMMON_H
+
+#include "commands/vacuum.h"
+#include "utils/sortsupport.h"
+
+
+typedef struct
+{
+       Oid                     eqopr;                  /* '=' operator for 
datatype, if any */
+       Oid                     eqfunc;                 /* and associated 
function */
+       Oid                     ltopr;                  /* '<' operator for 
datatype, if any */
+} StdAnalyzeData;
+
+typedef struct
+{
+       Datum           value;                  /* a data value */
+       int                     tupno;                  /* position index for 
tuple it came from */
+} ScalarItem;
+
+/* multi-sort */
+typedef struct MultiSortSupportData
+{
+       int                     ndims;                  /* number of dimensions 
supported by the */
+       SortSupportData ssup[1];        /* sort support data for each dimension 
*/
+} MultiSortSupportData;
+
+typedef MultiSortSupportData *MultiSortSupport;
+
+typedef struct SortItem
+{
+       Datum      *values;
+       bool       *isnull;
+} SortItem;
+
+extern MultiSortSupport multi_sort_init(int ndims);
+extern void multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
+                                                int dim, VacAttrStats 
**vacattrstats);
+extern int     multi_sort_compare(const void *a, const void *b, void *arg);
+extern int multi_sort_compare_dim(int dim, const SortItem * a,
+                                          const SortItem * b, MultiSortSupport 
mss);
+extern int multi_sort_compare_dims(int start, int end, const SortItem * a,
+                                               const SortItem * b, 
MultiSortSupport mss);
+
+/* comparators, used when constructing extended stats */
+extern int     compare_scalars_simple(const void *a, const void *b, void *arg);
+extern int     compare_scalars_partition(const void *a, const void *b, void 
*arg);
+
+#endif   /* STATISTICS_COMMON_H */
diff --git a/src/include/statistics/stats.h b/src/include/statistics/stats.h
new file mode 100644
index 0000000..ed14459
--- /dev/null
+++ b/src/include/statistics/stats.h
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * stats.h
+ *       Multivariate statistics and selectivity estimation functions.
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/statistics/stats.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STATS_H
+#define STATS_H
+
+#include "commands/vacuum.h"
+
+#define STATS_MAX_DIMENSIONS   8               /* max number of attributes */
+
+#define STATS_NDISTINCT_MAGIC          0xA352BFA4      /* marks serialized 
bytea */
+#define STATS_NDISTINCT_TYPE_BASIC     1       /* basic MCV list type */
+
+/* Multivariate distinct coefficients. */
+typedef struct MVNDistinctItem
+{
+       double          ndistinct;
+       AttrNumber      nattrs;
+       AttrNumber *attrs;
+} MVNDistinctItem;
+
+typedef struct MVNDistinctData
+{
+       uint32          magic;                  /* magic constant marker */
+       uint32          type;                   /* type of ndistinct (BASIC) */
+       uint32          nitems;                 /* number of items in the 
statistic */
+       MVNDistinctItem items[FLEXIBLE_ARRAY_MEMBER];
+} MVNDistinctData;
+
+typedef MVNDistinctData *MVNDistinct;
+
+extern MVNDistinct load_ext_ndistinct(Oid mvoid);
+
+extern bytea *serialize_ext_ndistinct(MVNDistinct ndistinct);
+
+/* deserialization of stats (serialization is private to analyze) */
+extern MVNDistinct deserialize_ext_ndistinct(bytea *data);
+
+extern MVNDistinct build_ext_ndistinct(double totalrows, int numrows, 
HeapTuple *rows,
+                                       int2vector *attrs, VacAttrStats 
**stats);
+
+extern void build_ext_stats(Relation onerel, double totalrows,
+                               int numrows, HeapTuple *rows,
+                               int natts, VacAttrStats **vacattrstats);
+extern bool stats_are_enabled(HeapTuple htup, char type);
+extern bool stats_are_built(HeapTuple htup, char type);
+
+#endif   /* STATS_H */
diff --git a/src/include/utils/acl.h b/src/include/utils/acl.h
index 0d11852..90dac93 100644
--- a/src/include/utils/acl.h
+++ b/src/include/utils/acl.h
@@ -326,6 +326,7 @@ extern bool pg_event_trigger_ownercheck(Oid et_oid, Oid 
roleid);
 extern bool pg_extension_ownercheck(Oid ext_oid, Oid roleid);
 extern bool pg_publication_ownercheck(Oid pub_oid, Oid roleid);
 extern bool pg_subscription_ownercheck(Oid sub_oid, Oid roleid);
+extern bool pg_statistics_ownercheck(Oid stat_oid, Oid roleid);
 extern bool has_createrole_privilege(Oid roleid);
 extern bool has_bypassrls_privilege(Oid roleid);
 
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index a617a7c..5772804 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -92,6 +92,7 @@ typedef struct RelationData
        bool            rd_isvalid;             /* relcache entry is valid */
        char            rd_indexvalid;  /* state of rd_indexlist: 0 = not 
valid, 1 =
                                                                 * valid, 2 = 
temporarily forced */
+       bool            rd_statvalid;   /* state of rd_statlist: true/false */
 
        /*
         * rd_createSubid is the ID of the highest subtransaction the rel has
@@ -136,6 +137,9 @@ typedef struct RelationData
        Oid                     rd_pkindex;             /* OID of primary key, 
if any */
        Oid                     rd_replidindex; /* OID of replica identity 
index, if any */
 
+       /* data managed by RelationGetStatExtList: */
+       List       *rd_statlist;        /* list of OIDs of extended stats */
+
        /* data managed by RelationGetIndexAttrBitmap: */
        Bitmapset  *rd_indexattr;       /* identifies columns used in indexes */
        Bitmapset  *rd_keyattr;         /* cols that can be ref'd by foreign 
keys */
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index da36b67..81af3ae 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -39,6 +39,7 @@ extern void RelationClose(Relation relation);
  */
 extern List *RelationGetFKeyList(Relation relation);
 extern List *RelationGetIndexList(Relation relation);
+extern List *RelationGetStatExtList(Relation relation);
 extern Oid     RelationGetOidIndex(Relation relation);
 extern Oid     RelationGetPrimaryKeyIndex(Relation relation);
 extern Oid     RelationGetReplicaIndex(Relation relation);
diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h
index 66f60d2..048541e 100644
--- a/src/include/utils/syscache.h
+++ b/src/include/utils/syscache.h
@@ -86,6 +86,8 @@ enum SysCacheIdentifier
        PUBLICATIONRELMAP,
        RULERELNAME,
        SEQRELID,
+       STATEXTNAMENSP,
+       STATEXTOID,
        STATRELATTINH,
        SUBSCRIPTIONOID,
        SUBSCRIPTIONNAME,
diff --git a/src/test/regress/expected/object_address.out 
b/src/test/regress/expected/object_address.out
index 836773f..07b3701 100644
--- a/src/test/regress/expected/object_address.out
+++ b/src/test/regress/expected/object_address.out
@@ -38,6 +38,7 @@ CREATE TRANSFORM FOR int LANGUAGE SQL (
        TO SQL WITH FUNCTION int4recv(internal));
 CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable;
 CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, 
NOCREATE SLOT);
+CREATE STATISTICS addr_nsp.gentable_stat ON (a,b) FROM addr_nsp.gentable;
 -- test some error cases
 SELECT pg_get_object_address('stone', '{}', '{}');
 ERROR:  unrecognized object type "stone"
@@ -399,7 +400,8 @@ WITH objects (type, name, args) AS (VALUES
                                ('access method', '{btree}', '{}'),
                                ('publication', '{addr_pub}', '{}'),
                                ('publication relation', '{addr_nsp, 
gentable}', '{addr_pub}'),
-                               ('subscription', '{addr_sub}', '{}')
+                               ('subscription', '{addr_sub}', '{}'),
+                               ('statistics', '{addr_nsp, gentable_stat}', 
'{}')
         )
 SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)).*,
        -- test roundtrip through pg_identify_object_as_address
@@ -447,6 +449,7 @@ SELECT (pg_identify_object(addr1.classid, addr1.objid, 
addr1.objsubid)).*,
  trigger                   |            |                   | t on 
addr_nsp.gentable                                               | t
  operator family           | pg_catalog | integer_ops       | 
pg_catalog.integer_ops USING btree                                   | t
  policy                    |            |                   | genpol on 
addr_nsp.gentable                                          | t
+ statistics                | addr_nsp   | gentable_stat     | 
addr_nsp.gentable_stat                                               | t
  collation                 | pg_catalog | "default"         | 
pg_catalog."default"                                                 | t
  transform                 |            |                   | for integer on 
language sql                                          | t
  text search dictionary    | addr_nsp   | addr_ts_dict      | 
addr_nsp.addr_ts_dict                                                | t
@@ -456,7 +459,7 @@ SELECT (pg_identify_object(addr1.classid, addr1.objid, 
addr1.objsubid)).*,
  subscription              |            | addr_sub          | addr_sub         
                                                    | t
  publication               |            | addr_pub          | addr_pub         
                                                    | t
  publication relation      |            |                   | gentable in 
publication addr_pub                                     | t
-(45 rows)
+(46 rows)
 
 ---
 --- Cleanup resources
diff --git a/src/test/regress/expected/opr_sanity.out 
b/src/test/regress/expected/opr_sanity.out
index 0bcec13..9a26205 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -817,11 +817,12 @@ WHERE c.castmethod = 'b' AND
  text              | character         |        0 | i
  character varying | character         |        0 | i
  pg_node_tree      | text              |        0 | i
+ pg_ndistinct      | bytea             |        0 | i
  cidr              | inet              |        0 | i
  xml               | text              |        0 | a
  xml               | character varying |        0 | a
  xml               | character         |        0 | a
-(7 rows)
+(8 rows)
 
 -- **************** pg_conversion ****************
 -- Look for illegal values in pg_conversion fields.
diff --git a/src/test/regress/expected/rules.out 
b/src/test/regress/expected/rules.out
index bd13ae6..d4b2158 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2160,6 +2160,14 @@ pg_stats| SELECT n.nspname AS schemaname,
      JOIN pg_attribute a ON (((c.oid = a.attrelid) AND (a.attnum = 
s.staattnum))))
      LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)))
   WHERE ((NOT a.attisdropped) AND has_column_privilege(c.oid, a.attnum, 
'select'::text) AND ((c.relrowsecurity = false) OR (NOT 
row_security_active(c.oid))));
+pg_stats_ext| SELECT n.nspname AS schemaname,
+    c.relname AS tablename,
+    s.staname,
+    s.stakeys AS attnums,
+    length((s.standistinct)::text) AS ndistbytes
+   FROM ((pg_statistic_ext s
+     JOIN pg_class c ON ((c.oid = s.starelid)))
+     LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)));
 pg_tables| SELECT n.nspname AS schemaname,
     c.relname AS tablename,
     pg_get_userbyid(c.relowner) AS tableowner,
diff --git a/src/test/regress/expected/sanity_check.out 
b/src/test/regress/expected/sanity_check.out
index b5eff55..9edba4f 100644
--- a/src/test/regress/expected/sanity_check.out
+++ b/src/test/regress/expected/sanity_check.out
@@ -142,6 +142,7 @@ pg_shdepend|t
 pg_shdescription|t
 pg_shseclabel|t
 pg_statistic|t
+pg_statistic_ext|t
 pg_subscription|t
 pg_tablespace|t
 pg_transform|t
diff --git a/src/test/regress/expected/stats_ext.out 
b/src/test/regress/expected/stats_ext.out
new file mode 100644
index 0000000..77ce1f1
--- /dev/null
+++ b/src/test/regress/expected/stats_ext.out
@@ -0,0 +1,117 @@
+-- data type passed by value
+CREATE TABLE ndistinct (
+    a INT,
+    b INT,
+    c INT,
+    d INT
+);
+-- unknown column
+CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
+ERROR:  column "unknown_column" referenced in statistics does not exist
+-- single column
+CREATE STATISTICS s10 ON (a) FROM ndistinct;
+ERROR:  statistics require at least 2 columns
+-- single column, duplicated
+CREATE STATISTICS s10 ON (a,a) FROM ndistinct;
+ERROR:  duplicate column name in statistics definition
+-- two columns, one duplicated
+CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
+ERROR:  duplicate column name in statistics definition
+-- correct command
+CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
+-- perfectly correlated groups
+INSERT INTO ndistinct
+     SELECT i/100, i/100, i/100 FROM generate_series(1,10000) s(i);
+ANALYZE ndistinct;
+SELECT staenabled, standistinct
+  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
+ staenabled |                                    standistinct                  
                   
+------------+-------------------------------------------------------------------------------------
+ {d}        | [{0, 1, 101.000000}, {0, 2, 101.000000}, {1, 2, 101.000000}, {0, 
1, 2, 101.000000}]
+(1 row)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
+         QUERY PLAN          
+-----------------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on ndistinct
+(3 rows)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
+         QUERY PLAN          
+-----------------------------
+ HashAggregate
+   Group Key: a, b, c
+   ->  Seq Scan on ndistinct
+(3 rows)
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
+         QUERY PLAN          
+-----------------------------
+ HashAggregate
+   Group Key: a, b, c, d
+   ->  Seq Scan on ndistinct
+(3 rows)
+
+TRUNCATE TABLE ndistinct;
+-- partially correlated groups
+INSERT INTO ndistinct
+     SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE ndistinct;
+SELECT staenabled, standistinct
+  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
+ staenabled |                                    standistinct                  
                   
+------------+-------------------------------------------------------------------------------------
+ {d}        | [{0, 1, 201.000000}, {0, 2, 201.000000}, {1, 2, 101.000000}, {0, 
1, 2, 201.000000}]
+(1 row)
+
+EXPLAIN
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ HashAggregate  (cost=230.00..232.01 rows=201 width=16)
+   Group Key: a, b
+   ->  Seq Scan on ndistinct  (cost=0.00..155.00 rows=10000 width=8)
+(3 rows)
+
+EXPLAIN
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ HashAggregate  (cost=255.00..257.01 rows=201 width=20)
+   Group Key: a, b, c
+   ->  Seq Scan on ndistinct  (cost=0.00..155.00 rows=10000 width=12)
+(3 rows)
+
+EXPLAIN
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ HashAggregate  (cost=280.00..290.00 rows=1000 width=24)
+   Group Key: a, b, c, d
+   ->  Seq Scan on ndistinct  (cost=0.00..155.00 rows=10000 width=16)
+(3 rows)
+
+EXPLAIN
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ HashAggregate  (cost=255.00..265.00 rows=1000 width=20)
+   Group Key: b, c, d
+   ->  Seq Scan on ndistinct  (cost=0.00..155.00 rows=10000 width=12)
+(3 rows)
+
+EXPLAIN
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ HashAggregate  (cost=230.00..240.00 rows=1000 width=16)
+   Group Key: a, d
+   ->  Seq Scan on ndistinct  (cost=0.00..155.00 rows=10000 width=8)
+(3 rows)
+
+DROP TABLE ndistinct;
diff --git a/src/test/regress/expected/type_sanity.out 
b/src/test/regress/expected/type_sanity.out
index 8d75bbf..f6b799a 100644
--- a/src/test/regress/expected/type_sanity.out
+++ b/src/test/regress/expected/type_sanity.out
@@ -59,7 +59,7 @@ WHERE (p1.typtype = 'c' AND p1.typrelid = 0) OR
 -- Look for types that should have an array type according to their typtype,
 -- but don't.  We exclude composites here because we have not bothered to
 -- make array types corresponding to the system catalogs' rowtypes.
--- NOTE: as of v10, this check finds pg_node_tree and smgr.
+-- NOTE: as of v10, this check finds pg_node_tree, pg_ndistinct, smgr.
 SELECT p1.oid, p1.typname
 FROM pg_type as p1
 WHERE p1.typtype not in ('c','d','p') AND p1.typname NOT LIKE E'\\_%'
@@ -67,11 +67,12 @@ WHERE p1.typtype not in ('c','d','p') AND p1.typname NOT 
LIKE E'\\_%'
     (SELECT 1 FROM pg_type as p2
      WHERE p2.typname = ('_' || p1.typname)::name AND
            p2.typelem = p1.oid and p1.typarray = p2.oid);
- oid |   typname    
------+--------------
- 194 | pg_node_tree
- 210 | smgr
-(2 rows)
+ oid  |   typname    
+------+--------------
+  194 | pg_node_tree
+ 3353 | pg_ndistinct
+  210 | smgr
+(3 rows)
 
 -- Make sure typarray points to a varlena array type of our own base
 SELECT p1.oid, p1.typname as basetype, p2.typname as arraytype,
diff --git a/src/test/regress/parallel_schedule 
b/src/test/regress/parallel_schedule
index 9f38349..a8ebf93 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -89,7 +89,7 @@ test: brin gin gist spgist privileges init_privs 
security_label collate matview
 # ----------
 # Another group of parallel tests
 # ----------
-test: alter_generic alter_operator misc psql async dbsize misc_functions 
sysviews tsrf
+test: alter_generic alter_operator misc psql async dbsize misc_functions 
sysviews tsrf stats_ext
 
 # rules cannot run concurrently with any test that creates a view
 test: rules psql_crosstab amutils
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 2987b24..bff9432 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -128,6 +128,7 @@ test: dbsize
 test: misc_functions
 test: sysviews
 test: tsrf
+test: stats_ext
 test: rules
 test: psql_crosstab
 test: select_parallel
diff --git a/src/test/regress/sql/object_address.sql 
b/src/test/regress/sql/object_address.sql
index 0ace4dd..4e34185 100644
--- a/src/test/regress/sql/object_address.sql
+++ b/src/test/regress/sql/object_address.sql
@@ -41,6 +41,7 @@ CREATE TRANSFORM FOR int LANGUAGE SQL (
        TO SQL WITH FUNCTION int4recv(internal));
 CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable;
 CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, 
NOCREATE SLOT);
+CREATE STATISTICS addr_nsp.gentable_stat ON (a,b) FROM addr_nsp.gentable;
 
 -- test some error cases
 SELECT pg_get_object_address('stone', '{}', '{}');
@@ -179,7 +180,8 @@ WITH objects (type, name, args) AS (VALUES
                                ('access method', '{btree}', '{}'),
                                ('publication', '{addr_pub}', '{}'),
                                ('publication relation', '{addr_nsp, 
gentable}', '{addr_pub}'),
-                               ('subscription', '{addr_sub}', '{}')
+                               ('subscription', '{addr_sub}', '{}'),
+                               ('statistics', '{addr_nsp, gentable_stat}', 
'{}')
         )
 SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)).*,
        -- test roundtrip through pg_identify_object_as_address
diff --git a/src/test/regress/sql/stats_ext.sql 
b/src/test/regress/sql/stats_ext.sql
new file mode 100644
index 0000000..6381157
--- /dev/null
+++ b/src/test/regress/sql/stats_ext.sql
@@ -0,0 +1,75 @@
+-- Generic extended statistics support
+CREATE TABLE ab1 (a int, b int);
+CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
+ALTER TABLE ab1 DROP COLUMN a;
+DROP TABLE ab1;
+
+
+-- data type passed by value
+CREATE TABLE ndistinct (
+    a INT,
+    b INT,
+    c INT,
+    d INT
+);
+
+-- unknown column
+CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
+
+-- single column
+CREATE STATISTICS s10 ON (a) FROM ndistinct;
+
+-- single column, duplicated
+CREATE STATISTICS s10 ON (a,a) FROM ndistinct;
+
+-- two columns, one duplicated
+CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
+
+-- correct command
+CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
+
+-- perfectly correlated groups
+INSERT INTO ndistinct
+     SELECT i/100, i/100, i/100 FROM generate_series(1,10000) s(i);
+
+ANALYZE ndistinct;
+
+SELECT staenabled, standistinct
+  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
+
+EXPLAIN (COSTS off)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
+
+TRUNCATE TABLE ndistinct;
+
+-- partially correlated groups
+INSERT INTO ndistinct
+     SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i);
+
+ANALYZE ndistinct;
+
+SELECT staenabled, standistinct
+  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
+
+EXPLAIN
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
+
+EXPLAIN
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
+
+EXPLAIN
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
+
+EXPLAIN
+ SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
+
+EXPLAIN
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
+
+DROP TABLE ndistinct;
diff --git a/src/test/regress/sql/type_sanity.sql 
b/src/test/regress/sql/type_sanity.sql
index 0a31249..4c65814 100644
--- a/src/test/regress/sql/type_sanity.sql
+++ b/src/test/regress/sql/type_sanity.sql
@@ -53,7 +53,7 @@ WHERE (p1.typtype = 'c' AND p1.typrelid = 0) OR
 -- Look for types that should have an array type according to their typtype,
 -- but don't.  We exclude composites here because we have not bothered to
 -- make array types corresponding to the system catalogs' rowtypes.
--- NOTE: as of v10, this check finds pg_node_tree and smgr.
+-- NOTE: as of v10, this check finds pg_node_tree, pg_ndistinct, smgr.
 
 SELECT p1.oid, p1.typname
 FROM pg_type as p1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] multivariate statistics (v25)

Reply via email to