On Tue, Apr 2, 2019 at 6:56 AM Michael Paquier <mich...@paquier.xyz> wrote:
>
> On Sat, Mar 30, 2019 at 06:15:11PM +0100, Julien Rouhaud wrote:
> > I'd also have to get more feedback on this.  For now, I'll add this
> > thread to the pg12 open items, as a follow up of the initial code
> > drop.
>
> Catching up here...  I think that having a completely separate view
> with one row for each database and one row for shared objects makes
> the most sense based on what has been proposed on this thread.  Being
> able to track checksum failures for shared catalogs is really
> something I'd like to be able to see easily, and I have seen
> corruption involving such objects from time to time.  I think that we
> should have a design which is extensible.

Ok!

>  One thing which is not
> proposed on this patch, and I am fine with it as a first draft, is
> that we don't have any information about the broken block number and
> the file involved.  My gut tells me that we'd want a separate view,
> like pg_stat_checksums_details with one tuple per (dboid, rel, fork,
> blck) to be complete.  But that's just for future work.

That could indeed be nice.

> For the progress part, we would most likely have a separate view for
> that as well, as the view should show no rows if there is no operation
> in progress.

Ok.

> The patch looks rather clean to me, I have some comments.
>
> -   <application>pg_checksums</application>. The exit status is zero if there
> -   are no checksum errors when checking them, and nonzero if at least one
> -   checksum failure is detected. If enabling or disabling checksums, the
> -   exit status is nonzero if the operation failed.
> +   <application>pg_checksums</application>. As a consequence, the
> +   <structname>pg_stat_checksums</structnameview won't reflect this activity.
> +   The exit status is zero if there are no checksum errors when checking 
> them,
> +   and nonzero if at least one checksum failure is detected. If enabling or
> +   disabling checksums, the exit status is nonzero if the operation failed.
>
> The docs of pg_checksums already clearly state that the cluster needs
> to be offline, so I am not sure that this addition is necessary.

Agreed, removed.

> @@ -1539,6 +1539,8 @@ pgstat_report_checksum_failures_in_db(Oid dboid,
> int failurecount)
>
> Please note that there is no need to have the list of arguments in the
> comment block at the top of pgstat_report_checksum_failures_in_db().

Indeed, fixed.

> +       if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
> +               result = 0;
> +       else
> +               result = dbentry->last_checksum_failure;
> +
> +       if (result == 0)
> +               PG_RETURN_NULL();
> +       else
> +               PG_RETURN_TIMESTAMPTZ(result);
> +}
>
> No need for two ifs here.  What about just that?
> if (NULL)
>     PG_RETURN_NULL();
> else
>     PG_RETURN_TIMESTAMPTZ(last_checksum_failure);

I do agree, but this is done like this everywhere in pgstatfuncs.c, so
I think it's better to keep it as-is for consistency.
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index f1df14bdea..30674f61ce 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -384,6 +384,14 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
      </entry>
      </row>
 
+     <row>
+      <entry><structname>pg_stat_checksums</structname><indexterm><primary>pg_stat_checksums</primary></indexterm></entry>
+      <entry>One row per database, plus one for the shared objects, showing
+      database-wide checksums statistics. See
+       <xref linkend="pg-stat-checksums-view"/> for details.
+      </entry>
+     </row>
+
      <row>
       <entry><structname>pg_stat_database</structname><indexterm><primary>pg_stat_database</primary></indexterm></entry>
       <entry>One row per database, showing database-wide statistics. See
@@ -2418,6 +2426,54 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
    single row, containing global data for the cluster.
   </para>
 
+  <table id="pg-stat-checksums-view" xreflabel="pg_stat_checksums">
+   <title><structname>pg_stat_checksums</structname> View</title>
+   <tgroup cols="3">
+    <thead>
+    <row>
+      <entry>Column</entry>
+      <entry>Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+   <tbody>
+    <row>
+     <entry><structfield>datid</structfield></entry>
+     <entry><type>oid</type></entry>
+     <entry>OID of a database, or 0 for objects belonging to a shared relation</entry>
+    </row>
+    <row>
+     <entry><structfield>datname</structfield></entry>
+     <entry><type>name</type></entry>
+     <entry>Name of this database, or <literal>&lt;shared_objects&gt;</literal></entry>
+    </row>
+    <row>
+     <entry><structfield>checksum_failures</structfield></entry>
+     <entry><type>bigint</type></entry>
+     <entry>Number of data page checksum failures detected in this
+     database</entry>
+    </row>
+    <row>
+     <entry><structfield>checksum_last_failure</structfield></entry>
+     <entry><type>timestamp with time zone</type></entry>
+     <entry>Time at which the last data page checksum failures was detected in
+     this database</entry>
+    </row>
+    <row>
+     <entry><structfield>stats_reset</structfield></entry>
+     <entry><type>timestamp with time zone</type></entry>
+     <entry>Time at which these statistics were last reset</entry>
+    </row>
+   </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_stat_database</structname> view will contain one row
+   for each database in the cluster, showing database-wide statistics.
+  </para>
+
   <table id="pg-stat-database-view" xreflabel="pg_stat_database">
    <title><structname>pg_stat_database</structname> View</title>
    <tgroup cols="3">
@@ -2529,11 +2585,6 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
      <entry><type>bigint</type></entry>
      <entry>Number of deadlocks detected in this database</entry>
     </row>
-    <row>
-     <entry><structfield>checksum_failures</structfield></entry>
-     <entry><type>bigint</type></entry>
-     <entry>Number of data page checksum failures detected in this database</entry>
-    </row>
     <row>
      <entry><structfield>blk_read_time</structfield></entry>
      <entry><type>double precision</type></entry>
diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml
index 84fb37c293..e063dd972e 100644
--- a/doc/src/sgml/ref/initdb.sgml
+++ b/doc/src/sgml/ref/initdb.sgml
@@ -218,7 +218,9 @@ PostgreSQL documentation
         I/O system that would otherwise be silent. Enabling checksums
         may incur a noticeable performance penalty. This option can only
         be set during initialization, and cannot be changed later. If
-        set, checksums are calculated for all objects, in all databases.
+        set, checksums are calculated for all objects, in all databases. All
+        checksum failures will be reported in the <xref
+        linkend="pg-stat-checksums-view"/> view.
        </para>
       </listitem>
      </varlistentry>
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index c4f3950e5b..8179db5f2a 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -531,7 +531,8 @@ PostgreSQL documentation
         By default, checksums are verified and checksum failures will result
         in a non-zero exit status. However, the base backup will not be
         removed in such a case, as if the <option>--no-clean</option> option
-        had been used.
+        had been used.  Checksum verifications failures will also be reported
+        in the <xref linkend="pg-stat-checksums-view"/> view.
        </para>
       </listitem>
      </varlistentry>
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index b89df70653..ffe53bf888 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -823,12 +823,24 @@ CREATE VIEW pg_stat_database AS
             pg_stat_get_db_temp_files(D.oid) AS temp_files,
             pg_stat_get_db_temp_bytes(D.oid) AS temp_bytes,
             pg_stat_get_db_deadlocks(D.oid) AS deadlocks,
-            pg_stat_get_db_checksum_failures(D.oid) AS checksum_failures,
             pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time,
             pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time,
             pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset
     FROM pg_database D;
 
+CREATE VIEW pg_stat_checksums AS
+    SELECT
+            D.oid AS datid,
+            D.datname AS datname,
+            pg_stat_get_db_checksum_failures(D.oid) AS checksum_failures,
+            pg_stat_get_db_checksum_last_failure(D.oid) AS checksum_last_failure,
+            pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset
+    FROM (
+        SELECT oid, datname FROM pg_database
+        UNION ALL
+        SELECT 0, '<shared objects>'
+    ) D;
+
 CREATE VIEW pg_stat_database_conflicts AS
     SELECT
             D.oid AS datid,
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 2a8472b91a..ed877d6d95 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -1523,7 +1523,7 @@ pgstat_report_deadlock(void)
 
 
 /* --------
- * pgstat_report_checksum_failures_in_db(dboid, failure_count) -
+ * pgstat_report_checksum_failures_in_db() -
  *
  *	Tell the collector about one or more checksum failures.
  * --------
@@ -1539,6 +1539,8 @@ pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
 	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE);
 	msg.m_databaseid = dboid;
 	msg.m_failurecount = failurecount;
+	msg.m_failure_time = GetCurrentTimestamp();
+
 	pgstat_send(&msg, sizeof(msg));
 }
 
@@ -4601,6 +4603,7 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
 	dbentry->n_temp_bytes = 0;
 	dbentry->n_deadlocks = 0;
 	dbentry->n_checksum_failures = 0;
+	dbentry->last_checksum_failure = 0;
 	dbentry->n_block_read_time = 0;
 	dbentry->n_block_write_time = 0;
 
@@ -6257,6 +6260,7 @@ pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len)
 	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
 	dbentry->n_checksum_failures += msg->m_failurecount;
+	dbentry->last_checksum_failure = msg->m_failure_time;
 }
 
 /* ----------
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 537f09e342..36dcb28754 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1584,9 +1584,9 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 				(errmsg("file \"%s\" has a total of %d checksum verification "
 						"failures", readfilename, checksum_failures)));
 
-		if (dboid != InvalidOid)
-			pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
+		pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
 	}
+
 	total_checksum_failures += checksum_failures;
 
 	return true;
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 90a817a25c..b28d9ab216 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1514,6 +1514,24 @@ pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS)
 	PG_RETURN_INT64(result);
 }
 
+Datum
+pg_stat_get_db_checksum_last_failure(PG_FUNCTION_ARGS)
+{
+	Oid			dbid = PG_GETARG_OID(0);
+	TimestampTz result;
+	PgStat_StatDBEntry *dbentry;
+
+	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+		result = 0;
+	else
+		result = dbentry->last_checksum_failure;
+
+	if (result == 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_TIMESTAMPTZ(result);
+}
+
 Datum
 pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS)
 {
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index eac909109c..c4aeeee810 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5281,6 +5281,11 @@
   proname => 'pg_stat_get_db_checksum_failures', provolatile => 's',
   proparallel => 'r', prorettype => 'int8', proargtypes => 'oid',
   prosrc => 'pg_stat_get_db_checksum_failures' },
+{ oid => '8394',
+  descr => 'statistics: when last checksum failure was detected in database',
+  proname => 'pg_stat_get_db_checksum_last_failure', provolatile => 's',
+  proparallel => 'r', prorettype => 'timestamptz', proargtypes => 'oid',
+  prosrc => 'pg_stat_get_db_checksum_last_failure' },
 { oid => '3074', descr => 'statistics: last reset for a database',
   proname => 'pg_stat_get_db_stat_reset_time', provolatile => 's',
   proparallel => 'r', prorettype => 'timestamptz', proargtypes => 'oid',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index c080fa6388..d6d1772706 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -541,6 +541,7 @@ typedef struct PgStat_MsgChecksumFailure
 	PgStat_MsgHdr m_hdr;
 	Oid			m_databaseid;
 	int			m_failurecount;
+	TimestampTz	m_failure_time;
 } PgStat_MsgChecksumFailure;
 
 
@@ -607,6 +608,7 @@ typedef struct PgStat_StatDBEntry
 	PgStat_Counter n_temp_bytes;
 	PgStat_Counter n_deadlocks;
 	PgStat_Counter n_checksum_failures;
+	TimestampTz last_checksum_failure;
 	PgStat_Counter n_block_read_time;	/* times in microseconds */
 	PgStat_Counter n_block_write_time;
 
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index d5f309fbfb..193de82cc3 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1801,6 +1801,17 @@ pg_stat_bgwriter| SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints
     pg_stat_get_buf_fsync_backend() AS buffers_backend_fsync,
     pg_stat_get_buf_alloc() AS buffers_alloc,
     pg_stat_get_bgwriter_stat_reset_time() AS stats_reset;
+pg_stat_checksums| SELECT d.oid AS datid,
+    d.datname,
+    pg_stat_get_db_checksum_failures(d.oid) AS checksum_failures,
+    pg_stat_get_db_checksum_last_failure(d.oid) AS checksum_last_failure,
+    pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset
+   FROM ( SELECT pg_database.oid,
+            pg_database.datname
+           FROM pg_database
+        UNION ALL
+         SELECT 0,
+            '<shared objects>'::name) d;
 pg_stat_database| SELECT d.oid AS datid,
     d.datname,
     pg_stat_get_db_numbackends(d.oid) AS numbackends,
@@ -1817,7 +1828,6 @@ pg_stat_database| SELECT d.oid AS datid,
     pg_stat_get_db_temp_files(d.oid) AS temp_files,
     pg_stat_get_db_temp_bytes(d.oid) AS temp_bytes,
     pg_stat_get_db_deadlocks(d.oid) AS deadlocks,
-    pg_stat_get_db_checksum_failures(d.oid) AS checksum_failures,
     pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time,
     pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time,
     pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset

Reply via email to