Re: pg_rewind WAL segments deletion pitfall

2024-07-12 Thread Alexander Kukushkin
Hi Sutou,

Thank you for picking it up!

On Fri, 12 Jul 2024 at 09:24, Sutou Kouhei  wrote:

Here are my review comments:
>
> @@ -217,6 +221,26 @@ findLastCheckpoint(const char *datadir, XLogRecPtr
> forkptr, int tliIndex,
> +   charxlogfname[MAXFNAMELEN];
> +
> +   tli = xlogreader->seg.ws_tli;
> +   segno = xlogreader->seg.ws_segno;
> +
> +   snprintf(xlogfname, MAXPGPATH, XLOGDIR "/");
> +   XLogFileName(xlogfname + strlen(xlogfname),
> +xlogreader->seg.ws_tli,
> +xlogreader->seg.ws_segno,
> WalSegSz);
> +
> +   /*
> +* Make sure pg_rewind doesn't remove this file,
> because it is
> +* required for postgres to start after rewind.
> +*/
> +   insert_keepwalhash_entry(xlogfname);
>
> MAXFNAMELEN is 64 and MAXPGPATH is 1024. strlen(XLOGDIR "/")
> is 7 because XLOGDIR is "pg_wal". So xlogfname has enough
> size but snprintf(xlogfname, MAXPGPATH) is wrong usage.
> (And XLogFileName() uses snprintf(xlogfname, MAXFNAMELEN)
> internally.)
>

Nice catch!

I don't think we need another buffer here, just need to use MAXFNAMELEN,
because strlen("pg_wal/$wal_filename") + 1 = 32 perfectly fits into 64
bytes.

The new version is attached.

Regards,
--
Alexander Kukushkin
From 19e9a3472782bdc92d0811e27cc3dd0fdcbf965d Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin 
Date: Fri, 12 Jul 2024 10:50:22 +0200
Subject: [PATCH v10] Be more picky with WAL segment deletion in pg_rewind

Make pg_rewind to be a bit wiser in terms of creating filemap:
preserve on the target all WAL segments that contain records between the
last common checkpoint and the point of divergence.

Co-authored-by: Polina Bungina 
---
 src/bin/pg_rewind/filemap.c   | 62 +-
 src/bin/pg_rewind/filemap.h   |  3 +
 src/bin/pg_rewind/meson.build |  1 +
 src/bin/pg_rewind/parsexlog.c | 24 +++
 src/bin/pg_rewind/pg_rewind.c |  3 +
 src/bin/pg_rewind/t/010_keep_recycled_wals.pl | 65 +++
 6 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 src/bin/pg_rewind/t/010_keep_recycled_wals.pl

diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index 4458324c9d..b357c28338 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -63,6 +63,28 @@ static file_entry_t *lookup_filehash_entry(const char *path);
 static int	final_filemap_cmp(const void *a, const void *b);
 static bool check_file_excluded(const char *path, bool is_source);
 
+typedef struct skipwal_t
+{
+	const char *path;
+	uint32		status;
+}			skipwal_t;
+
+#define SH_PREFIX		keepwalhash
+#define SH_ELEMENT_TYPE	skipwal_t
+#define SH_KEY_TYPE		const char *
+#define	SH_KEY			path
+#define SH_HASH_KEY(tb, key)	hash_string(key)
+#define SH_EQUAL(tb, a, b)		(strcmp(a, b) == 0)
+#define	SH_SCOPE		static inline
+#define SH_RAW_ALLOCATOR	pg_malloc0
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+static keepwalhash_hash * keepwalhash = NULL;
+
+static bool keepwalhash_entry_exists(const char *path);
+
 /*
  * Definition of one element part of an exclusion list, used to exclude
  * contents when rewinding.  "name" is the name of the file or path to
@@ -206,6 +228,35 @@ lookup_filehash_entry(const char *path)
 	return filehash_lookup(filehash, path);
 }
 
+/* Initialize a hash table to store WAL file names that must be kept */
+void
+keepwalhash_init(void)
+{
+	keepwalhash = keepwalhash_create(FILEHASH_INITIAL_SIZE, NULL);
+}
+
+/* Prevent a given file deletion during rewind */
+void
+insert_keepwalhash_entry(const char *path)
+{
+	skipwal_t  *entry;
+	bool		found;
+
+	/* Should only be called with keepwalhash initialized */
+	Assert(keepwalhash);
+
+	entry = keepwalhash_insert(keepwalhash, path, &found);
+
+	if (!found)
+		entry->path = pg_strdup(path);
+}
+
+static bool
+keepwalhash_entry_exists(const char *path)
+{
+	return keepwalhash_lookup(keepwalhash, path) != NULL;
+}
+
 /*
  * Callback for processing source file list.
  *
@@ -685,7 +736,16 @@ decide_file_action(file_entry_t *entry)
 	}
 	else if (entry->target_exists && !entry->source_exists)
 	{
-		/* File exists in target, but not source. Remove it. */
+		/* File exists in target, but not source. */
+
+		if (keepwalhash_entry_exists(path))
+		{
+			/* This is a WAL file that should be kept. */
+			pg_log_debug("Not removing %s because it is required for recovery", path);
+			return FILE_ACTION_NONE;
+		}
+
+		/* Otherwise remove an unexpected file. */
 		return FILE_ACTION_REMOVE;
 	}
 	else if (!entry->target_exists && !entry->source_exists)
diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h
index 007e0f17cf..0cb6fcae00 100644
--- 

Re: pg_rewind WAL segments deletion pitfall

2024-07-12 Thread Sutou Kouhei
Hi,

I'm reviewing patches in Commitfest 2024-07 from top to bottom:
https://commitfest.postgresql.org/48/

This is the 1st patch:
https://commitfest.postgresql.org/48/3874/

The latest patch can't be applied on master:
https://www.postgresql.org/message-id/CAFh8B=nnjtm9ke4_1mhpwgz2pv9yoyf6hmnyh5xact0aa4v...@mail.gmail.com

I've rebased on master. See the attached patch.
Here are changes for it:

* Resolve conflict
* Update copyright year to 2024 from 2023
* Add an added test to meson.build
* Run pgindent

Here are my review comments:

@@ -217,6 +221,26 @@ findLastCheckpoint(const char *datadir, XLogRecPtr 
forkptr, int tliIndex,
+   charxlogfname[MAXFNAMELEN];
+
+   tli = xlogreader->seg.ws_tli;
+   segno = xlogreader->seg.ws_segno;
+
+   snprintf(xlogfname, MAXPGPATH, XLOGDIR "/");
+   XLogFileName(xlogfname + strlen(xlogfname),
+xlogreader->seg.ws_tli,
+xlogreader->seg.ws_segno, 
WalSegSz);
+
+   /*
+* Make sure pg_rewind doesn't remove this file, 
because it is
+* required for postgres to start after rewind.
+*/
+   insert_keepwalhash_entry(xlogfname);

MAXFNAMELEN is 64 and MAXPGPATH is 1024. strlen(XLOGDIR "/")
is 7 because XLOGDIR is "pg_wal". So xlogfname has enough
size but snprintf(xlogfname, MAXPGPATH) is wrong usage.
(And XLogFileName() uses snprintf(xlogfname, MAXFNAMELEN)
internally.)

How about using one more buffer?


charxlogpath[MAXPGPATH];
charxlogfname[MAXFNAMELEN];

tli = xlogreader->seg.ws_tli;
segno = xlogreader->seg.ws_segno;

XLogFileName(xlogfname,
 xlogreader->seg.ws_tli,
 xlogreader->seg.ws_segno, WalSegSz);
snprintf(xlogpath, MAXPGPATH, "%s/%s", XLOGDIR, xlogfname);

/*
 * Make sure pg_rewind doesn't remove this file, because it is
 * required for postgres to start after rewind.
 */
insert_keepwalhash_entry(xlogpath);



Thanks,
-- 
kou

In 
  "Re: pg_rewind WAL segments deletion pitfall" on Tue, 23 Jan 2024 09:23:29 
+0100,
  Alexander Kukushkin  wrote:

> Hi Peter,
> 
> On Mon, 22 Jan 2024 at 00:38, Peter Smith  wrote:
> 
>> 2024-01 Commitfest.
>>
>> Hi, This patch has a CF status of "Ready for Committer", but it is
>> currently failing some CFbot tests [1]. Please have a look and post an
>> updated version..
>>
>> ==
>> [1]
>> https://cirrus-ci.com/github/postgresql-cfbot/postgresql/commitfest/46/3874
>>
>>
> From what I can see all failures are not related to this patch:
> 1. Windows build failed with
> [10:52:49.679] 126/281 postgresql:recovery / recovery/019_replslot_limit
> ERROR 185.84s (exit status 255 or signal 127 SIGinvalid)
> 2. FreeBSD build failed with
> [09:11:57.656] 190/285 postgresql:psql / psql/010_tab_completion ERROR
> 0.46s exit status 2
> [09:11:57.656] 220/285 postgresql:authentication /
> authentication/001_password ERROR 0.57s exit status 2
> 
> In fact, I don't even see this patch being applied for these builds and the
> introduced TAP test being executed.
> 
> Regards,
> --
> Alexander Kukushkin
>From 38edc07c49970ad0ac1818284f2e59e26591b3a4 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin 
Date: Sun, 6 Aug 2023 15:56:39 +0100
Subject: [PATCH v9] Be more picky with WAL segment deletion in pg_rewind

Make pg_rewind to be a bit wiser in terms of creating filemap:
preserve on the target all WAL segments that contain records between the
last common checkpoint and the point of divergence.

Co-authored-by: Polina Bungina 
---
 src/bin/pg_rewind/filemap.c   | 62 +-
 src/bin/pg_rewind/filemap.h   |  3 +
 src/bin/pg_rewind/meson.build |  1 +
 src/bin/pg_rewind/parsexlog.c | 24 +++
 src/bin/pg_rewind/pg_rewind.c |  3 +
 src/bin/pg_rewind/t/010_keep_recycled_wals.pl | 65 +++
 6 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 src/bin/pg_rewind/t/010_keep_recycled_wals.pl

diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index 4458324c9d8..b357c28338a 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -63,6 +63,28 @@ static file_entry_t *lookup_filehash_entry(const char *path);
 static int	final_filemap_cmp(const void *a, const void *b);
 static bool check_file_excluded(const char *path, bool is_source);
 
+typedef struct skipwal_t
+{
+	const char *path;
+	uint32		status;
+}			skipwal_t;
+
+#define SH_PREFIX		keepwalhash
+#define SH_ELE

Re: pg_rewind WAL segments deletion pitfall

2024-01-23 Thread Alexander Kukushkin
Hi Peter,

On Mon, 22 Jan 2024 at 00:38, Peter Smith  wrote:

> 2024-01 Commitfest.
>
> Hi, This patch has a CF status of "Ready for Committer", but it is
> currently failing some CFbot tests [1]. Please have a look and post an
> updated version..
>
> ==
> [1]
> https://cirrus-ci.com/github/postgresql-cfbot/postgresql/commitfest/46/3874
>
>
>From what I can see all failures are not related to this patch:
1. Windows build failed with
[10:52:49.679] 126/281 postgresql:recovery / recovery/019_replslot_limit
ERROR 185.84s (exit status 255 or signal 127 SIGinvalid)
2. FreeBSD build failed with
[09:11:57.656] 190/285 postgresql:psql / psql/010_tab_completion ERROR
0.46s exit status 2
[09:11:57.656] 220/285 postgresql:authentication /
authentication/001_password ERROR 0.57s exit status 2

In fact, I don't even see this patch being applied for these builds and the
introduced TAP test being executed.

Regards,
--
Alexander Kukushkin


Re: pg_rewind WAL segments deletion pitfall

2024-01-21 Thread Peter Smith
2024-01 Commitfest.

Hi, This patch has a CF status of "Ready for Committer", but it is
currently failing some CFbot tests [1]. Please have a look and post an
updated version..

==
[1] https://cirrus-ci.com/github/postgresql-cfbot/postgresql/commitfest/46/3874

Kind Regards,
Peter Smith.




Re: pg_rewind WAL segments deletion pitfall

2023-11-08 Thread torikoshia

On 2023-11-06 23:58, Alexander Kukushkin wrote:

Hi Torikoshia,

On Thu, 2 Nov 2023 at 04:24, torikoshia 
wrote:


+extern void preserve_file(char *filepath);


Is this necessary?
This function was defined in older version patch, but no longer
seems to
exist.

+# We use "perl -e 'exit(1)'" as a alternative to "false", because
the
last one
'a' should be 'an'?


Thanks for the feedback

Please find the new version attached.

Thanks for the update!

I've set the CF entry to "Ready for Committer".

--
Regards,

--
Atsushi Torikoshi
NTT DATA Group Corporation




Re: pg_rewind WAL segments deletion pitfall

2023-11-06 Thread Alexander Kukushkin
Hi Torikoshia,


On Thu, 2 Nov 2023 at 04:24, torikoshia  wrote:

>
>
> > +extern void preserve_file(char *filepath);
>
> Is this necessary?
> This function was defined in older version patch, but no longer seems to
> exist.
>
> +# We use "perl -e 'exit(1)'" as a alternative to "false", because the
> last one
> 'a' should be 'an'?
>
>
Thanks for the feedback

Please find the new version attached.

Regards,
--
Alexander Kukushkin

On Thu, 2 Nov 2023 at 04:24, torikoshia  wrote:

> On 2023-10-31 00:26, Alexander Kukushkin wrote:
> > Hi,
> >
> > On Wed, 18 Oct 2023 at 08:50, torikoshia 
> > wrote:
> >
> >> I have very minor questions on the regression tests mainly regarding
> >> the
> >> consistency with other tests for pg_rewind:
> >
> > Please find attached a new version of the patch. It addresses all your
> > comments.
>
> Thanks for updating the patch!
>
> > +extern void preserve_file(char *filepath);
>
> Is this necessary?
> This function was defined in older version patch, but no longer seems to
> exist.
>
> +# We use "perl -e 'exit(1)'" as a alternative to "false", because the
> last one
> 'a' should be 'an'?
>
>
> --
> Regards,
>
> --
> Atsushi Torikoshi
> NTT DATA Group Corporation
>


-- 
Regards,
--
Alexander Kukushkin
From 9b4e41c91312d8a79a03c929224cd4dfe99e7802 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin 
Date: Mon, 06 Aug 2023 15:56:39 +0100
Subject: [PATCH] Be more picky with WAL segment deletion in pg_rewind

Make pg_rewind to be a bit wiser in terms of creating filemap:
preserve on the target all WAL segments that contain records between the
last common checkpoint and the point of divergence.

Co-authored-by: Polina Bungina 
---
 src/bin/pg_rewind/filemap.c   | 62 +-
 src/bin/pg_rewind/filemap.h   |  3 +
 src/bin/pg_rewind/parsexlog.c | 22 +++
 src/bin/pg_rewind/pg_rewind.c |  3 +
 src/bin/pg_rewind/t/010_keep_recycled_wals.pl | 65 +++
 5 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100644 src/bin/pg_rewind/t/010_keep_recycled_wals.pl

diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index ecadd69dc5..ebaebfb149 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -64,6 +64,27 @@ static file_entry_t *lookup_filehash_entry(const char *path);
 static int	final_filemap_cmp(const void *a, const void *b);
 static bool check_file_excluded(const char *path, bool is_source);
 
+typedef struct skipwal_t {
+	const char *path;
+	uint32		status;
+} skipwal_t;
+
+#define SH_PREFIX		keepwalhash
+#define SH_ELEMENT_TYPE	skipwal_t
+#define SH_KEY_TYPE		const char *
+#define	SH_KEY			path
+#define SH_HASH_KEY(tb, key)	hash_string_pointer(key)
+#define SH_EQUAL(tb, a, b)		(strcmp(a, b) == 0)
+#define	SH_SCOPE		static inline
+#define SH_RAW_ALLOCATOR	pg_malloc0
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+static keepwalhash_hash *keepwalhash = NULL;
+
+static bool keepwalhash_entry_exists(const char *path);
+
 /*
  * Definition of one element part of an exclusion list, used to exclude
  * contents when rewinding.  "name" is the name of the file or path to
@@ -207,6 +228,35 @@ lookup_filehash_entry(const char *path)
 	return filehash_lookup(filehash, path);
 }
 
+/* Initialize a hash table to store WAL file names that must be kept */
+void
+keepwalhash_init(void)
+{
+	keepwalhash = keepwalhash_create(FILEHASH_INITIAL_SIZE, NULL);
+}
+
+/* Prevent a given file deletion during rewind */
+void
+insert_keepwalhash_entry(const char *path)
+{
+	skipwal_t   *entry;
+	bool		found;
+
+	/* Should only be called with keepwalhash initialized */
+	Assert(keepwalhash);
+
+	entry = keepwalhash_insert(keepwalhash, path, &found);
+
+	if (!found)
+		entry->path = pg_strdup(path);
+}
+
+static bool
+keepwalhash_entry_exists(const char *path)
+{
+	return keepwalhash_lookup(keepwalhash, path) != NULL;
+}
+
 /*
  * Callback for processing source file list.
  *
@@ -682,7 +732,16 @@ decide_file_action(file_entry_t *entry)
 	}
 	else if (entry->target_exists && !entry->source_exists)
 	{
-		/* File exists in target, but not source. Remove it. */
+		/* File exists in target, but not source. */
+
+		if (keepwalhash_entry_exists(path))
+		{
+			/* This is a WAL file that should be kept. */
+			pg_log_debug("Not removing %s because it is required for recovery", path);
+			return FILE_ACTION_NONE;
+		}
+
+		/* Otherwise remove an unexpected file. */
 		return FILE_ACTION_REMOVE;
 	}
 	else if (!entry->target_exists && !entry->source_exists)
@@ -818,7 +877,6 @@ decide_file_actions(void)
 	return filemap;
 }
 
-
 /*
  * Helper function for filemap hash table.
  */
diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h
index 988d4590e0..611b8cf49e 100644
--- a/src/bin/pg_rewind/filemap.h
+++ b/src/bin/pg_rewind/filemap.h
@@ -110,4 +110,7 @@ extern filemap_t *decide_file_actions(void);
 extern void calculate_totals(filemap_t *filemap);

Re: pg_rewind WAL segments deletion pitfall

2023-11-01 Thread torikoshia

On 2023-10-31 00:26, Alexander Kukushkin wrote:

Hi,

On Wed, 18 Oct 2023 at 08:50, torikoshia 
wrote:


I have very minor questions on the regression tests mainly regarding
the
consistency with other tests for pg_rewind:


Please find attached a new version of the patch. It addresses all your
comments.


Thanks for updating the patch!


+extern void preserve_file(char *filepath);


Is this necessary?
This function was defined in older version patch, but no longer seems to 
exist.


+# We use "perl -e 'exit(1)'" as a alternative to "false", because the 
last one

'a' should be 'an'?


--
Regards,

--
Atsushi Torikoshi
NTT DATA Group Corporation




Re: pg_rewind WAL segments deletion pitfall

2023-10-30 Thread Alexander Kukushkin
Hi,

On Wed, 18 Oct 2023 at 08:50, torikoshia  wrote:

>
> I have very minor questions on the regression tests mainly regarding the
> consistency with other tests for pg_rewind:
>

Please find attached a new version of the patch. It addresses all your
comments.

Regards,
--
Alexander Kukushkin
From 08d2cac946b92f05d410d75ed026ebf6587b6671 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin 
Date: Mon, 30 Oct 2023 16:23:15 +0200
Subject: [PATCH] Be more picky with WAL segment deletion in pg_rewind

Make pg_rewind to be a bit wiser in terms of creating filemap:
preserve on the target all WAL segments that contain records between the
last common checkpoint and the point of divergence.

Co-authored-by: Polina Bungina 
---
 src/bin/pg_rewind/filemap.c   | 62 +-
 src/bin/pg_rewind/filemap.h   |  4 ++
 src/bin/pg_rewind/parsexlog.c | 22 +++
 src/bin/pg_rewind/pg_rewind.c |  3 +
 src/bin/pg_rewind/t/010_keep_recycled_wals.pl | 64 +++
 5 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100644 src/bin/pg_rewind/t/010_keep_recycled_wals.pl

diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index ecadd69dc5..ebaebfb149 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -64,6 +64,27 @@ static file_entry_t *lookup_filehash_entry(const char *path);
 static int	final_filemap_cmp(const void *a, const void *b);
 static bool check_file_excluded(const char *path, bool is_source);
 
+typedef struct skipwal_t {
+	const char *path;
+	uint32		status;
+} skipwal_t;
+
+#define SH_PREFIX		keepwalhash
+#define SH_ELEMENT_TYPE	skipwal_t
+#define SH_KEY_TYPE		const char *
+#define	SH_KEY			path
+#define SH_HASH_KEY(tb, key)	hash_string_pointer(key)
+#define SH_EQUAL(tb, a, b)		(strcmp(a, b) == 0)
+#define	SH_SCOPE		static inline
+#define SH_RAW_ALLOCATOR	pg_malloc0
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+static keepwalhash_hash *keepwalhash = NULL;
+
+static bool keepwalhash_entry_exists(const char *path);
+
 /*
  * Definition of one element part of an exclusion list, used to exclude
  * contents when rewinding.  "name" is the name of the file or path to
@@ -207,6 +228,35 @@ lookup_filehash_entry(const char *path)
 	return filehash_lookup(filehash, path);
 }
 
+/* Initialize a hash table to store WAL file names that must be kept */
+void
+keepwalhash_init(void)
+{
+	keepwalhash = keepwalhash_create(FILEHASH_INITIAL_SIZE, NULL);
+}
+
+/* Prevent a given file deletion during rewind */
+void
+insert_keepwalhash_entry(const char *path)
+{
+	skipwal_t   *entry;
+	bool		found;
+
+	/* Should only be called with keepwalhash initialized */
+	Assert(keepwalhash);
+
+	entry = keepwalhash_insert(keepwalhash, path, &found);
+
+	if (!found)
+		entry->path = pg_strdup(path);
+}
+
+static bool
+keepwalhash_entry_exists(const char *path)
+{
+	return keepwalhash_lookup(keepwalhash, path) != NULL;
+}
+
 /*
  * Callback for processing source file list.
  *
@@ -682,7 +732,16 @@ decide_file_action(file_entry_t *entry)
 	}
 	else if (entry->target_exists && !entry->source_exists)
 	{
-		/* File exists in target, but not source. Remove it. */
+		/* File exists in target, but not source. */
+
+		if (keepwalhash_entry_exists(path))
+		{
+			/* This is a WAL file that should be kept. */
+			pg_log_debug("Not removing %s because it is required for recovery", path);
+			return FILE_ACTION_NONE;
+		}
+
+		/* Otherwise remove an unexpected file. */
 		return FILE_ACTION_REMOVE;
 	}
 	else if (!entry->target_exists && !entry->source_exists)
@@ -818,7 +877,6 @@ decide_file_actions(void)
 	return filemap;
 }
 
-
 /*
  * Helper function for filemap hash table.
  */
diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h
index 988d4590e0..f037448a0f 100644
--- a/src/bin/pg_rewind/filemap.h
+++ b/src/bin/pg_rewind/filemap.h
@@ -109,5 +109,9 @@ extern void process_target_wal_block_change(ForkNumber forknum,
 extern filemap_t *decide_file_actions(void);
 extern void calculate_totals(filemap_t *filemap);
 extern void print_filemap(filemap_t *filemap);
+extern void preserve_file(char *filepath);
+
+extern void keepwalhash_init(void);
+extern void insert_keepwalhash_entry(const char *path);
 
 #endif			/* FILEMAP_H */
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 0233ece88b..b89b65077d 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -176,6 +176,10 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
 	char	   *errormsg;
 	XLogPageReadPrivate private;
 
+	/* Track WAL segments opened while searching a checkpoint */
+	XLogSegNo	segno = 0;
+	TimeLineID	tli = 0;
+
 	/*
 	 * The given fork pointer points to the end of the last common record,
 	 * which is not necessarily the beginning of the next record, if the
@@ -217,6 +221,24 @@ findLastCheckpoint(const char *datadir, XLogRecPtr fo

Re: pg_rewind WAL segments deletion pitfall

2023-10-17 Thread torikoshia

Thanks for the patch.

I tested the v6 patch using the test script attached on [1], old primary 
has succeeded to become new standby.


I have very minor questions on the regression tests mainly regarding the 
consistency with other tests for pg_rewind:




+setup_cluster;
+create_standby;


Would it be better to add parentheses?
Also should we add "RewindTest::" for these function?



+primary_psql("create table t(a int)");
+primary_psql("insert into t values(0)");
+primary_psql("select pg_switch_wal()");

..

Should 'select', 'create', etc be capitalized?



my $false = "$^X -e 'exit(1)'";

I feel it's hard to understand what does this mean.
Isn't it better to add comments and describe this is for windows 
environments?



--
Regards,

--
Atsushi Torikoshi
NTT DATA Group Corporation




Re: pg_rewind WAL segments deletion pitfall

2023-09-13 Thread Alexander Kukushkin
Hi,

Please find attached v6.
Changes compared to v5:
1. use "perl -e 'exit(1)'" instead of "false" as archive_command, so it
also works on Windows
2. fixed the test name

Regards,
--
Alexander Kukushkin
From 3e1e6c9d968e9b829357b6eb0a7dfa366b550668 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin 
Date: Wed, 13 Sep 2023 09:17:15 +0200
Subject: [PATCH v6] Be more picky with WAL segment deletion in pg_rewind

Make pg_rewind to be a bit wiser in terms of creating filemap:
preserve on the target all WAL segments that contain records between the
last common checkpoint and the point of divergence.

Co-authored-by: Polina Bungina 
---
 src/bin/pg_rewind/filemap.c   | 62 +-
 src/bin/pg_rewind/filemap.h   |  4 ++
 src/bin/pg_rewind/parsexlog.c | 22 +++
 src/bin/pg_rewind/pg_rewind.c |  3 +
 src/bin/pg_rewind/t/010_keep_recycled_wals.pl | 63 +++
 5 files changed, 152 insertions(+), 2 deletions(-)
 create mode 100644 src/bin/pg_rewind/t/010_keep_recycled_wals.pl

diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index 58280d9abc..d44d716d82 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -64,6 +64,27 @@ static file_entry_t *lookup_filehash_entry(const char *path);
 static int	final_filemap_cmp(const void *a, const void *b);
 static bool check_file_excluded(const char *path, bool is_source);
 
+typedef struct skipwal_t {
+	const char *path;
+	uint32		status;
+} skipwal_t;
+
+#define SH_PREFIX		keepwalhash
+#define SH_ELEMENT_TYPE	skipwal_t
+#define SH_KEY_TYPE		const char *
+#define	SH_KEY			path
+#define SH_HASH_KEY(tb, key)	hash_string_pointer(key)
+#define SH_EQUAL(tb, a, b)		(strcmp(a, b) == 0)
+#define	SH_SCOPE		static inline
+#define SH_RAW_ALLOCATOR	pg_malloc0
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+static keepwalhash_hash *keepwalhash = NULL;
+
+static bool keepwalhash_entry_exists(const char *path);
+
 /*
  * Definition of one element part of an exclusion list, used to exclude
  * contents when rewinding.  "name" is the name of the file or path to
@@ -207,6 +228,35 @@ lookup_filehash_entry(const char *path)
 	return filehash_lookup(filehash, path);
 }
 
+/* Initialize a hash table to store WAL file names that must be kept */
+void
+keepwalhash_init(void)
+{
+	keepwalhash = keepwalhash_create(FILEHASH_INITIAL_SIZE, NULL);
+}
+
+/* Prevent a given file deletion during rewind */
+void
+insert_keepwalhash_entry(const char *path)
+{
+	skipwal_t   *entry;
+	bool		found;
+
+	/* Should only be called with keepwalhash initialized */
+	Assert(keepwalhash);
+
+	entry = keepwalhash_insert(keepwalhash, path, &found);
+
+	if (!found)
+		entry->path = pg_strdup(path);
+}
+
+static bool
+keepwalhash_entry_exists(const char *path)
+{
+	return keepwalhash_lookup(keepwalhash, path) != NULL;
+}
+
 /*
  * Callback for processing source file list.
  *
@@ -682,7 +732,16 @@ decide_file_action(file_entry_t *entry)
 	}
 	else if (entry->target_exists && !entry->source_exists)
 	{
-		/* File exists in target, but not source. Remove it. */
+		/* File exists in target, but not source. */
+
+		if (keepwalhash_entry_exists(path))
+		{
+			/* This is a WAL file that should be kept. */
+			pg_log_debug("Not removing %s because it is required for recovery", path);
+			return FILE_ACTION_NONE;
+		}
+
+		/* Otherwise remove an unexpected file. */
 		return FILE_ACTION_REMOVE;
 	}
 	else if (!entry->target_exists && !entry->source_exists)
@@ -818,7 +877,6 @@ decide_file_actions(void)
 	return filemap;
 }
 
-
 /*
  * Helper function for filemap hash table.
  */
diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h
index 48f240dff1..9a0d4bbc54 100644
--- a/src/bin/pg_rewind/filemap.h
+++ b/src/bin/pg_rewind/filemap.h
@@ -109,5 +109,9 @@ extern void process_target_wal_block_change(ForkNumber forknum,
 extern filemap_t *decide_file_actions(void);
 extern void calculate_totals(filemap_t *filemap);
 extern void print_filemap(filemap_t *filemap);
+extern void preserve_file(char *filepath);
+
+extern void keepwalhash_init(void);
+extern void insert_keepwalhash_entry(const char *path);
 
 #endif			/* FILEMAP_H */
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 27782237d0..4c7f9bef14 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -176,6 +176,10 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
 	char	   *errormsg;
 	XLogPageReadPrivate private;
 
+	/* Track WAL segments opened while searching a checkpoint */
+	XLogSegNo	segno = 0;
+	TimeLineID	tli = 0;
+
 	/*
 	 * The given fork pointer points to the end of the last common record,
 	 * which is not necessarily the beginning of the next record, if the
@@ -217,6 +221,24 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
 		 LSN_FORMAT_ARGS(searchptr));
 		}
 
+		/* We are trying to 

Re: pg_rewind WAL segments deletion pitfall

2023-09-12 Thread Alexander Kukushkin
Hi,

Please find attached v5.
What changed:
1. Now we collect which files should be kept  in a separate hash table.
2. Decision whether to keep the file is made only when the file is actually
missing on the source. That is, remaining WAL files will be copied over as
it currently is, although it could be extremely inefficient and unnecessary.
3. Added TAP test that actually at least one file isn't removed.

Regards,
--
Alexander Kukushkin
From 3e1e6c9d968e9b829357b6eb0a7dfa366b550668 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin 
Date: Tue, 12 Sep 2023 14:09:47 +0200
Subject: [PATCH v5] Be more picky with WAL segment deletion in pg_rewind

Make pg_rewind to be a bit wiser in terms of creating filemap:
preserve on the target all WAL segments that contain records between the
last common checkpoint and the point of divergence.

Co-authored-by: Polina Bungina 
---
 src/bin/pg_rewind/filemap.c   | 62 ++-
 src/bin/pg_rewind/filemap.h   |  4 ++
 src/bin/pg_rewind/parsexlog.c | 22 +++
 src/bin/pg_rewind/pg_rewind.c |  3 +
 src/bin/pg_rewind/t/010_keep_recycled_wals.pl | 62 +++
 5 files changed, 151 insertions(+), 2 deletions(-)
 create mode 100644 src/bin/pg_rewind/t/010_keep_recycled_wals.pl

diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index 58280d9abc..d44d716d82 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -64,6 +64,27 @@ static file_entry_t *lookup_filehash_entry(const char *path);
 static int	final_filemap_cmp(const void *a, const void *b);
 static bool check_file_excluded(const char *path, bool is_source);
 
+typedef struct skipwal_t {
+	const char *path;
+	uint32		status;
+} skipwal_t;
+
+#define SH_PREFIX		keepwalhash
+#define SH_ELEMENT_TYPE	skipwal_t
+#define SH_KEY_TYPE		const char *
+#define	SH_KEY			path
+#define SH_HASH_KEY(tb, key)	hash_string_pointer(key)
+#define SH_EQUAL(tb, a, b)		(strcmp(a, b) == 0)
+#define	SH_SCOPE		static inline
+#define SH_RAW_ALLOCATOR	pg_malloc0
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+static keepwalhash_hash *keepwalhash = NULL;
+
+static bool keepwalhash_entry_exists(const char *path);
+
 /*
  * Definition of one element part of an exclusion list, used to exclude
  * contents when rewinding.  "name" is the name of the file or path to
@@ -207,6 +228,35 @@ lookup_filehash_entry(const char *path)
 	return filehash_lookup(filehash, path);
 }
 
+/* Initialize a hash table to store WAL file names that must be kept */
+void
+keepwalhash_init(void)
+{
+	keepwalhash = keepwalhash_create(FILEHASH_INITIAL_SIZE, NULL);
+}
+
+/* Prevent a given file deletion during rewind */
+void
+insert_keepwalhash_entry(const char *path)
+{
+	skipwal_t   *entry;
+	bool		found;
+
+	/* Should only be called with keepwalhash initialized */
+	Assert(keepwalhash);
+
+	entry = keepwalhash_insert(keepwalhash, path, &found);
+
+	if (!found)
+		entry->path = pg_strdup(path);
+}
+
+static bool
+keepwalhash_entry_exists(const char *path)
+{
+	return keepwalhash_lookup(keepwalhash, path) != NULL;
+}
+
 /*
  * Callback for processing source file list.
  *
@@ -682,7 +732,16 @@ decide_file_action(file_entry_t *entry)
 	}
 	else if (entry->target_exists && !entry->source_exists)
 	{
-		/* File exists in target, but not source. Remove it. */
+		/* File exists in target, but not source. */
+
+		if (keepwalhash_entry_exists(path))
+		{
+			/* This is a WAL file that should be kept. */
+			pg_log_debug("Not removing %s because it is required for recovery", path);
+			return FILE_ACTION_NONE;
+		}
+
+		/* Otherwise remove an unexpected file. */
 		return FILE_ACTION_REMOVE;
 	}
 	else if (!entry->target_exists && !entry->source_exists)
@@ -818,7 +877,6 @@ decide_file_actions(void)
 	return filemap;
 }
 
-
 /*
  * Helper function for filemap hash table.
  */
diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h
index 48f240dff1..9a0d4bbc54 100644
--- a/src/bin/pg_rewind/filemap.h
+++ b/src/bin/pg_rewind/filemap.h
@@ -109,5 +109,9 @@ extern void process_target_wal_block_change(ForkNumber forknum,
 extern filemap_t *decide_file_actions(void);
 extern void calculate_totals(filemap_t *filemap);
 extern void print_filemap(filemap_t *filemap);
+extern void preserve_file(char *filepath);
+
+extern void keepwalhash_init(void);
+extern void insert_keepwalhash_entry(const char *path);
 
 #endif			/* FILEMAP_H */
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 27782237d0..4c7f9bef14 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -176,6 +176,10 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
 	char	   *errormsg;
 	XLogPageReadPrivate private;
 
+	/* Track WAL segments opened while searching a checkpoint */
+	XLogSegNo	segno = 0;
+	TimeLineID	tli = 0;
+
 	/*
 	 * The given fork pointer points to the end of the last common re

Re: pg_rewind WAL segments deletion pitfall

2023-08-29 Thread torikoshia

On 2023-08-24 09:45, Kyotaro Horiguchi wrote:

At Wed, 23 Aug 2023 13:44:52 +0200, Alexander Kukushkin
 wrote in
On Tue, 22 Aug 2023 at 07:32, Michael Paquier  
wrote:

> I don't like much this patch.  While it takes correctly advantage of
> the backward record read logic from SimpleXLogPageRead() able to
> handle correctly timeline jumps, it creates a hidden dependency in the
> code between the hash table from filemap.c and the page callback.
> Wouldn't it be simpler to build a list of the segment names using the
> information from WALOpenSegment and build this list in
> findLastCheckpoint()?

I think the first version of the patch more or less did that. Not
necessarily a list, but a hash table of WAL file names that we want to
keep. But Kyotaro Horiguchi didn't like it and suggested creating 
entries

in the filemap.c hash table instead.
But, I agree, doing it directly from the findLastCheckpoint() makes 
the

code easier to understand.

...

> +   /*
> +* Some entries (WAL segments) already have an action assigned
> +* (see SimpleXLogPageRead()).
> +*/
> +   if (entry->action == FILE_ACTION_UNDECIDED)
> +   entry->action = decide_file_action(entry);
>
> This change makes me a bit uneasy, per se my previous comment with the
> additional code dependencies.
>

We can revert to the original approach (see
v1-0001-pg_rewind-wal-deletion.patch from the very first email) if you 
like.


On the other hand, that approach brings in another source that
suggests the way that file should be handled. I still think that
entry->action should be the only source.


+1.
Imaging a case when we come to need decide how to treat files based on 
yet another factor, I feel that a single source of truth is better than 
creating a list or hash for each factor.



However, it seems I'm in the
minority here. So I'm not tied to that approach.


> I think that this scenario deserves a test case.  If one wants to
> emulate a delay in WAL archiving, it is possible to set
> archive_command to a command that we know will fail, for instance.
>

Yes, I totally agree, it is on our radar, but meanwhile please see the 
new

version, just to check if I correctly understood your idea.


Thanks for the patch.
I tested v4 patch using the script attached below thread and it has 
successfully finished.


https://www.postgresql.org/message-id/2e75ae22dce9a227c3d47fa6d0ed094a%40oss.nttdata.com


--
Regards,

--
Atsushi Torikoshi
NTT DATA Group Corporation




Re: pg_rewind WAL segments deletion pitfall

2023-08-23 Thread Kyotaro Horiguchi
At Wed, 23 Aug 2023 13:44:52 +0200, Alexander Kukushkin  
wrote in 
> On Tue, 22 Aug 2023 at 07:32, Michael Paquier  wrote:
> > I don't like much this patch.  While it takes correctly advantage of
> > the backward record read logic from SimpleXLogPageRead() able to
> > handle correctly timeline jumps, it creates a hidden dependency in the
> > code between the hash table from filemap.c and the page callback.
> > Wouldn't it be simpler to build a list of the segment names using the
> > information from WALOpenSegment and build this list in
> > findLastCheckpoint()?
> 
> I think the first version of the patch more or less did that. Not
> necessarily a list, but a hash table of WAL file names that we want to
> keep. But Kyotaro Horiguchi didn't like it and suggested creating entries
> in the filemap.c hash table instead.
> But, I agree, doing it directly from the findLastCheckpoint() makes the
> code easier to understand.
...
> > +   /*
> > +* Some entries (WAL segments) already have an action assigned
> > +* (see SimpleXLogPageRead()).
> > +*/
> > +   if (entry->action == FILE_ACTION_UNDECIDED)
> > +   entry->action = decide_file_action(entry);
> >
> > This change makes me a bit uneasy, per se my previous comment with the
> > additional code dependencies.
> >
> 
> We can revert to the original approach (see
> v1-0001-pg_rewind-wal-deletion.patch from the very first email) if you like.

On the other hand, that approach brings in another source that
suggests the way that file should be handled. I still think that
entry->action should be the only source. However, it seems I'm in the
minority here. So I'm not tied to that approach.

> > I think that this scenario deserves a test case.  If one wants to
> > emulate a delay in WAL archiving, it is possible to set
> > archive_command to a command that we know will fail, for instance.
> >
> 
> Yes, I totally agree, it is on our radar, but meanwhile please see the new
> version, just to check if I correctly understood your idea.

Agreed.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: pg_rewind WAL segments deletion pitfall

2023-08-23 Thread Alexander Kukushkin
Hi,



On Tue, 22 Aug 2023 at 07:32, Michael Paquier  wrote:

>
>
> I don't like much this patch.  While it takes correctly advantage of
> the backward record read logic from SimpleXLogPageRead() able to
> handle correctly timeline jumps, it creates a hidden dependency in the
> code between the hash table from filemap.c and the page callback.
> Wouldn't it be simpler to build a list of the segment names using the
> information from WALOpenSegment and build this list in
> findLastCheckpoint()?


I think the first version of the patch more or less did that. Not
necessarily a list, but a hash table of WAL file names that we want to
keep. But Kyotaro Horiguchi didn't like it and suggested creating entries
in the filemap.c hash table instead.
But, I agree, doing it directly from the findLastCheckpoint() makes the
code easier to understand.



>   Also, I am wondering if we should be smarter
> with any potential conflict handling between the source and the
> target, rather than just enforcing a FILE_ACTION_NONE for all these
> files.  In short, could it be better to copy the WAL file from the
> source if it exists there?
>

Before the switchpoint these files are supposed to be the same on the old
primary, new primary, and also in the archive. Also, if there is a
restore_command postgres will fetch the same file from the archive even if
it already exists in pg_wal, which effectively discards all pg_rewind
efforts on copying WAL files.


>
> +   /*
> +* Some entries (WAL segments) already have an action assigned
> +* (see SimpleXLogPageRead()).
> +*/
> +   if (entry->action == FILE_ACTION_UNDECIDED)
> +   entry->action = decide_file_action(entry);
>
> This change makes me a bit uneasy, per se my previous comment with the
> additional code dependencies.
>

We can revert to the original approach (see
v1-0001-pg_rewind-wal-deletion.patch from the very first email) if you like.


> I think that this scenario deserves a test case.  If one wants to
> emulate a delay in WAL archiving, it is possible to set
> archive_command to a command that we know will fail, for instance.
>

Yes, I totally agree, it is on our radar, but meanwhile please see the new
version, just to check if I correctly understood your idea.

Regards,
--
Alexander Kukushkin
From 1a8504419f7fafc04443c09d86807dccffc750f8 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin 
Date: Wed, 23 Aug 2023 13:40:47 +0200
Subject: [PATCH v4] Be more picky with WAL segment deletion in pg_rewind

Make pg_rewind to be a bit wiser in terms of creating filemap:
preserve on the target all WAL segments that contain records between the
last common checkpoint and the point of divergence.

Co-authored-by: Polina Bungina 
---
 src/bin/pg_rewind/filemap.c   | 19 ++-
 src/bin/pg_rewind/filemap.h   |  1 +
 src/bin/pg_rewind/parsexlog.c | 22 ++
 src/bin/pg_rewind/pg_rewind.c |  6 +++---
 4 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index bd5c598e20..a0c8dbc1b6 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -795,7 +795,12 @@ decide_file_actions(void)
 	filehash_start_iterate(filehash, &it);
 	while ((entry = filehash_iterate(filehash, &it)) != NULL)
 	{
-		entry->action = decide_file_action(entry);
+		/*
+		 * Some entries (WAL segments) already have an action assigned
+		 * (see findLastCheckpoint() function).
+		 */
+		if (entry->action == FILE_ACTION_UNDECIDED)
+			entry->action = decide_file_action(entry);
 	}
 
 	/*
@@ -818,6 +823,18 @@ decide_file_actions(void)
 	return filemap;
 }
 
+/*
+ * Prevent a given file deletion during rewind
+ */
+void
+preserve_file(char *filepath)
+{
+	file_entry_t *entry;
+
+	entry = insert_filehash_entry(filepath);
+	entry->action = FILE_ACTION_NONE;
+}
+
 
 /*
  * Helper function for filemap hash table.
diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h
index 48f240dff1..47c7a7fd09 100644
--- a/src/bin/pg_rewind/filemap.h
+++ b/src/bin/pg_rewind/filemap.h
@@ -109,5 +109,6 @@ extern void process_target_wal_block_change(ForkNumber forknum,
 extern filemap_t *decide_file_actions(void);
 extern void calculate_totals(filemap_t *filemap);
 extern void print_filemap(filemap_t *filemap);
+extern void preserve_file(char *filepath);
 
 #endif			/* FILEMAP_H */
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 27782237d0..40c7073522 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -176,6 +176,10 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
 	char	   *errormsg;
 	XLogPageReadPrivate private;
 
+	/* Track WAL segments opened while searching a checkpoint */
+	XLogSegNo	segno = 0;
+	TimeLineID	tli = 0;
+
 	/*
 	 * The given fork pointer points to the end of the last common record,
 	 * which is not necessarily the beginning of the next record, if the
@@ -217,6 +2

Re: pg_rewind WAL segments deletion pitfall

2023-08-23 Thread torikoshia

On 2023-08-22 14:32, Michael Paquier wrote:
Thanks for your review!


On Fri, Aug 18, 2023 at 03:40:57PM +0900, torikoshia wrote:

Thanks for the patch, I've marked this as ready-for-committer.

BTW, this issue can be considered a bug, right?
I think it would be appropriate to provide backpatch.


Hmm, I agree that there is a good argument in back-patching as we have
the WAL files between the redo LSN and the divergence LSN, but
pg_rewind is not smart enough to keep them around.  If the archives of
the primary were not able to catch up, the old primary is as good as
kaput, and restore_command won't help here.


True.
I also imagine that in the typical failover scenario where the target 
cluster was shut down soon after the divergence and pg_rewind was 
executed without much time, we can avoid this kind of 'requested WAL 
segment has already removed' error by preventing pg_rewind from deleting 
necessary WALs.




I don't like much this patch.  While it takes correctly advantage of
the backward record read logic from SimpleXLogPageRead() able to
handle correctly timeline jumps, it creates a hidden dependency in the
code between the hash table from filemap.c and the page callback.
Wouldn't it be simpler to build a list of the segment names using the
information from WALOpenSegment and build this list in
findLastCheckpoint()?  Also, I am wondering if we should be smarter
with any potential conflict handling between the source and the
target, rather than just enforcing a FILE_ACTION_NONE for all these
files.  In short, could it be better to copy the WAL file from the
source if it exists there?

+   /*
+* Some entries (WAL segments) already have an action assigned
+* (see SimpleXLogPageRead()).
+*/
+   if (entry->action == FILE_ACTION_UNDECIDED)
+   entry->action = decide_file_action(entry);

This change makes me a bit uneasy, per se my previous comment with the
additional code dependencies.

I think that this scenario deserves a test case.  If one wants to
emulate a delay in WAL archiving, it is possible to set
archive_command to a command that we know will fail, for instance.
--
Michael


Bungina, are you going to respond to these comments?

--
Regards,

--
Atsushi Torikoshi
NTT DATA Group Corporation




Re: pg_rewind WAL segments deletion pitfall

2023-08-21 Thread Michael Paquier
On Fri, Aug 18, 2023 at 03:40:57PM +0900, torikoshia wrote:
> Thanks for the patch, I've marked this as ready-for-committer.
> 
> BTW, this issue can be considered a bug, right?
> I think it would be appropriate to provide backpatch.

Hmm, I agree that there is a good argument in back-patching as we have
the WAL files between the redo LSN and the divergence LSN, but
pg_rewind is not smart enough to keep them around.  If the archives of
the primary were not able to catch up, the old primary is as good as
kaput, and restore_command won't help here.

I don't like much this patch.  While it takes correctly advantage of
the backward record read logic from SimpleXLogPageRead() able to
handle correctly timeline jumps, it creates a hidden dependency in the
code between the hash table from filemap.c and the page callback.
Wouldn't it be simpler to build a list of the segment names using the
information from WALOpenSegment and build this list in
findLastCheckpoint()?  Also, I am wondering if we should be smarter
with any potential conflict handling between the source and the
target, rather than just enforcing a FILE_ACTION_NONE for all these
files.  In short, could it be better to copy the WAL file from the
source if it exists there?

+   /*
+* Some entries (WAL segments) already have an action assigned
+* (see SimpleXLogPageRead()).
+*/
+   if (entry->action == FILE_ACTION_UNDECIDED)
+   entry->action = decide_file_action(entry);

This change makes me a bit uneasy, per se my previous comment with the
additional code dependencies.

I think that this scenario deserves a test case.  If one wants to
emulate a delay in WAL archiving, it is possible to set
archive_command to a command that we know will fail, for instance.
--
Michael


signature.asc
Description: PGP signature


Re: pg_rewind WAL segments deletion pitfall

2023-08-17 Thread torikoshia

On 2022-09-29 17:18, Polina Bungina wrote:

I agree with your suggestions, so here is the updated version of
patch. Hope I haven't missed anything.


Thanks for the patch, I've marked this as ready-for-committer.

BTW, this issue can be considered a bug, right?
I think it would be appropriate to provide backpatch.

On 2023-06-29 18:42, torikoshia wrote:

On 2023-06-29 10:25, Kyotaro Horiguchi wrote:
Thanks for the comment!


At Wed, 28 Jun 2023 22:28:13 +0900, torikoshia
 wrote in


On 2022-09-29 17:18, Polina Bungina wrote:
> I agree with your suggestions, so here is the updated version of
> patch. Hope I haven't missed anything.
> Regards,
> Polina Bungina

Thanks for working on this!
It seems like we are also facing the same issue.


Thanks for looking this.


I tested the v3 patch under our condition, old primary has succeeded
to become new standby.


BTW when I used pg_rewind-removes-wal-segments-reproduce.sh attached
in [1], old primary also failed to become standby:

  FATAL: could not receive data from WAL stream: ERROR: requested WAL
  segment 00020007 has already been removed

However, I think this is not a problem: just adding restore_command
like below fixed the situation.

  echo "restore_command = '/bin/cp `pwd`/newarch/%f %p'" >>
  oldprim/postgresql.conf


I thought on the same line at first, but that's not the point
here.


Yes. I don't think adding restore_command solves the problem and
modification to prevent deleting necessary WAL like proposed
patch is necessary.

I added restore_command since
pg_rewind-removes-wal-segments-reproduce.sh failed to catch up
even after applying v3 patch and prevent pg_rewind from delete
WALs(*), because some necessary WALs were archived.

It's not a problem we are discussing here, but I wanted to get
the script to work to the point where old primary could
successfully catch up to new primary.

(*)Specifically, running the script without apply the patch,
recovery failed because 00010003 which has
already been removed. This file was deleted by pg_rewind as
we know.
OTHO without the restore_command, recovery failed because
00020007 has already been removed even after
applying the patch.


The problem we want ot address is that pg_rewind ultimately
removes certain crucial WAL files required for the new primary to
start, despite them being present previously.


I thought it's not "new primary", but "old primary".


In other words, that
restore_command works, but it only undoes what pg_rewind wrongly did,
resulting in unnecessary consupmtion of I/O and/or network bandwidth
that essentially serves no purpose.


As far as I tested using the script and the situation we are facing,
after promoting newprim necessary WAL(00010003..) were
not available and just adding restore_command did not solve the 
problem.



pg_rewind already has a feature that determines how each file should
be handled, but it is currently making wrong dicisions for WAL
files. The goal here is to rectify this behavior and ensure that
pg_rewind makes the right decisions.


+1


--
Regards,

--
Atsushi Torikoshi
NTT DATA CORPORATION




Re: pg_rewind WAL segments deletion pitfall

2023-06-29 Thread torikoshia

On 2023-06-29 10:25, Kyotaro Horiguchi wrote:
Thanks for the comment!


At Wed, 28 Jun 2023 22:28:13 +0900, torikoshia
 wrote in


On 2022-09-29 17:18, Polina Bungina wrote:
> I agree with your suggestions, so here is the updated version of
> patch. Hope I haven't missed anything.
> Regards,
> Polina Bungina

Thanks for working on this!
It seems like we are also facing the same issue.


Thanks for looking this.


I tested the v3 patch under our condition, old primary has succeeded
to become new standby.


BTW when I used pg_rewind-removes-wal-segments-reproduce.sh attached
in [1], old primary also failed to become standby:

  FATAL: could not receive data from WAL stream: ERROR: requested WAL
  segment 00020007 has already been removed

However, I think this is not a problem: just adding restore_command
like below fixed the situation.

  echo "restore_command = '/bin/cp `pwd`/newarch/%f %p'" >>
  oldprim/postgresql.conf


I thought on the same line at first, but that's not the point
here.


Yes. I don't think adding restore_command solves the problem and
modification to prevent deleting necessary WAL like proposed
patch is necessary.

I added restore_command since
pg_rewind-removes-wal-segments-reproduce.sh failed to catch up
even after applying v3 patch and prevent pg_rewind from delete
WALs(*), because some necessary WALs were archived.

It's not a problem we are discussing here, but I wanted to get
the script to work to the point where old primary could
successfully catch up to new primary.

(*)Specifically, running the script without apply the patch,
recovery failed because 00010003 which has
already been removed. This file was deleted by pg_rewind as
we know.
OTHO without the restore_command, recovery failed because
00020007 has already been removed even after
applying the patch.


The problem we want ot address is that pg_rewind ultimately
removes certain crucial WAL files required for the new primary to
start, despite them being present previously.


I thought it's not "new primary", but "old primary".


In other words, that
restore_command works, but it only undoes what pg_rewind wrongly did,
resulting in unnecessary consupmtion of I/O and/or network bandwidth
that essentially serves no purpose.


As far as I tested using the script and the situation we are facing,
after promoting newprim necessary WAL(00010003..) were
not available and just adding restore_command did not solve the problem.


pg_rewind already has a feature that determines how each file should
be handled, but it is currently making wrong dicisions for WAL
files. The goal here is to rectify this behavior and ensure that
pg_rewind makes the right decisions.


+1


--
Regards,

--
Atsushi Torikoshi
NTT DATA CORPORATION




Re: pg_rewind WAL segments deletion pitfall

2023-06-28 Thread Kyotaro Horiguchi
At Wed, 28 Jun 2023 22:28:13 +0900, torikoshia  
wrote in 
> 
> On 2022-09-29 17:18, Polina Bungina wrote:
> > I agree with your suggestions, so here is the updated version of
> > patch. Hope I haven't missed anything.
> > Regards,
> > Polina Bungina
> 
> Thanks for working on this!
> It seems like we are also facing the same issue.

Thanks for looking this.

> I tested the v3 patch under our condition, old primary has succeeded
> to become new standby.
> 
> 
> BTW when I used pg_rewind-removes-wal-segments-reproduce.sh attached
> in [1], old primary also failed to become standby:
> 
>   FATAL: could not receive data from WAL stream: ERROR: requested WAL
>   segment 00020007 has already been removed
> 
> However, I think this is not a problem: just adding restore_command
> like below fixed the situation.
> 
>   echo "restore_command = '/bin/cp `pwd`/newarch/%f %p'" >>
>   oldprim/postgresql.conf

I thought on the same line at first, but that's not the point
here. The problem we want ot address is that pg_rewind ultimately
removes certain crucial WAL files required for the new primary to
start, despite them being present previously. In other words, that
restore_command works, but it only undoes what pg_rewind wrongly did,
resulting in unnecessary consupmtion of I/O and/or network bandwidth
that essentially serves no purpose.

pg_rewind already has a feature that determines how each file should
be handled, but it is currently making wrong dicisions for WAL
files. The goal here is to rectify this behavior and ensure that
pg_rewind makes the right decisions.

> Attached modified reproduction script for reference.
> 
> [1]https://www.postgresql.org/message-id/CAFh8B%3DnNiFZOAPsv49gffxHBPzwmZ%3D6Msd4miMis87K%3Dd9rcRA%40mail.gmail.com

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: pg_rewind WAL segments deletion pitfall

2023-06-28 Thread torikoshia


On 2022-09-29 17:18, Polina Bungina wrote:

I agree with your suggestions, so here is the updated version of
patch. Hope I haven't missed anything.

Regards,
Polina Bungina


Thanks for working on this!
It seems like we are also facing the same issue.

I tested the v3 patch under our condition, old primary has succeeded to 
become new standby.



BTW when I used pg_rewind-removes-wal-segments-reproduce.sh attached in 
[1], old primary also failed to become standby:


  FATAL:  could not receive data from WAL stream: ERROR:  requested WAL 
segment 00020007 has already been removed


However, I think this is not a problem:  just adding restore_command 
like below fixed the situation.


  echo "restore_command = '/bin/cp `pwd`/newarch/%f %p'" >> 
oldprim/postgresql.conf


Attached modified reproduction script for reference.

[1]https://www.postgresql.org/message-id/CAFh8B%3DnNiFZOAPsv49gffxHBPzwmZ%3D6Msd4miMis87K%3Dd9rcRA%40mail.gmail.com


--
Regards,

--
Atsushi Torikoshi
NTT DATA CORPORATIONmkdir newarch oldarch
initdb -k -D oldprim
echo "archive_mode = 'on'">> oldprim/postgresql.conf
echo "archive_command = 'echo "archive %f" >&2; cp %p `pwd`/oldarch/%f'">> 
oldprim/postgresql.conf
pg_ctl -D oldprim -o '-p 5432' -l oldprim.log start
psql -p 5432 -c 'create table t(a int)'
pg_basebackup -D newprim -p 5432
echo "primary_conninfo='host=/tmp port=5432'">> newprim/postgresql.conf
echo "archive_command = 'echo "archive %f" >&2; cp %p `pwd`/newarch/%f'">> 
newprim/postgresql.conf
touch newprim/standby.signal
pg_ctl -D newprim -o '-p 5433' -l newprim.log start

# the last common checkpoint
psql -p 5432 -c 'checkpoint'

# old primary cannot archive any more
echo "archive_command = 'false'">> oldprim/postgresql.conf
pg_ctl -D oldprim reload
# advance WAL on the old primary; four WAL segments will never make it to the 
archive
for i in $(seq 1 4); do psql -p 5432 -c 'insert into t values(0); select 
pg_switch_wal();'; done

# record approx. diverging WAL segment
start_wal=`psql -p 5432 -Atc "select pg_walfile_name(pg_last_wal_replay_lsn() - 
(select setting from pg_settings where name = 'wal_segment_size')::int);"`
pg_ctl -D newprim promote

# old rprimary loses diverging WAL segment
for i in $(seq 1 4); do psql -p 5432 -c 'insert into t values(0); select 
pg_switch_wal();'; done
psql -p 5432 -c 'checkpoint;'
psql -p 5433 -c 'checkpoint;'

pg_ctl -D oldprim stop

# rewind the old primary, using its own archive
# pg_rewind -D oldprim --source-server='port=5433' # should fail
echo "restore_command = 'echo "restore %f" >&2; cp `pwd`/oldarch/%f %p'">> 
oldprim/postgresql.conf
pg_rewind -D oldprim --source-server='port=5433' -c

# advance WAL on the old primary; new primary loses the launching WAL seg
for i in $(seq 1 4); do psql -p 5433 -c 'insert into t values(0); select 
pg_switch_wal();'; done
psql -p 5433 -c 'checkpoint'
echo "primary_conninfo='host=/tmp port=5433'">> oldprim/postgresql.conf
touch oldprim/standby.signal
echo "restore_command = '/bin/cp `pwd`/newarch/%f %p'" >> 
oldprim/postgresql.conf

postgres -D oldprim  # fails with "WAL file has been removed"

## The alternative of copying-in
## echo "restore_command = 'echo "restore %f" >&2; cp `pwd`/newarch/%f %p'">> 
oldprim/postgresql.conf
#
## copy-in WAL files from new primary's archive to old primary
#(cd newarch;
#for f in `ls`; do
#  if [[ "$f" > "$start_wal" ]]; then echo copy $f; cp $f ../oldprim/pg_wal; fi
#done)
#
#postgres -D oldprim  # also fails with "requested WAL segment XXX has already 
been removed"


Re: pg_rewind WAL segments deletion pitfall

2022-09-29 Thread Polina Bungina
I agree with your suggestions, so here is the updated version of patch.
Hope I haven't missed anything.

Regards,
Polina Bungina


v3-0001-pg_rewind-wal-deletion.patch
Description: Binary data


Re: pg_rewind WAL segments deletion pitfall

2022-09-28 Thread Kyotaro Horiguchi
At Wed, 28 Sep 2022 10:09:05 +0200, Polina Bungina  wrote in 
> On Tue, Sep 27, 2022 at 9:50 AM Kyotaro Horiguchi 
> wrote:
> 
> > Regarding the the patch, pg_rewind starts reading segments from the
> > divergence point back to the nearest checkpoint, then moves foward
> > during rewinding. So, the fact that SimpleXLogPageRead have read a
> > segment suggests that the segment is required during the next startup.
> > So I don't think we need to move around the keepWalSeg flag.  All
> > files that are wanted while rewinding should be preserved
> > unconditionally.
> >
> 
> I am probably not getting this right but as far as I see SimpleXLogPageRead
> is called at most 3 times during pg_rewind run:
> 1. From readOneRecord to determine the end-of-WAL on the target by reading
> the last shutdown checkpoint record/minRecoveryPoint on it
> 2. From findLastCheckpoint to find last common checkpoint (here it
> indeed reads all the segments that are required during the startup, hence
> the keepWalSeg flag set to true)
> 3. From extractPageMap to extract all the pages modified after the fork
> (here we also read all the segments that should be kept but also the ones
> further, until the target's end record. Doesn't seem we should
> unconditionally preserve them all).
> Am I missing something?

No. You're right. I have to admit that I was confused at the time X(,
sorry for that.  Those extra files are I believe harmless but of
course it's preferable to avoid them. So the keepWalSeg is useful.

So the latest version become very similar to v1 in that the both have
keepWalSeg flag. The difference is the need of the file name hash.  I
still think that it's better if we don't need the additional file
hash.  If we move out the bare code in v2 added to
SimpleXLogPageRead(), then name it "preserve_file(char *filepath)",
the code become more easy to read.

+   if (private->keepWalSeg)
+   {
+   /* the caller told us to preserve this file for future 
use */
+   snprintf(xlogfpath, MAXPGPATH, XLOGDIR "/%s", 
xlogfname);
+   preserve_file(xlogfpath);
+   }

Instead, I think we should add a comment here:

@@ -192,6 +195,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, 
int tliIndex,
 
 >  private.tliIndex = tliIndex;
 >  private.restoreCommand = restoreCommand;
++  /*
++   * WAL files read during searching for the last checkpoint are required
++   * by the next startup recovery of the target cluster.
++  */
 >+ private.keepWalSeg = true;

What do you think about the above?

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: pg_rewind WAL segments deletion pitfall

2022-09-28 Thread Polina Bungina
On Tue, Sep 27, 2022 at 9:50 AM Kyotaro Horiguchi 
wrote:

> Regarding the the patch, pg_rewind starts reading segments from the
> divergence point back to the nearest checkpoint, then moves foward
> during rewinding. So, the fact that SimpleXLogPageRead have read a
> segment suggests that the segment is required during the next startup.
> So I don't think we need to move around the keepWalSeg flag.  All
> files that are wanted while rewinding should be preserved
> unconditionally.
>

I am probably not getting this right but as far as I see SimpleXLogPageRead
is called at most 3 times during pg_rewind run:
1. From readOneRecord to determine the end-of-WAL on the target by reading
the last shutdown checkpoint record/minRecoveryPoint on it
2. From findLastCheckpoint to find last common checkpoint (here it
indeed reads all the segments that are required during the startup, hence
the keepWalSeg flag set to true)
3. From extractPageMap to extract all the pages modified after the fork
(here we also read all the segments that should be kept but also the ones
further, until the target's end record. Doesn't seem we should
unconditionally preserve them all).
Am I missing something?



> +   /*
> +* Some entries (WAL segments) already have an action
> assigned
> +* (see SimpleXLogPageRead()).
> +*/
> +   if (entry->action == FILE_ACTION_NONE)
> +   continue;
> entry->action = decide_file_action(entry);

It might be more reasonable to call decide_file_action() when action
> is UNDECIDED.
>

Agree, will change this part.

Regards,
Polina Bungina


Re: pg_rewind WAL segments deletion pitfall

2022-09-27 Thread Kyotaro Horiguchi
At Thu, 1 Sep 2022 13:33:09 +0200, Polina Bungina  wrote in 
> Here is the new version of the patch that includes the changes you
> suggested. It is smaller now but I doubt if it is as easy to understand as
> it used to be.

pg_rewind works in two steps. First it constructs file map which
decides the action for each file, then second, it performs file
operations according to the file map. So, if we are going to do
something on some files, that action should be record that in the file
map, I think.

Regarding the the patch, pg_rewind starts reading segments from the
divergence point back to the nearest checkpoint, then moves foward
during rewinding. So, the fact that SimpleXLogPageRead have read a
segment suggests that the segment is required during the next startup.
So I don't think we need to move around the keepWalSeg flag.  All
files that are wanted while rewinding should be preserved
unconditionally.

It's annoying that the file path for file map and open(2) have
different top directory. But sharing the same path string between the
two seems rather ugly..

I feel uncomfortable to directly touch the internal of file_entry_t
outside filemap.c. I'd like to hide the internals in filemap.c, but
pg_rewind already does that..

+   /*
+* Some entries (WAL segments) already have an action assigned
+* (see SimpleXLogPageRead()).
+*/
+   if (entry->action == FILE_ACTION_NONE)
+   continue;
entry->action = decide_file_action(entry);

It might be more reasonable to call decide_file_action() when action
is UNDECIDED.

> The need of manipulations with the target’s pg_wal/archive_status directory
> is a question to discuss…
>
> At first glance it seems to be useless for .ready files: checkpointer
> process will anyway recreate them if archiving is enabled on the rewound
> old primary and we will finally have them in the archive. As for the .done
> files, it seems reasonable to follow the pg_basebackup logic and keep .done
> files together with the corresponding segments (those between the last
> common checkpoint and the point of divergence) to protect them from being
> archived once again.
> 
> But on the other hand it seems to be not that straightforward: imaging we
> have WAL segment X on the target along with X.done file and we decide to
> preserve them both (or we download it from archive and force .done file
> creation), while archive_mode was set to ‘always’ and the source (promoted
> replica) also still has WAL segment X and X.ready file. After pg_rewind we
> will end up with both X.ready and X.done, which seems to be not a good
> situation (but most likely not critical either).

Thanks for the thought. Yes, it's not so straight-forward. And, as you
mentioned, the worst result comes from not doing that is that some
already-archived segments are archived at next run, which is generally
harmless. So I think we're ok to ignore that in this patdh then create
other patch if we still want to do that.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: pg_rewind WAL segments deletion pitfall

2022-09-26 Thread Alexander Kukushkin
Hello Kyotaro,

any further thoughts on it?

Regards,
--
Alexander Kukushkin


Re: pg_rewind WAL segments deletion pitfall

2022-09-01 Thread Polina Bungina
Terribly sorry for misspelling your name and for the topposting!

Regards,
Polina Bungina


Re: pg_rewind WAL segments deletion pitfall

2022-09-01 Thread Polina Bungina
Hello Kayotaro,


Here is the new version of the patch that includes the changes you
suggested. It is smaller now but I doubt if it is as easy to understand as
it used to be.


The need of manipulations with the target’s pg_wal/archive_status directory
is a question to discuss…

At first glance it seems to be useless for .ready files: checkpointer
process will anyway recreate them if archiving is enabled on the rewound
old primary and we will finally have them in the archive. As for the .done
files, it seems reasonable to follow the pg_basebackup logic and keep .done
files together with the corresponding segments (those between the last
common checkpoint and the point of divergence) to protect them from being
archived once again.

But on the other hand it seems to be not that straightforward: imaging we
have WAL segment X on the target along with X.done file and we decide to
preserve them both (or we download it from archive and force .done file
creation), while archive_mode was set to ‘always’ and the source (promoted
replica) also still has WAL segment X and X.ready file. After pg_rewind we
will end up with both X.ready and X.done, which seems to be not a good
situation (but most likely not critical either).


Regards,

Polina Bungina


On Wed, Aug 31, 2022 at 7:30 AM Kyotaro Horiguchi 
wrote:

> At Tue, 30 Aug 2022 11:01:58 +0200, Alexander Kukushkin <
> cyberd...@gmail.com> wrote in
> > On Tue, 30 Aug 2022 at 10:27, Kyotaro Horiguchi  >
> > wrote:
> >
> > >
> > > Hmm. Doesn't it work to ignoring tli then? All segments that their
> > > segment number is equal to or larger than the checkpoint locaiton are
> > > preserved regardless of TLI?
> > >
> >
> > If we ignore TLI there is a chance that we may retain some unnecessary
> (or
> > just wrong) files.
>
> Right. I mean I don't think thats a problem and we can rely on
> postgres itself for later cleanup. Theoretically some out-of-range tli
> or segno files are left alone but they surely will be gone soon after
> the server starts.
>
> > > > Also, we need to take into account the divergency LSN. Files after
> it are
> > > > not required.
> > >
> > > They are removed at the later checkpoints. But also we can remove
> > > segments that are out of the range between the last common checkpoint
> > > and divergence point ignoring TLI.
> >
> >
> > Everything that is newer last_common_checkpoint_seg could be removed (but
> > it already happens automatically, because these files are missing on the
> > new primary).
> > WAL files that are older than last_common_checkpoint_seg could be either
> > removed or at least not copied from the new primary.
> ..
> > The current implementation relies on tracking WAL files being open while
> > searching for the last common checkpoint. It automatically starts from
> the
> > divergence_seg, automatically finishes at last_common_checkpoint_seg, and
> > last but not least, automatically handles timeline changes. I don't think
> > that manually written code that decides what to do from the WAL file name
> > (and also takes into account TLI) could be much simpler than the current
> > approach.
>
> Yeah, I know.  My expectation is taking the simplest way for the same
> effect. My concern was the additional hash. On second thought, I
> concluded that we should that on the existing filehash.
>
> We can just add a FILE_ACTION_NONE entry to the file hash from
> SimpleXLogPageRead.  Since this happens before decide_file_action()
> call, decide_file_action() should ignore the entries with
> FILE_ACTION_NONE. Also we need to call filehash_init() earlier.
>
> > Actually, since we start doing some additional "manipulations" with files
> > in pg_wal, we probably should do a symmetric action with files inside
> > pg_wal/archive_status
>
> In that sense, pg_rewind rather should place missing
> archive_status/*.done for segments including restored ones seen while
> finding checkpoint.  This is analogous of the behavior with
> pg_basebackup and pg_receivewal. Also we should add FILE_ACTION_NONE
> entries for .done files for segments read while finding checkpoint.
>
> What do you think about that?
>
> regards.
>
> --
> Kyotaro Horiguchi
> NTT Open Source Software Center
>


v2-0001-pg_rewind-wal-deletion.patch
Description: Binary data


Re: pg_rewind WAL segments deletion pitfall

2022-08-30 Thread Kyotaro Horiguchi
At Wed, 31 Aug 2022 14:30:31 +0900 (JST), Kyotaro Horiguchi 
 wrote in 
> What do you think about that?

By the way don't you add an CF entry for this?

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: pg_rewind WAL segments deletion pitfall

2022-08-30 Thread Kyotaro Horiguchi
At Tue, 30 Aug 2022 11:01:58 +0200, Alexander Kukushkin  
wrote in 
> On Tue, 30 Aug 2022 at 10:27, Kyotaro Horiguchi 
> wrote:
> 
> >
> > Hmm. Doesn't it work to ignoring tli then? All segments that their
> > segment number is equal to or larger than the checkpoint locaiton are
> > preserved regardless of TLI?
> >
> 
> If we ignore TLI there is a chance that we may retain some unnecessary (or
> just wrong) files.

Right. I mean I don't think thats a problem and we can rely on
postgres itself for later cleanup. Theoretically some out-of-range tli
or segno files are left alone but they surely will be gone soon after
the server starts.

> > > Also, we need to take into account the divergency LSN. Files after it are
> > > not required.
> >
> > They are removed at the later checkpoints. But also we can remove
> > segments that are out of the range between the last common checkpoint
> > and divergence point ignoring TLI.
> 
> 
> Everything that is newer last_common_checkpoint_seg could be removed (but
> it already happens automatically, because these files are missing on the
> new primary).
> WAL files that are older than last_common_checkpoint_seg could be either
> removed or at least not copied from the new primary.
..
> The current implementation relies on tracking WAL files being open while
> searching for the last common checkpoint. It automatically starts from the
> divergence_seg, automatically finishes at last_common_checkpoint_seg, and
> last but not least, automatically handles timeline changes. I don't think
> that manually written code that decides what to do from the WAL file name
> (and also takes into account TLI) could be much simpler than the current
> approach.

Yeah, I know.  My expectation is taking the simplest way for the same
effect. My concern was the additional hash. On second thought, I
concluded that we should that on the existing filehash.

We can just add a FILE_ACTION_NONE entry to the file hash from
SimpleXLogPageRead.  Since this happens before decide_file_action()
call, decide_file_action() should ignore the entries with
FILE_ACTION_NONE. Also we need to call filehash_init() earlier.

> Actually, since we start doing some additional "manipulations" with files
> in pg_wal, we probably should do a symmetric action with files inside
> pg_wal/archive_status

In that sense, pg_rewind rather should place missing
archive_status/*.done for segments including restored ones seen while
finding checkpoint.  This is analogous of the behavior with
pg_basebackup and pg_receivewal. Also we should add FILE_ACTION_NONE
entries for .done files for segments read while finding checkpoint.

What do you think about that?

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: pg_rewind WAL segments deletion pitfall

2022-08-30 Thread Alexander Kukushkin
On Tue, 30 Aug 2022 at 10:27, Kyotaro Horiguchi 
wrote:

>
> Hmm. Doesn't it work to ignoring tli then? All segments that their
> segment number is equal to or larger than the checkpoint locaiton are
> preserved regardless of TLI?
>

If we ignore TLI there is a chance that we may retain some unnecessary (or
just wrong) files.


>
> > Also, we need to take into account the divergency LSN. Files after it are
> > not required.
>
> They are removed at the later checkpoints. But also we can remove
> segments that are out of the range between the last common checkpoint
> and divergence point ignoring TLI.


Everything that is newer last_common_checkpoint_seg could be removed (but
it already happens automatically, because these files are missing on the
new primary).
WAL files that are older than last_common_checkpoint_seg could be either
removed or at least not copied from the new primary.



> the divergence point is also
> compared?
>
> > if (file_segno >= last_common_checkpoint_seg &&
> > file_segno <= divergence_seg)
> > ;
>

The current implementation relies on tracking WAL files being open while
searching for the last common checkpoint. It automatically starts from the
divergence_seg, automatically finishes at last_common_checkpoint_seg, and
last but not least, automatically handles timeline changes. I don't think
that manually written code that decides what to do from the WAL file name
(and also takes into account TLI) could be much simpler than the current
approach.


Actually, since we start doing some additional "manipulations" with files
in pg_wal, we probably should do a symmetric action with files inside
pg_wal/archive_status

Regards,
--
Alexander Kukushkin


Re: pg_rewind WAL segments deletion pitfall

2022-08-30 Thread Kyotaro Horiguchi
At Tue, 30 Aug 2022 10:03:07 +0200, Alexander Kukushkin  
wrote in 
> On Tue, 30 Aug 2022 at 09:51, Kyotaro Horiguchi 
> wrote:
> > And as I said in a mail I sent just before, the patch looks too
> > complex.  How about just comparing WAL file name aginst the last
> > common checkpoint's tli and lsn? We can tell filemap.c about the last
> > checkpoint and decide_file_action can compare the file name with it.
> >
> > It is sufficient to preserve WAL files if tli matches and the segment
> > number of the WAL file is equal to or later than the checkpoint
> > location.
> >
> 
> What if the last common checkpoint was on a previous timeline?
> I.e., standby was promoted to primary, the timeline changed from 1 to 2,
> and after that the node crashed _before_ the CHECKPOINT after promote has
> finished.
> The next node will advance the timeline from 2 to 3.
> In this case, the last common checkpoint will be on timeline 1, and the
> check becomes more complex because we will have to consider both timelines,
> 1 and 2.

Hmm. Doesn't it work to ignoring tli then? All segments that their
segment number is equal to or larger than the checkpoint locaiton are
preserved regardless of TLI?

> Also, we need to take into account the divergency LSN. Files after it are
> not required.

They are removed at the later checkpoints. But also we can remove
segments that are out of the range between the last common checkpoint
and divergence point ignoring TLI.  the divergence point is also
compared?

> if (file_segno >= last_common_checkpoint_seg &&
> file_segno <= divergence_seg)
> ;


regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: pg_rewind WAL segments deletion pitfall

2022-08-30 Thread Alexander Kukushkin
Hello Kyotaro,


On Tue, 30 Aug 2022 at 09:51, Kyotaro Horiguchi 
wrote:

>
>
> And as I said in a mail I sent just before, the patch looks too
> complex.  How about just comparing WAL file name aginst the last
> common checkpoint's tli and lsn? We can tell filemap.c about the last
> checkpoint and decide_file_action can compare the file name with it.
>
> It is sufficient to preserve WAL files if tli matches and the segment
> number of the WAL file is equal to or later than the checkpoint
> location.
>

What if the last common checkpoint was on a previous timeline?
I.e., standby was promoted to primary, the timeline changed from 1 to 2,
and after that the node crashed _before_ the CHECKPOINT after promote has
finished.
The next node will advance the timeline from 2 to 3.
In this case, the last common checkpoint will be on timeline 1, and the
check becomes more complex because we will have to consider both timelines,
1 and 2.

Also, we need to take into account the divergency LSN. Files after it are
not required.

Regards,
--
Alexander Kukushkin


Re: pg_rewind WAL segments deletion pitfall

2022-08-30 Thread Kyotaro Horiguchi
At Tue, 30 Aug 2022 08:49:27 +0200, Alexander Kukushkin  
wrote in 
> No, we are complaining exactly about WAL segments from the old timeline
> that are removed by pg_rewind.
> Those segments haven't been archived by the old primary and the new primary
> already recycled them.

Yeah, sorry for my thick skull but I finally got your point.

And as I said in a mail I sent just before, the patch looks too
complex.  How about just comparing WAL file name aginst the last
common checkpoint's tli and lsn? We can tell filemap.c about the last
checkpoint and decide_file_action can compare the file name with it.

It is sufficient to preserve WAL files if tli matches and the segment
number of the WAL file is equal to or later than the checkpoint
location.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: pg_rewind WAL segments deletion pitfall

2022-08-30 Thread Kyotaro Horiguchi
At Tue, 30 Aug 2022 14:50:26 +0900 (JST), Kyotaro Horiguchi 
 wrote in 
> IFAIS pg_rewind doesn't. -c option contrarily restores the all
> segments after the last (common) checkpoint and all of them are left
> alone after pg_rewind finishes.  postgres itself removes the WAL files
> after recovery. After-promotion cleanup and checkpoint revmoes the
> files on the previous timeline.
> 
> Before pg_rewind runs in the repro below, the old primary has the
> following segments.
> 
> TLI1:  2 8 9 A B C D
> 
> Just after pg_rewind finishes, the old primary has the following
> segments.
> 
> TLI1:  2 3   5 6 7
> TLI2:  4(and 0002.history)
> 
> pg_rewind copied 1-2 to 1-3 and 2-4 and history file from the new
1> primary, 1-4 to 1-7 from archive. After rewind finished, 1-4,1-8 to
> 1-D have been removed since the new primary didn't have them.
> 
> Recovery starts from 1-3 and promotes at 0/4_00. postgres removes
> 1-5 to 1-7 by post-promotion cleanup and removes 1-2 to 1-4 by a
> restartpoint. All of the segments are useless after the old primary
> promotes.
> 
> When the old primary starts, it uses 1-3 and 2-4 for recovery and
> fails to fetch 2-5 from the new primary.  But it is not an issue of
> pg_rewind at all.

Ah. I think I understand what you are mentioning. If the new primary
didn't have the segment 1-3 to 1-6, pg_rewind removes it.  The new
primary doesn't have it in pg_wal nor in archive. The old primary has
it in its archive.  So get out from the situation, we need to the
following *two* things before the old primary can start:

1. copy 1-3 to 1-6 from the archive of the *old* primary
2. copy 2-7 and later from the archive of the *new* primary

Since pg_rewind have copied in to the old primary's pg_wal, removing them just 
have users to perform the task duplicatedly, as you stated.

Okay, I completely understand the problem and convinced that it is
worth changing the behavior.

However, the proposed patch looks too complex to me.  It can be done
by just comparing xlog file name and the last checkpoint location and
TLI in decide_file_actions().

regards.

=
# killall -9 postgres
# rm -r oldprim newprim oldarch newarch oldprim.log newprim.log
mkdir newarch oldarch
initdb -k -D oldprim
echo "archive_mode = 'on'">> oldprim/postgresql.conf
echo "archive_command = 'echo "archive %f" >&2; cp %p `pwd`/oldarch/%f'">> 
oldprim/postgresql.conf
pg_ctl -D oldprim -o '-p 5432' -l oldprim.log start
psql -p 5432 -c 'create table t(a int)'
pg_basebackup -D newprim -p 5432
echo "primary_conninfo='host=/tmp port=5432'">> newprim/postgresql.conf
echo "archive_command = 'echo "archive %f" >&2; cp %p `pwd`/newarch/%f'">> 
newprim/postgresql.conf
touch newprim/standby.signal
pg_ctl -D newprim -o '-p 5433' -l newprim.log start

# the last common checkpoint
psql -p 5432 -c 'checkpoint'

# record approx. diverging WAL segment
start_wal=`psql -p 5433 -Atc "select pg_walfile_name(pg_last_wal_replay_lsn() - 
(select setting from pg_settings where name = 'wal_segment_size')::int);"`
for i in $(seq 1 5); do psql -p 5432 -c 'insert into t values(0); select 
pg_switch_wal();'; done
psql -p 5432 -c 'checkpoint'
pg_ctl -D newprim promote

# old rprimary loses diverging WAL segment
for i in $(seq 1 4); do psql -p 5432 -c 'insert into t values(0); select 
pg_switch_wal();'; done
psql -p 5432 -c 'checkpoint;'
psql -p 5433 -c 'checkpoint;'

# old primary cannot archive any more
echo "archive_command = 'false'">> oldprim/postgresql.conf
pg_ctl -D oldprim reload
pg_ctl -D oldprim stop

# rewind the old primary, using its own archive
# pg_rewind -D oldprim --source-server='port=5433' # should fail
echo "restore_command = 'echo "restore %f" >&2; cp `pwd`/oldarch/%f %p'">> 
oldprim/postgresql.conf
pg_rewind -D oldprim --source-server='port=5433' -c

# advance WAL on the old primary; new primary loses the launching WAL seg
for i in $(seq 1 4); do psql -p 5433 -c 'insert into t values(0); select 
pg_switch_wal();'; done
psql -p 5433 -c 'checkpoint'
echo "primary_conninfo='host=/tmp port=5433'">> oldprim/postgresql.conf
touch oldprim/standby.signal

 copy the missing file of the old timeline
## cp oldarch/0001000[3456] oldprim/pg_wal
## cp newarch/0002000* oldprim/pg_wal

postgres -D oldprim  # fails with "WAL file has been removed"


# The alternative of copying-in
# echo "restore_command = 'echo "restore %f" >&2; cp `pwd`/newarch/%f %p'">> 
oldprim/postgresql.conf

# copy-in WAL files from new primary's archive to old primary
(cd newarch;
for f in `ls`; do
  if [[ "$f" > "$start_wal" ]]; then echo copy $f; cp $f ../oldprim/pg_wal; fi
done)

postgres -D oldprim
=

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: pg_rewind WAL segments deletion pitfall

2022-08-29 Thread Alexander Kukushkin
>
>
> I did a slight modification of your script that reproduces a problem.
>
>
> 
>

It seems that formatting damaged the script, so I better attach it as a
file.

Regards,
--
Alexander Kukushkin


pg_rewind-removes-wal-segments-reproduce.sh
Description: application/shellscript


Re: pg_rewind WAL segments deletion pitfall

2022-08-29 Thread Alexander Kukushkin
Hello Kyotaro,

On Tue, 30 Aug 2022 at 07:50, Kyotaro Horiguchi 
wrote:



> So, if I understand you correctly, the issue you are complaining is
> not about the WAL segments on the old timeline but about those on the
> new timeline, which don't have a business with what pg_rewind does. As
> the same with the case of pg_basebackup, the missing segments need to
> be somehow copied from the new primary since the old primary never had
> the chance to have them before.
>

No, we are complaining exactly about WAL segments from the old timeline
that are removed by pg_rewind.
Those segments haven't been archived by the old primary and the new primary
already recycled them.




>
> Thus I don't follow this..
>

I did a slight modification of your script that reproduces a problem.



mkdir newarch oldarch
initdb -k -D oldprim
echo "archive_mode = 'on'">> oldprim/postgresql.conf
echo "archive_command = 'echo "archive %f" >&2; cp %p `pwd`/oldarch/%f'">>
oldprim/postgresql.conf
pg_ctl -D oldprim -o '-p 5432' -l oldprim.log start
psql -p 5432 -c 'create table t(a int)'
pg_basebackup -D newprim -p 5432
echo "primary_conninfo='host=/tmp port=5432'">> newprim/postgresql.conf
echo "archive_command = 'echo "archive %f" >&2; cp %p `pwd`/newarch/%f'">>
newprim/postgresql.conf
touch newprim/standby.signal
pg_ctl -D newprim -o '-p 5433' -l newprim.log start

# the last common checkpoint
psql -p 5432 -c 'checkpoint'

# old primary cannot archive any more
echo "archive_command = 'false'">> oldprim/postgresql.conf
pg_ctl -D oldprim reload
# advance WAL on the old primary; four WAL segments will never make it to
the archive
for i in $(seq 1 4); do psql -p 5432 -c 'insert into t values(0); select
pg_switch_wal();'; done

# record approx. diverging WAL segment
start_wal=`psql -p 5432 -Atc "select
pg_walfile_name(pg_last_wal_replay_lsn() - (select setting from pg_settings
where name = 'wal_segment_size')::int);"`
pg_ctl -D newprim promote

# old rprimary loses diverging WAL segment
for i in $(seq 1 4); do psql -p 5432 -c 'insert into t values(0); select
pg_switch_wal();'; done
psql -p 5432 -c 'checkpoint;'
psql -p 5433 -c 'checkpoint;'

pg_ctl -D oldprim stop

# rewind the old primary, using its own archive
# pg_rewind -D oldprim --source-server='port=5433' # should fail
echo "restore_command = 'echo "restore %f" >&2; cp `pwd`/oldarch/%f %p'">>
oldprim/postgresql.conf
pg_rewind -D oldprim --source-server='port=5433' -c

# advance WAL on the old primary; new primary loses the launching WAL seg
for i in $(seq 1 4); do psql -p 5433 -c 'insert into t values(0); select
pg_switch_wal();'; done
psql -p 5433 -c 'checkpoint'
echo "primary_conninfo='host=/tmp port=5433'">> oldprim/postgresql.conf
touch oldprim/standby.signal

postgres -D oldprim  # fails with "WAL file has been removed"

# The alternative of copying-in
# echo "restore_command = 'echo "restore %f" >&2; cp `pwd`/newarch/%f
%p'">> oldprim/postgresql.conf

# copy-in WAL files from new primary's archive to old primary
(cd newarch;
for f in `ls`; do
  if [[ "$f" > "$start_wal" ]]; then echo copy $f; cp $f ../oldprim/pg_wal;
fi
done)

postgres -D oldprim  # also fails with "requested WAL segment XXX has
already been removed"
===

Regards,
--
Alexander Kukushkin


Re: pg_rewind WAL segments deletion pitfall

2022-08-29 Thread Kyotaro Horiguchi
Hello, Alex.

At Fri, 26 Aug 2022 10:57:25 +0200, Alexander Kukushkin  
wrote in 
> On Fri, 26 Aug 2022 at 10:04, Kyotaro Horiguchi 
> wrote:
> > What I don't still understand is why pg_rewind doesn't work for the
> > old primary in that case. When archive_mode=on, the old primary has
> > the complete set of WAL files counting both pg_wal and its archive. So
> > as the same to the privious repro, pg_rewind -c ought to work (but it
> > uses its own archive this time).  In that sense the proposed solution
> > is still not needed in this case.
> >
> 
> The pg_rewind finishes successfully. But as a result it removes some files
> from pg_wal that are required to perform recovery because they are missing
> on the new primary.

IFAIS pg_rewind doesn't. -c option contrarily restores the all
segments after the last (common) checkpoint and all of them are left
alone after pg_rewind finishes.  postgres itself removes the WAL files
after recovery. After-promotion cleanup and checkpoint revmoes the
files on the previous timeline.

Before pg_rewind runs in the repro below, the old primary has the
following segments.

TLI1:  2 8 9 A B C D

Just after pg_rewind finishes, the old primary has the following
segments.

TLI1:  2 3   5 6 7
TLI2:  4(and 0002.history)

pg_rewind copied 1-2 to 1-3 and 2-4 and history file from the new
primary, 1-4 to 1-7 from archive. After rewind finished, 1-4,1-8 to
1-D have been removed since the new primary didn't have them.

Recovery starts from 1-3 and promotes at 0/4_00. postgres removes
1-5 to 1-7 by post-promotion cleanup and removes 1-2 to 1-4 by a
restartpoint. All of the segments are useless after the old primary
promotes.

When the old primary starts, it uses 1-3 and 2-4 for recovery and
fails to fetch 2-5 from the new primary.  But it is not an issue of
pg_rewind at all.

> > A bit harder situation comes after the server successfully rewound; if
> > the new primary goes so far that the old primary cannot connect. Even
> > in that case, you can copy-in the requried WAL files or configure
> > restore_command of the old pimary so that it finds required WAL files
> > there.
> >
> 
> Yes, we can do the backup of pg_wal before running pg_rewind, but it feels

So, if I understand you correctly, the issue you are complaining is
not about the WAL segments on the old timeline but about those on the
new timeline, which don't have a business with what pg_rewind does. As
the same with the case of pg_basebackup, the missing segments need to
be somehow copied from the new primary since the old primary never had
the chance to have them before.

> very ugly, because we will also have to clean this "backup" after a
> successful recovery.

What do you mean by the "backup" here? Concretely what WAL segments do
you feel need to remove, for example, in the repro case?  Or, could
you show your issue by something like the repro below?

> It would be much better if pg_rewind didn't remove WAL files between the
> last common checkpoint and diverged LSN in the first place.

Thus I don't follow this..

regards.


(Fixed a bug and slightly modified)

# killall -9 postgres
# rm -r oldprim newprim oldarch newarch oldprim.log newprim.log
mkdir newarch oldarch
initdb -k -D oldprim
echo "archive_mode = 'on'">> oldprim/postgresql.conf
echo "archive_command = 'echo "archive %f" >&2; cp %p `pwd`/oldarch/%f'">> 
oldprim/postgresql.conf
pg_ctl -D oldprim -o '-p 5432' -l oldprim.log start
psql -p 5432 -c 'create table t(a int)'
pg_basebackup -D newprim -p 5432
echo "primary_conninfo='host=/tmp port=5432'">> newprim/postgresql.conf
echo "archive_command = 'echo "archive %f" >&2; cp %p `pwd`/newarch/%f'">> 
newprim/postgresql.conf
touch newprim/standby.signal
pg_ctl -D newprim -o '-p 5433' -l newprim.log start

# the last common checkpoint
psql -p 5432 -c 'checkpoint'

# record approx. diverging WAL segment
start_wal=`psql -p 5433 -Atc "select pg_walfile_name(pg_last_wal_replay_lsn() - 
(select setting from pg_settings where name = 'wal_segment_size')::int);"`
psql -p 5432 -c 'insert into t values(0); select pg_switch_wal();'
pg_ctl -D newprim promote

# old rprimary loses diverging WAL segment
for i in $(seq 1 4); do psql -p 5432 -c 'insert into t values(0); select 
pg_switch_wal();'; done
psql -p 5432 -c 'checkpoint;'
psql -p 5433 -c 'checkpoint;'

# old primary cannot archive any more
echo "archive_command = 'false'">> oldprim/postgresql.conf
pg_ctl -D oldprim reload
pg_ctl -D oldprim stop

# rewind the old primary, using its own archive
# pg_rewind -D oldprim --source-server='port=5433' # should fail
echo "restore_command = 'echo "restore %f" >&2; cp `pwd`/oldarch/%f %p'">> 
oldprim/postgresql.conf
pg_rewind -D oldprim --source-server='port=5433' -c

# advance WAL on the old primary; new primary loses the launching WAL seg
for i in $(seq 1 4); do psql -p 5433 -c 'insert into t values(0); select 
pg_switch_wal();'; done
psql -p 5433 -c 'checkpoint'
echo "primary_conninfo='host=/tmp port=5433'">> ol

Re: pg_rewind WAL segments deletion pitfall

2022-08-26 Thread Alexander Kukushkin
Hello Kyotaro,


On Fri, 26 Aug 2022 at 10:04, Kyotaro Horiguchi 
wrote:

> > With archive_mode = always you can't reproduce it.
> > It is very rarely people set it to always in production due to the
> overhead.
> ...
> > The archive_mode has to be set to on and the archive_command should be
> > failing when you do pg_ctl -D oldprim stop
>
> Ah, I see.
>
> What I don't still understand is why pg_rewind doesn't work for the
> old primary in that case. When archive_mode=on, the old primary has
> the complete set of WAL files counting both pg_wal and its archive. So
> as the same to the privious repro, pg_rewind -c ought to work (but it
> uses its own archive this time).  In that sense the proposed solution
> is still not needed in this case.
>

The pg_rewind finishes successfully. But as a result it removes some files
from pg_wal that are required to perform recovery because they are missing
on the new primary.



>
> A bit harder situation comes after the server successfully rewound; if
> the new primary goes so far that the old primary cannot connect. Even
> in that case, you can copy-in the requried WAL files or configure
> restore_command of the old pimary so that it finds required WAL files
> there.
>

Yes, we can do the backup of pg_wal before running pg_rewind, but it feels
very ugly, because we will also have to clean this "backup" after a
successful recovery.
It would be much better if pg_rewind didn't remove WAL files between the
last common checkpoint and diverged LSN in the first place.

Regards,
--
Alexander Kukushkin


Re: pg_rewind WAL segments deletion pitfall

2022-08-26 Thread Kyotaro Horiguchi
(Moved to -hackers)

At Thu, 25 Aug 2022 10:34:40 +0200, Alexander Kukushkin  
wrote in 
> > # killall -9 postgres
> > # rm -r oldprim newprim oldarch newarch oldprim.log newprim.log
> > mkdir newarch oldarch
> > initdb -k -D oldprim
> > echo "archive_mode = 'always'">> oldprim/postgresql.conf
> >
> 
> With archive_mode = always you can't reproduce it.
> It is very rarely people set it to always in production due to the overhead.
...
> The archive_mode has to be set to on and the archive_command should be
> failing when you do pg_ctl -D oldprim stop

Ah, I see.

What I don't still understand is why pg_rewind doesn't work for the
old primary in that case. When archive_mode=on, the old primary has
the complete set of WAL files counting both pg_wal and its archive. So
as the same to the privious repro, pg_rewind -c ought to work (but it
uses its own archive this time).  In that sense the proposed solution
is still not needed in this case.

A bit harder situation comes after the server successfully rewound; if
the new primary goes so far that the old primary cannot connect. Even
in that case, you can copy-in the requried WAL files or configure
restore_command of the old pimary so that it finds required WAL files
there.

As the result the system in total doesn't lose a WAL file.

So.. I might still be missing something..


###
# killall -9 postgres
# rm -r oldprim newprim oldarch newarch oldprim.log newprim.log
mkdir newarch oldarch
initdb -k -D oldprim
echo "archive_mode = 'on'">> oldprim/postgresql.conf
echo "archive_command = 'cp %p `pwd`/oldarch/%f'">> oldprim/postgresql.conf
pg_ctl -D oldprim -o '-p 5432' -l oldprim.log start
psql -p 5432 -c 'create table t(a int)'
pg_basebackup -D newprim -p 5432
echo "primary_conninfo='host=/tmp port=5432'">> newprim/postgresql.conf
echo "archive_command = 'cp %p `pwd`/newarch/%f'">> newprim/postgresql.conf
touch newprim/standby.signal
pg_ctl -D newprim -o '-p 5433' -l newprim.log start

# the last common checkpoint
psql -p 5432 -c 'checkpoint'

# record approx. diverging WAL segment
start_wal=`psql -p 5433 -Atc 'select pg_walfile_name(pg_last_wal_replay_lsn() - 
(select setting from pg_settings where name = 'wal_segment_size')::int);
`
psql -p 5432 -c 'insert into t values(0); select pg_switch_wal();'
pg_ctl -D newprim promote
psql -p 5433 -c 'checkpoint'

# old rprimary loses diverging WAL segment
for i in $(seq 1 4); do psql -p 5432 -c 'insert into t values(0); select 
pg_switch_wal();'; done

# old primary cannot archive any more
echo "archive_command = 'false'">> oldprim/postgresql.conf
pg_ctl -D oldprim reload
pg_ctl -D oldprim stop

# rewind the old primary, using its own archive
# pg_rewind -D oldprim --source-server='port=5433' # should fail
echo "restore_command = 'cp `pwd`/oldarch/%f %p'">> oldprim/postgresql.conf
pg_rewind -D oldprim --source-server='port=5433' -c

# advance WAL on the old primary; new primary loses the launching WAL seg
for i in $(seq 1 4); do psql -p 5433 -c 'insert into t values(0); select 
pg_switch_wal();'; done
psql -p 5433 -c 'checkpoint'
echo "primary_conninfo='host=/tmp port=5433'">> oldprim/postgresql.conf
touch oldprim/standby.signal

postgres -D oldprim  # fails with "WAL file has been removed"

# The alternative of copying-in
# echo "restore_command = 'cp `pwd`/newarch/%f %p'">> oldprim/postgresql.conf

# copy-in WAL files from new primary's archive to old primary
(cd newarch;
for f in `ls`; do
  if [[ "$f" > "$start_wal" ]]; then echo copy $f; cp $f ../oldprim/pg_wal; fi
done)

postgres -D oldprim

-- 
Kyotaro Horiguchi
NTT Open Source Software Center