Hi hackers! Some time ago I've seen a hanging logical replication that was trying to send transaction commit after doing table pg_repack. I understand that those things do not mix well. Yet walsender was ignoring pg_terminate_backend() and I think this worth fixing. Can we add CHECK_FOR_INTERRUPTS(); somewhere in this backtrace? Full session is attaches as file.
#0 pfree (pointer=0x561850bbee40) at ./build/../src/backend/utils/mmgr/mcxt.c:1032 #1 0x00005617712530d6 in ReorderBufferReturnTupleBuf (tuple=<optimized out>, rb=<optimized out>) at ./build/../src/backend/replication/logical/reorderbuffer.c:469 #2 ReorderBufferReturnChange (rb=<optimized out>, change=0x561772456048) at ./build/../src/backend/replication/logical/reorderbuffer.c:398 #3 0x0000561771253da1 in ReorderBufferRestoreChanges (rb=rb@entry=0x561771c14e10, txn=0x561771c0b078, file=file@entry=0x561771c15168, segno=segno@entry=0x561771c15178) at ./build/../src/backend/replication/logical/reorderbuffer.c:2570 #4 0x00005617712553ba in ReorderBufferIterTXNNext (state=0x561771c15130, rb=0x561771c14e10) at ./build/../src/backend/replication/logical/reorderbuffer.c:1146 #5 ReorderBufferCommit (rb=0x561771c14e10, xid=xid@entry=2976347782, commit_lsn=79160378448744, end_lsn=<optimized out>, commit_time=commit_time@entry=686095734290578, origin_id=origin_id@entry=0, origin_lsn=0) at ./build/../src/backend/replication/logical/reorderbuffer.c:1523 #6 0x000056177124a30a in DecodeCommit (xid=2976347782, parsed=0x7ffc3cb4c240, buf=0x7ffc3cb4c400, ctx=0x561771b10850) at ./build/../src/backend/replication/logical/decode.c:640 #7 DecodeXactOp (ctx=0x561771b10850, buf=buf@entry=0x7ffc3cb4c400) at ./build/../src/backend/replication/logical/decode.c:248 #8 0x000056177124a6a9 in LogicalDecodingProcessRecord (ctx=0x561771b10850, record=0x561771b10ae8) at ./build/../src/backend/replication/logical/decode.c:117 #9 0x000056177125d1e5 in XLogSendLogical () at ./build/../src/backend/replication/walsender.c:2893 #10 0x000056177125f5f2 in WalSndLoop (send_data=send_data@entry=0x56177125d180 <XLogSendLogical>) at ./build/../src/backend/replication/walsender.c:2242 #11 0x0000561771260125 in StartLogicalReplication (cmd=<optimized out>) at ./build/../src/backend/replication/walsender.c:1179 #12 exec_replication_command (cmd_string=cmd_string@entry=0x561771abe590 "START_REPLICATION SLOT dttsjtaa66crdhbm015h LOGICAL 0/0 ( \"include-timestamp\" '1', \"include-types\" '1', \"include-xids\" '1', \"write-in-chunks\" '1', \"add-tables\" '/* sanitized */.claim_audit,public.__consu"...) at ./build/../src/backend/replication/walsender.c:1612 #13 0x00005617712b2334 in PostgresMain (argc=<optimized out>, argv=argv@entry=0x561771b2a438, dbname=<optimized out>, username=<optimized out>) at ./build/../src/backend/tcop/postgres.c:4267 #14 0x000056177123857c in BackendRun (port=0x561771b0d7a0, port=0x561771b0d7a0) at ./build/../src/backend/postmaster/postmaster.c:4484 #15 BackendStartup (port=0x561771b0d7a0) at ./build/../src/backend/postmaster/postmaster.c:4167 #16 ServerLoop () at ./build/../src/backend/postmaster/postmaster.c:1725 #17 0x000056177123954b in PostmasterMain (argc=9, argv=0x561771ab70e0) at ./build/../src/backend/postmaster/postmaster.c:1398 #18 0x0000561770fae8b6 in main (argc=9, argv=0x561771ab70e0) at ./build/../src/backend/main/main.c:228 What do you think? Thank you! Best regards, Andrey Borodin.
check_for_interrupts.diff
Description: Binary data
Time: 0.731 ms
sas-/* sanitized *//postgres M # select * from pg_replication_slots ;
slot_name | plugin | slot_type | datoid |
database | temporary | active | active_pid | xmin | catalog_xmin |
restart_lsn | confirmed_flush_lsn
------------------------------------+----------+-----------+--------+--------------+-----------+--------+------------+------------+--------------+---------------+---------------------
/* sanitized */ | [null] | physical | [null] | [null] | f | t
| 1719461 | 2980736032 | [null] | 4818/908C9B98 | [null]
vla_/* sanitized */_db_yandex_net | [null] | physical | [null] | [null]
| f | t | 1719460 | 2980736032 | [null] | 4818/908DE000
| [null]
dttsjtaa66crdhbm015h | wal2json | logical | 16441 | /*
sanitized */ | f | t | 3697390 | [null] | 2976347004 |
47F6/3FFE5DA0 | 47FE/F588E800
(3 rows)
Time: 0.454 ms
sas-/* sanitized *//postgres M # select pg_terminate_backend(3697390);
pg_terminate_backend
----------------------
t
(1 row)
Time: 0.189 ms
sas-/* sanitized *//postgres M # select * from pg_replication_slots ;
slot_name | plugin | slot_type | datoid |
database | temporary | active | active_pid | xmin | catalog_xmin |
restart_lsn | confirmed_flush_lsn
------------------------------------+----------+-----------+--------+--------------+-----------+--------+------------+------------+--------------+---------------+---------------------
man_/* sanitized */_db_yandex_net | [null] | physical | [null] | [null]
| f | t | 1719461 | 2980737826 | [null] | 4818/92EC5638
| [null]
vla_/* sanitized */_db_yandex_net | [null] | physical | [null] | [null]
| f | t | 1719460 | 2980737682 | [null] | 4818/92ED3C18
| [null]
dttsjtaa66crdhbm015h | wal2json | logical | 16441 | /*
sanitized */ | f | t | 3697390 | [null] | 2976347004 |
47F6/3FFE5DA0 | 47FE/F588E800
(3 rows)
Time: 0.371 ms
sas-/* sanitized *//postgres M # select pg_terminate_backend(3697390);
pg_terminate_backend
----------------------
t
(1 row)
Time: 0.165 ms
sas-/* sanitized *//postgres M # select * from pg_replication_slots ;
slot_name | plugin | slot_type | datoid |
database | temporary | active | active_pid | xmin | catalog_xmin |
restart_lsn | confirmed_flush_lsn
------------------------------------+----------+-----------+--------+--------------+-----------+--------+------------+------------+--------------+---------------+---------------------
man_/* sanitized */_db_yandex_net | [null] | physical | [null] | [null]
| f | t | 1719461 | 2980741177 | [null] | 4818/9A95E000
| [null]
vla_/* sanitized */_db_yandex_net | [null] | physical | [null] | [null]
| f | t | 1719460 | 2980741253 | [null] | 4818/9A978108
| [null]
dttsjtaa66crdhbm015h | wal2json | logical | 16441 | /*
sanitized */ | f | t | 3697390 | [null] | 2976347004 |
47F6/3FFE5DA0 | 47FE/F588E800
(3 rows)
Time: 0.578 ms
sas-/* sanitized *//postgres M # select * from pg_stat_activity where pid =
'3697390';
-[ RECORD 1
]----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
datid | 16441
datname | /* sanitized */
pid | 3697390
usesysid | 4819584
usename | data_transfer
application_name |
client_addr | 127.0.0.1
client_hostname | localhost
client_port | 45966
backend_start | 2021-09-28 11:24:28.812267+03
xact_start | [null]
query_start | 2021-09-28 11:39:24.415415+03
state_change | 2021-09-28 11:39:24.415438+03
wait_event_type | IO
wait_event | ReorderBufferRead
state | active
backend_xid | [null]
backend_xmin | [null]
query | START_REPLICATION SLOT dttsjtaa66crdhbm015h LOGICAL 0/0 (
"include-timestamp" '1', "include-types" '1', "include-xids" '1',
"write-in-chunks" '1', "add-tables" '/* sanitized */./* sanitized */,public./*
sanitized */', "filter-tables" 'public.repl_mon' )
backend_type | walsender
Time: 3.155 ms
sas-/* sanitized *//postgres M # select * from pg_stat_activity where pid =
'3697390';
-[ RECORD 1
]----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
datid | 16441
datname | /* sanitized */
pid | 3697390
usesysid | 4819584
usename | data_transfer
application_name |
client_addr | 127.0.0.1
client_hostname | localhost
client_port | 45966
backend_start | 2021-09-28 11:24:28.812267+03
xact_start | [null]
query_start | 2021-09-28 11:39:24.415415+03
state_change | 2021-09-28 11:39:24.415438+03
wait_event_type | IO
wait_event | ReorderBufferRead
state | active
backend_xid | [null]
backend_xmin | [null]
query | START_REPLICATION SLOT dttsjtaa66crdhbm015h LOGICAL 0/0 (
"include-timestamp" '1', "include-types" '1', "include-xids" '1',
"write-in-chunks" '1', "add-tables" '/* sanitized */./* sanitized */,public./*
sanitized */', "filter-tables" 'public.repl_mon' )
backend_type | walsender
Time: 2.931 ms
sas-/* sanitized *//postgres M # select
pg_drop_replication_slot('dttsjtaa66crdhbm015h');
ERROR: 55006: replication slot "dttsjtaa66crdhbm015h" is active for PID 3697390
LOCATION: ReplicationSlotAcquire, slot.c:399
Time: 0.309 ms
sas-/* sanitized *//postgres M # ^C
sas-/* sanitized *//postgres M # \q
[PROD]root@sas-/* sanitized */ ~ # bt -p 3697390
Tue Sep 28 12:16:16 2021
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
pfree (pointer=0x561850bbee40) at ./build/../src/backend/utils/mmgr/mcxt.c:1032
#0 pfree (pointer=0x561850bbee40) at
./build/../src/backend/utils/mmgr/mcxt.c:1032
#1 0x00005617712530d6 in ReorderBufferReturnTupleBuf (tuple=<optimized out>,
rb=<optimized out>) at
./build/../src/backend/replication/logical/reorderbuffer.c:469
#2 ReorderBufferReturnChange (rb=<optimized out>, change=0x561772456048) at
./build/../src/backend/replication/logical/reorderbuffer.c:398
#3 0x0000561771253da1 in ReorderBufferRestoreChanges
(rb=rb@entry=0x561771c14e10, txn=0x561771c0b078,
file=file@entry=0x561771c15168, segno=segno@entry=0x561771c15178) at
./build/../src/backend/replication/logical/reorderbuffer.c:2570
#4 0x00005617712553ba in ReorderBufferIterTXNNext (state=0x561771c15130,
rb=0x561771c14e10) at
./build/../src/backend/replication/logical/reorderbuffer.c:1146
#5 ReorderBufferCommit (rb=0x561771c14e10, xid=xid@entry=2976347782,
commit_lsn=79160378448744, end_lsn=<optimized out>,
commit_time=commit_time@entry=686095734290578, origin_id=origin_id@entry=0,
origin_lsn=0) at ./build/../src/backend/replication/logical/reorderbuffer.c:1523
#6 0x000056177124a30a in DecodeCommit (xid=2976347782, parsed=0x7ffc3cb4c240,
buf=0x7ffc3cb4c400, ctx=0x561771b10850) at
./build/../src/backend/replication/logical/decode.c:640
#7 DecodeXactOp (ctx=0x561771b10850, buf=buf@entry=0x7ffc3cb4c400) at
./build/../src/backend/replication/logical/decode.c:248
#8 0x000056177124a6a9 in LogicalDecodingProcessRecord (ctx=0x561771b10850,
record=0x561771b10ae8) at
./build/../src/backend/replication/logical/decode.c:117
#9 0x000056177125d1e5 in XLogSendLogical () at
./build/../src/backend/replication/walsender.c:2893
#10 0x000056177125f5f2 in WalSndLoop (send_data=send_data@entry=0x56177125d180
<XLogSendLogical>) at ./build/../src/backend/replication/walsender.c:2242
#11 0x0000561771260125 in StartLogicalReplication (cmd=<optimized out>) at
./build/../src/backend/replication/walsender.c:1179
#12 exec_replication_command (cmd_string=cmd_string@entry=0x561771abe590
"START_REPLICATION SLOT dttsjtaa66crdhbm015h LOGICAL 0/0 (
\"include-timestamp\" '1', \"include-types\" '1', \"include-xids\" '1',
\"write-in-chunks\" '1', \"add-tables\" '/* sanitized
*/.claim_audit,public.__consu"...) at
./build/../src/backend/replication/walsender.c:1612
#13 0x00005617712b2334 in PostgresMain (argc=<optimized out>,
argv=argv@entry=0x561771b2a438, dbname=<optimized out>, username=<optimized
out>) at ./build/../src/backend/tcop/postgres.c:4267
#14 0x000056177123857c in BackendRun (port=0x561771b0d7a0, port=0x561771b0d7a0)
at ./build/../src/backend/postmaster/postmaster.c:4484
#15 BackendStartup (port=0x561771b0d7a0) at
./build/../src/backend/postmaster/postmaster.c:4167
#16 ServerLoop () at ./build/../src/backend/postmaster/postmaster.c:1725
#17 0x000056177123954b in PostmasterMain (argc=9, argv=0x561771ab70e0) at
./build/../src/backend/postmaster/postmaster.c:1398
#18 0x0000561770fae8b6 in main (argc=9, argv=0x561771ab70e0) at
./build/../src/backend/main/main.c:228
