Re: [HACKERS] SIGSEGV in BRIN autosummarize

Justin Pryzby Sat, 14 Oct 2017 15:44:00 -0700

On Fri, Oct 13, 2017 at 10:57:32PM -0500, Justin Pryzby wrote:
> I don't have any reason to believe there's memory issue on the server, So I
> suppose this is just a "heads up" to early adopters until/in case it happens
> again and I can at least provide a stack trace.


I'm back; find stacktrace below.

> Today I see:
> < 2017-10-13 17:22:47.839 -04  >LOG:  server process (PID 32127) was 
> terminated by signal 11: Segmentation fault
> < 2017-10-13 17:22:47.839 -04  >DETAIL:  Failed process was running: 
> autovacuum: BRIN summarize public.gtt 747263

Is it a coincidence the server failed within 45m of yesterday's failure ?

postmaster[26500] general protection ip:84a177 sp:7ffd9b349b88 error:0 in 
postgres[400000+692000]
< 2017-10-14 18:05:36.432 -04  >DETAIL:  Failed process was running: 
autovacuum: BRIN summarize public.gtt 41087

> It looks like this table was being inserted into simultaneously by a python
> program using multiprocessing.  It looks like each subprocess was INSERTing
> into several tables, each of which has one BRIN index on timestamp column.

I should add:
These are insert-only child tables in a heirarchy (not PG10 partitions), being
inserted into directly (not via trigger/rule).

Also notice the vacuum process was interrupted, same as yesterday (think
goodness for full logs).  Our INSERT script is using python
multiprocessing.pool() with "maxtasksperchild=1", which I think means we load
one file and then exit the subprocess, and pool() creates a new subproc, which
starts a new PG session and transaction.  Which explains why autovacuum starts
processing the table only to be immediately interrupted.

postgres=# SELECT * FROM postgres_log_2017_10_14_1800 WHERE pid=26500 ORDER BY 
log_time DESC LIMIT 9;
log_time               | 2017-10-14 18:05:34.132-04
pid                    | 26500
session_id             | 59e289b4.6784
session_line           | 2
session_start_time     | 2017-10-14 18:03:32-04
error_severity         | ERROR
sql_state_code         | 57014
message                | canceling autovacuum task
context                | processing work entry for relation 
"gtt.public.cdrs_eric_ggsnpdprecord_2017_10_14_recordopeningtime_idx"
-----------------------+---------------------------------------------------------------------------------------------------------
log_time               | 2017-10-14 18:05:32.925-04
pid                    | 26500
session_id             | 59e289b4.6784
session_line           | 1
session_start_time     | 2017-10-14 18:03:32-04
error_severity         | ERROR
sql_state_code         | 57014
message                | canceling autovacuum task
context                | automatic analyze of table 
"gtt.public.cdrs_eric_egsnpdprecord_2017_10_14"

gtt=# \dt+ *record_2017_10_14
 public | cdrs_eric_egsnpdprecord_2017_10_14 | table | gtt   | 1642 MB | 
 public | cdrs_eric_ggsnpdprecord_2017_10_14 | table | gtt   | 492 MB  | 

gtt=# \di+ *_2017_10_14*_recordopeningtime_idx
 public | cdrs_eric_egsnpdprecord_2017_10_14_recordopeningtime_idx | index | 
gtt   | cdrs_eric_egsnpdprecord_2017_10_14 | 72 kB | 
 public | cdrs_eric_ggsnpdprecord_2017_10_14_recordopeningtime_idx | index | 
gtt   | cdrs_eric_ggsnpdprecord_2017_10_14 | 48 kB | 

Due to a .."behavioral deficiency" in the loader for those tables, the crashed
backend causes the loader to get stuck, so the tables should be untouched since
the crash, should it be desirable to inspect them.

#0  pfree (pointer=0x298c740) at mcxt.c:954
        context = 0x7474617261763a20
#1  0x00000000006a52e9 in perform_work_item (workitem=0x7f8ad1f94824) at 
autovacuum.c:2676
        cur_datname = 0x298c740 "no 1 :vartype 1184 :vartypmod -1 :varcollid 0 
:varlevelsup 0 :varnoold 1 :varoattno 1 :location 146} {CONST :consttype 1184 
:consttypmod -1 :constcollid 0 :constlen 8 :constbyval true :constisnull fal"...
        cur_nspname = 0x298c728 "s ({VAR :varno 1 :varattno 1 :vartype 1184 
:vartypmod -1 :varcollid 0 :varlevelsup 0 :varnoold 1 :varoattno 1 :location 
146} {CONST :consttype 1184 :consttypmod -1 :constcollid 0 :constlen 8 
:constbyv"...
        cur_relname = 0x298cd68 "cdrs_eric_msc_sms_2017_10_14_startofcharge_idx"
        __func__ = "perform_work_item"
#2  0x00000000006a6fd9 in do_autovacuum () at autovacuum.c:2533
        workitem = 0x7f8ad1f94824
        classRel = 0x7f89c26d0e58
        tuple = <value optimized out>
        relScan = <value optimized out>
        dbForm = <value optimized out>
        table_oids = <value optimized out>
        orphan_oids = 0x0
        ctl = {num_partitions = 0, ssize = 0, dsize = 0, max_dsize = 0, ffactor 
= 0, keysize = 4, entrysize = 80, hash = 0, match = 0, keycopy = 0, alloc = 0, 
hcxt = 0x0, hctl = 0x0}
        table_toast_map = 0x29c8188
        cell = 0x0
        shared = 0x298ce18
        dbentry = 0x298d0a0
        bstrategy = 0x2a61c18
        key = {sk_flags = 0, sk_attno = 16, sk_strategy = 3, sk_subtype = 0, 
sk_collation = 100, sk_func = {fn_addr = 0x750430 <chareq>, fn_oid = 61, 
fn_nargs = 2, fn_strict = 1 '\001', fn_retset = 0 '\000', 
            fn_stats = 2 '\002', fn_extra = 0x0, fn_mcxt = 0x28f3608, fn_expr = 
0x0}, sk_argument = 116}
        pg_class_desc = 0x29b66a8
        effective_multixact_freeze_max_age = 400000000
        did_vacuum = <value optimized out>
        found_concurrent_worker = <value optimized out>
        i = <value optimized out>
        __func__ = "do_autovacuum"
#3  0x00000000006a7dc8 in AutoVacWorkerMain (argc=<value optimized out>, 
argv=<value optimized out>) at autovacuum.c:1694
        dbname = 
"gtt\000\000\000\000\000\325I\001\327;\000\000\000\204g\000\000\000\000\000\000E\352\200\327;\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\374\024\316",
 '\000' <repeats 12 times>
        local_sigjmp_buf = {{__jmpbuf = {0, -5392960675600994449, 42895168, 
1508018581, 26499, 1508015202, 5391772750539845487, -5392961587993972881}, 
__mask_was_saved = 1, __saved_mask = {__val = {
                18446744066192964103, 13184024, 1, 13182664, 0, 26499, 
1508015202, 0, 257010229664, 1, 0, 140727207371280, 257014425560, 4, 0, 4}}}}
        dbid = 16400
        __func__ = "AutoVacWorkerMain"
#4  0x00000000006a7e96 in StartAutoVacWorker () at autovacuum.c:1499
        worker_pid = 0
        __func__ = "StartAutoVacWorker"
#5  0x00000000006b595a in StartAutovacuumWorker (postgres_signal_arg=<value 
optimized out>) at postmaster.c:5414
        bn = 0x28e8740
#6  sigusr1_handler (postgres_signal_arg=<value optimized out>) at 
postmaster.c:5111
        save_errno = 4
        __func__ = "sigusr1_handler"
#7  <signal handler called>
No symbol table info available.
#8  0x0000003bd74e1603 in __select_nocancel () at 
../sysdeps/unix/syscall-template.S:82
No locals.
#9  0x00000000006b6c96 in ServerLoop (argc=<value optimized out>, argv=<value 
optimized out>) at postmaster.c:1717
        timeout = {tv_sec = 59, tv_usec = 686521}
        rmask = {fds_bits = {120, 0 <repeats 15 times>}}
        selres = <value optimized out>
        now = <value optimized out>
        readmask = {fds_bits = {120, 0 <repeats 15 times>}}
        nSockets = 7
        last_lockfile_recheck_time = 1508018581
        last_touch_time = 1508015202
#10 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at 
postmaster.c:1361
        opt = <value optimized out>
        status = <value optimized out>
        userDoption = <value optimized out>
        listen_addr_saved = <value optimized out>
        i = <value optimized out>
        output_config_variable = <value optimized out>
        __func__ = "PostmasterMain"
#11 0x0000000000638450 in main (argc=3, argv=0x28c03a0) at main.c:228


-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] SIGSEGV in BRIN autosummarize

Reply via email to