On Fri, Aug 15, 2014 at 3:40 AM, Andres Freund <and...@2ndquadrant.com> wrote: > On 2014-08-14 14:37:22 -0400, Robert Haas wrote: >> On Thu, Aug 14, 2014 at 2:21 PM, Andres Freund <and...@2ndquadrant.com> >> wrote: >> > On 2014-08-14 14:19:13 -0400, Robert Haas wrote: >> >> That's about the idea. However, what you've got there is actually >> >> unsafe, because shmem->counter++ is not an atomic operation. It reads >> >> the counter (possibly even as two separate 4-byte loads if the counter >> >> is an 8-byte value), increments it inside the CPU, and then writes the >> >> resulting value back to memory. If two backends do this concurrently, >> >> one of the updates might be lost. >> > >> > All these are only written by one backend, so it should be safe. Note >> > that that coding pattern, just without memory barriers, is all over >> > pgstat.c >> >> Ah, OK. If there's a separate slot for each backend, I agree that it's safe. >> >> We should probably add barriers to pgstat.c, too. > > Yea, definitely. I think this is rather borked on "weaker" > architectures. It's just that the consequences of an out of date/torn > value are rather low, so it's unlikely to be noticed. > > Imo we should encapsulate the changecount modifications/checks somehow > instead of repeating the barriers, Asserts, comments et al everywhere.
So what about applying the attached patch first, which adds the macros to load and store the changecount with the memory barries, and changes pgstat.c use them. Maybe this patch needs to be back-patch to at least 9.4? After applying the patch, I will rebase the pg_last_xact_insert_timestamp patch and post it again. Regards, -- Fujii Masao
*** a/src/backend/postmaster/pgstat.c --- b/src/backend/postmaster/pgstat.c *************** *** 2563,2569 **** pgstat_bestart(void) beentry = MyBEEntry; do { ! beentry->st_changecount++; } while ((beentry->st_changecount & 1) == 0); beentry->st_procpid = MyProcPid; --- 2563,2569 ---- beentry = MyBEEntry; do { ! pgstat_increment_changecount_before(beentry); } while ((beentry->st_changecount & 1) == 0); beentry->st_procpid = MyProcPid; *************** *** 2588,2595 **** pgstat_bestart(void) beentry->st_appname[NAMEDATALEN - 1] = '\0'; beentry->st_activity[pgstat_track_activity_query_size - 1] = '\0'; ! beentry->st_changecount++; ! Assert((beentry->st_changecount & 1) == 0); /* Update app name to current GUC setting */ if (application_name) --- 2588,2594 ---- beentry->st_appname[NAMEDATALEN - 1] = '\0'; beentry->st_activity[pgstat_track_activity_query_size - 1] = '\0'; ! pgstat_increment_changecount_after(beentry); /* Update app name to current GUC setting */ if (application_name) *************** *** 2624,2635 **** pgstat_beshutdown_hook(int code, Datum arg) * before and after. We use a volatile pointer here to ensure the * compiler doesn't try to get cute. */ ! beentry->st_changecount++; beentry->st_procpid = 0; /* mark invalid */ ! beentry->st_changecount++; ! Assert((beentry->st_changecount & 1) == 0); } --- 2623,2633 ---- * before and after. We use a volatile pointer here to ensure the * compiler doesn't try to get cute. */ ! pgstat_increment_changecount_before(beentry); beentry->st_procpid = 0; /* mark invalid */ ! pgstat_increment_changecount_after(beentry); } *************** *** 2666,2672 **** pgstat_report_activity(BackendState state, const char *cmd_str) * non-disabled state. As our final update, change the state and * clear fields we will not be updating anymore. */ ! beentry->st_changecount++; beentry->st_state = STATE_DISABLED; beentry->st_state_start_timestamp = 0; beentry->st_activity[0] = '\0'; --- 2664,2670 ---- * non-disabled state. As our final update, change the state and * clear fields we will not be updating anymore. */ ! pgstat_increment_changecount_before(beentry); beentry->st_state = STATE_DISABLED; beentry->st_state_start_timestamp = 0; beentry->st_activity[0] = '\0'; *************** *** 2674,2681 **** pgstat_report_activity(BackendState state, const char *cmd_str) /* st_xact_start_timestamp and st_waiting are also disabled */ beentry->st_xact_start_timestamp = 0; beentry->st_waiting = false; ! beentry->st_changecount++; ! Assert((beentry->st_changecount & 1) == 0); } return; } --- 2672,2678 ---- /* st_xact_start_timestamp and st_waiting are also disabled */ beentry->st_xact_start_timestamp = 0; beentry->st_waiting = false; ! pgstat_increment_changecount_after(beentry); } return; } *************** *** 2695,2701 **** pgstat_report_activity(BackendState state, const char *cmd_str) /* * Now update the status entry */ ! beentry->st_changecount++; beentry->st_state = state; beentry->st_state_start_timestamp = current_timestamp; --- 2692,2698 ---- /* * Now update the status entry */ ! pgstat_increment_changecount_before(beentry); beentry->st_state = state; beentry->st_state_start_timestamp = current_timestamp; *************** *** 2707,2714 **** pgstat_report_activity(BackendState state, const char *cmd_str) beentry->st_activity_start_timestamp = start_timestamp; } ! beentry->st_changecount++; ! Assert((beentry->st_changecount & 1) == 0); } /* ---------- --- 2704,2710 ---- beentry->st_activity_start_timestamp = start_timestamp; } ! pgstat_increment_changecount_after(beentry); } /* ---------- *************** *** 2734,2746 **** pgstat_report_appname(const char *appname) * st_changecount before and after. We use a volatile pointer here to * ensure the compiler doesn't try to get cute. */ ! beentry->st_changecount++; memcpy((char *) beentry->st_appname, appname, len); beentry->st_appname[len] = '\0'; ! beentry->st_changecount++; ! Assert((beentry->st_changecount & 1) == 0); } /* --- 2730,2741 ---- * st_changecount before and after. We use a volatile pointer here to * ensure the compiler doesn't try to get cute. */ ! pgstat_increment_changecount_before(beentry); memcpy((char *) beentry->st_appname, appname, len); beentry->st_appname[len] = '\0'; ! pgstat_increment_changecount_after(beentry); } /* *************** *** 2760,2769 **** pgstat_report_xact_timestamp(TimestampTz tstamp) * st_changecount before and after. We use a volatile pointer here to * ensure the compiler doesn't try to get cute. */ ! beentry->st_changecount++; beentry->st_xact_start_timestamp = tstamp; ! beentry->st_changecount++; ! Assert((beentry->st_changecount & 1) == 0); } /* ---------- --- 2755,2763 ---- * st_changecount before and after. We use a volatile pointer here to * ensure the compiler doesn't try to get cute. */ ! pgstat_increment_changecount_before(beentry); beentry->st_xact_start_timestamp = tstamp; ! pgstat_increment_changecount_after(beentry); } /* ---------- *************** *** 2839,2845 **** pgstat_read_current_status(void) */ for (;;) { ! int save_changecount = beentry->st_changecount; localentry->backendStatus.st_procpid = beentry->st_procpid; if (localentry->backendStatus.st_procpid > 0) --- 2833,2842 ---- */ for (;;) { ! int before_changecount; ! int after_changecount; ! ! pgstat_save_changecount_before(beentry, before_changecount); localentry->backendStatus.st_procpid = beentry->st_procpid; if (localentry->backendStatus.st_procpid > 0) *************** *** 2856,2863 **** pgstat_read_current_status(void) localentry->backendStatus.st_activity = localactivity; } ! if (save_changecount == beentry->st_changecount && ! (save_changecount & 1) == 0) break; /* Make sure we can break out of loop if stuck... */ --- 2853,2861 ---- localentry->backendStatus.st_activity = localactivity; } ! pgstat_save_changecount_after(beentry, after_changecount); ! if (before_changecount == after_changecount && ! (before_changecount & 1) == 0) break; /* Make sure we can break out of loop if stuck... */ *************** *** 2927,2938 **** pgstat_get_backend_current_activity(int pid, bool checkUser) for (;;) { ! int save_changecount = vbeentry->st_changecount; found = (vbeentry->st_procpid == pid); ! if (save_changecount == vbeentry->st_changecount && ! (save_changecount & 1) == 0) break; /* Make sure we can break out of loop if stuck... */ --- 2925,2941 ---- for (;;) { ! int before_changecount; ! int after_changecount; ! ! pgstat_save_changecount_before(vbeentry, before_changecount); found = (vbeentry->st_procpid == pid); ! pgstat_save_changecount_after(vbeentry, after_changecount); ! ! if (before_changecount == after_changecount && ! (before_changecount & 1) == 0) break; /* Make sure we can break out of loop if stuck... */ *** a/src/include/pgstat.h --- b/src/include/pgstat.h *************** *** 16,21 **** --- 16,22 ---- #include "libpq/pqcomm.h" #include "portability/instr_time.h" #include "postmaster/pgarch.h" + #include "storage/barrier.h" #include "utils/hsearch.h" #include "utils/relcache.h" *************** *** 714,719 **** typedef struct PgBackendStatus --- 715,726 ---- * st_changecount again. If the value hasn't changed, and if it's even, * the copy is valid; otherwise start over. This makes updates cheap * while reads are potentially expensive, but that's the tradeoff we want. + * + * The above protocol needs the memory barriers to ensure that + * the apparent order of execution is as it desires. Otherwise, + * for example, the CPU might rearrange the code so that st_changecount + * is incremented twice before the modification on a machine with + * weak memory ordering. This surprising result can lead to bugs. */ int st_changecount; *************** *** 745,750 **** typedef struct PgBackendStatus --- 752,794 ---- char *st_activity; } PgBackendStatus; + /* + * Macros to load and store st_changecount with the memory barriers. + * + * pgstat_increment_changecount_before() and + * pgstat_increment_changecount_after() need to be called before and after + * PgBackendStatus entries are modified, respectively. This makes sure that + * st_changecount is incremented around the modification. + * + * Also pgstat_save_changecount_before() and pgstat_save_changecount_after() + * need to be called before and after PgBackendStatus entries are copied into + * private memory, respectively. + */ + #define pgstat_increment_changecount_before(beentry) \ + do { \ + beentry->st_changecount++; \ + pg_write_barrier(); \ + } while (0) + + #define pgstat_increment_changecount_after(beentry) \ + do { \ + pg_write_barrier(); \ + beentry->st_changecount++; \ + Assert((beentry->st_changecount & 1) == 0); \ + } while (0) + + #define pgstat_save_changecount_before(beentry, save_changecount) \ + do { \ + save_changecount = beentry->st_changecount; \ + pg_read_barrier(); \ + } while (0) + + #define pgstat_save_changecount_after(beentry, save_changecount) \ + do { \ + pg_read_barrier(); \ + save_changecount = beentry->st_changecount; \ + } while (0) + /* ---------- * LocalPgBackendStatus *
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers