Hi everyone, I propose a patch that would allow optional "cleaning" of queries tracked in pg_stat_statements, compressing the result and making it more readable.
The default behavior is that when the same query is run with different parameter values, it's actually stored as two separate queries (the string do differ). A small example - when you run "pgbench -S" you'll get queries like this SELECT abalance FROM pgbench_accounts WHERE aid = 12433 SELECT abalance FROM pgbench_accounts WHERE aid = 2322 SELECT abalance FROM pgbench_accounts WHERE aid = 52492 and so on, and each one is listed separately in the pg_stat_statements. This often pollutes the pg_stat_statements. The patch implements a simple "cleaning" that replaces the parameter values with generic strings - e.g. numbers are turned to ":n", so the queries mentioned above are turned to SELECT abalance FROM pgbench_accounts WHERE aid = :n and thus tracked as a single query in pg_stat_statements. The patch provides an enum GUC (pg_stat_statements.clean) with three options - none, basic and aggressive. The default option is "none", the "basic" performs the basic value replacement (described above) and "aggressive" performs some additional cleaning - for example replaces multiple spaces with a single one etc. The parsing is intentionally very simple and cleans the query in a single pass. Currently handles three literal types: a) string (basic, C-style escaped, Unicode-escaped, $-espaced) b) numeric (although 1.925e-3 is replaced by :n-:n) c) boolean (true/false) There is probably room for improvement (e.g. handling UESCAPE). Tomas
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c new file mode 100644 index 8dc3054..a3ace20 *** a/contrib/pg_stat_statements/pg_stat_statements.c --- b/contrib/pg_stat_statements/pg_stat_statements.c *************** static const struct config_enum_entry tr *** 143,152 **** --- 143,167 ---- {NULL, 0, false} }; + typedef enum + { + PGSS_CLEAR_NONE, /* no query cleaning at all */ + PGSS_CLEAR_BASIC, /* basic parameter value replacement */ + PGSS_CLEAR_AGGRESSIVE /* more replacements (spaces, comments) */ + } PGSSClearLevel; + + static const struct config_enum_entry clear_options[] = { + {"none", PGSS_CLEAR_NONE, false}, + {"basic", PGSS_CLEAR_BASIC, false}, + {"aggressive", PGSS_CLEAR_AGGRESSIVE, false}, + {NULL, 0, false} + }; + static int pgss_max; /* max # statements to track */ static int pgss_track; /* tracking level */ static bool pgss_track_utility; /* whether to track utility commands */ static bool pgss_save; /* whether to save stats across shutdown */ + static int pgss_clean; /* whether to clean query parameter values */ #define pgss_enabled() \ *************** static Size pgss_memsize(void); *** 183,189 **** static pgssEntry *entry_alloc(pgssHashKey *key); static void entry_dealloc(void); static void entry_reset(void); ! /* * Module load callback --- 198,204 ---- static pgssEntry *entry_alloc(pgssHashKey *key); static void entry_dealloc(void); static void entry_reset(void); ! static char * pgss_clean_query(const char * query); /* * Module load callback *************** _PG_init(void) *** 252,257 **** --- 267,284 ---- NULL, NULL); + DefineCustomEnumVariable("pg_stat_statements.clean", + "Clean the queries (remove parameter values).", + NULL, + &pgss_clean, + PGSS_CLEAR_NONE, + clear_options, + PGC_SIGHUP, + 0, + NULL, + NULL, + NULL); + EmitWarningsOnPlaceholders("pg_stat_statements"); /* *************** pgss_ExecutorFinish(QueryDesc *queryDesc *** 583,588 **** --- 610,618 ---- static void pgss_ExecutorEnd(QueryDesc *queryDesc) { + + char * query; + if (queryDesc->totaltime && pgss_enabled()) { /* *************** pgss_ExecutorEnd(QueryDesc *queryDesc) *** 590,597 **** * levels of hook all do this.) */ InstrEndLoop(queryDesc->totaltime); ! pgss_store(queryDesc->sourceText, queryDesc->totaltime->total, queryDesc->estate->es_processed, &queryDesc->totaltime->bufusage); --- 620,633 ---- * levels of hook all do this.) */ InstrEndLoop(queryDesc->totaltime); + + if (pgss_clean != PGSS_CLEAR_NONE) { + query = pgss_clean_query(queryDesc->sourceText); + } else { + query = (char*)queryDesc->sourceText; + } ! pgss_store(query, queryDesc->totaltime->total, queryDesc->estate->es_processed, &queryDesc->totaltime->bufusage); *************** entry_reset(void) *** 1040,1042 **** --- 1076,1343 ---- LWLockRelease(pgss->lock); } + + /* + * Clear the query, so that queries that differ only by parameter + * values are considered equal (and represented by a single row in + * pg_stat_statements). + * + * This parsing is just very basic and performs just these steps + * + * a) replaces string values with :s + * b) replaces numeric values with :n + * c) replaces boolean values with :b + * d) replaces multiple whitespaces (whatever isspace() considers + * a space with a single space) + * + * This surely is very simple and does not help with differently + * formatted queries - e.g. those two queries + * + * SELECT ((1)); + * SELECT ( (1) ); + * + * are considered different because the function produces this: + * + * SELECT ((:n)); + * SELECT ( (:n) ); + * + * The basic assumption is the validity of the query, but this + * is executed after successful execution so it's fine. + * + * Possible enhancements (in no special order), mostly aiming to + * normalize the queries further: + * + * a) replace multiple values in the IN clause with a single one + * b) remove comments (not sure about this) + * c) many others ... + * + */ + + /* those are just helper methods for parsing ... */ + static bool string_start(const char * query, int idx, int len); + static int string_find_end(const char * query, int idx, int len); + + static bool number_start(const char * query, int idx, int len); + static int number_find_end(const char * query, int idx, int len); + + static bool boolean_start(const char * query, int idx, int len); + static int boolean_find_end(const char * query, int idx, int len); + + /* + * Does the actual cleaning and returns the cleaned version. + */ + static + char * pgss_clean_query(const char * query) { + + int i, idx = 0; + + /* buffer for the new query */ + char * new_query = (char*)palloc(pgss->query_size+1); + + /* length of the original query */ + int len = strlen(query); + + /* end when the whole query is processed or when the output query + * is full (whichever comes first) */ + i = 0; + while ((i < len) && ((idx + 4) < pgss->query_size)) { + + /* check what we're dealing with */ + if (string_start(query, i, len)) { + + /* string - replace it with :s and find the end */ + i = string_find_end(query, i, len); + + new_query[idx++] = ':'; + new_query[idx++] = 's'; + + } else if (number_start(query, i, len)) { + + /* number - replace it with :n and find the end */ + i = number_find_end(query, i, len); + + new_query[idx++] = ':'; + new_query[idx++] = 'n'; + + } else if (boolean_start(query, i, len)) { + + /* number - replace it with :n and find the end */ + i = boolean_find_end(query, i, len); + + new_query[idx++] = ':'; + new_query[idx++] = 'b'; + + } else if (isspace(query[i]) + && (pgss_clean == PGSS_CLEAR_AGGRESSIVE)) { + + /* just a whitespace - check if there was a previous space */ + if ((idx > 0) && (new_query[idx-1] != ' ')) { + new_query[idx++] = ' '; + } + + } else if (query[i] == ';') { + + /* a semi-colon, so terminate the processing */ + break; + + } else { + + /* other string */ + new_query[idx++] = query[i]; + } + + i++; + + } + + /* terminate the query and remove the trailing spaces */ + new_query[idx] = '\0'; + while ((idx > 0) && (new_query[idx-1] == ' ')) { + new_query[--idx] = '\0'; + } + + return new_query; + + } + + /* + * Implementation of the helper methods. + */ + static + bool string_start(const char * query, int idx, int len) { + /* supports basic escape styles (plain, C-style, Unicode and dollar)*/ + return (query[idx] == '\'') || (query[idx] == '$'); + } + + static + int string_find_end(const char * query, int idx, int len) { + + bool isEscaped = false; + + /* is it C-style escape or a dollar escape? */ + if (query[idx] == '\'') { + + /* C-style escape */ + int endIdx = idx + 1; + + /* find the end of the string */ + while (endIdx < len) { + + if (query[endIdx] == '\\') { + isEscaped = !isEscaped; + } else if ((query[endIdx] == '\'') && (! isEscaped)) { + /* this is the actual end of string, so return it */ + return endIdx; + } + + endIdx++; + + } + + } else if (query[idx] == '$') { + + char * end; + char * tag; + int len; + + /* dollar escape - lets see what's the tag */ + end = strchr(&query[idx+1], '$'); + + /* copy the tag */ + len = end - &query[idx] + 2; + tag = (char*)palloc(len); + memcpy(tag, &query[idx], len-1); + tag[len-1] = '\0'; + + /* what's the next occurence of the tag */ + end = strstr(&query[idx+len], tag); + + return (idx + len - 2 + end - &query[idx]); + + } + + elog(DEBUG1, "end of string starting at %d not found", idx); + + return len; + + } + + static + bool number_start(const char * query, int idx, int len) { + /* numbers start with a digit or a dot */ + return (isdigit(query[idx]) || (query[idx] == '.')); + } + + static + int number_find_end(const char * query, int idx, int len) { + + int endIdx = idx; + + /* find the end of the string */ + while (idx < len) { + + if (! isalnum(query[endIdx])) { + /* the number actually ends at the previous character */ + return (endIdx-1); + } + + endIdx++; + + } + + elog(DEBUG1, "end of number starting at %d not found", idx); + + return len; + + } + + static bool boolean_start_true(const char * query, int idx, int len) { + + char buff[4]; + int i; + + /* true */ + if (len - idx >= 4) { + strncpy(buff, &query[idx], 4); + for (i = 0; i < 4; i++) { + buff[i] = tolower(buff[i]); + } + if (strncmp(buff, "true", 4) == 0) { + return true; + } + } + + return false; + } + + static bool boolean_start_false(const char * query, int idx, int len) { + + char buff[5]; + int i; + + /* true */ + if (len - idx >= 5) { + strncpy(buff, &query[idx], 5); + for (i = 0; i < 5; i++) { + buff[i] = tolower(buff[i]); + } + if (strncmp(buff, "false", 5) == 0) { + return true; + } + } + + return false; + } + + static bool boolean_start(const char * query, int idx, int len) { + + return (boolean_start_true(query, idx, len) + || boolean_start_false(query, idx, len)); + + } + + static int boolean_find_end(const char * query, int idx, int len) { + /* this should be called only when there's a boolean, so add + * 4 (true) or 5 (false) */ + return idx + (boolean_start_true(query, idx, len) ? 4 : 5); + } diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml new file mode 100644 index 5a0230c..d3e4842 *** a/doc/src/sgml/pgstatstatements.sgml --- b/doc/src/sgml/pgstatstatements.sgml *************** *** 264,269 **** --- 264,290 ---- </para> </listitem> </varlistentry> + + <varlistentry> + <term> + <varname>pg_stat_statements.clean</varname> (<type>enum</type>) + </term> + + <listitem> + <para> + <varname>pg_stat_statements.clean</varname> controls how are the + stataments preprocessed. + Specify <literal>basic</> to replace parameter values with common + values (<literal>:n</> for numbers, <literal>:s</> for strings and + <literal>:b</> for booleans), <literal>aggressive</> to further + clean the queries (remove line endings, replace multiple spaces + with a single one), or <literal>none</> to disable. + The default value is <literal>none</>. + Only superusers can change this setting. + </para> + </listitem> + </varlistentry> + </variablelist> <para>
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers