Hi everyone,

I propose a patch that would allow optional "cleaning" of queries
tracked in pg_stat_statements, compressing the result and making it more
readable.

The default behavior is that when the same query is run with different
parameter values, it's actually stored as two separate queries (the
string do differ).

A small example - when you run "pgbench -S" you'll get queries like
this

   SELECT abalance FROM pgbench_accounts WHERE aid = 12433
   SELECT abalance FROM pgbench_accounts WHERE aid = 2322
   SELECT abalance FROM pgbench_accounts WHERE aid = 52492

and so on, and each one is listed separately in the pg_stat_statements.
This often pollutes the pg_stat_statements.

The patch implements a simple "cleaning" that replaces the parameter
values with generic strings - e.g. numbers are turned to ":n", so the
queries mentioned above are turned to

   SELECT abalance FROM pgbench_accounts WHERE aid = :n

and thus tracked as a single query in pg_stat_statements.

The patch provides an enum GUC (pg_stat_statements.clean) with three
options - none, basic and aggressive. The default option is "none", the
"basic" performs the basic value replacement (described above) and
"aggressive" performs some additional cleaning - for example replaces
multiple spaces with a single one etc.

The parsing is intentionally very simple and cleans the query in a
single pass. Currently handles three literal types:

 a) string (basic, C-style escaped, Unicode-escaped, $-espaced)
 b) numeric (although 1.925e-3 is replaced by :n-:n)
 c) boolean (true/false)

There is probably room for improvement (e.g. handling UESCAPE).


Tomas
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c 
b/contrib/pg_stat_statements/pg_stat_statements.c
new file mode 100644
index 8dc3054..a3ace20
*** a/contrib/pg_stat_statements/pg_stat_statements.c
--- b/contrib/pg_stat_statements/pg_stat_statements.c
*************** static const struct config_enum_entry tr
*** 143,152 ****
--- 143,167 ----
        {NULL, 0, false}
  };
  
+ typedef enum
+ {
+       PGSS_CLEAR_NONE,                        /* no query cleaning at all */
+       PGSS_CLEAR_BASIC,                       /* basic parameter value 
replacement  */
+       PGSS_CLEAR_AGGRESSIVE           /* more replacements (spaces, comments) 
*/
+ }     PGSSClearLevel;
+ 
+ static const struct config_enum_entry clear_options[] = {
+       {"none", PGSS_CLEAR_NONE, false},
+       {"basic", PGSS_CLEAR_BASIC, false},
+       {"aggressive", PGSS_CLEAR_AGGRESSIVE, false},
+       {NULL, 0, false}
+ };
+ 
  static int    pgss_max;                       /* max # statements to track */
  static int    pgss_track;                     /* tracking level */
  static bool pgss_track_utility; /* whether to track utility commands */
  static bool pgss_save;                        /* whether to save stats across 
shutdown */
+ static int    pgss_clean;             /* whether to clean query parameter 
values */
  
  
  #define pgss_enabled() \
*************** static Size pgss_memsize(void);
*** 183,189 ****
  static pgssEntry *entry_alloc(pgssHashKey *key);
  static void entry_dealloc(void);
  static void entry_reset(void);
! 
  
  /*
   * Module load callback
--- 198,204 ----
  static pgssEntry *entry_alloc(pgssHashKey *key);
  static void entry_dealloc(void);
  static void entry_reset(void);
! static char * pgss_clean_query(const char * query);
  
  /*
   * Module load callback
*************** _PG_init(void)
*** 252,257 ****
--- 267,284 ----
                                                         NULL,
                                                         NULL);
  
+     DefineCustomEnumVariable("pg_stat_statements.clean",
+                                                        "Clean the queries 
(remove parameter values).",
+                                                        NULL,
+                                                        &pgss_clean,
+                                                        PGSS_CLEAR_NONE,
+                                                        clear_options,
+                                                        PGC_SIGHUP,
+                                                        0,
+                                                        NULL,
+                                                        NULL,
+                                                        NULL);
+ 
        EmitWarningsOnPlaceholders("pg_stat_statements");
  
        /*
*************** pgss_ExecutorFinish(QueryDesc *queryDesc
*** 583,588 ****
--- 610,618 ----
  static void
  pgss_ExecutorEnd(QueryDesc *queryDesc)
  {
+     
+       char * query;
+     
        if (queryDesc->totaltime && pgss_enabled())
        {
                /*
*************** pgss_ExecutorEnd(QueryDesc *queryDesc)
*** 590,597 ****
                 * levels of hook all do this.)
                 */
                InstrEndLoop(queryDesc->totaltime);
  
!               pgss_store(queryDesc->sourceText,
                                   queryDesc->totaltime->total,
                                   queryDesc->estate->es_processed,
                                   &queryDesc->totaltime->bufusage);
--- 620,633 ----
                 * levels of hook all do this.)
                 */
                InstrEndLoop(queryDesc->totaltime);
+         
+               if (pgss_clean != PGSS_CLEAR_NONE) {
+                       query = pgss_clean_query(queryDesc->sourceText);
+               } else {
+                       query = (char*)queryDesc->sourceText;
+               }
  
!               pgss_store(query,
                                   queryDesc->totaltime->total,
                                   queryDesc->estate->es_processed,
                                   &queryDesc->totaltime->bufusage);
*************** entry_reset(void)
*** 1040,1042 ****
--- 1076,1343 ----
  
        LWLockRelease(pgss->lock);
  }
+ 
+ /*
+  * Clear the query, so that queries that differ only by parameter
+  * values are considered equal (and represented by a single row in
+  * pg_stat_statements).
+  * 
+  * This parsing is just very basic and performs just these steps
+  * 
+  *  a) replaces string values with :s
+  *  b) replaces numeric values with :n
+  *  c) replaces boolean values with :b
+  *  d) replaces multiple whitespaces (whatever isspace() considers
+  *     a space with a single space)
+  * 
+  * This surely is very simple and does not help with differently
+  * formatted queries - e.g. those two queries
+  * 
+  *   SELECT ((1));
+  *   SELECT ( (1) );
+  * 
+  * are considered different because the function produces this:
+  * 
+  *   SELECT ((:n));
+  *   SELECT ( (:n) );
+  * 
+  * The basic assumption is the validity of the query, but this
+  * is executed after successful execution so it's fine.
+  * 
+  * Possible enhancements (in no special order), mostly aiming to
+  * normalize the queries further:
+  * 
+  *  a) replace multiple values in the IN clause with a single one
+  *  b) remove comments (not sure about this)
+  *  c) many others ...
+  * 
+  */
+ 
+ /* those are just helper methods for parsing ... */
+ static bool string_start(const char * query, int idx, int len);
+ static int  string_find_end(const char * query, int idx, int len);
+ 
+ static bool number_start(const char * query, int idx, int len);
+ static int  number_find_end(const char * query, int idx, int len);
+ 
+ static bool boolean_start(const char * query, int idx, int len);
+ static int  boolean_find_end(const char * query, int idx, int len);
+ 
+ /*
+  * Does the actual cleaning and returns the cleaned version.
+  */
+ static
+ char * pgss_clean_query(const char * query) {
+       
+       int i, idx = 0;
+       
+       /* buffer for the new query */
+       char * new_query = (char*)palloc(pgss->query_size+1);
+       
+       /* length of the original query */
+       int len = strlen(query);
+       
+       /* end when the whole query is processed or when the output query
+        * is full (whichever comes first) */
+       i = 0;
+       while ((i < len) && ((idx + 4) < pgss->query_size)) {
+               
+               /* check what we're dealing with */
+               if (string_start(query, i, len)) {
+                       
+                       /* string - replace it with :s and find the end */
+                       i = string_find_end(query, i, len);
+                       
+                       new_query[idx++] = ':';
+                       new_query[idx++] = 's';
+                       
+               } else if (number_start(query, i, len)) {
+                       
+                       /* number - replace it with :n and find the end */
+                       i = number_find_end(query, i, len);
+                       
+                       new_query[idx++] = ':';
+                       new_query[idx++] = 'n';
+                       
+               } else if (boolean_start(query, i, len)) {
+                       
+                       /* number - replace it with :n and find the end */
+                       i = boolean_find_end(query, i, len);
+                       
+                       new_query[idx++] = ':';
+                       new_query[idx++] = 'b';
+                       
+               } else if (isspace(query[i])
+                          && (pgss_clean == PGSS_CLEAR_AGGRESSIVE)) {
+                       
+                       /* just a whitespace - check if there was a previous 
space */
+                       if ((idx > 0) && (new_query[idx-1] != ' ')) {
+                               new_query[idx++] = ' ';
+                       }
+                       
+               } else if (query[i] == ';') {
+                       
+                       /* a semi-colon, so terminate the processing */
+                       break;
+                       
+               } else {
+                       
+                       /* other string */
+                       new_query[idx++] = query[i];
+               }
+               
+               i++;
+               
+       }
+       
+       /* terminate the query and remove the trailing spaces */
+       new_query[idx] = '\0';
+       while ((idx > 0) && (new_query[idx-1] == ' ')) {
+               new_query[--idx] = '\0';
+       }
+       
+       return new_query;
+       
+ }
+ 
+ /*
+  * Implementation of the helper methods.
+  */
+ static
+ bool string_start(const char * query, int idx, int len) {
+       /* supports basic escape styles (plain, C-style, Unicode and dollar)*/
+       return (query[idx] == '\'') || (query[idx] == '$');
+ }
+ 
+ static
+ int  string_find_end(const char * query, int idx, int len) {
+       
+       bool isEscaped = false;
+       
+       /* is it C-style escape or a dollar escape? */
+       if (query[idx] == '\'') {
+               
+               /* C-style escape */
+               int  endIdx = idx + 1;
+               
+               /* find the end of the string */
+               while (endIdx < len) {
+                       
+                       if (query[endIdx] == '\\') {
+                               isEscaped = !isEscaped;
+                       } else if ((query[endIdx] == '\'') && (! isEscaped)) {
+                               /* this is the actual end of string, so return 
it */
+                               return endIdx;
+                       }
+                       
+                       endIdx++;
+                       
+               }
+               
+       } else if (query[idx] == '$') {
+               
+               char * end;
+               char * tag;
+               int len;
+               
+               /* dollar escape - lets see what's the tag */
+               end = strchr(&query[idx+1], '$');
+               
+               /* copy the tag */
+               len = end - &query[idx] + 2;
+               tag = (char*)palloc(len);
+               memcpy(tag, &query[idx], len-1);
+               tag[len-1] = '\0';
+               
+               /* what's the next occurence of the tag */
+               end = strstr(&query[idx+len], tag);
+               
+               return (idx + len - 2 + end - &query[idx]);
+               
+       }
+       
+       elog(DEBUG1, "end of string starting at %d not found", idx);
+       
+       return len;
+       
+ }
+ 
+ static
+ bool number_start(const char * query, int idx, int len) {
+       /* numbers start with a digit or a dot */
+       return (isdigit(query[idx]) || (query[idx] == '.'));
+ }
+ 
+ static
+ int  number_find_end(const char * query, int idx, int len) {
+       
+       int  endIdx = idx;
+       
+       /* find the end of the string */
+       while (idx < len) {
+               
+               if (! isalnum(query[endIdx])) {
+                       /* the number actually ends at the previous character */
+                       return (endIdx-1);
+               }
+               
+               endIdx++;
+               
+       }
+       
+       elog(DEBUG1, "end of number starting at %d not found", idx);
+       
+       return len;
+       
+ }
+ 
+ static bool boolean_start_true(const char * query, int idx, int len) {
+       
+       char buff[4];
+       int  i;
+       
+       /* true */
+       if (len - idx >= 4) {
+               strncpy(buff, &query[idx], 4);
+               for (i = 0; i < 4; i++) {
+                       buff[i] = tolower(buff[i]);
+               }
+               if (strncmp(buff, "true", 4) == 0) {
+                       return true;
+               }
+       }
+       
+       return false;
+ }
+ 
+ static bool boolean_start_false(const char * query, int idx, int len) {
+       
+       char buff[5];
+       int  i;
+       
+       /* true */
+       if (len - idx >= 5) {
+               strncpy(buff, &query[idx], 5);
+               for (i = 0; i < 5; i++) {
+                       buff[i] = tolower(buff[i]);
+               }
+               if (strncmp(buff, "false", 5) == 0) {
+                       return true;
+               }
+       }
+       
+       return false;
+ }
+ 
+ static bool boolean_start(const char * query, int idx, int len) {
+       
+       return (boolean_start_true(query, idx, len)
+                || boolean_start_false(query, idx, len));
+       
+ }
+ 
+ static int  boolean_find_end(const char * query, int idx, int len) {
+       /* this should be called only when there's a boolean, so add
+        * 4 (true) or 5 (false) */
+       return idx + (boolean_start_true(query, idx, len) ? 4 : 5);
+ }
diff --git a/doc/src/sgml/pgstatstatements.sgml 
b/doc/src/sgml/pgstatstatements.sgml
new file mode 100644
index 5a0230c..d3e4842
*** a/doc/src/sgml/pgstatstatements.sgml
--- b/doc/src/sgml/pgstatstatements.sgml
***************
*** 264,269 ****
--- 264,290 ----
       </para>
      </listitem>
     </varlistentry>
+ 
+    <varlistentry>
+     <term>
+      <varname>pg_stat_statements.clean</varname> (<type>enum</type>)
+     </term>
+ 
+     <listitem>
+      <para>
+       <varname>pg_stat_statements.clean</varname> controls how are the
+       stataments preprocessed.
+       Specify <literal>basic</> to replace parameter values with common
+       values (<literal>:n</> for numbers, <literal>:s</> for strings and
+       <literal>:b</> for booleans), <literal>aggressive</> to further
+       clean the queries (remove line endings, replace multiple spaces
+       with a single one), or <literal>none</> to disable.
+       The default value is <literal>none</>.
+       Only superusers can change this setting.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
    </variablelist>
  
    <para>
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to