Edenhill has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/79745


Change subject: Added support for escaping troublesome characters in tag 
content.
......................................................................

Added support for escaping troublesome characters in tag content.

Change-Id: I7888bea7fde6d6d70826c73c83cda3d0d3e1e59c
---
M varnishkafka.c
M varnishkafka.conf.example
M varnishkafka.h
3 files changed, 328 insertions(+), 73 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/operations/software/varnish/varnishkafka 
refs/changes/45/79745/1

diff --git a/varnishkafka.c b/varnishkafka.c
index 308ddeb..d166bc0 100644
--- a/varnishkafka.c
+++ b/varnishkafka.c
@@ -114,12 +114,13 @@
 
        _DBG("%i/%i formats:", conf.fmt_cnt, conf.fmt_size);
        for (i = 0 ; i < conf.fmt_cnt ; i++) {
-               _DBG(" #%-3i  fmt %i (%c)  var \"%s\", def \"%.*s\"",
+               _DBG(" #%-3i  fmt %i (%c)  var \"%s\", def (%i)\"%.*s\"%s",
                     i,
                     conf.fmt[i].id,
-                    isprint(conf.fmt[i].id) ? (char)conf.fmt[i].id : 0,
+                    isprint(conf.fmt[i].id) ? (char)conf.fmt[i].id : ' ',
                     conf.fmt[i].var ? : "",
-                    conf.fmt[i].deflen, conf.fmt[i].def);
+                    conf.fmt[i].deflen, conf.fmt[i].deflen, conf.fmt[i].def,
+                    conf.fmt[i].flags & FMT_F_ESCAPE ? ", escape" : "");
        }
 }
 
@@ -157,6 +158,7 @@
 static int format_add (int fmtr,
                       const char *var, ssize_t varlen,
                       const char *def, ssize_t deflen,
+                      int flags,
                       char *errstr, size_t errstr_size) {
        struct fmt *fmt;
 
@@ -167,9 +169,9 @@
 
        fmt = &conf.fmt[conf.fmt_cnt];
 
-       fmt->id = fmtr;
-       fmt->idx = conf.fmt_cnt;
-
+       fmt->id    = fmtr;
+       fmt->idx   = conf.fmt_cnt;
+       fmt->flags = flags;
        if (var) {
                if (varlen == -1)
                        varlen = strlen(var);
@@ -237,17 +239,29 @@
 }
                     
 
+
+static inline void match_assign0 (const struct tag *tag, struct logline *lp,
+                                 const char *ptr, int len);
+static void match_assign (const struct tag *tag, struct logline *lp,
+                         const char *ptr, int len);
+
+
+
+
 /**
- * Assign 'PTR' of size 'LEN' as a match for 'TAG' in logline 'LP'.
- *
- * 'PTR' must be a pointer to persistent memory:
- *   - either VSL shared memory (original VSL tag payload)
- *   - or to a buffer allocated in the 'LP' scratch buffer.
+ * Returns true if 'ptr' is within 'lp's scratch pad, else false.
  */
-#define MATCH_ASSIGN(TAG,LP,PTR,LEN) do {                      \
-               (LP)->match[(TAG)->fmt->idx].ptr = (PTR);       \
-               (LP)->match[(TAG)->fmt->idx].len = (LEN);       \
-       } while (0)
+static inline int is_scratch_ptr (const struct logline *lp, const char *ptr) {
+       return (lp->scratch <= ptr && ptr < lp->scratch + sizeof(lp->scratch));
+}
+
+/**
+ * Rewinds (deallocates) the last allocation by 'len' bytes.
+ */
+static inline void scratch_rewind (struct logline *lp, int len) {
+       assert(lp->sof >= len);
+       lp->sof -= len;
+}
 
 
 /**
@@ -284,9 +298,86 @@
 
        memcpy(dst, src, len);
 
-       MATCH_ASSIGN(tag, lp, dst, len);
+       match_assign(tag, lp, dst, len);
 
        return len;
+}
+
+
+/**
+ * Writes 'src' of 'len' bytes to scratch buffer, escaping
+ * all unprintable characters as well as the ones defined in 'map' below.
+ * Returns -1 on error.
+ */
+static inline int scratch_write_escaped (const struct tag *tag,
+                                        struct logline *lp,
+                                        const char *src, int len) {
+       static const char *map[256] = {
+               ['\t'] = "\\t",
+               ['\n'] = "\\n",
+               ['\r'] = "\\r",
+               ['\v'] = "\\v",
+               ['\f'] = "\\f",
+               ['"']  = "\\\"",
+               [' ']  = "\\ ",
+       };
+       char *dst;
+       char *dstend;
+       char *d;
+       const char *s, *srcend = src + len;
+
+       /* Allocate initial space for escaped string. */
+       if (unlikely((dst = scratch_alloc(tag, lp, len + 10)) == NULL))
+               return -1;
+
+       dstend = dst + len + 10;
+
+       s = src;
+       d = dst;
+       while (s < srcend) {
+               int outlen = 1;
+               const char *out;
+               char tmp[6];
+
+               if (unlikely((out = map[(int)*s]) != NULL)) {
+                       /* Escape from 'map' */
+                       outlen = 2;
+
+               } else if (unlikely(!isprint(*s))) {
+                       /* Escape non-printables as \<octal> */
+                       sprintf(tmp, "\%04o", (int)*s);
+                       out = tmp;
+                       outlen = 5;
+
+               } else {
+                       /* No escaping */
+                       out = s;
+               }
+
+               /* Increase scratch pad if necessary. */
+               if (unlikely((d + outlen >= dstend))) {
+                       if (unlikely(!scratch_alloc(tag, lp, outlen + 20)))
+                               return -1;
+                       dstend += outlen + 20;
+               }
+
+               if (likely(outlen == 1))
+                       *(d++) = *out;
+               else {
+                       memcpy(d, out, outlen);
+                       d += outlen;
+               }
+
+               s++;
+       }
+
+       /* Rewind scratch pad to reclaim unused memory. */
+       scratch_rewind(lp, (int)(dstend-d));
+
+       /* Assign new matched string */
+       match_assign0(tag, lp, dst, (int)(d-dst));
+
+       return 0;
 }
 
 /**
@@ -311,7 +402,7 @@
        vsnprintf(dst, r+1, fmt, ap);
        va_end(ap);
 
-       MATCH_ASSIGN(tag, lp, dst, r);
+       match_assign(tag, lp, dst, r);
 
        return r;
 }
@@ -322,10 +413,64 @@
 
 
 
+
+static inline void match_assign0 (const struct tag *tag, struct logline *lp,
+                                 const char *ptr, int len) {
+       lp->match[tag->fmt->idx].ptr = ptr;
+       lp->match[tag->fmt->idx].len = len;
+}
+
+
+/**
+ * Assign 'PTR' of size 'LEN' as a match for 'TAG' in logline 'LP'.
+ *
+ * 'PTR' must be a pointer to persistent memory:
+ *   - either VSL shared memory (original VSL tag payload)
+ *   - or to a buffer allocated in the 'LP' scratch buffer.
+ */
+static void match_assign (const struct tag *tag, struct logline *lp,
+                         const char *ptr, int len) {
+
+       if (unlikely(tag->fmt->flags & FMT_F_ESCAPE)) {
+               /* If 'ptr' is in the scratch pad; rewind the scratch pad
+                * since we'll be re-writing the string escaped. */
+               if (is_scratch_ptr(lp, ptr)) {
+                       ptr = strndupa(ptr, len);
+                       scratch_rewind(lp, len);
+               }
+               scratch_write_escaped(tag, lp, ptr, len);
+
+       } else {
+               match_assign0(tag, lp, ptr, len);
+       }
+}
+
+
+
 static char *strnchr (const char *s, int len, int c) {
        const char *end = s + len;
        while (s < end) {
                if (*s == c)
+                       return (char *)s;
+               s++;
+       }
+
+       return NULL;
+}
+
+
+/**
+ * Looks for any matching character from 'match' in 's' and returns
+ * a pointer to the first match, or NULL if none of 'match' matched 's'.
+ */
+static char *strnchrs (const char *s, int len, const char *match) {
+       const char *end = s + len;
+       char map[256] = {};
+       while (*match)
+               map[(int)*(match++)] = 1;
+       
+       while (s < end) {
+               if (map[(int)*s])
                        return (char *)s;
                s++;
        }
@@ -393,7 +538,7 @@
        if (slen == deflen && !strncmp(s, "default", slen))
                column_get(2, ' ', ptr, len, &s, &slen);
 
-       MATCH_ASSIGN(tag, lp, s, slen);
+       match_assign(tag, lp, s, slen);
 
        return 0;
 }      
@@ -406,7 +551,7 @@
        if ((qs = strnchr(ptr, len, '?')))
                slen = (int)(qs - ptr);
 
-       MATCH_ASSIGN(tag, lp, ptr, slen);
+       match_assign(tag, lp, ptr, slen);
        return slen;
 }
 
@@ -420,7 +565,7 @@
 
        slen = len - (int)(qs - ptr);
 
-       MATCH_ASSIGN(tag, lp, qs, slen);
+       match_assign(tag, lp, qs, slen);
        return slen;
 }
 
@@ -447,7 +592,7 @@
 
        tlen = strftime(dst, timelen, timefmt, &tm);
 
-       MATCH_ASSIGN(tag, lp, dst, tlen);
+       match_assign(tag, lp, dst, tlen);
 
        return tlen;
 }
@@ -486,12 +631,12 @@
 static int parse_hitmiss (const struct tag *tag, struct logline *lp,
                          const char *ptr, int len) {
        if (len == 3 && !strncmp(ptr, "hit", 3)) {
-               MATCH_ASSIGN(tag, lp, ptr, len);
+               match_assign(tag, lp, ptr, len);
                return len;
        } else if (len == 4 &&
                 (!strncmp(ptr, "miss", 4) ||
                  !strncmp(ptr, "pass", 4))) {
-               MATCH_ASSIGN(tag, lp, "miss", 4);
+               match_assign(tag, lp, "miss", 4);
                return 4;
        }
 
@@ -503,7 +648,7 @@
        if ((len == 3 && !strncmp(ptr, "hit", 3)) ||
            (len == 4 && (!strncmp(ptr, "miss", 4) ||
                          !strncmp(ptr, "pass", 4)))) {
-               MATCH_ASSIGN(tag, lp, ptr, len);
+               match_assign(tag, lp, ptr, len);
                return len;
        }
        return 0;
@@ -718,6 +863,7 @@
                int deflen = -1;
                int fmtid;
                int i;
+               int flags = 0;
 
                if (*s != '%') {
                        s++;
@@ -730,7 +876,7 @@
                        if (format_add(0,
                                       NULL, 0,
                                       t, (int)(s - t),
-                                      errstr, errstr_size) == -1)
+                                      0, errstr, errstr_size) == -1)
                                return -1;
 
                begin = s;
@@ -738,11 +884,23 @@
 
                /* Parse '{VAR}X': '*s' will be set to X, and 'var' to VAR.
                 * varnishkafka also adds the following features:
-                *  VAR?DEF,   where DEF is a default value, in this mode
+                *
+                *  VAR?DEF    where DEF is a default value, in this mode
                 *             VAR can be empty, and {?DEF} may be applied to
                 *             any formatter.
                 *             I.e.: %{Content-type?text/html}o
                 *                   %{?no-user}u
+                *
+                *  VAR!OPTION Various formatting options, see below.
+                *
+                * Where OPTION is one of:
+                *  escape     Escape rogue characters in the value.
+                *             VAR can be empty and {!escape} may be applied to
+                *             any formatter.
+                *             I.e. %{User-Agent!escape}i
+                *                  %{?nouser!escape}u
+                *
+                * ?DEF and !OPTIONs can be combined.
                 */
                if (*s == '{') {
                        const char *a = s+1;
@@ -773,13 +931,53 @@
 
                        var = a;
 
-                       if ((q = strnchr(a, (int)(b-a), '?'))) {
-                               /* "VAR?DEF" */
-                               def = q+1;
-                               deflen = (int)(b - def);
+                       /* Check for ?DEF and !OPTIONs */
+                       if ((q = strnchrs(a, (int)(b-a), "?!"))) {
+                               const char *q2 = q;
+
                                varlen = (int)(q - a);
                                if (varlen == 0)
                                        var = NULL;
+
+                               /* Scan all ?DEF and !OPTIONs */
+                               do {
+                                       int qlen;
+
+                                       q++;
+
+                                       if ((q2 = strnchrs(q, (int)(b-q2-1),
+                                                          "?!")))
+                                               qlen = (int)(q2-q);
+                                       else
+                                               qlen = (int)(b-q);
+
+                                       switch (*(q-1))
+                                       {
+                                       case '?':
+                                               def = q;
+                                               deflen = qlen;
+                                               break;
+                                       case '!':
+                                               if (!strncasecmp(q, "escape",
+                                                                qlen))
+                                                       flags |= FMT_F_ESCAPE;
+                                               else {
+                                                       snprintf(errstr,
+                                                                errstr_size,
+                                                                "Unknown "
+                                                                "formatter "
+                                                                "option "
+                                                                "\"%.*s\" at "
+                                                                "\"%.*s...\"",
+                                                                qlen, q,
+                                                                30, a);
+                                                       return -1;
+                                               }
+                                               break;
+                                       }
+
+                               } while ((q = q2));
+
                        } else
                                varlen = (int)(b-a);                    
 
@@ -797,7 +995,7 @@
                        def = map[(int)*s].def;
 
                /* Add formatter to ordered list of formatters */
-               if ((fmtid = format_add(*s, var, varlen, def, deflen,
+               if ((fmtid = format_add(*s, var, varlen, def, deflen, flags,
                                        errstr, errstr_size)) == -1)
                        return -1;
 
@@ -841,7 +1039,7 @@
         *      ^---^  add this part as verbatim string */
        if (s > t)
                if (format_add(0, NULL, 0,
-                              t, (int)(s - t),
+                              t, (int)(s - t), 0,
                               errstr, errstr_size) == -1)
                        return -1;
 
@@ -1069,8 +1267,9 @@
                        
                } else {
                        /* Fallback to verbatim field. */
-                       MATCH_ASSIGN(tag, lp, ptr2, len2);
+                       match_assign(tag, lp, ptr2, len2);
                }
+
        }
 
        /* Request end: render the match string. */
diff --git a/varnishkafka.conf.example b/varnishkafka.conf.example
index 81b6717..ca85998 100644
--- a/varnishkafka.conf.example
+++ b/varnishkafka.conf.example
@@ -1,26 +1,80 @@
-######################################################################
-#                                                                   #
-#              varnishkafka configuration file                      #
-#                                                                   #
-#                                                                   #
-######################################################################
-#                                                                   #
-# Format:                                                           #
-# <property-name> = <value>                                         #
-#                                                                   #
-# boolean properties:                                               #
-#   >0, "true", "yes", "on" - interpreted as true                   #
-#  everything else          - interpreted as false                  #
-#                                                                   #
-######################################################################
+#######################################################################
+#                                                                     #
+#                varnishkafka configuration file                      #
+#                                                                     #
+#                                                                     #
+#######################################################################
+#                                                                     #
+# Syntax:                                                             #
+# <property-name> = <value>                                           #
+#                                                                     #
+# Boolean property values:                                            #
+#   >0, "true", "yes", "on" - interpreted as true                     #
+#  everything else          - interpreted as false                    #
+#                                                                     #
+#######################################################################
+                                                                      #
+                                                                      #
+                                                                      #
+#######################################################################
+#                                                                     #
+# Varnish log formatting                                              #
+#                                                                     #
+# One of:                                                             #
+#  format.string   - ASCII string output (format string)              #
+#  format.json     - JSON output (n:v, n2:v2, format string)          #
+#  format.avro     - AVRO output (schema file)                        #
+#  format.protobuf - Google Protocol Buffer output (true)             #
+#  format.nlv      - Binary Name Length Value (field specifiers)      #
+#                                                                     #
+#                                                                     #
+#                                                                     #
+#                                                                     #
+#  %X                                                                 #
+#   where 'X' is one of the standard varnishncsa(1) formatters.       #
+#   Example: %u                                                       #
+#                                                                     #
+#                                                                     #
+#  %{VAR}X                                                            #
+#    Name-Value tokens where X is 'x', 'i' or 'o' and 'VAR' is the    #
+#    Name to extract the value for.                                   #
+#    Example: %{User-Agent}i                                          #
+#                                                                     #
+#                                                                     #
+#  %{?DEFAULT!OPTION!OPTION..}X                                       #
+#    where 'X' is any formatter,                                      #
+#                                                                     #
+#    'DEFAULT' is the default string to use if no tag was matched,    #
+#     the default default string is "-".                              #
+#                                                                     #
+#    'OPTION' is one of the formatting options:                       #
+#        escape - escape non-printable characters to \<octalcode>     #
+#                 and \t\n\r\v\f " to their canonical                 #
+#                 backslashed notations (\t\n\r\v\f\"\ ).             #
+#                                                                     #
+#    This syntax can be combined with %{VAR}X.                        #
+#    Example: %{User-Agent?Mozilla!escape}i                           #
+#             %{?nouser}u                                             #
+#             %{!escape}q                                             #
+#                                                                     #
+#                                                                     #
+#  Non %-prefixed strings are copied verbatim to the                  #
+#  output log string.                                                 #
+#    Example: "User: %u;"   would render "User: snaps;"               #
+#                                                                     #
+#                                                                     #
+#######################################################################
+format.string = %l        %n        %t        %{Varnish:time_firstbyte}x       
 %h        %{Varnish:handling}x/%s        %b        %m        
http://%{Host}i%U%q        -        %{Content-Type}o        %{Referer}i        
%{X-Forwarded-For}i        %{User-agent!escape}i        %{Accept-Language}i     
   %{X-Analytics}o
+
+
 
 # Where to output varnish log lines:
 #  kafka  - (default) send to kafka broker
 #  stdout - just print to stdout (behave like varnishncsa)
 output = kafka
+output = stdout
 
-# Varnish log formatting
-format = %l    %n      %t      %{Varnish:time_firstbyte}x      %h      
%{Varnish:handling}x/%s %b      %m      http://%{Host}i%U%q     -       
%{Content-Type}o        %{Referer}i     %{X-Forwarded-For}i     %{User-agent}i  
%{Accept-Language}i     %{X-Analytics}o
+
 
 # Start for sequence number (%n)
 # Either a number, or the string "time" which will set it to the current
@@ -38,7 +92,7 @@
 #
 
 # varnishkafka log level (1 = emergencies .. 7 = debug)
-log.level = 6
+log.level = 7
 
 # specify log output
 log.stderr = true
@@ -55,16 +109,16 @@
 #vaktl.path = /var/run/varnishkafka.vaktl
 
 
-######################################################################
-#                                                                   #
-# Standard varnish VSL command line arguments                       #
-#                                                                   #
-# Format:                                                            #
-#  varnish.arg.<c> = <value>, where <c> is a command line option.    #
-#                                                                   #
-# See varnishncsa(1) and varnishlog(1) for valid options.           #
-#                                                                   #
-######################################################################
+#######################################################################
+#                                                                     #
+# Standard varnish VSL command line arguments                         #
+#                                                                     #
+# Syntax:                                                             #
+#  varnish.arg.<c> = <value>, where <c> is a command line option.     #
+#                                                                     #
+# See varnishncsa(1) and varnishlog(1) for valid options.             #
+#                                                                     #
+#######################################################################
 
 # -m tag:regex
 varnish.arg.m = RxRequest:^(?!PURGE$)
@@ -78,18 +132,18 @@
 #varnish.arg.n = frontend
 
 
-######################################################################
-#                                                                   #
-# Kafka configuration                                                #
-#                                                                   #
-# For the full range of Kafka handle and topic configuration         #
-# properties, see:                                                  #
-#  https://github.com/edenhill/librdkafka/blob/0.8-wip/rdkafka.h     #
-#                                                                   #
-# And the Apache Kafka configuration reference:                             #
-#  http://kafka.apache.org/08/configuration.html                     #
-#                                                                   #
-######################################################################
+#######################################################################
+#                                                                     #
+# Kafka configuration                                                 #
+#                                                                     #
+# For the full range of Kafka handle and topic configuration          #
+# properties, see:                                                    #
+#  https://github.com/edenhill/librdkafka/blob/0.8-wip/rdkafka.h      #
+#                                                                     #
+# And the Apache Kafka configuration reference:                       #
+#  http://kafka.apache.org/08/configuration.html                      #
+#                                                                     #
+#######################################################################
 
 # Initial list of kafka brokers
 metadata.broker.list = localhost:9091
diff --git a/varnishkafka.h b/varnishkafka.h
index 82b4c8d..f0ab0fc 100644
--- a/varnishkafka.h
+++ b/varnishkafka.h
@@ -93,6 +93,8 @@
        const char *var;  /* variable name  (for %{..}x,i,o) */
        const char *def;  /* default string, typically "-" */
        int   deflen;
+       int   flags;
+#define FMT_F_ESCAPE    0x1 /* Escape the value string */
 };
 
 

-- 
To view, visit https://gerrit.wikimedia.org/r/79745
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7888bea7fde6d6d70826c73c83cda3d0d3e1e59c
Gerrit-PatchSet: 1
Gerrit-Project: operations/software/varnish/varnishkafka
Gerrit-Branch: master
Gerrit-Owner: Edenhill <mag...@edenhill.se>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to