Edenhill has uploaded a new change for review. https://gerrit.wikimedia.org/r/79745
Change subject: Added support for escaping troublesome characters in tag content. ...................................................................... Added support for escaping troublesome characters in tag content. Change-Id: I7888bea7fde6d6d70826c73c83cda3d0d3e1e59c --- M varnishkafka.c M varnishkafka.conf.example M varnishkafka.h 3 files changed, 328 insertions(+), 73 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/software/varnish/varnishkafka refs/changes/45/79745/1 diff --git a/varnishkafka.c b/varnishkafka.c index 308ddeb..d166bc0 100644 --- a/varnishkafka.c +++ b/varnishkafka.c @@ -114,12 +114,13 @@ _DBG("%i/%i formats:", conf.fmt_cnt, conf.fmt_size); for (i = 0 ; i < conf.fmt_cnt ; i++) { - _DBG(" #%-3i fmt %i (%c) var \"%s\", def \"%.*s\"", + _DBG(" #%-3i fmt %i (%c) var \"%s\", def (%i)\"%.*s\"%s", i, conf.fmt[i].id, - isprint(conf.fmt[i].id) ? (char)conf.fmt[i].id : 0, + isprint(conf.fmt[i].id) ? (char)conf.fmt[i].id : ' ', conf.fmt[i].var ? : "", - conf.fmt[i].deflen, conf.fmt[i].def); + conf.fmt[i].deflen, conf.fmt[i].deflen, conf.fmt[i].def, + conf.fmt[i].flags & FMT_F_ESCAPE ? ", escape" : ""); } } @@ -157,6 +158,7 @@ static int format_add (int fmtr, const char *var, ssize_t varlen, const char *def, ssize_t deflen, + int flags, char *errstr, size_t errstr_size) { struct fmt *fmt; @@ -167,9 +169,9 @@ fmt = &conf.fmt[conf.fmt_cnt]; - fmt->id = fmtr; - fmt->idx = conf.fmt_cnt; - + fmt->id = fmtr; + fmt->idx = conf.fmt_cnt; + fmt->flags = flags; if (var) { if (varlen == -1) varlen = strlen(var); @@ -237,17 +239,29 @@ } + +static inline void match_assign0 (const struct tag *tag, struct logline *lp, + const char *ptr, int len); +static void match_assign (const struct tag *tag, struct logline *lp, + const char *ptr, int len); + + + + /** - * Assign 'PTR' of size 'LEN' as a match for 'TAG' in logline 'LP'. - * - * 'PTR' must be a pointer to persistent memory: - * - either VSL shared memory (original VSL tag payload) - * - or to a buffer allocated in the 'LP' scratch buffer. + * Returns true if 'ptr' is within 'lp's scratch pad, else false. */ -#define MATCH_ASSIGN(TAG,LP,PTR,LEN) do { \ - (LP)->match[(TAG)->fmt->idx].ptr = (PTR); \ - (LP)->match[(TAG)->fmt->idx].len = (LEN); \ - } while (0) +static inline int is_scratch_ptr (const struct logline *lp, const char *ptr) { + return (lp->scratch <= ptr && ptr < lp->scratch + sizeof(lp->scratch)); +} + +/** + * Rewinds (deallocates) the last allocation by 'len' bytes. + */ +static inline void scratch_rewind (struct logline *lp, int len) { + assert(lp->sof >= len); + lp->sof -= len; +} /** @@ -284,9 +298,86 @@ memcpy(dst, src, len); - MATCH_ASSIGN(tag, lp, dst, len); + match_assign(tag, lp, dst, len); return len; +} + + +/** + * Writes 'src' of 'len' bytes to scratch buffer, escaping + * all unprintable characters as well as the ones defined in 'map' below. + * Returns -1 on error. + */ +static inline int scratch_write_escaped (const struct tag *tag, + struct logline *lp, + const char *src, int len) { + static const char *map[256] = { + ['\t'] = "\\t", + ['\n'] = "\\n", + ['\r'] = "\\r", + ['\v'] = "\\v", + ['\f'] = "\\f", + ['"'] = "\\\"", + [' '] = "\\ ", + }; + char *dst; + char *dstend; + char *d; + const char *s, *srcend = src + len; + + /* Allocate initial space for escaped string. */ + if (unlikely((dst = scratch_alloc(tag, lp, len + 10)) == NULL)) + return -1; + + dstend = dst + len + 10; + + s = src; + d = dst; + while (s < srcend) { + int outlen = 1; + const char *out; + char tmp[6]; + + if (unlikely((out = map[(int)*s]) != NULL)) { + /* Escape from 'map' */ + outlen = 2; + + } else if (unlikely(!isprint(*s))) { + /* Escape non-printables as \<octal> */ + sprintf(tmp, "\%04o", (int)*s); + out = tmp; + outlen = 5; + + } else { + /* No escaping */ + out = s; + } + + /* Increase scratch pad if necessary. */ + if (unlikely((d + outlen >= dstend))) { + if (unlikely(!scratch_alloc(tag, lp, outlen + 20))) + return -1; + dstend += outlen + 20; + } + + if (likely(outlen == 1)) + *(d++) = *out; + else { + memcpy(d, out, outlen); + d += outlen; + } + + s++; + } + + /* Rewind scratch pad to reclaim unused memory. */ + scratch_rewind(lp, (int)(dstend-d)); + + /* Assign new matched string */ + match_assign0(tag, lp, dst, (int)(d-dst)); + + return 0; } /** @@ -311,7 +402,7 @@ vsnprintf(dst, r+1, fmt, ap); va_end(ap); - MATCH_ASSIGN(tag, lp, dst, r); + match_assign(tag, lp, dst, r); return r; } @@ -322,10 +413,64 @@ + +static inline void match_assign0 (const struct tag *tag, struct logline *lp, + const char *ptr, int len) { + lp->match[tag->fmt->idx].ptr = ptr; + lp->match[tag->fmt->idx].len = len; +} + + +/** + * Assign 'PTR' of size 'LEN' as a match for 'TAG' in logline 'LP'. + * + * 'PTR' must be a pointer to persistent memory: + * - either VSL shared memory (original VSL tag payload) + * - or to a buffer allocated in the 'LP' scratch buffer. + */ +static void match_assign (const struct tag *tag, struct logline *lp, + const char *ptr, int len) { + + if (unlikely(tag->fmt->flags & FMT_F_ESCAPE)) { + /* If 'ptr' is in the scratch pad; rewind the scratch pad + * since we'll be re-writing the string escaped. */ + if (is_scratch_ptr(lp, ptr)) { + ptr = strndupa(ptr, len); + scratch_rewind(lp, len); + } + scratch_write_escaped(tag, lp, ptr, len); + + } else { + match_assign0(tag, lp, ptr, len); + } +} + + + static char *strnchr (const char *s, int len, int c) { const char *end = s + len; while (s < end) { if (*s == c) + return (char *)s; + s++; + } + + return NULL; +} + + +/** + * Looks for any matching character from 'match' in 's' and returns + * a pointer to the first match, or NULL if none of 'match' matched 's'. + */ +static char *strnchrs (const char *s, int len, const char *match) { + const char *end = s + len; + char map[256] = {}; + while (*match) + map[(int)*(match++)] = 1; + + while (s < end) { + if (map[(int)*s]) return (char *)s; s++; } @@ -393,7 +538,7 @@ if (slen == deflen && !strncmp(s, "default", slen)) column_get(2, ' ', ptr, len, &s, &slen); - MATCH_ASSIGN(tag, lp, s, slen); + match_assign(tag, lp, s, slen); return 0; } @@ -406,7 +551,7 @@ if ((qs = strnchr(ptr, len, '?'))) slen = (int)(qs - ptr); - MATCH_ASSIGN(tag, lp, ptr, slen); + match_assign(tag, lp, ptr, slen); return slen; } @@ -420,7 +565,7 @@ slen = len - (int)(qs - ptr); - MATCH_ASSIGN(tag, lp, qs, slen); + match_assign(tag, lp, qs, slen); return slen; } @@ -447,7 +592,7 @@ tlen = strftime(dst, timelen, timefmt, &tm); - MATCH_ASSIGN(tag, lp, dst, tlen); + match_assign(tag, lp, dst, tlen); return tlen; } @@ -486,12 +631,12 @@ static int parse_hitmiss (const struct tag *tag, struct logline *lp, const char *ptr, int len) { if (len == 3 && !strncmp(ptr, "hit", 3)) { - MATCH_ASSIGN(tag, lp, ptr, len); + match_assign(tag, lp, ptr, len); return len; } else if (len == 4 && (!strncmp(ptr, "miss", 4) || !strncmp(ptr, "pass", 4))) { - MATCH_ASSIGN(tag, lp, "miss", 4); + match_assign(tag, lp, "miss", 4); return 4; } @@ -503,7 +648,7 @@ if ((len == 3 && !strncmp(ptr, "hit", 3)) || (len == 4 && (!strncmp(ptr, "miss", 4) || !strncmp(ptr, "pass", 4)))) { - MATCH_ASSIGN(tag, lp, ptr, len); + match_assign(tag, lp, ptr, len); return len; } return 0; @@ -718,6 +863,7 @@ int deflen = -1; int fmtid; int i; + int flags = 0; if (*s != '%') { s++; @@ -730,7 +876,7 @@ if (format_add(0, NULL, 0, t, (int)(s - t), - errstr, errstr_size) == -1) + 0, errstr, errstr_size) == -1) return -1; begin = s; @@ -738,11 +884,23 @@ /* Parse '{VAR}X': '*s' will be set to X, and 'var' to VAR. * varnishkafka also adds the following features: - * VAR?DEF, where DEF is a default value, in this mode + * + * VAR?DEF where DEF is a default value, in this mode * VAR can be empty, and {?DEF} may be applied to * any formatter. * I.e.: %{Content-type?text/html}o * %{?no-user}u + * + * VAR!OPTION Various formatting options, see below. + * + * Where OPTION is one of: + * escape Escape rogue characters in the value. + * VAR can be empty and {!escape} may be applied to + * any formatter. + * I.e. %{User-Agent!escape}i + * %{?nouser!escape}u + * + * ?DEF and !OPTIONs can be combined. */ if (*s == '{') { const char *a = s+1; @@ -773,13 +931,53 @@ var = a; - if ((q = strnchr(a, (int)(b-a), '?'))) { - /* "VAR?DEF" */ - def = q+1; - deflen = (int)(b - def); + /* Check for ?DEF and !OPTIONs */ + if ((q = strnchrs(a, (int)(b-a), "?!"))) { + const char *q2 = q; + varlen = (int)(q - a); if (varlen == 0) var = NULL; + + /* Scan all ?DEF and !OPTIONs */ + do { + int qlen; + + q++; + + if ((q2 = strnchrs(q, (int)(b-q2-1), + "?!"))) + qlen = (int)(q2-q); + else + qlen = (int)(b-q); + + switch (*(q-1)) + { + case '?': + def = q; + deflen = qlen; + break; + case '!': + if (!strncasecmp(q, "escape", + qlen)) + flags |= FMT_F_ESCAPE; + else { + snprintf(errstr, + errstr_size, + "Unknown " + "formatter " + "option " + "\"%.*s\" at " + "\"%.*s...\"", + qlen, q, + 30, a); + return -1; + } + break; + } + + } while ((q = q2)); + } else varlen = (int)(b-a); @@ -797,7 +995,7 @@ def = map[(int)*s].def; /* Add formatter to ordered list of formatters */ - if ((fmtid = format_add(*s, var, varlen, def, deflen, + if ((fmtid = format_add(*s, var, varlen, def, deflen, flags, errstr, errstr_size)) == -1) return -1; @@ -841,7 +1039,7 @@ * ^---^ add this part as verbatim string */ if (s > t) if (format_add(0, NULL, 0, - t, (int)(s - t), + t, (int)(s - t), 0, errstr, errstr_size) == -1) return -1; @@ -1069,8 +1267,9 @@ } else { /* Fallback to verbatim field. */ - MATCH_ASSIGN(tag, lp, ptr2, len2); + match_assign(tag, lp, ptr2, len2); } + } /* Request end: render the match string. */ diff --git a/varnishkafka.conf.example b/varnishkafka.conf.example index 81b6717..ca85998 100644 --- a/varnishkafka.conf.example +++ b/varnishkafka.conf.example @@ -1,26 +1,80 @@ -###################################################################### -# # -# varnishkafka configuration file # -# # -# # -###################################################################### -# # -# Format: # -# <property-name> = <value> # -# # -# boolean properties: # -# >0, "true", "yes", "on" - interpreted as true # -# everything else - interpreted as false # -# # -###################################################################### +####################################################################### +# # +# varnishkafka configuration file # +# # +# # +####################################################################### +# # +# Syntax: # +# <property-name> = <value> # +# # +# Boolean property values: # +# >0, "true", "yes", "on" - interpreted as true # +# everything else - interpreted as false # +# # +####################################################################### + # + # + # +####################################################################### +# # +# Varnish log formatting # +# # +# One of: # +# format.string - ASCII string output (format string) # +# format.json - JSON output (n:v, n2:v2, format string) # +# format.avro - AVRO output (schema file) # +# format.protobuf - Google Protocol Buffer output (true) # +# format.nlv - Binary Name Length Value (field specifiers) # +# # +# # +# # +# # +# %X # +# where 'X' is one of the standard varnishncsa(1) formatters. # +# Example: %u # +# # +# # +# %{VAR}X # +# Name-Value tokens where X is 'x', 'i' or 'o' and 'VAR' is the # +# Name to extract the value for. # +# Example: %{User-Agent}i # +# # +# # +# %{?DEFAULT!OPTION!OPTION..}X # +# where 'X' is any formatter, # +# # +# 'DEFAULT' is the default string to use if no tag was matched, # +# the default default string is "-". # +# # +# 'OPTION' is one of the formatting options: # +# escape - escape non-printable characters to \<octalcode> # +# and \t\n\r\v\f " to their canonical # +# backslashed notations (\t\n\r\v\f\"\ ). # +# # +# This syntax can be combined with %{VAR}X. # +# Example: %{User-Agent?Mozilla!escape}i # +# %{?nouser}u # +# %{!escape}q # +# # +# # +# Non %-prefixed strings are copied verbatim to the # +# output log string. # +# Example: "User: %u;" would render "User: snaps;" # +# # +# # +####################################################################### +format.string = %l %n %t %{Varnish:time_firstbyte}x %h %{Varnish:handling}x/%s %b %m http://%{Host}i%U%q - %{Content-Type}o %{Referer}i %{X-Forwarded-For}i %{User-agent!escape}i %{Accept-Language}i %{X-Analytics}o + + # Where to output varnish log lines: # kafka - (default) send to kafka broker # stdout - just print to stdout (behave like varnishncsa) output = kafka +output = stdout -# Varnish log formatting -format = %l %n %t %{Varnish:time_firstbyte}x %h %{Varnish:handling}x/%s %b %m http://%{Host}i%U%q - %{Content-Type}o %{Referer}i %{X-Forwarded-For}i %{User-agent}i %{Accept-Language}i %{X-Analytics}o + # Start for sequence number (%n) # Either a number, or the string "time" which will set it to the current @@ -38,7 +92,7 @@ # # varnishkafka log level (1 = emergencies .. 7 = debug) -log.level = 6 +log.level = 7 # specify log output log.stderr = true @@ -55,16 +109,16 @@ #vaktl.path = /var/run/varnishkafka.vaktl -###################################################################### -# # -# Standard varnish VSL command line arguments # -# # -# Format: # -# varnish.arg.<c> = <value>, where <c> is a command line option. # -# # -# See varnishncsa(1) and varnishlog(1) for valid options. # -# # -###################################################################### +####################################################################### +# # +# Standard varnish VSL command line arguments # +# # +# Syntax: # +# varnish.arg.<c> = <value>, where <c> is a command line option. # +# # +# See varnishncsa(1) and varnishlog(1) for valid options. # +# # +####################################################################### # -m tag:regex varnish.arg.m = RxRequest:^(?!PURGE$) @@ -78,18 +132,18 @@ #varnish.arg.n = frontend -###################################################################### -# # -# Kafka configuration # -# # -# For the full range of Kafka handle and topic configuration # -# properties, see: # -# https://github.com/edenhill/librdkafka/blob/0.8-wip/rdkafka.h # -# # -# And the Apache Kafka configuration reference: # -# http://kafka.apache.org/08/configuration.html # -# # -###################################################################### +####################################################################### +# # +# Kafka configuration # +# # +# For the full range of Kafka handle and topic configuration # +# properties, see: # +# https://github.com/edenhill/librdkafka/blob/0.8-wip/rdkafka.h # +# # +# And the Apache Kafka configuration reference: # +# http://kafka.apache.org/08/configuration.html # +# # +####################################################################### # Initial list of kafka brokers metadata.broker.list = localhost:9091 diff --git a/varnishkafka.h b/varnishkafka.h index 82b4c8d..f0ab0fc 100644 --- a/varnishkafka.h +++ b/varnishkafka.h @@ -93,6 +93,8 @@ const char *var; /* variable name (for %{..}x,i,o) */ const char *def; /* default string, typically "-" */ int deflen; + int flags; +#define FMT_F_ESCAPE 0x1 /* Escape the value string */ }; -- To view, visit https://gerrit.wikimedia.org/r/79745 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7888bea7fde6d6d70826c73c83cda3d0d3e1e59c Gerrit-PatchSet: 1 Gerrit-Project: operations/software/varnish/varnishkafka Gerrit-Branch: master Gerrit-Owner: Edenhill <mag...@edenhill.se> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits