BBlack has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391216 )
Change subject: [WIP] normalize_path: fully normalize MW+RB URL paths ...................................................................... [WIP] normalize_path: fully normalize MW+RB URL paths Haven't even compiled this yet, but I think it's right in theory Bug: T127387 Change-Id: I5887c54a9295e6a344911621b2963c9df9ad4e24 --- M modules/varnish/templates/normalize_path.inc.vcl.erb 1 file changed, 158 insertions(+), 78 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/16/391216/1 diff --git a/modules/varnish/templates/normalize_path.inc.vcl.erb b/modules/varnish/templates/normalize_path.inc.vcl.erb index 98e4242..1c5895d 100644 --- a/modules/varnish/templates/normalize_path.inc.vcl.erb +++ b/modules/varnish/templates/normalize_path.inc.vcl.erb @@ -1,92 +1,172 @@ C{ + +/******************************************************************************* + * URL-Path (not query!) normalization: + * + * For MediaWiki's purposes, the 256 characters can be divided into two sets + * named Always-Decode and Always-Encode, which means every path has exactly + * one canonical encoding: + * + * Always-Decode: + * Unreserved Set (RFC 3986): 0-9 A-Z a-z - . _ ~ + * MW-specific from wfUrlencode: ! $ ( ) * , : ; / @ + * Always-Encode: + * Unprintables and space: 0x00-0x20 0x7F-0xFF + * MediaWiki disallowed title chars: # < > [ ] | { } + * Observed canonical encodes: " % & ' + = \ ^ ` ? + * + * Additional notes:------------------- + * Canonical form for percent-encoding hex digits is uppercase. + * "Observed canonical encodes" - tested live WP titles containing these + * characters, observed MW rel=canonical uses the percent-encoded form. + * Won't ever actually get encoded: + * space ( ) - Can't be transmitted in HTTP request anyways + * question (?) - Starts query part, used to delimit path below + * Literal Percent (%) - Obviously, only encode if not followed by hex digits + * + * Restbase:------------------- + * Believed to use MW encoding rules above, but has a special exception for + * forward-slash: We can neither encode nor decode either form of the + * forward-slash for RB; it must be preserved. This is because RB needs + * forward-slashes from MediaWiki titles to be in %27 form, but still needs + * its own functional path-delimiting slashes unencoded. + ******************************************************************************/ + +#include <inttypes.h> #include <string.h> -/* DIY hexadecimal conversion, since it is simple enough for a fixed - * width, and all the relevant standard C library functions promise to - * malfunction if the locale is set to anything other than "C" - */ -#define NP_HEX_DIGIT(c) ( \ - (c) >= '0' && (c) <= '9' ? (c) - '0' : ( \ - (c) >= 'A' && (c) <= 'F' ? (c) - 'A' + 0x0a : ( \ - (c) >= 'a' && (c) <= 'f' ? (c) - 'a' + 0x0a : -1 ) ) ) -#define NP_IS_HEX(c) (NP_HEX_DIGIT(c) != -1) -#define NP_HEXCHAR(c1, c2) (char)( (NP_HEX_DIGIT(c1) << 4) | NP_HEX_DIGIT(c2) ) +static const uintptr_t decoder_ring[256] = { + // 0x00-0x1F (all unprintable) + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + 0,1,0,0,1,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0, + //@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1, + //` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ <DEL> + 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0 + // 0x80-0xFF (all unprintable) + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +}; -void raw_normalize_path(const struct vrt_ctx *ctx, const int doslash) { - /* Rewrite the path part of the URL, replacing unnecessarily escaped - * punctuation with the actual characters. The character list is from - * MediaWiki's wfUrlencode(), so the URLs produced here will be the same as - * the ones produced by MediaWiki in href attributes. Doing this reduces - * cache fragmentation and fixes T29935, i.e. stale cache entries due to - * MediaWiki purging only the wfUrlencode'd version of the URL. - */ - const char * url = VRT_r_req_url(ctx); - size_t i, outPos; - const size_t urlLength = strlen(url); - // index for the last position %XX can start at: - const size_t lastConvertIdx = urlLength > 2 ? urlLength - 3 : 0; - char c; - int dirty = 0; - - /* Allocate destination memory from the stack using the C99 - * variable-length automatic feature. We know the length in advance - * because this function can only shorten the input string. - */ - char destBuffer[urlLength + 1]; - if (url) { - for (i = 0, outPos = 0; i < urlLength; i++) { - if (i <= lastConvertIdx && url[i] == '%' && NP_IS_HEX(url[i+1]) && NP_IS_HEX(url[i+2])) { - c = NP_HEXCHAR(url[i+1], url[i+2]); - if (c == ';' - || c == '@' - || c == '$' - || c == '!' - || c == '*' - || c == '(' - || c == ')' - || c == ',' - || c == ':' - || (doslash && c == '/')) - { - destBuffer[outPos++] = c; - dirty = 1; - i += 2; - } else { - destBuffer[outPos++] = url[i]; - } - } else if (url[i] == '?') { - /* Reached the query part. Just copy the rest of the URL - * to the destination. - */ - memcpy(destBuffer + outPos, url + i, sizeof(char) * (urlLength - i)); - outPos += urlLength - i; - i = urlLength; - } else { - destBuffer[outPos++] = url[i]; - } - } - destBuffer[outPos] = '\0'; - - /* Set req.url. This will copy our stack buffer into the workspace. - * VRT_l_req_url() is varadic, and concatenates its arguments. The - * vrt_magic_string_end marks the end of the list. - */ - if (dirty) { - VRT_l_req_url(ctx, destBuffer, vrt_magic_string_end); - } - } +static const uintptr_t hex_finder[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ... + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, + //@ A B C D E F G ... + 0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + //` a b c d e f g ... + 0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } -#undef NP_IS_HEX -#undef NP_HEX_DIGIT -#undef NP_HEXCHAR +static const uintptr_t hex_dec[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0, + 0,10,11,12,13,14,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,10,11,12,13,14,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +} + +static const char hex_enc[16] = { + '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F' +} + +#define DO_DECODE(_c) (decoder_ring[(uint8_t)(_c)]) +#define IS_HEX(_c) (hex_finder[(uint8_t)(_c)]) +#define HEX_DECODE(_c1,_c2) \ + (hex_dec[(uint8_t)(_c1)] << 4 | hex_dec[(uint8_t)(_c2)]) +#define HEX_ENC_TOP(_c) (hex_enc[((uint8_t)(_c)) >> 4]) +#define HEX_ENC_BOTTOM(_c) (hex_enc[((uint8_t)(_c)) & 0xF]) + +void raw_normalize_path(const struct vrt_ctx *ctx, const int doslash) { + const char* vrt_url = VRT_r_req_url(ctx); + assert(url); + const size_t in_url_len = strlen(url); + + // Copy input URL to new array with double-NUL termination (this allows + // us to not worry about running off the end when checking for %XX hex + // encodings): + char in_url[in_url_len + 2]; + memcpy(in_url, vrt_url, in_url_len + 1) + in_url[in_url_len + 1] = '\0' + + // worst-case is every input character requires encoding: + char out_url[(in_url_len * 3) + 1]; + + char* in_p = in_url; + char* in_end = in_url + in_url_len - 1; + char* out_p = out_url; + unsigned dirty = 0; + + while(1) { + const char c = *in_p++; + if (c == '\0') { + *out_p = '\0'; + break; + } + else if (c == '?') { + if (dirty) { + *out_p++ = '?'; + memcpy(out_p, in_p, in_end - in_p); + } + break; + } + else if (c == '%' && IS_HEX(*in_p) && IS_HEX(*(in_p + 1)) { + const char x1 = *in_p++; + const char x2 = *in_p++; + const char x = HEX_DECODE(x1, x2); + if(DO_DECODE(x) && (doslash || x != '/')) { + dirty = 1; + *out_p++ = x; + } else { + // mark dirty if we have to normalize hex case + dirty = (((uint8_t)x1 | (uint8_t)x2) & 0x20); + *out_p++ = '%'; + *out_p++ = x1 & ~0x20; + *out_p++ = x2 & ~0x20; + } + } + else if(DO_DECODE(c)) { + *out_p++ = c; + } + else { + dirty = 1; + *out_p++ = '%'; + *out_p++ = HEX_ENC_TOP(c); + *out_p++ = HEX_ENC_BOTTOM(c); + } + } + + /* Set req.url. This will copy our stack buffer into the workspace. + * VRT_l_req_url() is varadic, and concatenates its arguments. The + * vrt_magic_string_end marks the end of the list. + */ + if (dirty) { + VRT_l_req_url(ctx, out_url, vrt_magic_string_end); + } +} + +#undef DO_DECODE +#undef IS_HEX +#undef HEX_DECODE +#undef HEX_ENC_TOP +#undef HEX_ENC_BOTTOM }C sub normalize_mediawiki_path { - C{ raw_normalize_path(ctx, 1); }C + C{ raw_normalize_path(ctx, 1); }C } sub normalize_rest_path { - C{ raw_normalize_path(ctx, 0); }C + C{ raw_normalize_path(ctx, 0); }C } -- To view, visit https://gerrit.wikimedia.org/r/391216 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I5887c54a9295e6a344911621b2963c9df9ad4e24 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: BBlack <bbl...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits