dgaudet 98/03/12 23:11:03
Modified: src/main util_uri.c
Log:
Deal with the performance problem in parse_uri_components(). This new
version is over two orders of magnitude faster based on timing trials
requesting the test page from mod_test_util_uri. It's 50% faster overall
when doing a zb /index.html with the default index.html.
I'm still resiting the urge to hardcode i386 assembly language with
a C fallback for the rest of the world ;)
Revision Changes Path
1.9 +205 -191 apache-1.3/src/main/util_uri.c
Index: util_uri.c
===================================================================
RCS file: /export/home/cvs/apache-1.3/src/main/util_uri.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- util_uri.c 1998/03/07 21:39:28 1.8
+++ util_uri.c 1998/03/13 07:11:02 1.9
@@ -197,182 +197,15 @@
return ret;
}
-
-
-/* This will serve as the basis for an optimized parse_uri_components, sorry
- * about the if 0
+/* The regex version of parse_uri_components has the advantage that it is
+ * relatively easy to understand and extend. But it has the disadvantage
+ * that the regexes are complex enough that regex libraries really
+ * don't do a great job with them performancewise.
+ *
+ * The default is a hand coded scanner that is two orders of magnitude
+ * faster.
*/
-
-#if 0
-/* parse_uri_components():
- * Parse a given URI, fill in all supplied fields of a uri_components
- * structure. This eliminates the necessity of extracting host, port,
- * path, query info repeatedly in the modules.
- * Side effects:
- * - fills in fields of uri_components *uptr
- * - none on any of the r->* fields
- */
-API_EXPORT(int) parse_uri_components(pool *p, const char *uri,
uri_components *uptr)
-{
- const char *s;
- int ret = HTTP_OK;
-
- /* Initialize the structure. parse_uri() and parse_uri_components()
- * can be called more than once per request.
- */
- memset (uptr, '\0', sizeof(*uptr));
- uptr->is_initialized = 1;
-
- /* A proxy request contains a ':' early on (after the scheme),
- * but not as first character. RFC1738 allows [a-zA-Z0-9-+.]:
- */
- for (s = uri; s != '\0'; s++)
- if (!isalnum(*s) && *s != '+' && *s != '-' && *s != '.')
- break;
-
- if (s == uri || s[0] != ':' || s[1] == '\0') {
- /* not a full URL (not: scheme://host/path), so no proxy request: */
-
- /* Store path, without the optional "?query" argument: */
- uptr->path = getword (p, &uri, '?');
- if (uptr->path[0] == '\0') {
- uptr->path = NULL;
- }
-
- if (uri[0] != '\0') {
- uptr->query = pstrdup(p, uri);
- }
-
-#if defined(__EMX__) || defined(WIN32)
- /* Handle path translations for OS/2 and plug security hole.
- * This will prevent "http://www.wherever.com/..\..\/" from
- * returning a directory for the root drive.
- */
- for (s = uptr->path; (s = strchr(s, '\\')) != NULL; )
- *(char *)s = '/';
-#ifndef WIN32 /* for OS/2 only: */
- /* Fix OS/2 HPFS filename case problem. */
- uptr->path = strlwr(uptr->path);
-#endif
-#endif /* __EMX__ || WIN32 */
- }
- else {
- /* Yes, it is a proxy request. We've detected the scheme, now
- * we split the URI's components and mark what we've found:
- * - scheme
- * followed by "://", then:
- * - [ username [ ":" password ] "@" ]
- * - hostname
- * [ ":" port ]
- * [ "/" path ... [ "?" query ] ]
- */
-
- /* As per RFC1738:
- * The generic form of a URL is:
- * genericurl = scheme ":" schemepart
- *
- * the scheme is in lower case; interpreters should use case-ignore
- * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
- *
- * Extract the scheme:
- */
- s = uri;
- uptr->scheme = getword(p, &s, ':');
- if (uptr->scheme[0] == '\0') {
- uptr->scheme = NULL;
- }
-
- /* URL schemeparts for ip based protocols:
- *
- * ip-schemepart = "//" login [ "/" urlpath ]
- *
- * login = [ user [ ":" password ] "@" ] hostport
- * hostport = host [ ":" port ]
- * host = hostname | hostnumber
- * hostname = *[ domainlabel "." ] toplabel
- * domainlabel = alphadigit | alphadigit *[ alphadigit | "-" ]
alphadigit
- * toplabel = alpha | alpha *[ alphadigit | "-" ] alphadigit
- * alphadigit = alpha | digit
- * hostnumber = digits "." digits "." digits "." digits
- * port = digits
- * user = *[ uchar | ";" | "?" | "&" | "=" ]
- * password = *[ uchar | ";" | "?" | "&" | "=" ]
- * urlpath = *xchar
- */
- /* if IP-schemepart follows, extract host, port etc. */
- if (s[0] == '/' && s[1] == '/') {
- char *tmp;
-
- s += 2;
- if ((tmp = strchr(s, '/')) != NULL) {
- /* In the request_rec structure, the uri is not
- * separated into path & query for proxy requests.
- * But here, we want maximum knowledge about the request,
- * so we still split them. */
- uptr->path = getword_nc(p, &tmp, '?');
- if (uptr->path[0] == '\0') {
- uptr->path = NULL;
- }
-
- if (tmp[0] != '\0') {
- uptr->query = pstrdup(p, tmp);
- }
- }
- else {
- /* the request is just http://hostname - no trailing slash.
- * Provide one:
- */
- uptr->path = "/";
- }
-
- uptr->hostname = getword (p, &s, '/');
- if (uptr->hostname[0] == '\0') {
- uptr->hostname = NULL;
- }
-
- /* disintegrate "[EMAIL PROTECTED]" */
- /* NOTE: using reverse search here because user:password might
- * contain a '@' as well (ftp login: user=ftp : [EMAIL PROTECTED])
- */
- if ((tmp = strrchr(uptr->hostname, '@')) != NULL) {
- uptr->user = uptr->hostname;
- *tmp++ = '\0';
- uptr->hostname = tmp;
-
- /* disintegrate "user:password" */
- if ((tmp = strchr(uptr->user, ':')) != NULL) {
- *tmp++ = '\0';
- uptr->password = tmp;
- }
- }
-
- /* disintegrate "host:port" */
- if ((tmp = strchr(uptr->hostname, ':')) != NULL) {
- *tmp++ = '\0';
- uptr->port_str = tmp;
- uptr->port = (unsigned short) strtol(tmp, &tmp, 10);
- /* Catch possible problem: http://www.apache.org:80@@@/dist/ */
- if (*tmp != '\0')
- ret = HTTP_BAD_REQUEST;
- }
-
- /* Strip any trailing dots in hostname */
- tmp = &uptr->hostname[strlen(uptr->hostname)-1];
- for (; *tmp == '.' && tmp > uptr->hostname; --tmp)
- *tmp = '\0';
-
- /* This name hasn't been looked up yet */
- uptr->dns_looked_up = 0;
- }
- /* If the ip-schemepart doesn't start with "//", deny: */
- else
- ret = HTTP_BAD_REQUEST;
-
- }
-
- return ret;
-}
-#endif
+#ifdef UTIL_URI_REGEX
static regex_t re_uri;
static regex_t re_hostpart;
@@ -382,6 +215,14 @@
int ret;
const char *re_str;
+ memset(uri_delims, 0, sizeof(uri_delims));
+ uri_delims[':'] = T_COLON;
+ uri_delims['/'] = T_SLASH;
+ uri_delims['?'] = T_QUESTION;
+ uri_delims['#'] = T_HASH;
+ uri_delims['@'] = T_AT;
+ uri_delims['\0'] = T_NUL;
+
/* This is a modified version of the regex that appeared in
* draft-fielding-uri-syntax-01. It doesnt allow the uri to contain a
* scheme but no hostinfo or vice versa.
@@ -437,6 +278,7 @@
}
}
+
/* parse_uri_components():
* Parse a given URI, fill in all supplied fields of a uri_components
* structure. This eliminates the necessity of extracting host, port,
@@ -535,24 +377,196 @@
}
}
-#if defined(__EMX__) || defined(WIN32)
- /* Handle path translations for OS/2 and plug security hole.
- * This will prevent "http://www.wherever.com/..\..\/" from
- * returning a directory for the root drive.
+ if (ret == 0)
+ ret = HTTP_OK;
+ return ret;
+}
+#else
+
+/* Here is the hand-optimized parse_uri_components(). There are some wild
+ * tricks we could pull in assembly language that we don't pull here... like
we
+ * can do word-at-time scans for delimiter characters using the same
technique
+ * that fast memchr()s use. But that would be way non-portable. -djg
+ */
+
+/* We have a table that we can index by character and it tells us if the
+ * character is one of the interesting delimiters. Note that we even get
+ * compares for NUL for free -- it's just another delimiter.
+ */
+
+#define T_COLON 0x01 /* ':' */
+#define T_SLASH 0x02 /* '/' */
+#define T_QUESTION 0x04 /* '?' */
+#define T_HASH 0x08 /* '#' */
+#define T_AT 0x10 /* '@' */
+#define T_NUL 0x80 /* '\0' */
+
+static unsigned char uri_delims[256];
+
+/* it works like this:
+ if (uri_delims[ch] & NOTEND_foobar) {
+ then we're not at a delimiter for foobar
+ }
+*/
+
+/* Note that we optimize the scheme scanning here, we cheat and let the
+ * compiler know that it doesn't have to do the & masking.
+ */
+#define NOTEND_SCHEME (0xff)
+#define NOTEND_HOSTINFO (T_SLASH | T_QUESTION | T_HASH | T_NUL)
+#define NOTEND_PATH (T_QUESTION | T_HASH | T_NUL)
+
+void util_uri_init(void)
+{
+ memset(uri_delims, 0, sizeof(uri_delims));
+ uri_delims[':'] = T_COLON;
+ uri_delims['/'] = T_SLASH;
+ uri_delims['?'] = T_QUESTION;
+ uri_delims['#'] = T_HASH;
+ uri_delims['@'] = T_AT;
+ uri_delims['\0'] = T_NUL;
+}
+
+/* Since we know that the string we're duping is of exactly length l
+ * we don't need to go through the expensive (silly) pstrndup(). We
+ * can do much better on our own. This is worth another 50%
+ * improvement.
+ */
+static char *special_strdup(pool *p, const char *s, size_t l)
+{
+ char *d;
+
+ d = palloc(p, l + 1);
+ memcpy(d, s, l);
+ d[l] = '\0';
+ return d;
+}
+
+/* parse_uri_components():
+ * Parse a given URI, fill in all supplied fields of a uri_components
+ * structure. This eliminates the necessity of extracting host, port,
+ * path, query info repeatedly in the modules.
+ * Side effects:
+ * - fills in fields of uri_components *uptr
+ * - none on any of the r->* fields
+ */
+API_EXPORT(int) parse_uri_components(pool *p, const char *uri,
uri_components *uptr)
+{
+ const char *s;
+ const char *s1;
+ const char *hostinfo;
+ char *endstr;
+ int port;
+
+ /* Initialize the structure. parse_uri() and parse_uri_components()
+ * can be called more than once per request.
*/
- {
- char *s;
+ memset (uptr, '\0', sizeof(*uptr));
+ uptr->is_initialized = 1;
- for (s = uptr->path; (s = strchr(s, '\\')) != NULL; )
- *s = '/';
+ /* We assume the processor has a branch predictor like most --
+ * it assumes forward branches are untaken and backwards are taken.
That's
+ * the reason for the gotos. -djg
+ */
+ if (uri[0] == '/') {
+deal_with_path:
+ /* we expect uri to point to first character of path ... remember
+ * that the path could be empty -- http://foobar?query for example
+ */
+ s = uri;
+ while ((uri_delims[*(unsigned char *)s] & NOTEND_PATH) == 0) {
+ ++s;
+ }
+ if (s != uri) {
+ uptr->path = special_strdup(p, uri, s - uri);
+ }
+ if (*s == 0) {
+ return HTTP_OK;
+ }
+ if (*s == '?') {
+ ++s;
+ s1 = strchr(s, '#');
+ if (s1) {
+ uptr->fragment = pstrdup(p, s1 + 1);
+ uptr->query = special_strdup(p, s, s1 - s);
+ }
+ else {
+ uptr->query = pstrdup(p, s);
+ }
+ return HTTP_OK;
+ }
+ /* otherwise it's a fragment */
+ uptr->fragment = pstrdup(p, s + 1);
+ return HTTP_OK;
}
-#ifndef WIN32 /* for OS/2 only: */
- /* Fix OS/2 HPFS filename case problem. */
- str_tolower(uptr->path);
-#endif
-#endif /* __EMX__ || WIN32 */
- if (ret == 0)
- ret = HTTP_OK;
- return ret;
+ /* find the scheme: */
+ s = uri;
+ while ((uri_delims[*(unsigned char *)s] & NOTEND_SCHEME) == 0) {
+ ++s;
+ }
+ /* scheme must be non-empty and followed by :// */
+ if (s == uri || s[0] != ':' || s[1] != '/' || s[2] != '/') {
+ goto deal_with_path; /* backwards predicted taken! */
+ }
+
+ uptr->scheme = special_strdup(p, uri, s - uri);
+ s += 3;
+ hostinfo = s;
+ while ((uri_delims[*(unsigned char *)s] & NOTEND_HOSTINFO) == 0) {
+ ++s;
+ }
+ uri = s; /* whatever follows hostinfo is start of uri */
+ uptr->hostinfo = special_strdup(p, hostinfo, uri - hostinfo);
+
+ /* If there's a username:[EMAIL PROTECTED]:port, the @ we want is the
last @...
+ * too bad there's no memrchr()... For the C purists, note that hostinfo
+ * is definately not the first character of the original uri so therefore
+ * &hostinfo[-1] < &hostinfo[0] ... and this loop is valid C.
+ */
+ s = uri;
+ do {
+ --s;
+ } while (s >= hostinfo && *s != '@');
+ if (s < hostinfo) {
+ /* again we want the common case to be fall through */
+deal_with_host:
+ /* We expect hostinfo to point to the first character of
+ * the hostname. If there's a port it is the first colon.
+ */
+ s = memchr(hostinfo, ':', uri - hostinfo);
+ if (s == NULL) {
+ /* we expect the common case to have no port */
+ uptr->hostname = special_strdup(p, hostinfo, uri - hostinfo);
+ goto deal_with_path;
+ }
+ uptr->hostname = special_strdup(p, hostinfo, s - hostinfo);
+ ++s;
+ uptr->port_str = special_strdup(p, s, uri - s);
+ if (uri != s) {
+ port = strtol(uptr->port_str, &endstr, 10);
+ uptr->port = port;
+ if (*endstr == '\0' && uptr->port == port) {
+ goto deal_with_path;
+ }
+ /* Invalid characters after ':' found */
+ return HTTP_BAD_REQUEST;
+ }
+ uptr->port = default_port_for_scheme(uptr->scheme);
+ goto deal_with_path;
+ }
+
+ /* first colon delimits username:password */
+ s1 = memchr(hostinfo, ':', s - hostinfo);
+ if (s1) {
+ uptr->user = special_strdup(p, hostinfo, s1 - hostinfo);
+ ++s1;
+ uptr->password = special_strdup(p, s1, s - s1);
+ }
+ else {
+ uptr->user = special_strdup(p, hostinfo, s - hostinfo);
+ }
+ hostinfo = s + 1;
+ goto deal_with_host;
}
+#endif