Hi Apache developers,

the next patch fixes the behavior of mod_proxy_html to remove any <!doctype> tags from the beginning of HTML and XHTML documents. (Thomas, Ewald, this is issue #19803 in our Mantis). This <!doctype> tag is needed by some browsers to correctly render XHTML documents.

Additionally it fixes the issue in XHTML documents that empty tags don't get terminated correctly unless "ProxyHTMLDoctype XHTML" is explicitly configured (Thomas, Ewald, this is issue #17643 in our Mantis).

The attached patch fixes the first issue by using the internalSubset hook of the libxml2 SAX parser to parse and output the <!doctype> tag. To address the second issue, the same hook is used to automatically detect whether the current document is an XHTML document or not. In case of an XHTML document the XHTML-style for closing empty tags is enabled for the current request. In my opinion the suggested patch makes the directive ProxyHTMLDoctype more or less obsolete. Of course it is kept for backwards compatibility.

The patch is based on httpd trunk, rev. 1579365.

Please provide feedback whether I should file an issue in Apaches Bugzilla or whether this isn't needed in this case.

Regards,
Micha
Index: modules/filters/mod_proxy_html.c
===================================================================
--- modules/filters/mod_proxy_html.c	(Revision 1579365)
+++ modules/filters/mod_proxy_html.c	(Arbeitskopie)
@@ -101,6 +101,7 @@
 typedef struct {
     ap_filter_t *f;
     proxy_html_conf *cfg;
+    const char *etag;
     htmlParserCtxtPtr parser;
     apr_bucket_brigade *bb;
     char *buf;
@@ -280,6 +281,25 @@
     }
     AP_fwrite(ctx, ctx->buf, strlen(ctx->buf), 1);
 }
+
+static void pinternalSubset(void* ctxt, const xmlChar *name, const xmlChar *externalID, const xmlChar *sysID)
+{
+    if (!ctxt) return;
+    if (!name) return;
+    saxctxt* ctx = (saxctxt*) ctxt;
+    if (ctx->cfg->doctype && ctx->cfg->doctype[0]) return;
+    ap_fprintf(ctx->f->next, ctx->bb, "<!DOCTYPE %s", (const char *)name);
+    if (externalID) {
+        if ((strcasecmp((const char*)name, "html") == 0) &&
+            (strncasecmp((const char *)externalID, "-//W3C//DTD XHTML ", 18) == 0))
+            ctx->etag = xhtml_etag;
+        ap_fprintf(ctx->f->next, ctx->bb, " PUBLIC \"%s\"", (const char *)externalID);
+    if (sysID)
+        ap_fprintf(ctx->f->next, ctx->bb, " \"%s\"", (const char *)sysID);
+    }
+    ap_fprintf(ctx->f->next, ctx->bb, ">\n");
+}
+
 static void pcdata(void *ctxt, const xmlChar *uchars, int length)
 {
     const char *chars = (const char*) uchars;
@@ -632,7 +652,7 @@
     }
     ctx->offset = 0;
     if (desc && desc->empty)
-        ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag);
+        ap_fputs(ctx->f->next, ctx->bb, ctx->etag);
     else
         ap_fputc(ctx->f->next, ctx->bb, '>');
 
@@ -833,6 +853,7 @@
         fctx->bb = apr_brigade_create(f->r->pool,
                                       f->r->connection->bucket_alloc);
         fctx->cfg = cfg;
+        fctx->etag = cfg->etag;
         apr_table_unset(f->r->headers_out, "Content-Length");
 
         if (cfg->interp)
@@ -1236,6 +1257,7 @@
     sax.characters = pcharacters;
     sax.comment = pcomment;
     sax.cdataBlock = pcdata;
+    sax.internalSubset = pinternalSubset;
     xml2enc_charset = APR_RETRIEVE_OPTIONAL_FN(xml2enc_charset);
     xml2enc_filter = APR_RETRIEVE_OPTIONAL_FN(xml2enc_filter);
     if (!xml2enc_charset) {

Reply via email to