[Zope-dev] Content Type Meta tag stripping in zope.pagetemplate

Miano Njoka Wed, 22 Feb 2012 07:29:21 -0800

Hello all,

I'm a fairly new zope developer, came across a "bug" in my application
that <meta http-equiv="content-type" content="text/html;charset=UTF-8"
/> tags were being stripped out from ZPT templates. Is there a reason
for this? This is done in the _prepare_html function of
zope.pagetemplate.pagetemplatefile.PageTemplateFile. My application
produces XHTML that contains non-ASCII characters that is then used by
other applications so it needs to have the content type set on the
document itself in addition to the HTTP headers.


Secondly, finding and stripping of the meta tag is done using a regular
expression so simply changing the order of the attributes on the
<meta> tag would make the reg-exp not match.

Attached is a patch that uses HTMLParser to find the content type meta
tag instead of a regex. It stops parsing the html as soon as it
encounters the required meta tag.

Miano

Index: src/zope/pagetemplate/pagetemplatefile.py
===================================================================
--- src/zope/pagetemplate/pagetemplatefile.py	(revision 124430)
+++ src/zope/pagetemplate/pagetemplatefile.py	(working copy)
@@ -23,19 +23,49 @@
 import re
 import logging
 
+from HTMLParser import HTMLParser, HTMLParseError
+
 from zope.pagetemplate.pagetemplate import PageTemplate
 
 DEFAULT_ENCODING = "utf-8"
 
-meta_pattern = re.compile(
-    r'\s*<meta\s+http-equiv=["\']?Content-Type["\']?'
-    r'\s+content=["\']?([^;]+);\s*charset=([^"\']+)["\']?\s*/?\s*>\s*',
+meta_pattern = re.compile(r'\s*["\']?([^;]+);\s*charset=([^"\']+)',
     re.IGNORECASE)
 
+
 def package_home(gdict):
     filename = gdict["__file__"]
     return os.path.dirname(filename)
 
+
+class FoundMetaContentTypeTag(Exception):
+    def __init__(self, value):
+        self.parameter = value
+    def __str__(self):
+        return repr(self.parameter)
+
+
+class FindMetaContentTypeHTMLParser(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.content_type = None
+        self.encoding = DEFAULT_ENCODING
+
+    def handle_startendtag(self, tag, attrs):
+        if tag == "meta":
+            http_equiv = [a[1] for a in attrs if a[0] == "http-equiv"]
+            if http_equiv and http_equiv[0].lower() == "content-type":
+                content = [a[1] for a in attrs if a[0] == "content"]
+                if content:
+                    match = meta_pattern.search(content[0])
+                    if match is not None:
+                        self.content_type, self.encoding = match.groups()
+                raise FoundMetaContentTypeTag("Content Type Meta tag found")
+
+    def get_params(self):
+        return self.content_type, self.encoding
+
+
 class PageTemplateFile(PageTemplate):
     "Zope wrapper for filesystem Page Template using TAL, TALES, and METAL"
 
@@ -57,16 +87,16 @@
         return path
 
     def _prepare_html(self, text):
-        match = meta_pattern.search(text)
-        if match is not None:
-            type_, encoding = match.groups()
-            # TODO: Shouldn't <meta>/<?xml?> stripping
-            # be in PageTemplate.__call__()?
-            text = meta_pattern.sub("", text)
-        else:
-            type_ = None
-            encoding = DEFAULT_ENCODING
-        return unicode(text, encoding), type_
+        parser = FindMetaContentTypeHTMLParser()
+        content_type = None
+        encoding = DEFAULT_ENCODING
+        try:
+            parser.feed(text)
+        except FoundMetaContentTypeTag:
+            content_type, encoding = parser.get_params()
+        except HTMLParseError:
+            pass
+        return unicode(text, encoding), content_type
 
     def _read_file(self):
         __traceback_info__ = self.filename
Index: src/zope/pagetemplate/tests/test_ptfile.py
===================================================================
--- src/zope/pagetemplate/tests/test_ptfile.py	(revision 124430)
+++ src/zope/pagetemplate/tests/test_ptfile.py	(working copy)
@@ -161,7 +161,9 @@
         self.failUnlessEqual(rendered.strip(),
             u"<html><head><title>"
             u"\u0422\u0435\u0441\u0442"
-            u"</title></head></html>")
+            u'</title><meta http-equiv="Content-Type"'
+            u' content="text/html; charset=windows-1251" />'
+            u"</head></html>")
 
     def test_xhtml(self):
         pt = self.get_pt(
@@ -176,7 +178,9 @@
         self.failUnlessEqual(rendered.strip(),
             u"<html><head><title>"
             u"\u0422\u0435\u0441\u0442"
-            u"</title></head></html>")
+            u'</title><meta http-equiv="Content-Type"'
+            u' content="text/html; charset=windows-1251" />'
+            u"</head></html>")

_______________________________________________
Zope-Dev maillist  -  Zope-Dev@zope.org
https://mail.zope.org/mailman/listinfo/zope-dev
**  No cross posts or HTML encoding!  **
(Related lists -
 https://mail.zope.org/mailman/listinfo/zope-announce
 https://mail.zope.org/mailman/listinfo/zope )

[Zope-dev] Content Type Meta tag stripping in zope.pagetemplate

Reply via email to