Hi All,

 The WSGI specification says in the section on "Input and Error Streams":

The optional "size" argument to readline() is not supported, as it may be
complex for server authors to implement, and is not often used in practice.

But the current implementation of cgi.FieldStorage in the 2.4.4 branch and on Python 2.5 does call readline with the size argument. It has started to do this in response to the Python bug #1112549 - cgi.FieldStorage memory usage can spike in line-oriented ops. See http://sourceforge.net/tracker/index.php?func=detail&aid=1112549&group_id=5470&atid=105470

Since it is reasonable for a WSGI application to use cgi.FieldStorage I am wondering whether cgi.FieldStorage or the WSGI specification needs to changed in order to solve this incompatibility.

Originally I thought it was cgi.FieldStorage that needs to be changed, and hence tried to fix it by wrapping the input stream so that the readline method always uses the read method on the input stream. While this seems to work for me it introduces a level of complexity in the cgi.py file, and possible some other bugs, that makes me think that adding the size argument for readline into the WSGI specification isn't such bad idea after all.

There way be other ways of modifying cgi.FieldStorage to solve this but I can't see how at the moment.

For those that are interested , I have attached the patch but my main issue is where should this incompatibility be solved.

 Thanks
 Michael

Index: Lib/cgi.py
===================================================================
--- Lib/cgi.py	(revision 52033)
+++ Lib/cgi.py	(working copy)
@@ -374,7 +374,83 @@
         """Return printable representation."""
         return "MiniFieldStorage(%r, %r)" % (self.name, self.value)
 
+BUFFER_CHUNK = 1 << 16
 
+class BufferInputStream:
+
+    """
+    WSGI doesn't support any agruements to the readline method
+    """
+
+    def __init__(self, fp):
+        assert not isinstance(fp, BufferInputStream), \
+               "File pointer must not be an input stream"
+        self.fp = fp
+        self.fp_buffer = ''
+        self.lfp_buffer = 0
+
+    def tell(self):
+        # should still fail if self.fp.tell doesn't exist.
+        return self.fp.tell() - self.lfp_buffer
+
+    def seek(self, offset, whence = 0):
+        # reset the buffers since is it is basically to hard on my
+        # brain to calculate this.
+        self.fp.seek(0)
+        self.fp_buffer = ''
+        self.lfp_buffer = 0
+        return self.fp.seek(offset, whence)
+
+    def read(self, size = -1):
+        if size < 0 or size is None:
+            return self.fp_buffer + self.fp.read()
+
+        while self.lfp_buffer < size:
+            d = self.fp.read(BUFFER_CHUNK)
+            self.fp_buffer += d
+            self.lfp_buffer += len(d)
+            if not d:
+                break
+
+        d = self.fp_buffer[0:size]
+        self.fp_buffer = self.fp_buffer[size:]
+        self.lfp_buffer -= size
+        return d
+
+    def readline(self, size = -1):
+        """Read one line from the filehandle fp."""
+        while self.lfp_buffer < size:
+            index = self.fp_buffer.find('\n')
+            if index >= 0:
+                line = self.fp_buffer[0:index + 1]
+                self.fp_buffer = self.fp_buffer[index + 1:]
+                self.lfp_buffer = self.lfp_buffer - index - 1
+                return line
+
+            d = self.fp.read(BUFFER_CHUNK)
+            if not d:
+                break
+
+            self.fp_buffer += d
+            self.lfp_buffer += len(d)
+
+        index = self.fp_buffer.find('\n')
+        if index >= 0:
+            s = index + 1
+        elif size > 0:
+            s = size
+        else:
+            # defualt size - so call fp.readline() strainght
+            d = self.fp_buffer + self.fp.readline()
+            self.fp_buffer = ''
+            self.lfp_buffer = 0
+            return d
+        d = self.fp_buffer[0:s]
+        self.fp_buffer = self.fp_buffer[s:]
+        self.lfp_buffer -= s
+        return d
+
+
 class FieldStorage:
 
     """Store a sequence of fields, reading multipart/form-data.
@@ -472,7 +548,11 @@
                 headers['content-type'] = environ['CONTENT_TYPE']
             if 'CONTENT_LENGTH' in environ:
                 headers['content-length'] = environ['CONTENT_LENGTH']
-        self.fp = fp or sys.stdin
+        fp = fp or sys.stdin
+        if isinstance(fp, BufferInputStream):
+            self.fp = fp
+        else:
+            self.fp = BufferInputStream(fp)
         self.headers = headers
         self.outerboundary = outerboundary
 
@@ -699,7 +779,7 @@
     def read_lines_to_eof(self):
         """Internal: read lines until EOF."""
         while 1:
-            line = self.fp.readline(1<<16)
+            line = self.fp.read(1<<16)
             if not line:
                 self.done = -1
                 break
Index: Lib/test/output/test_cgi
===================================================================
--- Lib/test/output/test_cgi	(revision 52033)
+++ Lib/test/output/test_cgi	(working copy)
@@ -38,5 +38,6 @@
 Testing log
 Testing initlog 1
 Testing log 2
+Test FieldStorage BufferedInputStream
 Test FieldStorage methods that use readline
 Test basic FieldStorage multipart parsing
Index: Lib/test/test_cgi.py
===================================================================
--- Lib/test/test_cgi.py	(revision 52033)
+++ Lib/test/test_cgi.py	(working copy)
@@ -205,6 +205,25 @@
         cgi.initlog("%s", "Testing log 3")
         cgi.log("Testing log 4")
 
+    print "Test FieldStorage BufferedInputStream"
+    data = """first line
+second line
+
+we skipped the third line
+forth line
+fifth line"""
+    fp = cgi.BufferInputStream(StringIO(data))
+    verify(fp.read(1) == 'f')
+    verify(fp.tell() == 1)
+    verify(fp.readline(14) == 'irst line\n')
+    verify(fp.tell() == 11)
+    verify(fp.readline() == 'second line\n')
+    verify(fp.readline() == '\n')
+    fp.seek(5)
+    verify(fp.tell(), 5)
+    verify(fp.readline() == ' line\n')
+    verify(fp.read() == 'second line\n\nwe skipped the third line\nforth line\nfifth line')
+
     print "Test FieldStorage methods that use readline"
     # FieldStorage uses readline, which has the capacity to read all
     # contents of the input file into memory; we use readline's size argument
@@ -237,7 +256,8 @@
     # if we're not chunking properly, readline is only called twice
     # (by read_binary); if we are chunking properly, it will be called 5 times
     # as long as the chunksize is 1 << 16.
-    verify(f.numcalls > 2)
+    verify(f.numcalls == 0)
+    verify(f.tell() == 256 * 1024)
 
     print "Test basic FieldStorage multipart parsing"
     env = {'REQUEST_METHOD':'POST', 'CONTENT_TYPE':'multipart/form-data; boundary=---------------------------721837373350705526688164684', 'CONTENT_LENGTH':'558'}
@@ -261,7 +281,8 @@
  Add\x20
 -----------------------------721837373350705526688164684--
 """
-    fs = cgi.FieldStorage(fp=StringIO(postdata), environ=env)
+    fp = StringIO(postdata)
+    fs = cgi.FieldStorage(fp=fp, environ=env)
     verify(len(fs.list) == 4)
     expect = [{'name':'id', 'filename':None, 'value':'1234'},
               {'name':'title', 'filename':None, 'value':''},
@@ -271,5 +292,6 @@
         for k, exp in expect[x].items():
             got = getattr(fs.list[x], k)
             verify(got == exp)
+    verify(fp.tell() == len(postdata))
 
 main()
_______________________________________________
Web-SIG mailing list
Web-SIG@python.org
Web SIG: http://www.python.org/sigs/web-sig
Unsubscribe: 
http://mail.python.org/mailman/options/web-sig/archive%40mail-archive.com

Reply via email to