are you referring to memory of the python process or number of  
unclaimed objects still present ?   psycopg2 will buffer the full  
results from the database in memory before fetchone() is ever called,  
which will force the python process to grow to accomodate it.  a  
python process generally stays large once it grows.



On Feb 21, 2009, at 11:16 AM, Allen Bierbaum wrote:

> I spent some more time and came up with a completely standalone test
> application. (attached) (requires SA 0.5.2 and some database backend).
> I have tested this with postgres and with sqlite and both of them
> exhibit the behavior I was talking about.  Namely the memory keeps
> increasing and stays topped out even though all handles to the results
> have been cleared in python.
>
> I am sure that I must be doing something very wrong, but I can't
> figure it out.  Can anyone point out my error?
>
> -Allen
>
> On Sat, Feb 21, 2009 at 7:52 AM, Allen Bierbaum  
> <abierb...@gmail.com> wrote:
>> Hello all:
>>
>> We are running into a very strange memory leak issue.  Here are the  
>> details:
>>
>> - The co executes select statements directly on a table using SA
>> 0.5.2, Postgres 8.3, Psycopg2, all running on Linux (Ubuntu 8.04)
>> - Querying a table with approximately 70000 records
>> - When we query the table, convert the results to a list, and then
>> free both the list and the results proxy, memory seems to stay used
>> - If we query the table with ever increasing size limits (limit
>> clause), the amount of memory consumed increases
>> - If we query with a limit of say 50000 and then perform smaller
>> queries, the smaller queries do not increase the amount of memory
>> consumed overall
>> - In other words, the amount of memory caps out in a way that looks
>> like it is linear with the maximum number of table rows ever returned
>>
>> It looks like something in SA, Psycopg2, or PG is holding references
>> to the returned rows but we have not been able to track it down.  I  
>> am
>> working on a completely standalone application to demonstrate the
>> issue.  Before I went much further though I wanted to post this
>> question and code sample (not standalone, but shows the commands we
>> are using) in case anyone sees something fishy.
>>
>> Any ideas?
>>
>> Thanks,
>> Allen
>>
>> ----------------------------------------------------------------------------------
>> import os, sys, gc
>> import psycopg2
>> import sqlalchemy as sa
>> import sqlalchemy.exc
>>
>> class DataProvider(object):
>>  def __init__(self, dbUri, tableName):
>>     self._dbUri       = dbUri
>>     self._dbEngine    = None
>>     self._metadata    = None
>>     self._table       = None
>>     self._tableName   = tableName
>>
>>  def initialize(self):
>>     self._dbEngine = sa.create_engine(self._dbUri)
>>     self._metadata = sa.MetaData()
>>     self._metadata.bind = self._dbEngine
>>
>>     # Build up the table metadata reflectively
>>     name_parts = self._tableName.split('.')
>>
>>     if(len(name_parts) > 1):
>>        self._table = sa.Table(name_parts[1], self._metadata,  
>> autoload = True,
>>                               schema = name_parts[0])
>>     else:
>>        self._table = sa.Table(name_parts[0], self._metadata,  
>> autoload = True)
>>
>>  def query(self, limit = None):
>>     s = self._table.select()
>>
>>     if limit is not None:
>>        s = s.limit(limit)
>>
>>     print "objs before: ", len(gc.get_objects())
>>     print "garbage: ", len(gc.garbage)
>>     result_proxy = s.execute()
>>     result_objects = [x for x in result_proxy]
>>     #print "num results: ", len(result_objects)
>>     result_proxy.close()
>>     result_objects = []
>>     results = []
>>     gc.collect(2)
>>     print "objs after: ", len(gc.get_objects())
>>     print "garbage: ", len(gc.garbage)
>>     return results
>>
>> db_uri          = sys.argv[0]
>> collection_name = sys.argv[1]
>>
>> prov = DataProvider(db_uri, collection_name)
>> prov.initialize()
>>
>> limits = [10000, 20000, 30000, 40000, 50000, 60000,
>>           10000, 20000, 30000, 40000, 50000, 60000]
>> # Query
>> for x in range(10):
>>  limit = limits[x]
>>  print "----[ %s ] limit: %s ----" % (x, limit)
>>  gc.collect(2)
>>  printMemoryInformation()
>>  features = prov.query(limit = limit)
>>  features = None
>>  gc.collect(2)
>>  printMemoryInformation()
>>
>> ----------------------------------------------------------------------------------
>>
>
> >
> import os, sys, gc
> import psycopg2
>
> import sqlalchemy as sa
> import sqlalchemy.exc
>
> def getMemoryUsage(VmKey):
>   proc_status = '/proc/%d/status' % os.getpid()
>   scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
>            'KB': 1024.0, 'MB': 1024.0*1024.0}
>   v = open(proc_status).read()
>
>   # get VmKey line e.g. 'VmRSS:  9999  kB\n ...'
>   i = v.index(VmKey)
>   v = v[i:].split(None, 3)  # whitespace
>   if len(v) < 3:
>      return 0.0  # invalid format?
>   # convert Vm value to bytes
>   return float(v[1]) * scale[v[2]]
>
> def memory():
>    return getMemoryUsage('VmSize:') / (1024*1024)
> def resident():
>   return getMemoryUsage('VmRSS:') / (1024*1024)
> def printMemoryUsage():
>   print "mem: ", memory(), " res: ", resident()
>
>
> def createTestData(dbUri, tableName):
>   """  Called to create temporary data if needed. """
>   engine = sa.create_engine(dbUri)
>   metadata = sa.MetaData()
>   metadata.bind = engine
>
>   test_table = sa.Table(tableName, metadata,
>      sa.Column('id', sa.Integer, primary_key = True),
>      sa.Column('text1', sa.String),
>      sa.Column('text2', sa.String))
>
>   metadata.create_all(engine)
>
>   s = test_table.select().limit(1)
>   have_some = len(s.execute().fetchall()) > 0
>
>   if not have_some:
>      print "Creating data"
>      num_rows = 70000
>      for x in xrange(num_rows):
>         if (x%1000) == 0:
>            print "row: ", x
>         ins = test_table.insert()
>         ins.execute(text1 = 'a'*1000, text2 = 'b'*1000)
>
> class DataProvider(object):
>   def __init__(self, dbUri, tableName):
>      self._dbEngine = sa.create_engine(dbUri)
>      self._metadata = sa.MetaData()
>      self._metadata.bind = self._dbEngine
>      self._table = sa.Table(tableName, self._metadata, autoload =  
> True)
>
>   def query(self, limit = None):
>      s = self._table.select()
>
>      if limit is not None:
>         s = s.limit(limit)
>
>      print "objs before: ", len(gc.get_objects())
>      print "garbage: ", len(gc.garbage)
>      result_proxy = s.execute()
>      result_objects = [x for x in result_proxy]
>      #print "num results: ", len(result_objects)
>      result_proxy.close()
>      result_objects = []
>      results = []
>      gc.collect(2)
>      print "objs after: ", len(gc.get_objects())
>      print "garbage: ", len(gc.garbage)
>      return results
>
> #db_uri    = "postgres://localhost/test_db"
> db_uri    = "sqlite:///test_db.sqlite"
> tablename = "test_table"
>
> createTestData(db_uri, tablename)
> prov = DataProvider(db_uri, tablename)
> #prov = PgDataProvider(dsn, host, database, username, collection_name)
>
> limits = [10000, 20000, 30000, 40000, 50000, 60000,
>          10000, 20000, 30000, 40000, 50000, 60000]
> # Query
> for x in range(10):
>   limit = limits[x]
>   print "----[ %s ] limit: %s ----" % (x, limit)
>   gc.collect(2)
>   printMemoryUsage()
>   features = prov.query(limit = limit)
>   features = None
>   gc.collect(2)
>   printMemoryUsage()


--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups 
"sqlalchemy" group.
To post to this group, send email to sqlalchemy@googlegroups.com
To unsubscribe from this group, send email to 
sqlalchemy+unsubscr...@googlegroups.com
For more options, visit this group at 
http://groups.google.com/group/sqlalchemy?hl=en
-~----------~----~----~----~------~----~------~--~---

Reply via email to