I spent some more time and came up with a completely standalone test
application. (attached) (requires SA 0.5.2 and some database backend).
 I have tested this with postgres and with sqlite and both of them
exhibit the behavior I was talking about.  Namely the memory keeps
increasing and stays topped out even though all handles to the results
have been cleared in python.

I am sure that I must be doing something very wrong, but I can't
figure it out.  Can anyone point out my error?

-Allen

On Sat, Feb 21, 2009 at 7:52 AM, Allen Bierbaum <abierb...@gmail.com> wrote:
> Hello all:
>
> We are running into a very strange memory leak issue.  Here are the details:
>
> - The co executes select statements directly on a table using SA
> 0.5.2, Postgres 8.3, Psycopg2, all running on Linux (Ubuntu 8.04)
> - Querying a table with approximately 70000 records
> - When we query the table, convert the results to a list, and then
> free both the list and the results proxy, memory seems to stay used
> - If we query the table with ever increasing size limits (limit
> clause), the amount of memory consumed increases
> - If we query with a limit of say 50000 and then perform smaller
> queries, the smaller queries do not increase the amount of memory
> consumed overall
> - In other words, the amount of memory caps out in a way that looks
> like it is linear with the maximum number of table rows ever returned
>
> It looks like something in SA, Psycopg2, or PG is holding references
> to the returned rows but we have not been able to track it down.  I am
> working on a completely standalone application to demonstrate the
> issue.  Before I went much further though I wanted to post this
> question and code sample (not standalone, but shows the commands we
> are using) in case anyone sees something fishy.
>
> Any ideas?
>
> Thanks,
> Allen
>
> ----------------------------------------------------------------------------------
> import os, sys, gc
> import psycopg2
> import sqlalchemy as sa
> import sqlalchemy.exc
>
> class DataProvider(object):
>   def __init__(self, dbUri, tableName):
>      self._dbUri       = dbUri
>      self._dbEngine    = None
>      self._metadata    = None
>      self._table       = None
>      self._tableName   = tableName
>
>   def initialize(self):
>      self._dbEngine = sa.create_engine(self._dbUri)
>      self._metadata = sa.MetaData()
>      self._metadata.bind = self._dbEngine
>
>      # Build up the table metadata reflectively
>      name_parts = self._tableName.split('.')
>
>      if(len(name_parts) > 1):
>         self._table = sa.Table(name_parts[1], self._metadata, autoload = True,
>                                schema = name_parts[0])
>      else:
>         self._table = sa.Table(name_parts[0], self._metadata, autoload = True)
>
>   def query(self, limit = None):
>      s = self._table.select()
>
>      if limit is not None:
>         s = s.limit(limit)
>
>      print "objs before: ", len(gc.get_objects())
>      print "garbage: ", len(gc.garbage)
>      result_proxy = s.execute()
>      result_objects = [x for x in result_proxy]
>      #print "num results: ", len(result_objects)
>      result_proxy.close()
>      result_objects = []
>      results = []
>      gc.collect(2)
>      print "objs after: ", len(gc.get_objects())
>      print "garbage: ", len(gc.garbage)
>      return results
>
> db_uri          = sys.argv[0]
> collection_name = sys.argv[1]
>
> prov = DataProvider(db_uri, collection_name)
> prov.initialize()
>
> limits = [10000, 20000, 30000, 40000, 50000, 60000,
>            10000, 20000, 30000, 40000, 50000, 60000]
> # Query
> for x in range(10):
>   limit = limits[x]
>   print "----[ %s ] limit: %s ----" % (x, limit)
>   gc.collect(2)
>   printMemoryInformation()
>   features = prov.query(limit = limit)
>   features = None
>   gc.collect(2)
>   printMemoryInformation()
>
> ----------------------------------------------------------------------------------
>

--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups 
"sqlalchemy" group.
To post to this group, send email to sqlalchemy@googlegroups.com
To unsubscribe from this group, send email to 
sqlalchemy+unsubscr...@googlegroups.com
For more options, visit this group at 
http://groups.google.com/group/sqlalchemy?hl=en
-~----------~----~----~----~------~----~------~--~---

import os, sys, gc
import psycopg2

import sqlalchemy as sa
import sqlalchemy.exc

def getMemoryUsage(VmKey):
   proc_status = '/proc/%d/status' % os.getpid()
   scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
            'KB': 1024.0, 'MB': 1024.0*1024.0}
   v = open(proc_status).read()
   
   # get VmKey line e.g. 'VmRSS:  9999  kB\n ...'
   i = v.index(VmKey)
   v = v[i:].split(None, 3)  # whitespace
   if len(v) < 3:
      return 0.0  # invalid format?
   # convert Vm value to bytes
   return float(v[1]) * scale[v[2]]

def memory():
    return getMemoryUsage('VmSize:') / (1024*1024)
def resident():
   return getMemoryUsage('VmRSS:') / (1024*1024)
def printMemoryUsage():
   print "mem: ", memory(), " res: ", resident()


def createTestData(dbUri, tableName):
   """  Called to create temporary data if needed. """
   engine = sa.create_engine(dbUri)
   metadata = sa.MetaData()
   metadata.bind = engine

   test_table = sa.Table(tableName, metadata,
      sa.Column('id', sa.Integer, primary_key = True),
      sa.Column('text1', sa.String),
      sa.Column('text2', sa.String))

   metadata.create_all(engine)

   s = test_table.select().limit(1)
   have_some = len(s.execute().fetchall()) > 0
   
   if not have_some:
      print "Creating data"
      num_rows = 70000
      for x in xrange(num_rows):
         if (x%1000) == 0:
            print "row: ", x
         ins = test_table.insert()
         ins.execute(text1 = 'a'*1000, text2 = 'b'*1000)

class DataProvider(object):
   def __init__(self, dbUri, tableName):
      self._dbEngine = sa.create_engine(dbUri)
      self._metadata = sa.MetaData()
      self._metadata.bind = self._dbEngine
      self._table = sa.Table(tableName, self._metadata, autoload = True)

   def query(self, limit = None):
      s = self._table.select()

      if limit is not None:
         s = s.limit(limit)

      print "objs before: ", len(gc.get_objects())
      print "garbage: ", len(gc.garbage)
      result_proxy = s.execute()
      result_objects = [x for x in result_proxy]
      #print "num results: ", len(result_objects)
      result_proxy.close()
      result_objects = []
      results = []
      gc.collect(2)
      print "objs after: ", len(gc.get_objects())
      print "garbage: ", len(gc.garbage)
      return results

#db_uri    = "postgres://localhost/test_db"
db_uri    = "sqlite:///test_db.sqlite"
tablename = "test_table"

createTestData(db_uri, tablename)
prov = DataProvider(db_uri, tablename)
#prov = PgDataProvider(dsn, host, database, username, collection_name)

limits = [10000, 20000, 30000, 40000, 50000, 60000,
          10000, 20000, 30000, 40000, 50000, 60000]
# Query
for x in range(10):
   limit = limits[x]
   print "----[ %s ] limit: %s ----" % (x, limit)
   gc.collect(2)
   printMemoryUsage()
   features = prov.query(limit = limit)
   features = None
   gc.collect(2)
   printMemoryUsage()

Reply via email to