I spent some more time and came up with a completely standalone test application. (attached) (requires SA 0.5.2 and some database backend). I have tested this with postgres and with sqlite and both of them exhibit the behavior I was talking about. Namely the memory keeps increasing and stays topped out even though all handles to the results have been cleared in python.
I am sure that I must be doing something very wrong, but I can't figure it out. Can anyone point out my error? -Allen On Sat, Feb 21, 2009 at 7:52 AM, Allen Bierbaum <abierb...@gmail.com> wrote: > Hello all: > > We are running into a very strange memory leak issue. Here are the details: > > - The co executes select statements directly on a table using SA > 0.5.2, Postgres 8.3, Psycopg2, all running on Linux (Ubuntu 8.04) > - Querying a table with approximately 70000 records > - When we query the table, convert the results to a list, and then > free both the list and the results proxy, memory seems to stay used > - If we query the table with ever increasing size limits (limit > clause), the amount of memory consumed increases > - If we query with a limit of say 50000 and then perform smaller > queries, the smaller queries do not increase the amount of memory > consumed overall > - In other words, the amount of memory caps out in a way that looks > like it is linear with the maximum number of table rows ever returned > > It looks like something in SA, Psycopg2, or PG is holding references > to the returned rows but we have not been able to track it down. I am > working on a completely standalone application to demonstrate the > issue. Before I went much further though I wanted to post this > question and code sample (not standalone, but shows the commands we > are using) in case anyone sees something fishy. > > Any ideas? > > Thanks, > Allen > > ---------------------------------------------------------------------------------- > import os, sys, gc > import psycopg2 > import sqlalchemy as sa > import sqlalchemy.exc > > class DataProvider(object): > def __init__(self, dbUri, tableName): > self._dbUri = dbUri > self._dbEngine = None > self._metadata = None > self._table = None > self._tableName = tableName > > def initialize(self): > self._dbEngine = sa.create_engine(self._dbUri) > self._metadata = sa.MetaData() > self._metadata.bind = self._dbEngine > > # Build up the table metadata reflectively > name_parts = self._tableName.split('.') > > if(len(name_parts) > 1): > self._table = sa.Table(name_parts[1], self._metadata, autoload = True, > schema = name_parts[0]) > else: > self._table = sa.Table(name_parts[0], self._metadata, autoload = True) > > def query(self, limit = None): > s = self._table.select() > > if limit is not None: > s = s.limit(limit) > > print "objs before: ", len(gc.get_objects()) > print "garbage: ", len(gc.garbage) > result_proxy = s.execute() > result_objects = [x for x in result_proxy] > #print "num results: ", len(result_objects) > result_proxy.close() > result_objects = [] > results = [] > gc.collect(2) > print "objs after: ", len(gc.get_objects()) > print "garbage: ", len(gc.garbage) > return results > > db_uri = sys.argv[0] > collection_name = sys.argv[1] > > prov = DataProvider(db_uri, collection_name) > prov.initialize() > > limits = [10000, 20000, 30000, 40000, 50000, 60000, > 10000, 20000, 30000, 40000, 50000, 60000] > # Query > for x in range(10): > limit = limits[x] > print "----[ %s ] limit: %s ----" % (x, limit) > gc.collect(2) > printMemoryInformation() > features = prov.query(limit = limit) > features = None > gc.collect(2) > printMemoryInformation() > > ---------------------------------------------------------------------------------- > --~--~---------~--~----~------------~-------~--~----~ You received this message because you are subscribed to the Google Groups "sqlalchemy" group. To post to this group, send email to sqlalchemy@googlegroups.com To unsubscribe from this group, send email to sqlalchemy+unsubscr...@googlegroups.com For more options, visit this group at http://groups.google.com/group/sqlalchemy?hl=en -~----------~----~----~----~------~----~------~--~---
import os, sys, gc import psycopg2 import sqlalchemy as sa import sqlalchemy.exc def getMemoryUsage(VmKey): proc_status = '/proc/%d/status' % os.getpid() scale = {'kB': 1024.0, 'mB': 1024.0*1024.0, 'KB': 1024.0, 'MB': 1024.0*1024.0} v = open(proc_status).read() # get VmKey line e.g. 'VmRSS: 9999 kB\n ...' i = v.index(VmKey) v = v[i:].split(None, 3) # whitespace if len(v) < 3: return 0.0 # invalid format? # convert Vm value to bytes return float(v[1]) * scale[v[2]] def memory(): return getMemoryUsage('VmSize:') / (1024*1024) def resident(): return getMemoryUsage('VmRSS:') / (1024*1024) def printMemoryUsage(): print "mem: ", memory(), " res: ", resident() def createTestData(dbUri, tableName): """ Called to create temporary data if needed. """ engine = sa.create_engine(dbUri) metadata = sa.MetaData() metadata.bind = engine test_table = sa.Table(tableName, metadata, sa.Column('id', sa.Integer, primary_key = True), sa.Column('text1', sa.String), sa.Column('text2', sa.String)) metadata.create_all(engine) s = test_table.select().limit(1) have_some = len(s.execute().fetchall()) > 0 if not have_some: print "Creating data" num_rows = 70000 for x in xrange(num_rows): if (x%1000) == 0: print "row: ", x ins = test_table.insert() ins.execute(text1 = 'a'*1000, text2 = 'b'*1000) class DataProvider(object): def __init__(self, dbUri, tableName): self._dbEngine = sa.create_engine(dbUri) self._metadata = sa.MetaData() self._metadata.bind = self._dbEngine self._table = sa.Table(tableName, self._metadata, autoload = True) def query(self, limit = None): s = self._table.select() if limit is not None: s = s.limit(limit) print "objs before: ", len(gc.get_objects()) print "garbage: ", len(gc.garbage) result_proxy = s.execute() result_objects = [x for x in result_proxy] #print "num results: ", len(result_objects) result_proxy.close() result_objects = [] results = [] gc.collect(2) print "objs after: ", len(gc.get_objects()) print "garbage: ", len(gc.garbage) return results #db_uri = "postgres://localhost/test_db" db_uri = "sqlite:///test_db.sqlite" tablename = "test_table" createTestData(db_uri, tablename) prov = DataProvider(db_uri, tablename) #prov = PgDataProvider(dsn, host, database, username, collection_name) limits = [10000, 20000, 30000, 40000, 50000, 60000, 10000, 20000, 30000, 40000, 50000, 60000] # Query for x in range(10): limit = limits[x] print "----[ %s ] limit: %s ----" % (x, limit) gc.collect(2) printMemoryUsage() features = prov.query(limit = limit) features = None gc.collect(2) printMemoryUsage()