are you referring to memory of the python process or number of unclaimed objects still present ? psycopg2 will buffer the full results from the database in memory before fetchone() is ever called, which will force the python process to grow to accomodate it. a python process generally stays large once it grows.
On Feb 21, 2009, at 11:16 AM, Allen Bierbaum wrote: > I spent some more time and came up with a completely standalone test > application. (attached) (requires SA 0.5.2 and some database backend). > I have tested this with postgres and with sqlite and both of them > exhibit the behavior I was talking about. Namely the memory keeps > increasing and stays topped out even though all handles to the results > have been cleared in python. > > I am sure that I must be doing something very wrong, but I can't > figure it out. Can anyone point out my error? > > -Allen > > On Sat, Feb 21, 2009 at 7:52 AM, Allen Bierbaum > <abierb...@gmail.com> wrote: >> Hello all: >> >> We are running into a very strange memory leak issue. Here are the >> details: >> >> - The co executes select statements directly on a table using SA >> 0.5.2, Postgres 8.3, Psycopg2, all running on Linux (Ubuntu 8.04) >> - Querying a table with approximately 70000 records >> - When we query the table, convert the results to a list, and then >> free both the list and the results proxy, memory seems to stay used >> - If we query the table with ever increasing size limits (limit >> clause), the amount of memory consumed increases >> - If we query with a limit of say 50000 and then perform smaller >> queries, the smaller queries do not increase the amount of memory >> consumed overall >> - In other words, the amount of memory caps out in a way that looks >> like it is linear with the maximum number of table rows ever returned >> >> It looks like something in SA, Psycopg2, or PG is holding references >> to the returned rows but we have not been able to track it down. I >> am >> working on a completely standalone application to demonstrate the >> issue. Before I went much further though I wanted to post this >> question and code sample (not standalone, but shows the commands we >> are using) in case anyone sees something fishy. >> >> Any ideas? >> >> Thanks, >> Allen >> >> ---------------------------------------------------------------------------------- >> import os, sys, gc >> import psycopg2 >> import sqlalchemy as sa >> import sqlalchemy.exc >> >> class DataProvider(object): >> def __init__(self, dbUri, tableName): >> self._dbUri = dbUri >> self._dbEngine = None >> self._metadata = None >> self._table = None >> self._tableName = tableName >> >> def initialize(self): >> self._dbEngine = sa.create_engine(self._dbUri) >> self._metadata = sa.MetaData() >> self._metadata.bind = self._dbEngine >> >> # Build up the table metadata reflectively >> name_parts = self._tableName.split('.') >> >> if(len(name_parts) > 1): >> self._table = sa.Table(name_parts[1], self._metadata, >> autoload = True, >> schema = name_parts[0]) >> else: >> self._table = sa.Table(name_parts[0], self._metadata, >> autoload = True) >> >> def query(self, limit = None): >> s = self._table.select() >> >> if limit is not None: >> s = s.limit(limit) >> >> print "objs before: ", len(gc.get_objects()) >> print "garbage: ", len(gc.garbage) >> result_proxy = s.execute() >> result_objects = [x for x in result_proxy] >> #print "num results: ", len(result_objects) >> result_proxy.close() >> result_objects = [] >> results = [] >> gc.collect(2) >> print "objs after: ", len(gc.get_objects()) >> print "garbage: ", len(gc.garbage) >> return results >> >> db_uri = sys.argv[0] >> collection_name = sys.argv[1] >> >> prov = DataProvider(db_uri, collection_name) >> prov.initialize() >> >> limits = [10000, 20000, 30000, 40000, 50000, 60000, >> 10000, 20000, 30000, 40000, 50000, 60000] >> # Query >> for x in range(10): >> limit = limits[x] >> print "----[ %s ] limit: %s ----" % (x, limit) >> gc.collect(2) >> printMemoryInformation() >> features = prov.query(limit = limit) >> features = None >> gc.collect(2) >> printMemoryInformation() >> >> ---------------------------------------------------------------------------------- >> > > > > import os, sys, gc > import psycopg2 > > import sqlalchemy as sa > import sqlalchemy.exc > > def getMemoryUsage(VmKey): > proc_status = '/proc/%d/status' % os.getpid() > scale = {'kB': 1024.0, 'mB': 1024.0*1024.0, > 'KB': 1024.0, 'MB': 1024.0*1024.0} > v = open(proc_status).read() > > # get VmKey line e.g. 'VmRSS: 9999 kB\n ...' > i = v.index(VmKey) > v = v[i:].split(None, 3) # whitespace > if len(v) < 3: > return 0.0 # invalid format? > # convert Vm value to bytes > return float(v[1]) * scale[v[2]] > > def memory(): > return getMemoryUsage('VmSize:') / (1024*1024) > def resident(): > return getMemoryUsage('VmRSS:') / (1024*1024) > def printMemoryUsage(): > print "mem: ", memory(), " res: ", resident() > > > def createTestData(dbUri, tableName): > """ Called to create temporary data if needed. """ > engine = sa.create_engine(dbUri) > metadata = sa.MetaData() > metadata.bind = engine > > test_table = sa.Table(tableName, metadata, > sa.Column('id', sa.Integer, primary_key = True), > sa.Column('text1', sa.String), > sa.Column('text2', sa.String)) > > metadata.create_all(engine) > > s = test_table.select().limit(1) > have_some = len(s.execute().fetchall()) > 0 > > if not have_some: > print "Creating data" > num_rows = 70000 > for x in xrange(num_rows): > if (x%1000) == 0: > print "row: ", x > ins = test_table.insert() > ins.execute(text1 = 'a'*1000, text2 = 'b'*1000) > > class DataProvider(object): > def __init__(self, dbUri, tableName): > self._dbEngine = sa.create_engine(dbUri) > self._metadata = sa.MetaData() > self._metadata.bind = self._dbEngine > self._table = sa.Table(tableName, self._metadata, autoload = > True) > > def query(self, limit = None): > s = self._table.select() > > if limit is not None: > s = s.limit(limit) > > print "objs before: ", len(gc.get_objects()) > print "garbage: ", len(gc.garbage) > result_proxy = s.execute() > result_objects = [x for x in result_proxy] > #print "num results: ", len(result_objects) > result_proxy.close() > result_objects = [] > results = [] > gc.collect(2) > print "objs after: ", len(gc.get_objects()) > print "garbage: ", len(gc.garbage) > return results > > #db_uri = "postgres://localhost/test_db" > db_uri = "sqlite:///test_db.sqlite" > tablename = "test_table" > > createTestData(db_uri, tablename) > prov = DataProvider(db_uri, tablename) > #prov = PgDataProvider(dsn, host, database, username, collection_name) > > limits = [10000, 20000, 30000, 40000, 50000, 60000, > 10000, 20000, 30000, 40000, 50000, 60000] > # Query > for x in range(10): > limit = limits[x] > print "----[ %s ] limit: %s ----" % (x, limit) > gc.collect(2) > printMemoryUsage() > features = prov.query(limit = limit) > features = None > gc.collect(2) > printMemoryUsage() --~--~---------~--~----~------------~-------~--~----~ You received this message because you are subscribed to the Google Groups "sqlalchemy" group. To post to this group, send email to sqlalchemy@googlegroups.com To unsubscribe from this group, send email to sqlalchemy+unsubscr...@googlegroups.com For more options, visit this group at http://groups.google.com/group/sqlalchemy?hl=en -~----------~----~----~----~------~----~------~--~---