GK wrote: > Michael, > > Thank you for your response - it was very helpful for me. > > It turns out my main problem was that I was importing an order of > magnitude or so more data than I realized, but you were also right > about using flush(). > > You were also right about the overhead of creating extra indexes. In > the spirit of putting some data in a public space... > > Starting with en empty database, with a test dataset of 1200 values > (about 1150 unique insertions) and flushing after every insertion I > have the following timings: > > No extra indexes: 2:00 > Three extra indexes: 2:15 > > This is using SQLite with a flat file on a 1.8GHz laptop. The records > are each nearly 1Kb. There's an overhead of about 5 seconds for > reading the data, so most of the above time is loading the database. > > I haven't yet had time to judge how the performance varies with larger > datasets.
2:00 seems very high- is that 2 minutes? Below are two similar bulk table loads. The first uses the same insert-or-update methodology and only the relational layer (no ORM)- that clocks in at 1.25 seconds on my laptop. The second is an ORM implementation with a different duplicate detection methodology- that clocks in at 2.0 seconds. --- ## ## Relational version ## import os import time import random from sqlalchemy import * from sqlalchemy.exceptions import IntegrityError data_cols = ( 'sendadr', 'fromadr', 'toadr', 'sendnam', 'fromnam', 'tonam', 'subject', 'received', 'spam', 'folderid' ) chunk = lambda: '%x' % random.getrandbits(400) dataset = [dict((col, chunk()) for col in data_cols) for _ in xrange(1200)] dupes = random.sample(dataset, 50) db = '1krows.db' if os.path.exists(db): os.unlink(db) engine = create_engine('sqlite:///%s' % db) metadata = MetaData(engine) table = Table('t', metadata, Column('id', Integer, primary_key=True), Column('occurs', Integer, default=1), *(Column(col, Text) for col in data_cols)) table.append_constraint(UniqueConstraint(*data_cols)) metadata.create_all() table.insert().execute(dupes) assert table.select().count().scalar() == 50 start = time.time() insert = table.insert() update = (table.update(). where(and_(*((table.c[col] == bindparam(col)) for col in data_cols))). values({'occurs': table.c.occurs+1})) conn = engine.connect() tx = conn.begin() for row in dataset: try: conn.execute(insert, row) except IntegrityError: conn.execute(update, row) tx.commit() end = time.time() assert table.select().count().scalar() == 1200 assert select([func.count(table.c.id)], table.c.occurs==2).scalar() == 50 print "elapsed: %04f" % (end - start) ## ## ORM version ## import hashlib import os import time import random from sqlalchemy import * from sqlalchemy.orm import * data_cols = ( 'sendadr', 'fromadr', 'toadr', 'sendnam', 'fromnam', 'tonam', 'subject', 'received', 'spam', 'folderid' ) chunk = lambda: '%x' % random.getrandbits(400) dataset = [dict((col, chunk()) for col in data_cols) for _ in xrange(1200)] def hashrow(row): return hashlib.sha1( ','.join(row[c] for c in data_cols)).hexdigest() dupes = [] for row in random.sample(dataset, 50): dupe = row.copy() dupe['hash'] = hashrow(dupe) dupes.append(dupe) db = '1krows.db' if os.path.exists(db): os.unlink(db) engine = create_engine('sqlite:///%s' % db) metadata = MetaData(engine) table = Table('t', metadata, Column('id', Integer, primary_key=True), Column('occurs', Integer, default=1), Column('hash', String(40), unique=True), *(Column(col, Text) for col in data_cols)) metadata.create_all() table.insert().execute(dupes) assert table.select().count().scalar() == 50 class Email(object): def __init__(self, **kw): for key, value in kw.items(): setattr(self, key, value) def hashval(self): return hashrow(dict((col, getattr(self, col)) for col in data_cols)) mapper(Email, table) start = time.time() session = create_session() session.begin() data = [Email(**row) for row in dataset] chunk, remaining = [], [(e.hashval(), e) for e in data] while remaining: chunk, remaining = remaining[:100], remaining[100:] by_hash = dict(chunk) dupes = (session.query(Email). filter(Email.hash.in_(by_hash.keys()))).all() for dupe in dupes: dupe.occurs += 1 by_hash.pop(dupe.hash) for hashval, email in by_hash.items(): email.hash = hashval session.save(email) session.flush() session.commit() end = time.time() assert table.select().count().scalar() == 1200 assert select([func.count(table.c.id)], table.c.occurs==2).scalar() == 50 print "elapsed: %04f" % (end - start) --~--~---------~--~----~------------~-------~--~----~ You received this message because you are subscribed to the Google Groups "sqlalchemy" group. To post to this group, send email to sqlalchemy@googlegroups.com To unsubscribe from this group, send email to [EMAIL PROTECTED] For more options, visit this group at http://groups.google.com/group/sqlalchemy?hl=en -~----------~----~----~----~------~----~------~--~---