Klaas wrote:
22.2s  20m25s[3]

20m to insert 1m keys?  You are doing something wrong.

Hi Mike.

I've put together some simplified test code, but the bsddb
module gives 11m for 1M keys:

Number generator test for 1000000 number ranges
        with a maximum of 3 wildcard digits.
Wed May 17 22:18:17 2006 dictionary population started
Wed May 17 22:18:26 2006 dictionary population stopped, duration 8.6s
Wed May 17 22:18:27 2006 StorageBerkeleyDB population started
Wed May 17 22:29:32 2006 StorageBerkeleyDB population stopped, duration 665.6s
Wed May 17 22:29:33 2006 StorageSQLite population started
Wed May 17 22:30:38 2006 StorageSQLite population stopped, duration 65.5s

test code is attached.

With bdb's it is crucial to insert keys in bytestring-sorted order.

For the bsddb test, I'm using a plain string.  (The module docs list a
string being the only datatype supported for both keys & values).

Also, be sure to give it a decent amount of cache.

The bsddb.hashopen() factory seems to have a bug in this regard; if you
supply a cachesize argument, then it barfs:

...
  File "bsddb-test.py", line 67, in runtest
    db = bsddb.hashopen(None, flag='c', cachesize=8192)
  File "/usr/lib/python2.4/bsddb/__init__.py", line 288, in hashopen
    if cachesize is not None: d.set_cachesize(0, cachesize)
bsddb._db.DBInvalidArgError: (22, 'Invalid argument -- DB->set_cachesize: method not permitted when environment specified')


I'll file a bug report on this if it isn't already fixed.

Cheers,
Chris
import bsddb
import random
import time
import sqlite

import psyco
psyco.full()

class WallClockTimer(object):
    '''Simple timer for measuring tests.'''
    def __init__(self, msg):
        self.start = time.time()
        self.msg = msg
        print time.ctime(self.start), self.msg, 'started'
        
    def stop(self):
        end = time.time()
        duration = end - self.start
        print time.ctime(end), self.msg, 'stopped, duration %.1fs' % duration

class NumberGen(object):
    '''Phone number generator for testing different methods of storage.'''
    def __init__(self, range_start, range_end, n, max_wildcards):

        print '''Number generator test for %d number ranges
        with a maximum of %d wildcard digits.''' % (n, max_wildcards)
        
        wildcards = range(0, max_wildcards + 1)
        # generate n unique numbers and store as keys in number_hash
        timer = WallClockTimer('dictionary population')
        self.number_hash = {}
        
        for i in xrange(0, n):
            unique = False
            while not unique:
                wildcard_digits = self.gen_wildcard_digits(wildcards)
                num = self.gen_number(range_start, range_end)
                if wildcard_digits > 0:
                    num /= (10 ** wildcard_digits)
                key = (num, wildcard_digits)
                if self.number_hash.has_key(key):
                    unique = False
                else:
                    unique = True
                    self.number_hash[key] = None
        timer.stop()
        
    def gen_wildcard_digits(self, wildcards):
        return random.choice(wildcards)
    
    def gen_number(self, start, end):
        return int(random.uniform(start, end))

    def storage_test(self, StorageTestClass):
        test = StorageTestClass(self.number_hash)

class StorageTest(object):
    '''base class for testing storage.  Derive a test
    class and provide your own runtest() method.'''
    def __init__(self, number_hash):
        timer = WallClockTimer('%s population' % type(self).__name__)
        self.runtest(number_hash)
        timer.stop()

class StorageBerkeleyDB(StorageTest):
    def runtest(self, number_hash):
        db = bsddb.hashopen(None, flag='c', cachesize=8192)
        for (num, wildcard_digits) in number_hash.keys():
            key = '%d:%d' % (num, wildcard_digits)
            db[key] = None
        db.close()

class StorageSQLite(StorageTest):
    def runtest(self, number_hash):
        db = sqlite.connect(':memory:')
        cursor = db.cursor()
        cursor.execute('CREATE TABLE numbers (num INTEGER, wd INTEGER)')
        q = """insert into numbers (num, wd) values (%d, %d)"""
        for (num, wildcard_digits) in number_hash.keys():
            cursor.execute(q, num, wildcard_digits)
        db.commit()
        db.close()


if __name__ == '__main__':
    
    test_vals = {'range_start'   : 61880 * 10 ** 7,
                 'range_end'     : 61891 * 10 ** 7,
                 'n'             : 1 * 10 ** 4,
                 'max_wildcards' : 3}
    
    ngen = NumberGen(test_vals['range_start'], test_vals['range_end'],
                     test_vals['n'], test_vals['max_wildcards'])

    ngen.storage_test(StorageBerkeleyDB)
    ngen.storage_test(StorageSQLite)
-- 
http://mail.python.org/mailman/listinfo/python-list

Reply via email to