[Yum-devel] [PATCH][RFC] Optimizing SqliteSack

Florian Festi Fri, 14 Dec 2007 02:53:06 -0800

Hi!

I started moving the sqlite sack to operate on pkgKey (table index) insteadof pkgId (rpm hash value) quite a while ago - in fact this was one of thefirst things I've done in yum. While some changes are already upstream someof these changes are still pending.

The first patch changes a lot of SQL statements and adds a per pkgKeypackage cache that makes sure we load every pkg only once and per pkgKeyexcludes.

The second patch adds an Least Recently Used list library and uses it tokeep only a limited amount of PRCOs in memory at a time.

The overall memory saving is 85MB (33%) for an F8 full install and 19MB(20%) for a 1000 pkg install. Runtime is reduced by 10-20%.

The LRU lib is LGPLv2.1. If we want to use it we have to decide whether wewant to include it or package it for Fedora.


Florian

>From b059939a54bf9b8466aadbfaa6cf4a7dcc156ad3 Mon Sep 17 00:00:00 2001
From: Florian Festi <[EMAIL PROTECTED]>
Date: Tue, 11 Dec 2007 17:31:14 +0100
Subject: [PATCH] Move most operation to be based on pkgKey instead of pkgId to gain some speed.

Generate only one package object per package to avoid duplicates.
Load only minimal tags per default.
---
 yum/sqlitesack.py |  149 ++++++++++++++++++++++++++++-------------------------
 1 files changed, 78 insertions(+), 71 deletions(-)

diff --git a/yum/sqlitesack.py b/yum/sqlitesack.py
index 0c612fd..78797cc 100644
--- a/yum/sqlitesack.py
+++ b/yum/sqlitesack.py
@@ -220,15 +220,17 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
 
     def __init__(self, packageClass):
         # Just init as usual and create a dict to hold the databases
-        yumRepo.YumPackageSack.__init__(self,packageClass)
+        yumRepo.YumPackageSack.__init__(self, packageClass)
         self.primarydb = {}
         self.filelistsdb = {}
         self.otherdb = {}
         self.excludes = {}
+        self._excludes = set() # of (repo, pkgKey)
         self._search_cache = {
             'provides' : { },
             'requires' : { },
             }
+        self._key2pkg = {}
 
     @catchSqliteException
     def __len__(self):
@@ -276,7 +278,7 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
         if not self.excludes.has_key(obj.repo):
             self.excludes[obj.repo] = {}
         self.excludes[obj.repo][obj.pkgId] = 1
-        
+        self._excludes.add( (obj.repo, obj.pkgKey) )
 
     def _excluded(self, repo, pkgId):
         if self.excludes.has_key(repo):
@@ -284,6 +286,22 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
                 return True
                 
         return False
+
+    def _pkgKeyExcluded(self, repo, pkgKey):
+        return (repo, pkgKey) in self._excludes
+
+    def _pkgExcluded(self, po):
+        return (po.repo, po.pkgKey) in self._excludes
+
+    def _packageByKey(self, repo, pkgKey):
+        if not self._key2pkg.has_key(repo):
+            self._key2pkg[repo] = {}
+        if not self._key2pkg[repo].has_key(pkgKey):
+            cur = self.primarydb[repo].cursor()
+            executeSQL(cur, "select pkgKey, pkgId, name, epoch, version, release from packages where pkgKey = ?", (pkgKey,))
+            po = self.pc(repo, cur.fetchone())
+            self._key2pkg[repo][pkgKey] = po
+        return self._key2pkg[repo][pkgKey]
         
     def addDict(self, repo, datatype, dataobj, callback=None):
         if self.added.has_key(repo):
@@ -344,19 +362,22 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
         for (rep,cache) in self.filelistsdb.items():
             cur = cache.cursor()
 
+            if glob:
+                dirname_check = ""
+            else:
+                dirname = os.path.dirname(name)
+                dirname_check = "dirname = '%s' and " % dirname
+
             # grab the entries that are a single file in the 
             # filenames section, use sqlites globbing if it is a glob
-            executeSQL(cur, "select packages.pkgId as pkgId from filelist, \
-                    packages where packages.pkgKey = filelist.pkgKey and \
-                    length(filelist.filetypes) = 1 and \
-                    filelist.dirname || ? || filelist.filenames \
-                    %s ?" % querytype, ('/', name))
+            executeSQL(cur, "select pkgKey from filelist where \
+                    %s length(filetypes) = 1 and \
+                    dirname || ? || filenames \
+                    %s ?" % (dirname_check, querytype), ('/', name))
             for ob in cur:
-                if self._excluded(rep, ob['pkgId']):
+                if self._pkgKeyExcluded(rep, ob['pkgKey']):
                     continue
-                pkg = self.getPackageDetails(ob['pkgId'])
-                po = self.pc(rep, pkg)
-                pkgs.append(po)
+                pkgs.append(self._packageByKey(rep, ob['pkgKey']))
 
             def filelist_globber(dirname, filenames):
                 files = filenames.split('/')
@@ -367,27 +388,17 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
                     matches = filter(lambda x: name==x, fns)
                 return len(matches)
 
-            if glob:
-                dirname_check = ""
-            else:
-                dirname = os.path.dirname(name)
-                dirname_check = "filelist.dirname = '%s' and " % dirname
-
             cache.create_function("filelist_globber", 2, filelist_globber)
             # for all the ones where filenames is multiple files, 
             # make the files up whole and use python's globbing method
-            executeSQL(cur, "select packages.pkgId as pkgId \
-                             from filelist, packages where \
-                             %s length(filelist.filetypes) > 1 \
-                             and filelist_globber(filelist.dirname,filelist.filenames) \
-                             and packages.pkgKey = filelist.pkgKey " % dirname_check)
+            executeSQL(cur, "select pkgKey from filelist where \
+                             %s length(filetypes) > 1 \
+                             and filelist_globber(dirname,filenames)" % dirname_check)
 
             for ob in cur:
-                if self._excluded(rep, ob['pkgId']):
+                if self._pkgKeyExcluded(rep, ob['pkgKey']):
                     continue
-                pkg = self.getPackageDetails(ob['pkgId'])
-                po = self.pc(rep, pkg)
-                pkgs.append(po)
+                pkgs.append(self._packageByKey(rep, ob['pkgKey']))
 
         pkgs = misc.unique(pkgs)
         return pkgs
@@ -399,7 +410,7 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
         if len(fields) < 1:
             return result
         
-        basestring="select DISTINCT pkgId from packages where %s like '%%%s%%' " % (fields[0], searchstring)
+        basestring="select DISTINCT pkgKey from packages where %s like '%%%s%%' " % (fields[0], searchstring)
         
         for f in fields[1:]:
             basestring = "%s or %s like '%%%s%%' " % (basestring, f, searchstring)
@@ -408,11 +419,9 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
             cur = cache.cursor()
             executeSQL(cur, basestring)
             for ob in cur:
-                if self._excluded(rep, ob['pkgId']):
+                if self._pkgKeyExcluded(rep, ob['pkgKey']):
                     continue
-                pkg = self.getPackageDetails(ob['pkgId'])
-                result.append((self.pc(rep,pkg)))
-         
+                result.append(self._packageByKey(rep, ob['pkgKey']))
         return result
         
     @catchSqliteException
@@ -424,7 +433,7 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
             executeSQL(cur, "select packages.name as name,\
-                packages.pkgId as pkgId,\
+                packages.pkgKey as pkgKey,\
                 packages.arch as arch, packages.epoch as epoch,\
                 packages.release as release, packages.version as version,\
                 obsoletes.name as oname, obsoletes.epoch as oepoch,\
@@ -434,7 +443,7 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
             for ob in cur:
                 # If the package that is causing the obsoletes is excluded
                 # continue without processing the obsoletes
-                if self._excluded(rep, ob['pkgId']):
+                if self._pkgKeyExcluded(rep, ob['pkgKey']):
                     continue
                     
                 key = ( ob['name'],ob['arch'],
@@ -499,12 +508,9 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
                 if rpmUtils.miscutils.rangeCompare(req, val):
                     tmp.setdefault(x['pkgKey'], []).append(val)
             for pkgKey, hits in tmp.iteritems():
-                executeSQL(cur, "select * from packages where pkgKey=?",
-                           (pkgKey,))
-                x = cur.fetchone()
-                if self._excluded(rep,x['pkgId']):
+                if self._pkgKeyExcluded(rep, pkgKey):
                     continue
-                result[self.pc(rep,x)] = hits
+                result[self._packageByKey(rep, pkgKey)] = hits
 
         if prcotype != 'provides' or name[0] != '/':
             self._search_cache[prcotype][req] = result
@@ -527,11 +533,11 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
         # If it is a filename, search the primary.xml file info
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
-            executeSQL(cur, "select DISTINCT packages.* from files,packages where files.name = ? and files.pkgKey = packages.pkgKey", (name,))
-            for x in cur:
-                if self._excluded(rep,x['pkgId']):
+            executeSQL(cur, "select DISTINCT pkgKey from files where name = ?", (name,))
+            for ob in cur:
+                if self._pkgKeyExcluded(rep, ob['pkgKey']):
                     continue
-                result[self.pc(rep,x)] = [(name, None, None)]
+                result[self._packageByKey(rep, ob['pkgKey'])] = [(name, None, None)]
         self._search_cache[prcotype][req] = result
         return result
 
@@ -554,11 +560,11 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
         results = []
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
-            executeSQL(cur, "select DISTINCT packages.* from %s,packages where %s.name %s ? and %s.pkgKey=packages.pkgKey" % (prcotype,prcotype,querytype,prcotype), (name,))
-            for x in cur:
-                if self._excluded(rep, x['pkgId']):
+            executeSQL(cur, "select DISTINCT pkgKey from %s where name %s ?" % (prcotype,querytype), (name,))
+            for ob in cur:
+                if self._pkgKeyExcluded(rep, ob['pkgKey']):
                     continue
-                results.append(self.pc(rep, x))
+                results.append(self._packageByKey(rep, ob['pkgKey']))
         
         # If it's not a provides or a filename, we are done
         if prcotype != "provides" or name[0] != '/':
@@ -568,11 +574,11 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
         # If it is a filename, search the primary.xml file info
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
-            executeSQL(cur, "select DISTINCT packages.* from files,packages where files.name %s ? and files.pkgKey = packages.pkgKey" % querytype, (name,))
-            for x in cur:
-                if self._excluded(rep,x['pkgId']):
+            executeSQL(cur, "select DISTINCT pkgKey from files where name %s ?" % querytype, (name,))
+            for ob in cur:
+                if self._pkgKeyExcluded(rep, ob['pkgKey']):
                     continue
-                results.append(self.pc(rep,x))
+                results.append(self._packageByKey(rep, ob['pkgKey']))
         
         matched = 0
         globs = ['.*bin\/.*', '^\/etc\/.*', '^\/usr\/lib\/sendmail$']
@@ -689,11 +695,11 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
         allpkg = []
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
-            executeSQL(cur, "select pkgId,name,epoch,version,release,arch from packages where name=? and arch=?",naTup)
-            for x in cur:
-                if self._excluded(rep, x['pkgId']):
-                    continue                    
-                allpkg.append(self.pc(rep,x))
+            executeSQL(cur, "select pkgKey from packages where name=? and arch=?",naTup)
+            for ob in cur:
+                if self._pkgKeyExcluded(rep, ob['pkgKey']):
+                    continue
+                allpkg.append(self._packageByKey(rep, ob['pkgKey']))
         
         # if we've got zilch then raise
         if not allpkg:
@@ -711,11 +717,11 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
         allpkg = []
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
-            executeSQL(cur, "select pkgId,name,epoch,version,release,arch from packages where name=?", (name,))
-            for x in cur:
-                if self._excluded(rep, x['pkgId']):
-                    continue                    
-                allpkg.append(self.pc(rep,x))
+            executeSQL(cur, "select pkgKey from packages where name=?", (name,))
+            for ob in cur:
+                if self._pkgKeyExcluded(rep, ob['pkgKey']):
+                    continue
+                allpkg.append(self._packageByKey(rep, ob['pkgKey']))
         
         # if we've got zilch then raise
         if not allpkg:
@@ -741,11 +747,11 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
                 cur = db.cursor()
                 executeSQL(cur, query)
                 for pkg in cur:
-                    if self._excluded(rep, pkg['pkgId']):
+                    if self._pkgKeyExcluded(rep, pkg['pkgKey']):
                         continue
                     if p in unmatched:
                         unmatched.remove(p)
-                    matchres.append(self.pc(rep, pkg))
+                    matchres.append(self._packageByKey(rep, pkg['pkgKey']))
 
         exactmatch = misc.unique(exactmatch)
         matched = misc.unique(matched)
@@ -762,9 +768,11 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
             if (repoid == None or repoid == repo.id):
                 cur = cache.cursor()
                 
-                executeSQL(cur, "select pkgId,name,epoch,version,release,arch from packages")
+                executeSQL(cur, "select pkgId, pkgKey, name,epoch,version,release,arch from packages")
                 for x in cur:
-                    returnList.append(self.pc(repo,x))
+                    po = self.pc(repo,x)
+                    self._key2pkg.setdefault(repo, {})[po.pkgKey] = po
+                    returnList.append(po)
         self.pkgobjlist = returnList
                 
     def returnPackages(self, repoid=None):
@@ -796,7 +804,7 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
             return returnList
         
         # make up our execute string
-        q = "select * from packages WHERE"
+        q = "select pkgKey from packages WHERE"
         for (col, var) in [('name', name), ('epoch', epoch), ('version', ver),
                            ('arch', arch), ('release', rel)]:
             if var:
@@ -808,12 +816,11 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
         # Search all repositories            
         for (rep,cache) in self.primarydb.items():
             cur = cache.cursor()
-            #executeSQL(cur, "select * from packages WHERE name = %s AND epoch = %s AND version = %s AND release = %s AND arch = %s" , (name,epoch,ver,rel,arch))
             executeSQL(cur, q)
-            for x in cur:
-                if self._excluded(rep, x['pkgId']):
+            for ob in cur:
+                if self._pkgKeyExcluded(rep, ob['pkgKey']):
                     continue
-                returnList.append(self.pc(rep,x))
+                returnList.append(self._packageByKey(rep, ob['pkgKey']))
         return returnList
     
     @catchSqliteException
@@ -826,7 +833,7 @@ class YumSqlitePackageSack(yumRepo.YumPackageSack):
 
         for (rep, cache) in self.primarydb.items():
             cur = cache.cursor()
-            myq = "select pkgId from packages where arch not in %s" % arch_query
+            myq = "select pkgId, pkgKey from packages where arch not in %s" % arch_query
             executeSQL(cur, myq)
             for row in cur:
                 obj = self.pc(rep,row)
@@ -864,7 +871,7 @@ def decodefiletypelist(filetypestring):
 # Check against name, nameArch, nameVerRelArch, nameVer, nameVerRel,
 # envra, nevra
 PARSE_QUERY = """
-select pkgId, name, arch, epoch, version, release from packages
+select pkgKey from packages
 where name %(op)s '%(q)s'
    or name || '.' || arch %(op)s '%(q)s'
    or name || '-' || version %(op)s '%(q)s'
-- 
1.5.3.3

>From ea49f2c885f6775e3afbbfd308c256752bfc6417 Mon Sep 17 00:00:00 2001
From: Florian Festi <[EMAIL PROTECTED]>
Date: Fri, 14 Dec 2007 11:00:24 +0100
Subject: [PATCH] Use Least Recently Used lists for PRCO to save memory

---
 yum/depsolve.py   |    4 +-
 yum/lrucache.py   |  738 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 yum/sqlitesack.py |   50 +++-
 3 files changed, 777 insertions(+), 15 deletions(-)
 create mode 100644 yum/lrucache.py

diff --git a/yum/depsolve.py b/yum/depsolve.py
index 18ddae4..b35d88f 100644
--- a/yum/depsolve.py
+++ b/yum/depsolve.py
@@ -696,7 +696,9 @@ class Depsolve(object):
         p = pstats.Stats('yumprof')
         p.strip_dirs()
         p.sort_stats('time')
-        p.print_stats(20)
+        p.print_stats(40)
+        p.sort_stats('cumulative')
+        p.print_stats(40)
         return rc
 
     def resolveDeps(self):
diff --git a/yum/lrucache.py b/yum/lrucache.py
new file mode 100644
index 0000000..c9321cc
--- /dev/null
+++ b/yum/lrucache.py
@@ -0,0 +1,738 @@
+#  Copyright (c) 2002-2005 Bryce "Zooko" Wilcox-O'Hearn
+#  portions Copyright (c) 2001 Autonomous Zone Industries
+#  This file is licensed under the
+#    GNU Lesser General Public License v2.1.
+#    See the file COPYING or visit http://www.gnu.org/ for details.
+#  http://zooko.com/repos/pyutil/pyutil/pyutil/
+"""
+This module offers three implementations of an LRUCache, which is a dict that
+drops items according to a Least-Recently-Used policy if the dict exceeds a
+fixed maximum size.
+
+Warning: if -O optimizations are not turned on then LRUCache performs
+extensive self-analysis in every function call, which can take minutes
+and minutes for a large cache.  Turn on -O, or comment out assert
+self._assert_invariants()
+"""
+
+import operator
+
+#from assertutil import _assert, precondition, postcondition
+#from humanreadable import hr
+
+class LRUCache:
+    """
+    An efficient least-recently-used cache.  It keeps an LRU queue, and when
+    the number of items in the cache reaches maxsize, it removes the least
+    recently used item.
+
+    "Looking" at an item, key, or value such as with "has_key()" makes that
+    item become the most recently used item.
+
+    You can also use "refresh()" to explicitly make an item become the most
+    recently used item.
+
+    Adding an item that is already in the dict *does* make it the most-
+    recently-used item although it does not change the state of the dict
+    itself.
+    """
+    class ItemIterator:
+        def __init__(self, c):
+            self.c = c
+            self.i = c.d[c.hs][2]
+        def __iter__(self):
+            return self
+        def next(self):
+            if self.i is self.c.ts:
+                raise StopIteration
+            k = self.i
+            precondition(self.c.d.has_key(k), "The iterated LRUCache doesn't have the next key.  Most likely this is because someone altered the contents of the LRUCache while the iteration was in progress.", k, self.c)
+            (v, p, n,) = self.c.d[k]
+            self.i = n
+            return (k, v,)
+
+    class KeyIterator:
+        def __init__(self, c):
+            self.c = c
+            self.i = c.d[c.hs][2]
+        def __iter__(self):
+            return self
+        def next(self):
+            if self.i is self.c.ts:
+                raise StopIteration
+            k = self.i
+            precondition(self.c.d.has_key(k), "The iterated LRUCache doesn't have the next key.  Most likely this is because someone altered the contents of the LRUCache while the iteration was in progress.", k, self.c)
+            (v, p, n,) = self.c.d[k]
+            self.i = n
+            return k
+
+    class ValIterator:
+        def __init__(self, c):
+            self.c = c
+            self.i = c.d[c.hs][2]
+        def __iter__(self):
+            return self
+        def next(self):
+            if self.i is self.c.ts:
+                raise StopIteration
+            precondition(self.c.d.has_key(self.i), "The iterated LRUCache doesn't have the next key.  Most likely this is because someone altered the contents of the LRUCache while the iteration was in progress.", self.i, self.c)
+            (v, p, n,) = self.c.d[self.i]
+            self.i = n
+            return v
+
+    class Sentinel:
+        def __init__(self, msg):
+            self.msg = msg
+        def __repr__(self):
+            return "<%s %s>" % (self.__class__.__name__, self.msg,)
+
+    def __init__(self, initialdata={}, maxsize=128):
+        precondition(maxsize > 0)
+        self.m = maxsize+2 # The +2 is for the head and tail nodes.
+        self.d = {} # k: k, v: [v, prev, next,] # the dict
+        self.hs = LRUCache.Sentinel("hs")
+        self.ts = LRUCache.Sentinel("ts")
+        self.d[self.hs] = [None, self.hs, self.ts,] # This allows us to use sentinels as normal nodes.
+        self.d[self.ts] = [None, self.hs, self.ts,] # This allows us to use sentinels as normal nodes.
+        self.update(initialdata)
+
+        #assert self._assert_invariants()
+
+    def __repr_n__(self, n=None):
+        s = ["{",]
+        try:
+            iter = self.iteritems()
+            x = iter.next()
+            s.append(str(x[0])); s.append(": "); s.append(str(x[1]))
+            i = 1
+            while (n is None) or (i < n):
+                x = iter.next()
+                s.append(", "); s.append(str(x[0])); s.append(": "); s.append(str(x[1]))
+        except StopIteration:
+            pass
+        s.append("}")
+        return ''.join(s)
+
+    def __repr__(self):
+        return "<%s %s>" % (self.__class__.__name__, self.__repr_n__(),)
+
+    def __str__(self):
+        return "<%s %s>" % (self.__class__.__name__, self.__repr_n__(16),)
+
+    def _assert_invariants(self):
+        _assert(len(self.d) <= self.m, "Size is required to be <= maxsize.", len(self.d), self.m)
+        _assert((len(self.d) > 2) == (self.d[self.hs][2] is not self.ts) == (self.d[self.ts][1] is not self.hs), "Head and tail point to something other than each other if and only if there is at least one element in the dictionary.", self.hs, self.ts, len(self.d))
+        foundprevsentinel = 0
+        foundnextsentinel = 0
+        for (k, (v, p, n,)) in self.d.iteritems():
+            _assert(v not in (self.hs, self.ts,))
+            _assert(p is not self.ts, "A reference to the tail sentinel may not appear in prev.", k, v, p, n)
+            _assert(n is not self.hs, "A reference to the head sentinel may not appear in next.", k, v, p, n)
+            _assert(p in self.d, "Each prev is required to appear as a key in the dict.", k, v, p, n)
+            _assert(n in self.d, "Each next is required to appear as a key in the dict.", k, v, p, n)
+            if p is self.hs:
+                foundprevsentinel += 1
+                _assert(foundprevsentinel <= 2, "No more than two references to the head sentinel may appear as a prev.", k, v, p, n)
+            if n is self.ts:
+                foundnextsentinel += 1
+                _assert(foundnextsentinel <= 2, "No more than one reference to the tail sentinel may appear as a next.", k, v, p, n)
+        _assert(foundprevsentinel == 2, "A reference to the head sentinel is required appear as a prev (plus a self-referential reference).")
+        _assert(foundnextsentinel == 2, "A reference to the tail sentinel is required appear as a next (plus a self-referential reference).")
+
+        count = 0
+        for (k, v,) in self.iteritems():
+            _assert(k not in (self.hs, self.ts,))
+            count += 1
+        _assert(count == len(self.d)-2, count, len(self.d)) # -2 for the sentinels
+
+        return True
+
+    def freshen(self, k, strictkey=False):
+        #assert self._assert_invariants()
+
+        if not self.d.has_key(k):
+            if strictkey:
+                raise KeyError, k
+            return
+
+        node = self.d[k]
+
+        # relink
+        self.d[node[1]][2] = node[2]
+        self.d[node[2]][1] = node[1]
+
+        # move to front
+        hnode = self.d[self.hs]
+
+        node[1] = self.hs
+        node[2] = hnode[2]
+        hnode[2] = k
+        self.d[node[2]][1] = k
+
+        #assert self._assert_invariants()
+
+    def iteritems(self):
+        return LRUCache.ItemIterator(self)
+
+    def itervalues(self):
+        return LRUCache.ValIterator(self)
+
+    def iterkeys(self):
+        return self.__iter__()
+
+    def __iter__(self):
+        return LRUCache.KeyIterator(self)
+
+    def __getitem__(self, key, default=None, strictkey=True):
+        node = self.d.get(key)
+        if not node:
+            if strictkey:
+                raise KeyError, key
+            return default
+        self.freshen(key)
+        return node[0]
+
+    def __setitem__(self, k, v=None):
+        #assert self._assert_invariants()
+
+        node = self.d.get(k)
+        if node:
+            node[0] = v
+            self.freshen(k)
+            return
+
+        if len(self.d) == self.m:
+            # If this insert is going to increase the size of the cache to
+            # bigger than maxsize.
+            self.pop()
+
+        hnode = self.d[self.hs]
+        n = hnode[2]
+        self.d[k] = [v, self.hs, n,]
+        hnode[2] = k
+        self.d[n][1] = k
+
+        #assert self._assert_invariants()
+        return v
+
+    def __delitem__(self, key, default=None, strictkey=True):
+        """
+        @param strictkey: True if you want a KeyError in the case that
+            key is not there, False if you want a reference to default
+            in the case that key is not there
+        @param default: the object to return if key is not there; This
+            is ignored if strictkey.
+
+        @return: the value removed or default if there is not item by
+            that key and strictkey is False
+        """
+        #assert self._assert_invariants()
+        if self.d.has_key(key):
+            node = self.d[key]
+            # relink
+            self.d[node[1]][2] = node[2]
+            self.d[node[2]][1] = node[1]
+            del self.d[key]
+            #assert self._assert_invariants()
+            return node[0]
+        elif strictkey:
+            #assert self._assert_invariants()
+            raise KeyError, key
+        else:
+            #assert self._assert_invariants()
+            return default
+
+    def has_key(self, key):
+        #assert self._assert_invariants()
+        if self.d.has_key(key):
+            self.freshen(key)
+            #assert self._assert_invariants()
+            return True
+        else:
+            #assert self._assert_invariants()
+            return False
+
+    def clear(self):
+        #assert self._assert_invariants()
+        self.d.clear()
+        self.d[self.hs] = [None, self.hs, self.ts,] # This allows us to use sentinels as normal nodes.
+        self.d[self.ts] = [None, self.hs, self.ts,] # This allows us to use sentinels as normal nodes.
+        #assert self._assert_invariants()
+
+    def update(self, otherdict):
+        """
+        @return: self
+        """
+        #assert self._assert_invariants()
+
+        if len(otherdict) >= (self.m-2): # -2 for the sentinel nodes
+            # optimization
+            self.clear()
+            #assert self._assert_invariants()
+
+            i = otherdict.iteritems()
+            try:
+                while len(self.d) < self.m:
+                    (k, v,) = i.next()
+                    #assert self._assert_invariants()
+                    self[k] = v
+                    #assert self._assert_invariants()
+                return self
+            except StopIteration:
+                _assert(False, "Internal error -- this should never have happened since the while loop should have terminated first.")
+                return self
+
+        for (k, v,) in otherdict.iteritems():
+            #assert self._assert_invariants()
+            self[k] = v
+            #assert self._assert_invariants()
+
+    def pop(self):
+        #assert self._assert_invariants()
+        if len(self.d) < 2: # the +2 is for the sentinels
+            raise KeyError, 'popitem(): dictionary is empty'
+        k = self.d[self.ts][1]
+        self.remove(k)
+        #assert self._assert_invariants()
+        return k
+
+    def popitem(self):
+        #assert self._assert_invariants()
+        if len(self.d) < 2: # the +2 is for the sentinels
+            raise KeyError, 'popitem(): dictionary is empty'
+        k = self.d[self.ts][1]
+        val = self.remove(k)
+        #assert self._assert_invariants()
+        return (k, val,)
+
+    def keys_unsorted(self):
+        #assert self._assert_invariants()
+        t = self.d.copy()
+        del t[self.hs]
+        del t[self.ts]
+        #assert self._assert_invariants()
+        return t.keys()
+
+    def keys(self):
+        res = [None] * len(self)
+        i = 0
+        for k in self.iterkeys():
+            res[i] = k
+            i += 1
+        return res
+
+    def values_unsorted(self):
+        #assert self._assert_invariants()
+        t = self.d.copy()
+        del t[self.hs]
+        del t[self.ts]
+        #assert self._assert_invariants()
+        return map(operator.__getitem__, t.values(), [0]*len(t))
+
+    def values(self):
+        res = [None] * len(self)
+        i = 0
+        for v in self.itervalues():
+            res[i] = v
+            i += 1
+        return res
+
+    def items(self):
+        res = [None] * len(self)
+        i = 0
+        for it in self.iteritems():
+            res[i] = it
+            i += 1
+        return res
+
+    def __len__(self):
+        return len(self.d) - 2
+
+    def insert(self, key, val=None):
+        #assert self._assert_invariants()
+        result = self.__setitem__(key, val)
+        #assert self._assert_invariants()
+        return result
+
+    def setdefault(self, key, default=None):
+        #assert self._assert_invariants()
+        if not self.has_key(key):
+            self[key] = default
+        #assert self._assert_invariants()
+        return self[key]
+
+    def get(self, key, default=None):
+        return self.__getitem__(key, default, strictkey=False)
+
+    def remove(self, key, default=None, strictkey=True):
+        #assert self._assert_invariants()
+        result = self.__delitem__(key, default, strictkey)
+        #assert self._assert_invariants()
+        return result
+
+class SmallLRUCache(dict):
+    """
+    This one is faster than LRUCache for small sets.  By experiment,
+    the cross-over point appears to be about 2^14 elements --
+    SmallLRUCache is faster if you have fewer than 2^14 elements,
+    LRUCache is faster if you have more than 2^14 elements
+
+    A simple least-recently-used cache.  It keeps an LRU queue, and
+    when the number of items in the cache reaches maxsize, it removes
+    the least recently used item.
+
+    "Looking" at an item or a key such as with "has_key()" makes that
+    item become the most recently used item.
+
+    You can also use "refresh()" to explicitly make an item become the most
+    recently used item.
+
+    Adding an item that is already in the dict *does* make it the
+    most- recently-used item although it does not change the state of
+    the dict itself.
+    """
+    class ItemIterator:
+        def __init__(self, c):
+            self.c = c
+            self.i = 0
+        def __iter__(self):
+            return self
+        def next(self):
+            precondition(self.i <= len(self.c._lru), "The iterated SmallLRUCache doesn't have this many elements.  Most likely this is because someone altered the contents of the LRUCache while the iteration was in progress.", i, self.c)
+            precondition(dict.has_key(self.c, self.c._lru[self.i]), "The iterated SmallLRUCache doesn't have this key.  Most likely this is because someone altered the contents of the LRUCache while the iteration was in progress.", i, self.c._lru[self.i], self.c)
+            if self.i == len(self.c._lru):
+                raise StopIteration
+            k = self.i
+            self.i += 1
+            return (k, dict.__getitem__(self.c, k),)
+
+    class KeyIterator:
+        def __init__(self, c):
+            self.c = c
+            self.i = 0
+        def __iter__(self):
+            return self
+        def next(self):
+            precondition(self.i <= len(self.c._lru), "The iterated SmallLRUCache doesn't have this many elements.  Most likely this is because someone altered the contents of the LRUCache while the iteration was in progress.", i, self.c)
+            precondition(dict.has_key(self.c, self.c._lru[self.i]), "The iterated SmallLRUCache doesn't have this key.  Most likely this is because someone altered the contents of the LRUCache while the iteration was in progress.", i, self.c._lru[self.i], self.c)
+            if self.i == len(self.c._lru):
+                raise StopIteration
+            k = self.i
+            self.i += 1
+            return k
+
+    class ValueIterator:
+        def __init__(self, c):
+            self.c = c
+            self.i = 0
+        def __iter__(self):
+            return self
+        def next(self):
+            precondition(self.i <= len(self.c._lru), "The iterated SmallLRUCache doesn't have this many elements.  Most likely this is because someone altered the contents of the LRUCache while the iteration was in progress.", i, self.c)
+            precondition(dict.has_key(self.c, self.c._lru[self.i]), "The iterated SmallLRUCache doesn't have this key.  Most likely this is because someone altered the contents of the LRUCache while the iteration was in progress.", i, self.c._lru[self.i], self.c)
+            if self.i == len(self.c._lru):
+                raise StopIteration
+            k = self.i
+            self.i += 1
+            return dict.__getitem__(self.c, k)
+
+    def __init__(self, initialdata={}, maxsize=128):
+        dict.__init__(self, initialdata)
+        self._lru = initialdata.keys() # contains keys
+        self._maxsize = maxsize
+        over = len(self) - self._maxsize
+        if over > 0:
+            map(dict.__delitem__, [self]*over, self._lru[:over])
+            del self._lru[:over]
+        #assert self._assert_invariants()
+
+    def _assert_invariants(self):
+        _assert(len(self._lru) <= self._maxsize, "Size is required to be <= maxsize.")
+        _assert(len(filter(lambda x: dict.has_key(self, x), self._lru)) == len(self._lru), "Each key in self._lru is required to be in dict.", filter(lambda x: not dict.has_key(self, x), self._lru), len(self._lru), self._lru, len(self), self)
+        _assert(len(filter(lambda x: x in self._lru, self.keys())) == len(self), "Each key in dict is required to be in self._lru.", filter(lambda x: x not in self._lru, self.keys()), len(self._lru), self._lru, len(self), self)
+        _assert(len(self._lru) == len(self), "internal consistency", filter(lambda x: x not in self.keys(), self._lru), len(self._lru), self._lru, len(self), self)
+        _assert(len(self._lru) <= self._maxsize, "internal consistency", len(self._lru), self._lru, self._maxsize)
+        return True
+
+    def insert(self, key, item=None):
+        #assert self._assert_invariants()
+        result = self.__setitem__(key, item)
+        #assert self._assert_invariants()
+        return result
+
+    def setdefault(self, key, default=None):
+        #assert self._assert_invariants()
+        if not self.has_key(key):
+            self[key] = default
+        #assert self._assert_invariants()
+        return self[key]
+
+    def __setitem__(self, key, item=None):
+        #assert self._assert_invariants()
+        if dict.has_key(self, key):
+            self._lru.remove(key)
+        else:
+            if len(self._lru) == self._maxsize:
+                # If this insert is going to increase the size of the cache to bigger than maxsize:
+                killkey = self._lru.pop(0)
+                dict.__delitem__(self, killkey)
+        dict.__setitem__(self, key, item)
+        self._lru.append(key)
+        #assert self._assert_invariants()
+        return item
+
+    def remove(self, key, default=None, strictkey=True):
+        #assert self._assert_invariants()
+        result = self.__delitem__(key, default, strictkey)
+        #assert self._assert_invariants()
+        return result
+
+    def __delitem__(self, key, default=None, strictkey=True):
+        """
+        @param strictkey: True if you want a KeyError in the case that
+            key is not there, False if you want a reference to default
+            in the case that key is not there
+        @param default: the object to return if key is not there; This
+            is ignored if strictkey.
+
+        @return: the object removed or default if there is not item by
+            that key and strictkey is False
+        """
+        #assert self._assert_invariants()
+        if dict.has_key(self, key):
+            val = dict.__getitem__(self, key)
+            dict.__delitem__(self, key)
+            self._lru.remove(key)
+            #assert self._assert_invariants()
+            return val
+        elif strictkey:
+            #assert self._assert_invariants()
+            raise KeyError, key
+        else:
+            #assert self._assert_invariants()
+            return default
+
+    def clear(self):
+        #assert self._assert_invariants()
+        dict.clear(self)
+        self._lru = []
+        #assert self._assert_invariants()
+
+    def update(self, otherdict):
+        """
+        @return: self
+        """
+        #assert self._assert_invariants()
+        if len(otherdict) > self._maxsize:
+            # Handling this special case here makes it possible to implement the
+            # other more common cases faster below.
+            dict.clear(self)
+            self._lru = []
+            if self._maxsize > (len(otherdict) - self._maxsize):
+                dict.update(self, otherdict)
+                while len(self) > self._maxsize:
+                    dict.popitem(self)
+            else:
+                for k, v, in otherdict.iteritems():
+                    if len(self) == self._maxsize:
+                        break
+                    dict.__setitem__(self, k, v)
+            self._lru = dict.keys(self)
+            #assert self._assert_invariants()
+            return self
+      
+        for k in otherdict.iterkeys():
+            if dict.has_key(self, k):
+                self._lru.remove(k)
+        self._lru.extend(otherdict.keys())
+        dict.update(self, otherdict)
+
+        over = len(self) - self._maxsize
+        if over > 0:
+            map(dict.__delitem__, [self]*over, self._lru[:over])
+            del self._lru[:over]
+
+        #assert self._assert_invariants()
+        return self
+
+    def has_key(self, key):
+        #assert self._assert_invariants()
+        if dict.has_key(self, key):
+            #assert key in self._lru, "key: %s, self._lru: %s" % tuple(map(hr, (key, self._lru,)))
+            self._lru.remove(key)
+            self._lru.append(key)
+            #assert self._assert_invariants()
+            return True
+        else:
+            #assert self._assert_invariants()
+            return False
+
+    def refresh(self, key, strictkey=True):
+        """
+        @param strictkey: raise a KeyError exception if key isn't present
+        """
+        #assert self._assert_invariants()
+        if not dict.has_key(self, key):
+            if strictkey:
+                raise KeyError, key
+            return
+        self._lru.remove(key)
+        self._lru.append(key)
+
+    def popitem(self):
+        if not self._lru:
+            raise KeyError, 'popitem(): dictionary is empty'
+        k = self._lru[-1]
+        obj = self.remove(k)
+        return (k, obj,)
+
+class LinkedListLRUCache:
+    """
+    This is slower and less featureful than LRUCache.  It is included
+    here for comparison purposes.
+
+    Implementation of a length-limited O(1) LRU queue.
+    Built for and used by PyPE:
+    http://pype.sourceforge.net
+    original Copyright 2003 Josiah Carlson.
+    useful methods and _assert_invariant added by Zooko for testing and benchmarking purposes
+    """
+    class Node:
+        def __init__(self, prev, me):
+            self.prev = prev
+            self.me = me
+            self.next = None
+    def __init__(self, initialdata={}, maxsize=128):
+        self._maxsize = max(maxsize, 1)
+        self.d = {}
+        self.first = None
+        self.last = None
+        for key, value in initialdata.iteritems():
+            self[key] = value
+    def clear(self):
+        self.d = {}
+        self.first = None
+        self.last = None
+    def update(self, otherdict):
+        for (k, v,) in otherdict.iteritems():
+            self[k] = v
+    def setdefault(self, key, default=None):
+        if not self.has_key(key):
+            self[key] = default
+        return self[key]
+    def _assert_invariants(self):
+        def lliterkeys(self):
+            cur = self.first
+            while cur != None:
+                cur2 = cur.next
+                yield cur.me[0]
+                cur = cur2
+        def lllen(self):
+            # Ugh.
+            acc = 0
+            for x in lliterkeys(self):
+                acc += 1
+            return acc
+        def llhaskey(self, key):
+            # Ugh.
+            for x in lliterkeys(self):
+                if x is key:
+                    return True
+            return False
+        for k in lliterkeys(self):
+            _assert(self.d.has_key(k), "Each key in the linked list is required to be in the dict.", k)
+        for k in self.d.iterkeys():
+            _assert(llhaskey(self, k), "Each key in the dict is required to be in the linked list.", k)
+        _assert(lllen(self) == len(self.d), "internal consistency", self, self.d)
+        _assert(len(self.d) <= self._maxsize, "Size is required to be <= maxsize.")
+        return True
+    def __contains__(self, obj):
+        return obj in self.d
+    def has_key(self, key):
+        return self.__contains__(key)
+    def __getitem__(self, obj):
+        a = self.d[obj].me
+        self[a[0]] = a[1]
+        return a[1]
+    def get(self, key, default=None, strictkey=False):
+        if not self.has_key(key) and strictkey:
+            raise KeyError, key
+        if self.has_key(key):
+            return self.__getitem__(key)
+        else:
+            return default
+    def __setitem__(self, obj, val):
+        if obj in self.d:
+            del self[obj]
+        nobj = self.Node(self.last, (obj, val))
+        if self.first is None:
+            self.first = nobj
+        if self.last:
+            self.last.next = nobj
+        self.last = nobj
+        self.d[obj] = nobj
+        if len(self.d) > self._maxsize:
+            if self.first == self.last:
+                self.first = None
+                self.last = None
+                return
+            a = self.first
+            a.next.prev = None
+            self.first = a.next
+            a.next = None
+            del self.d[a.me[0]]
+            del a
+    def insert(self, key, item=None):
+        return self.__setitem__(key, item)
+    def __delitem__(self, obj, default=None, strictkey=True):
+        if self.d.has_key(obj):
+            nobj = self.d[obj]
+            if nobj.prev:
+                nobj.prev.next = nobj.next
+            else:
+                self.first = nobj.next
+            if nobj.next:
+                nobj.next.prev = nobj.prev
+            else:
+                self.last = nobj.prev
+            val = self.d[obj]
+            del self.d[obj]
+            return val.me[1]
+        elif strictkey:
+            raise KeyError, obj
+        else:
+            return default
+    def remove(self, obj, default=None, strictkey=True):
+        return self.__delitem__(obj, default=default, strictkey=strictkey)
+    def __iter__(self):
+        cur = self.first
+        while cur != None:
+            cur2 = cur.next
+            yield cur.me[1]
+            cur = cur2
+    def iteritems(self):
+        cur = self.first
+        while cur != None:
+            cur2 = cur.next
+            yield cur.me
+            cur = cur2
+    def iterkeys(self):
+        return iter(self.d)
+    def itervalues(self):
+        for i,j in self.iteritems():
+            yield j
+    def values(self):
+        l = []
+        for v in self.itervalues():
+            l.append(v)
+        return l
+    def keys(self):
+        return self.d.keys()
+    def __len__(self):
+        return self.d.__len__()
+    def popitem(self):
+        i = self.last.me
+        obj = self.remove(i[0])
+        return i
+
+# vim:ts=4:sw=4:showmatch:expandtab
diff --git a/yum/sqlitesack.py b/yum/sqlitesack.py
index 78797cc..a2e0823 100644
--- a/yum/sqlitesack.py
+++ b/yum/sqlitesack.py
@@ -32,6 +32,7 @@ import misc
 from sqlutils import executeSQL
 import rpmUtils.miscutils
 import sqlutils
+from lrucache import SmallLRUCache
 
 def catchSqliteException(func):
     """This decorator converts sqlite exceptions into RepoError"""
@@ -46,13 +47,35 @@ def catchSqliteException(func):
     newFunc.__dict__.update(func.__dict__)
     return newFunc
 
-class YumAvailablePackageSqlite(YumAvailablePackage, PackageObject, RpmBase):
+class SqlitePrcoDict(dict):
+
+    def __init__(self, pkg):
+        dict.__init__(self)
+        for name in ('obsoletes', 'conflicts', 'requires', 'provides'):
+            self[name] = None
+        self.pkg = pkg
+
+    def get(self, key, default=None):
+        if not self.has_key(key):
+            return default
+        return self.pkg._getPrco(key)
+
+    def __getitem__(self, key):
+        if not self.has_key(key):
+            raise KeyError
+        return self.pkg._getPrco(key)
+
+
+class YumAvailablePackageSqlite(YumAvailablePackage):
+
+    prco_cache = { 'obsoletes': SmallLRUCache(maxsize=10),
+                   'conflicts': SmallLRUCache(maxsize=10),
+                   'requires': SmallLRUCache(maxsize=10),
+                   'provides': SmallLRUCache(maxsize=10) }
+
     def __init__(self, repo, db_obj):
         self._checksums = []
-        self.prco = { 'obsoletes': (),
-                      'conflicts': (),
-                      'requires': (),
-                      'provides': () }
+        self.prco = SqlitePrcoDict(self)
         self._files = {}
         self.sack = repo.sack
         self.repoid = repo.id
@@ -199,20 +222,19 @@ class YumAvailablePackageSqlite(YumAvailablePackage, PackageObject, RpmBase):
         return map(lambda x: x['fname'], cur)
 
     @catchSqliteException
-    def returnPrco(self, prcotype, printable=False):
-        if isinstance(self.prco[prcotype], tuple):
+    def _getPrco(self, prcotype):
+        result = self.prco_cache[prcotype].get(self)
+        if result is None:
             cache = self.sack.primarydb[self.repo]
             cur = cache.cursor()
             query = "select name, version, release, epoch, flags from %s "\
                         "where pkgKey = '%s'" % (prcotype, self.pkgKey)
             executeSQL(cur, query)
-            self.prco[prcotype] = [ ]
-            for ob in cur:
-                self.prco[prcotype].append((ob['name'], ob['flags'],
-                                           (ob['epoch'], ob['version'], 
-                                            ob['release'])))
-
-        return RpmBase.returnPrco(self, prcotype, printable)
+            result = [ (ob['name'], ob['flags'],
+                        (ob['epoch'], ob['version'],
+                         ob['release'])) for ob in cur ]
+            self.prco_cache[prcotype][self] = result
+        return result
 
 class YumSqlitePackageSack(yumRepo.YumPackageSack):
     """ Implementation of a PackageSack that uses sqlite cache instead of fully
-- 
1.5.3.3

_______________________________________________
Yum-devel mailing list
[email protected]
https://lists.dulug.duke.edu/mailman/listinfo/yum-devel

[Yum-devel] [PATCH][RFC] Optimizing SqliteSack

Reply via email to