Change cache modules to write md5 in cache entries, instead of mtime.
Since portage-2.2.27, the relevant cache modules have had the ability
to read cache entries containing either md5 or mtime, therefore this
change is backward-compatible with portage-2.2.27 and later.

Also fix the reconstruct_eclasses function to raise CacheCorruption
when the specified chf_type is md5 and the cache entry contains mtime
data, and optimize __getitem__ to skip reconstruct_eclasses calls when
the entry appears to have a different chf_type.

X-Gentoo-Bug: 568934
X-Gentoo-Bug-url: https://bugs.gentoo.org/show_bug.cgi?id=568934
---
[PATCH v4] adds some comments to clarify the purposes of the  __getitem__
optimization and _md5_deserializer stuff

 pym/portage/cache/anydbm.py    |  4 ++--
 pym/portage/cache/flat_hash.py |  4 ++--
 pym/portage/cache/sqlite.py    |  4 ++--
 pym/portage/cache/template.py  | 36 ++++++++++++++++++++++++++++++++----
 4 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/pym/portage/cache/anydbm.py b/pym/portage/cache/anydbm.py
index 80d24e5..88d85b0 100644
--- a/pym/portage/cache/anydbm.py
+++ b/pym/portage/cache/anydbm.py
@@ -36,8 +36,8 @@ from portage.cache import cache_errors
 
 class database(fs_template.FsBased):
 
-       validation_chf = 'mtime'
-       chf_types = ('mtime', 'md5')
+       validation_chf = 'md5'
+       chf_types = ('md5', 'mtime')
 
        autocommits = True
        cleanse_keys = True
diff --git a/pym/portage/cache/flat_hash.py b/pym/portage/cache/flat_hash.py
index cca0f10..3a899c0 100644
--- a/pym/portage/cache/flat_hash.py
+++ b/pym/portage/cache/flat_hash.py
@@ -163,5 +163,5 @@ class md5_database(database):
 
 
 class mtime_md5_database(database):
-       validation_chf = 'mtime'
-       chf_types = ('mtime', 'md5')
+       validation_chf = 'md5'
+       chf_types = ('md5', 'mtime')
diff --git a/pym/portage/cache/sqlite.py b/pym/portage/cache/sqlite.py
index 32e4076..69150f6 100644
--- a/pym/portage/cache/sqlite.py
+++ b/pym/portage/cache/sqlite.py
@@ -18,8 +18,8 @@ if sys.hexversion >= 0x3000000:
 
 class database(fs_template.FsBased):
 
-       validation_chf = 'mtime'
-       chf_types = ('mtime', 'md5')
+       validation_chf = 'md5'
+       chf_types = ('md5', 'mtime')
 
        autocommits = False
        synchronous = False
diff --git a/pym/portage/cache/template.py b/pym/portage/cache/template.py
index a7c6de0..8662d85 100644
--- a/pym/portage/cache/template.py
+++ b/pym/portage/cache/template.py
@@ -54,6 +54,15 @@ class database(object):
 
                if self.serialize_eclasses and "_eclasses_" in d:
                        for chf_type in chf_types:
+                               if '_%s_' % chf_type not in d:
+                                       # Skip the reconstruct_eclasses call, 
since it's
+                                       # a waste of time if it contains a 
different chf_type
+                                       # than the current one. In the past, it 
was possible
+                                       # for reconstruct_eclasses called with 
chf_type='md5'
+                                       # to "successfully" return invalid data 
here, because
+                                       # it was unable to distinguish between 
md5 data and
+                                       # mtime data.
+                                       continue
                                try:
                                        d["_eclasses_"] = 
reconstruct_eclasses(cpv, d["_eclasses_"],
                                                chf_type, 
paths=self.store_eclass_paths)
@@ -62,6 +71,9 @@ class database(object):
                                                raise
                                else:
                                        break
+                       else:
+                               raise cache_errors.CacheCorruption(cpv,
+                                       'entry does not contain a recognized 
chf_type')
 
                elif "_eclasses_" not in d:
                        d["_eclasses_"] = {}
@@ -310,6 +322,23 @@ def serialize_eclasses(eclass_dict, chf_type='mtime', 
paths=True):
                for k, v in sorted(eclass_dict.items(), key=_keysorter))
 
 
+def _md5_deserializer(md5):
+       """
+       Without this validation, it's possible for reconstruct_eclasses to
+       mistakenly interpret mtime data as md5 data, and return an invalid
+       data structure containing strings where ints are expected.
+       """
+       if len(md5) != 32:
+               raise ValueError('expected 32 hex digits')
+       return md5
+
+
+_chf_deserializers = {
+       'md5': _md5_deserializer,
+       'mtime': long,
+}
+
+
 def reconstruct_eclasses(cpv, eclass_string, chf_type='mtime', paths=True):
        """returns a dict when handed a string generated by 
serialize_eclasses"""
        eclasses = eclass_string.rstrip().lstrip().split("\t")
@@ -317,9 +346,7 @@ def reconstruct_eclasses(cpv, eclass_string, 
chf_type='mtime', paths=True):
                # occasionally this occurs in the fs backends.  they suck.
                return {}
 
-       converter = _unicode
-       if chf_type == 'mtime':
-               converter = long
+       converter = _chf_deserializers.get(chf_type, lambda x: x)
 
        if paths:
                if len(eclasses) % 3 != 0:
@@ -340,6 +367,7 @@ def reconstruct_eclasses(cpv, eclass_string, 
chf_type='mtime', paths=True):
                raise cache_errors.CacheCorruption(cpv,
                        "_eclasses_ was of invalid len %i" % len(eclasses))
        except ValueError:
-               raise cache_errors.CacheCorruption(cpv, "_eclasses_ mtime 
conversion to long failed")
+               raise cache_errors.CacheCorruption(cpv,
+                       "_eclasses_ not valid for chf_type {}".format(chf_type))
        del eclasses
        return d
-- 
2.7.4


Reply via email to