commit:     a4f06ab3cf7339100b2af2146ae90cbba8bac371
Author:     Daniel Robbins <drobbins <AT> funtoo <DOT> org>
AuthorDate: Sat Feb 20 23:11:46 2021 +0000
Commit:     Zac Medico <zmedico <AT> gentoo <DOT> org>
CommitDate: Mon Feb 22 11:48:41 2021 +0000
URL:        https://gitweb.gentoo.org/proj/portage.git/commit/?id=a4f06ab3

Add content-hash distfiles layout (bug 756778)

The content-hash layout is identical to the filename-hash layout,
except for these three differences:

1) A content digest is used instead of a filename digest.

2) The final element of the path returned from the get_path method
corresponds to the complete content digest. The path is a function
of the content digest alone.

3) Because the path is a function of content digest alone, the
get_filenames implementation cannot derive distfiles names from
paths, so it instead yields DistfileName instances whose names are
equal to content digest values. The DistfileName documentation
discusses resulting implications.

Motivations to use the content-hash layout instead of the
filename-hash layout may include:

1) Since the file path is independent of the file name, file
name collisions cannot occur. This makes the content-hash
layout suitable for storage of multiple types of files (not
only gentoo distfiles). For example, it can be used to store
distfiles for multiple linux distros within the same tree,
with automatic deduplication based on content digest. This
layout can be used to store and distribute practically anything
(including binary packages for example).

2) Allows multiple revisions for the same distfiles name. An
existing distfile can be updated, and if a user still has an
older copy of an ebuild repository (or an overlay), then a user
can successfully fetch a desired revision of the distfile as
long as it has not been purged from the mirror.

3) File integrity data is integrated into the layout itself,
making it very simple to verify the integrity of any file that
it contains. The only tool required is an implementation of
the chosen hash algorithm.

Bug: https://bugs.gentoo.org/756778
Signed-off-by: Zac Medico <zmedico <AT> gentoo.org>

 lib/portage/package/ebuild/fetch.py    | 97 ++++++++++++++++++++++++++++++++++
 lib/portage/tests/ebuild/test_fetch.py | 36 +++++++++++++
 2 files changed, 133 insertions(+)

diff --git a/lib/portage/package/ebuild/fetch.py 
b/lib/portage/package/ebuild/fetch.py
index af9edd91e..f0ae864ad 100644
--- a/lib/portage/package/ebuild/fetch.py
+++ b/lib/portage/package/ebuild/fetch.py
@@ -464,6 +464,97 @@ class FilenameHashLayout:
                return False
 
 
+class ContentHashLayout(FilenameHashLayout):
+       """
+       The content-hash layout is identical to the filename-hash layout,
+       except for these three differences:
+
+       1) A content digest is used instead of a filename digest.
+
+       2) The final element of the path returned from the get_path method
+       corresponds to the complete content digest. The path is a function
+       of the content digest alone.
+
+       3) Because the path is a function of content digest alone, the
+       get_filenames implementation cannot derive distfiles names from
+       paths, so it instead yields DistfileName instances whose names are
+       equal to content digest values. The DistfileName documentation
+       discusses resulting implications.
+
+       Motivations to use the content-hash layout instead of the
+       filename-hash layout may include:
+
+       1) Since the file path is independent of the file name, file
+       name collisions cannot occur. This makes the content-hash
+       layout suitable for storage of multiple types of files (not
+       only gentoo distfiles). For example, it can be used to store
+       distfiles for multiple linux distros within the same tree,
+       with automatic deduplication based on content digest. This
+       layout can be used to store and distribute practically anything
+       (including binary packages for example).
+
+       2) Allows multiple revisions for the same distfiles name. An
+       existing distfile can be updated, and if a user still has an
+       older copy of an ebuild repository (or an overlay), then a user
+       can successfully fetch a desired revision of the distfile as
+       long as it has not been purged from the mirror.
+
+       3) File integrity data is integrated into the layout itself,
+       making it very simple to verify the integrity of any file that
+       it contains. The only tool required is an implementation of
+       the chosen hash algorithm.
+       """
+
+       def get_path(self, filename):
+               """
+               For content-hash, the path is a function of the content digest 
alone.
+               The final element of the path returned from the get_path method
+               corresponds to the complete content digest.
+               """
+               fnhash = remaining = filename.digests[self.algo]
+               ret = ""
+               for c in self.cutoffs:
+                       assert c % 4 == 0
+                       c = c // 4
+                       ret += remaining[:c] + "/"
+                       remaining = remaining[c:]
+               return ret + fnhash
+
+       def get_filenames(self, distdir):
+               """
+               Yields DistfileName instances each with filename corresponding
+               to a digest value for self.algo, and which can be compared to
+               other DistfileName instances with their digests_equal method.
+               """
+               for filename in super(ContentHashLayout, 
self).get_filenames(distdir):
+                       yield DistfileName(
+                               filename, digests=dict([(self.algo, filename)])
+                       )
+
+       @staticmethod
+       def verify_args(args, filename=None):
+               """
+               If the filename argument is given, then supported hash
+               algorithms are constrained by digests available in the filename
+               digests attribute.
+
+               @param args: layout.conf entry args
+               @param filename: filename with digests attribute
+               @return: True if args are valid for available digest algorithms,
+                               and False otherwise
+               """
+               if len(args) != 3:
+                       return False
+               if filename is None:
+                       supported_algos = get_valid_checksum_keys()
+               else:
+                       supported_algos = filename.digests
+               algo = args[1].upper()
+               if algo not in supported_algos:
+                       return False
+               return FilenameHashLayout.verify_args(args)
+
+
 class MirrorLayoutConfig:
        """
        Class to read layout.conf from a mirror.
@@ -505,6 +596,8 @@ class MirrorLayoutConfig:
                        return FlatLayout.verify_args(val)
                elif val[0] == 'filename-hash':
                        return FilenameHashLayout.verify_args(val)
+               elif val[0] == 'content-hash':
+                       return ContentHashLayout.verify_args(val, 
filename=filename)
                return False
 
        def get_best_supported_layout(self, filename=None):
@@ -521,6 +614,8 @@ class MirrorLayoutConfig:
                                        return FlatLayout(*val[1:])
                                elif val[0] == 'filename-hash':
                                        return FilenameHashLayout(*val[1:])
+                               elif val[0] == 'content-hash':
+                                       return ContentHashLayout(*val[1:])
                # fallback
                return FlatLayout()
 
@@ -533,6 +628,8 @@ class MirrorLayoutConfig:
                                ret.append(FlatLayout(*val[1:]))
                        elif val[0] == 'filename-hash':
                                ret.append(FilenameHashLayout(*val[1:]))
+                       elif val[0] == 'content-hash':
+                               ret.append(ContentHashLayout(*val[1:]))
                if not ret:
                        ret.append(FlatLayout())
                return ret

diff --git a/lib/portage/tests/ebuild/test_fetch.py 
b/lib/portage/tests/ebuild/test_fetch.py
index b88ae3efb..c195888cc 100644
--- a/lib/portage/tests/ebuild/test_fetch.py
+++ b/lib/portage/tests/ebuild/test_fetch.py
@@ -20,6 +20,7 @@ from portage.util._eventloop.global_event_loop import 
global_event_loop
 from portage.package.ebuild.config import config
 from portage.package.ebuild.digestgen import digestgen
 from portage.package.ebuild.fetch import (
+       ContentHashLayout,
        DistfileName,
        _download_suffix,
        fetch,
@@ -109,6 +110,11 @@ class EbuildFetchTestCase(TestCase):
                                "1=filename-hash BLAKE2B 8",
                                "0=flat",
                        ),
+                       (
+                               "[structure]",
+                               "0=content-hash SHA512 8:8:8",
+                               "1=flat",
+                       ),
                )
 
                fetchcommand = 
portage.util.shlex_split(playground.settings["FETCHCOMMAND"])
@@ -444,6 +450,35 @@ class EbuildFetchTestCase(TestCase):
                self.assertEqual(FilenameHashLayout('SHA1', 
'8:16:24').get_path('foo-1.tar.gz'),
                                '19/c3b6/37a94b/foo-1.tar.gz')
 
+       def test_content_hash_layout(self):
+               
self.assertFalse(ContentHashLayout.verify_args(('content-hash',)))
+               self.assertTrue(ContentHashLayout.verify_args(('content-hash', 
'SHA1', '8')))
+               self.assertFalse(ContentHashLayout.verify_args(('content-hash', 
'INVALID-HASH', '8')))
+               self.assertTrue(ContentHashLayout.verify_args(('content-hash', 
'SHA1', '4:8:12')))
+               self.assertFalse(ContentHashLayout.verify_args(('content-hash', 
'SHA1', '3')))
+               self.assertFalse(ContentHashLayout.verify_args(('content-hash', 
'SHA1', 'junk')))
+               self.assertFalse(ContentHashLayout.verify_args(('content-hash', 
'SHA1', '4:8:junk')))
+
+               filename = DistfileName(
+                       'foo-1.tar.gz',
+                       digests=dict((algo, checksum_str(b'', hashname=algo)) 
for algo in MANIFEST2_HASH_DEFAULTS),
+               )
+
+               # Raise KeyError for a hash algorithm SHA1 which is not in 
MANIFEST2_HASH_DEFAULTS.
+               self.assertRaises(KeyError, ContentHashLayout('SHA1', 
'4').get_path, filename)
+
+               # Raise AttributeError for a plain string argument.
+               self.assertRaises(AttributeError, ContentHashLayout('SHA512', 
'4').get_path, str(filename))
+
+               self.assertEqual(ContentHashLayout('SHA512', 
'4').get_path(filename),
+                               
'c/cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
+               self.assertEqual(ContentHashLayout('SHA512', 
'8').get_path(filename),
+                               
'cf/cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
+               self.assertEqual(ContentHashLayout('SHA512', 
'8:16').get_path(filename),
+                               
'cf/83e1/cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
+               self.assertEqual(ContentHashLayout('SHA512', 
'8:16:24').get_path(filename),
+                               
'cf/83e1/357eef/cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
+
        def test_mirror_layout_config(self):
                mlc = MirrorLayoutConfig()
                self.assertEqual(mlc.serialize(), ())
@@ -521,6 +556,7 @@ class EbuildFetchTestCase(TestCase):
                        FilenameHashLayout('SHA1', '8'),
                        FilenameHashLayout('SHA1', '8:16'),
                        FilenameHashLayout('SHA1', '8:16:24'),
+                       ContentHashLayout('SHA512', '8:8:8'),
                )
 
                for layout in layouts:

Reply via email to