Skip to site navigation (Press enter)

[webkit-changes] [267579] trunk/Tools

jbedard Fri, 25 Sep 2020 10:56:42 -0700

Title: [267579] trunk/Tools

Revision: 267579
Author: jbed...@apple.com
Date: 2020-09-25 10:55:03 -0700 (Fri, 25 Sep 2020)

Log Message

results.webkit.org: Use s3 for cold storage
https://bugs.webkit.org/show_bug.cgi?id=216662
<rdar://problem/69092010>


Rubber-stamped by Aakash Jain.

* Scripts/libraries/resultsdbpy/resultsdbpy/__init__.py:
* Scripts/libraries/resultsdbpy/resultsdbpy/model/archive_context.py:
(ArchiveContext.__init__): Differentiate between cache storage and long-term storage of archives.
(ArchiveContext.__enter__): Connect to the cold storage archive, if one is available.
(ArchiveContext.__exit__): Disconnect from the cold storage archive, if applicable.
(ArchiveContext.register): Save to cold storage instead of cache storage by default.
(ArchiveContext.find_archive):
* Scripts/libraries/resultsdbpy/resultsdbpy/model/model.py:
(Model.__init__): Pass S3 credentials to ArchiveContext.
* Scripts/libraries/resultsdbpy/resultsdbpy/model/s3_archiver.py: Added.
(S3Archiver):
(S3Archiver.Credentials):
(S3Archiver.__init__): Connect to S3 and configure our bucket.
(S3Archiver._cipher): Construct new AES cipher, if a key was provided.
(S3Archiver.__enter__): Create S3 resource, if one is not available.
(S3Archiver.__exit__): Teardown S3 resource.
(S3Archiver.save): Save an archive to S3 by it's hash.
(S3Archiver.retrieve): Retreive an archive from S3 by its hash.
* Scripts/libraries/resultsdbpy/setup.py: Add boto3.

Modified Paths

trunk/Tools/ChangeLog
trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/__init__.py
trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/archive_context.py
trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/model.py
trunk/Tools/Scripts/libraries/resultsdbpy/setup.py

Added Paths

trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/s3_archiver.py

Diff

Modified: trunk/Tools/ChangeLog (267578 => 267579)


--- trunk/Tools/ChangeLog	2020-09-25 17:47:30 UTC (rev 267578)
+++ trunk/Tools/ChangeLog	2020-09-25 17:55:03 UTC (rev 267579)
@@ -1,3 +1,31 @@
+2020-09-25  Jonathan Bedard  <jbed...@apple.com>
+
+        results.webkit.org: Use s3 for cold storage
+        https://bugs.webkit.org/show_bug.cgi?id=216662
+        <rdar://problem/69092010>
+
+        Rubber-stamped by Aakash Jain.
+
+        * Scripts/libraries/resultsdbpy/resultsdbpy/__init__.py:
+        * Scripts/libraries/resultsdbpy/resultsdbpy/model/archive_context.py:
+        (ArchiveContext.__init__): Differentiate between cache storage and long-term storage of archives.
+        (ArchiveContext.__enter__): Connect to the cold storage archive, if one is available.
+        (ArchiveContext.__exit__): Disconnect from the cold storage archive, if applicable.
+        (ArchiveContext.register): Save to cold storage instead of cache storage by default.
+        (ArchiveContext.find_archive):
+        * Scripts/libraries/resultsdbpy/resultsdbpy/model/model.py:
+        (Model.__init__): Pass S3 credentials to ArchiveContext.
+        * Scripts/libraries/resultsdbpy/resultsdbpy/model/s3_archiver.py: Added.
+        (S3Archiver):
+        (S3Archiver.Credentials):
+        (S3Archiver.__init__): Connect to S3 and configure our bucket.
+        (S3Archiver._cipher): Construct new AES cipher, if a key was provided.
+        (S3Archiver.__enter__): Create S3 resource, if one is not available.
+        (S3Archiver.__exit__): Teardown S3 resource.
+        (S3Archiver.save): Save an archive to S3 by it's hash.
+        (S3Archiver.retrieve): Retreive an archive from S3 by its hash.
+        * Scripts/libraries/resultsdbpy/setup.py: Add boto3.
+
 2020-09-25  Youenn Fablet  <you...@apple.com>
 
         Implement default behavior for getUserMedia requests in case where there is no delegate set

Modified: trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/init.py (267578 => 267579)


--- trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/__init__.py	2020-09-25 17:47:30 UTC (rev 267578)
+++ trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/__init__.py	2020-09-25 17:55:03 UTC (rev 267579)
@@ -43,6 +43,6 @@
         "Please install webkitcorepy with `pip install webkitcorepy --extra-index-url <package index URL>`"
     )
 
-version = Version(1, 0, 2)
+version = Version(1, 1, 0)
 
 name = 'resultsdbpy'

Modified: trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/archive_context.py (267578 => 267579)


--- trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/archive_context.py	2020-09-25 17:47:30 UTC (rev 267578)
+++ trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/archive_context.py	2020-09-25 17:55:03 UTC (rev 267579)
@@ -22,20 +22,17 @@
 
 import calendar
 import io
-import json
 import time
 import zipfile
 
 from cassandra.cqlengine import columns
-from cassandra.cqlengine.models import Model
-from collections import OrderedDict
 from datetime import datetime
 from resultsdbpy.controller.commit import Commit
-from resultsdbpy.controller.configuration import Configuration
 from resultsdbpy.model.archiver import Archiver
 from resultsdbpy.model.cassandra_archiver import CassandraArchiver
 from resultsdbpy.model.commit_context import CommitContext
 from resultsdbpy.model.configuration_context import ClusteredByConfiguration
+from resultsdbpy.model.s3_archiver import S3Archiver
 from resultsdbpy.model.upload_context import UploadContext
 
 
@@ -81,11 +78,16 @@
         cls.assert_zipfile(archive)
         return zipfile.ZipFile(archive, mode='r')
 
-    def __init__(self, configuration_context, commit_context, ttl_seconds=None):
+    def __init__(self, configuration_context, commit_context, ttl_seconds=None, s3_credentials=None):
         self.configuration_context = configuration_context
         self.commit_context = commit_context
         self.cassandra = self.configuration_context.cassandra
-        self.archiver = CassandraArchiver(self.cassandra)
+        self.cached_archiver = CassandraArchiver(self.cassandra)
+        self.cold_archiver = S3Archiver(
+            bucket='{}-results-archive'.format(self.cassandra.keyspace.replace('_', '-')),
+            credentials=s3_credentials,
+            ttl_seconds=ttl_seconds,
+        ) if s3_credentials else None
         self.ttl_seconds = ttl_seconds
 
         with self:
@@ -95,12 +97,16 @@
     def __enter__(self):
         self.configuration_context.__enter__()
         self.commit_context.__enter__()
-        self.archiver.__enter__()
+        self.cached_archiver.__enter__()
+        if self.cold_archiver:
+            self.cold_archiver.__enter__()
 
     def __exit__(self, *args, **kwargs):
         self.commit_context.__exit__(*args, **kwargs)
         self.configuration_context.__exit__(*args, **kwargs)
-        self.archiver.__exit__(*args, **kwargs)
+        self.cached_archiver.__exit__(*args, **kwargs)
+        if self.cold_archiver:
+            self.cold_archiver.__exit__()
 
     def register(self, archive, configuration, commits, suite, timestamp=None):
         self.assert_zipfile(archive)
@@ -116,14 +122,17 @@
                 self.configuration_context.insert_row_with_configuration(
                     UploadContext.SuitesByConfiguration.__table_name__, configuration, suite=suite, branch=branch, ttl=ttl,
                 )
-                digest = self.archiver.save(archive, retain_for=ttl)
 
+                archiver_to_use = self.cold_archiver or self.cached_archiver
+                size = Archiver.archive_size(archive)
+                digest = archiver_to_use.save(archive, retain_for=ttl)
+
                 self.configuration_context.insert_row_with_configuration(
                     self.ArchiveMetaDataByCommit.__table_name__, configuration=configuration, suite=suite,
                     branch=branch, uuid=uuid, ttl=ttl,
                     sdk=configuration.sdk or '?', start_time=timestamp,
                     digest=digest,
-                    size=Archiver.archive_size(archive),
+                    size=size,
                 )
 
     def find_archive(
@@ -167,9 +176,19 @@
                         continue
 
                     if not archive_by_digest.get(value.get('digest')):
-                        archive = self.archiver.retrieve(value.get('digest'), value.get('size', None))
+                        archive = self.cached_archiver.retrieve(value.get('digest'), value.get('size', None))
                         if not archive:
-                            continue
+                            if not self.cold_archiver:
+                                continue
+
+                            archive = self.cold_archiver.retrieve(value.get('digest'), value.get('size', None))
+                            if not archive:
+                                continue
+
+                            # If we retrieved an archive from the cold_archiver, it's pretty likely that
+                            # the same archive will be retrieved in the near future. Cache the archive for 6 hours
+                            self.cached_archiver.save(archive, retain_for=60 * 60 * 6)
+
                         archive_by_digest[value.get('digest')] = archive
 
                     archive_by_digest.get(value.get('digest')).seek(0)

Modified: trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/model.py (267578 => 267579)


--- trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/model.py	2020-09-25 17:47:30 UTC (rev 267578)
+++ trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/model.py	2020-09-25 17:55:03 UTC (rev 267579)
@@ -45,7 +45,7 @@
         key = columns.Text(partition_key=True, required=True)
         value = columns.Text(required=True)
 
-    def __init__(self, redis, cassandra, repositories=[], default_ttl_seconds=TTL_YEAR * 5, archive_ttl_seconds=TTL_WEEK * 8, async_processing=False):
+    def __init__(self, redis, cassandra, repositories=[], default_ttl_seconds=TTL_YEAR * 5, archive_ttl_seconds=TTL_WEEK * 8, async_processing=False, s3_credentials=None):
         if default_ttl_seconds is not None and default_ttl_seconds < 4 * self.TTL_WEEK:
             raise ValueError('TTL must be at least 4 weeks')
         if archive_ttl_seconds is not None and archive_ttl_seconds < 2 * self.TTL_WEEK:
@@ -99,6 +99,7 @@
             configuration_context=self.configuration_context,
             commit_context=self.commit_context,
             ttl_seconds=self.archive_ttl_seconds,
+            s3_credentials=s3_credentials,
         )
 
     def healthy(self, writable=True):

Added: trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/s3_archiver.py (0 => 267579)


--- trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/s3_archiver.py	                        (rev 0)
+++ trunk/Tools/Scripts/libraries/resultsdbpy/resultsdbpy/model/s3_archiver.py	2020-09-25 17:55:03 UTC (rev 267579)
@@ -0,0 +1,136 @@
+# Copyright (C) 2020 Apple Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1.  Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+# 2.  Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import boto3
+import botocore
+import io
+import math
+import os
+import hashlib
+
+from Crypto.Cipher import AES
+from Crypto.Util.Padding import pad, unpad
+from resultsdbpy.model.archiver import Archiver
+
+
+class S3Archiver(Archiver):
+    class Credentials(object):
+        def __init__(
+            self,
+            region_name=None,
+            aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None,
+            key=None,
+        ):
+            self.region_name = region_name or 'us-west-2'
+            self.aws_access_key_id = aws_access_key_id
+            self.aws_secret_access_key = aws_secret_access_key
+            self.aws_session_token = aws_session_token
+            self.key = key
+
+        def __repr__(self):
+            return self.aws_access_key_id
+
+    def __init__(self, credentials, bucket='results-archive', ttl_seconds=None):
+        self.credentials = credentials
+        self.resources = None
+        self.bucket = bucket
+        self._count = 0
+
+        # Only modify the lifecycle of the bucket if we're editing the schema
+        if os.environ['CQLENG_ALLOW_SCHEMA_MANAGEMENT'] == '1':
+            client = boto3.client(
+                service_name='s3',
+                region_name=self.credentials.region_name,
+                aws_access_key_id=self.credentials.aws_access_key_id,
+                aws_secret_access_key=self.credentials.aws_secret_access_key,
+                aws_session_token=self.credentials.aws_session_token,
+            )
+
+            ttl_seconds = ttl_seconds or 60 * 60 * 24 * 365
+            ttl_days = int(math.ceil(ttl_seconds / (60 * 60 * 24)))
+
+            client.put_bucket_lifecycle_configuration(
+                Bucket=self.bucket,
+                LifecycleConfiguration=dict(
+                    Rules=[dict(
+                        Expiration=dict(Days=ttl_days),
+                        ID='ttl',
+                        Filter=dict(Prefix='archives/'),
+                        Status='Enabled',
+                    )],
+                ),
+            )
+
+    @property
+    def _cipher(self):
+        if not self.credentials.key:
+            return None
+        return AES.new(hashlib.sha256(self.credentials.key.encode()).digest(), AES.MODE_ECB)
+
+    def __enter__(self):
+        self._count += 1
+        if not self.resources:
+            self.resources = boto3.resource(
+                service_name='s3',
+                region_name=self.credentials.region_name,
+                aws_access_key_id=self.credentials.aws_access_key_id,
+                aws_secret_access_key=self.credentials.aws_secret_access_key,
+                aws_session_token=self.credentials.aws_session_token,
+            )
+
+    def __exit__(self, *args, **kwargs):
+        self._count -= 1
+        if self._count <= 0:
+            self.resources = None
+            self._count = 0
+
+    def save(self, archive, retain_for=None):
+        with self:
+            # S3 doesn't let us set custom expiration dates. This is a bit annoying because we really want
+            # resources to stay alive based on their commit time, rather than their report time, but the
+            # consequence will be resources kept around for a bit longer than we need access to them.
+            digest = self.archive_digest(archive)
+            cipher = self._cipher
+            if cipher:
+                archive = io.BytesIO(cipher.encrypt(pad(archive.read(), AES.block_size)))
+
+            self.resources.Bucket(self.bucket).upload_fileobj(archive, 'archives/{}'.format(digest))
+            return digest
+
+    def retrieve(self, digest, size=None):
+        with self:
+            try:
+                archive = io.BytesIO()
+                self.resources.Bucket(self.bucket).download_fileobj('archives/{}'.format(digest), archive)
+                archive.seek(0)
+
+                cipher = self._cipher
+                if cipher:
+                    archive = io.BytesIO(unpad(cipher.decrypt(archive.read()), AES.block_size))
+
+                if (size and self.archive_size(archive) != size) or digest != self.archive_digest(archive):
+                    raise RuntimeError('Retrieved archive does not match provided digest')
+                return archive
+            except botocore.exceptions.ClientError as e:
+                if e.response['Error']['Code'] == "404":
+                    return None
+                raise

Modified: trunk/Tools/Scripts/libraries/resultsdbpy/setup.py (267578 => 267579)


--- trunk/Tools/Scripts/libraries/resultsdbpy/setup.py	2020-09-25 17:47:30 UTC (rev 267578)
+++ trunk/Tools/Scripts/libraries/resultsdbpy/setup.py	2020-09-25 17:55:03 UTC (rev 267579)
@@ -59,6 +59,7 @@
         'resultsdbpy.view',
     ],
     install_requires=[
+        'boto3',
         'cassandra-driver',
         'fakeredis',
         'Flask',
@@ -65,6 +66,7 @@
         'Flask-Cors',
         'gunicorn',
         'lupa',
+        'pycryptodome',
         'redis',
         'xmltodict',
         'selenium',

_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
https://lists.webkit.org/mailman/listinfo/webkit-changes