[libcloud] 01/06: Update scrape-ec2-sizes script so it uses larger chunk size since the file we are downloading is massive (multiple 100MBs).

tomaz Tue, 22 Sep 2020 03:21:46 -0700

This is an automated email from the ASF dual-hosted git repository.

tomaz pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/libcloud.git


commit 173ca2759fff547bc5e9de6ea3d22aff7584c6e5
Author: Tomaz Muraus <[email protected]>
AuthorDate: Tue Sep 22 11:49:39 2020 +0200

    Update scrape-ec2-sizes script so it uses larger chunk size since the
    file we are downloading is massive (multiple 100MBs).
    
    Also update the script and make sure we try to clean up the file in case
    locally cached file is corrupted or incomplete.
---
 contrib/scrape-ec2-sizes.py | 44 +++++++++++++++++++++++++++++++-------------
 tox.ini                     |  4 +++-
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/contrib/scrape-ec2-sizes.py b/contrib/scrape-ec2-sizes.py
index 7872ab8..c202903 100755
--- a/contrib/scrape-ec2-sizes.py
+++ b/contrib/scrape-ec2-sizes.py
@@ -28,6 +28,8 @@ Use it as following:
 import re
 import os
 import json
+import shutil
+import atexit
 
 import requests
 import ijson  # pylint: disable=import-error
@@ -195,22 +197,25 @@ REGION_DETAILS = {
 
 
 def download_json():
-    response = requests.get(URL, stream=True)
-    try:
+    if os.path.isfile(FILEPATH):
         return open(FILEPATH, 'r')
-    except IOError:
-        with open(FILEPATH, 'wb') as fo:
-            for chunk in response.iter_content(chunk_size=2**20):
-                if chunk:
-                    fo.write(chunk)
+
+    # File not cached locally, download data and cache it
+    with requests.get(URL, stream=True) as response:
+        with open(FILEPATH, 'wb') as fp:
+            # NOTE: We use shutil.copyfileobj with large chunk size instead of
+            # response.iter_content with large chunk size since data we
+            # download is massive and copyfileobj is more efficient.
+            shutil.copyfileobj(response.raw, fp, 10 * 1024 * 1024)
+
     return open(FILEPATH, 'r')
 
 
 def get_json():
-    try:
-        return open(FILEPATH, 'r')
-    except IOError:
-        return download_json()
+    if not os.path.isfile(FILEPATH):
+        return download_json(), False
+
+    return open(FILEPATH, 'r'), True
 
 
 def filter_extras(extras):
@@ -230,9 +235,22 @@ def parse():
     for region_id in regions:
         regions[region_id]['instance_types'] = []
     # Parse
-    json_file = get_json()
+    json_file, from_file = get_json()
     products_data = ijson.items(json_file, 'products')
-    products_data = next(products_data)
+
+    try:
+        products_data = next(products_data)
+    except ijson.common.IncompleteJSONError as e:
+        # This likely indicates that the cached file is incomplete or corrupt 
so we delete it and re
+        # download data
+        if from_file:
+            os.remove(FILEPATH)
+            json_file, from_file = get_json()
+            products_data = ijson.items(json_file, 'products')
+            products_data = next(products_data)
+        else:
+            raise e
+
     for sku in products_data:
         if products_data[sku]['productFamily'] != "Compute Instance":
             continue
diff --git a/tox.ini b/tox.ini
index 7f5caf9..3a6d498 100644
--- a/tox.ini
+++ b/tox.ini
@@ -152,7 +152,9 @@ commands = python contrib/scrape-ec2-prices.py
 basepython: python3.7
 deps = requests
        ijson
-commands = bash -c 'python contrib/scrape-ec2-sizes.py > 
libcloud/compute/constants.py'
+commands = 
+    bash -c 'echo "Scrapping EC2 sizes, this may take up to 5 minutes more 
since the actual JSON data we download and scrape is very large"'
+    bash -c 'python contrib/scrape-ec2-sizes.py > 
libcloud/compute/constants.py'
 
 [testenv:pylint]
 deps = -r{toxinidir}/requirements-tests.txt

[libcloud] 01/06: Update scrape-ec2-sizes script so it uses larger chunk size since the file we are downloading is massive (multiple 100MBs).

Reply via email to