This is an automated email from the ASF dual-hosted git repository. tomaz pushed a commit to branch trunk in repository https://gitbox.apache.org/repos/asf/libcloud.git
commit 173ca2759fff547bc5e9de6ea3d22aff7584c6e5 Author: Tomaz Muraus <[email protected]> AuthorDate: Tue Sep 22 11:49:39 2020 +0200 Update scrape-ec2-sizes script so it uses larger chunk size since the file we are downloading is massive (multiple 100MBs). Also update the script and make sure we try to clean up the file in case locally cached file is corrupted or incomplete. --- contrib/scrape-ec2-sizes.py | 44 +++++++++++++++++++++++++++++++------------- tox.ini | 4 +++- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/contrib/scrape-ec2-sizes.py b/contrib/scrape-ec2-sizes.py index 7872ab8..c202903 100755 --- a/contrib/scrape-ec2-sizes.py +++ b/contrib/scrape-ec2-sizes.py @@ -28,6 +28,8 @@ Use it as following: import re import os import json +import shutil +import atexit import requests import ijson # pylint: disable=import-error @@ -195,22 +197,25 @@ REGION_DETAILS = { def download_json(): - response = requests.get(URL, stream=True) - try: + if os.path.isfile(FILEPATH): return open(FILEPATH, 'r') - except IOError: - with open(FILEPATH, 'wb') as fo: - for chunk in response.iter_content(chunk_size=2**20): - if chunk: - fo.write(chunk) + + # File not cached locally, download data and cache it + with requests.get(URL, stream=True) as response: + with open(FILEPATH, 'wb') as fp: + # NOTE: We use shutil.copyfileobj with large chunk size instead of + # response.iter_content with large chunk size since data we + # download is massive and copyfileobj is more efficient. + shutil.copyfileobj(response.raw, fp, 10 * 1024 * 1024) + return open(FILEPATH, 'r') def get_json(): - try: - return open(FILEPATH, 'r') - except IOError: - return download_json() + if not os.path.isfile(FILEPATH): + return download_json(), False + + return open(FILEPATH, 'r'), True def filter_extras(extras): @@ -230,9 +235,22 @@ def parse(): for region_id in regions: regions[region_id]['instance_types'] = [] # Parse - json_file = get_json() + json_file, from_file = get_json() products_data = ijson.items(json_file, 'products') - products_data = next(products_data) + + try: + products_data = next(products_data) + except ijson.common.IncompleteJSONError as e: + # This likely indicates that the cached file is incomplete or corrupt so we delete it and re + # download data + if from_file: + os.remove(FILEPATH) + json_file, from_file = get_json() + products_data = ijson.items(json_file, 'products') + products_data = next(products_data) + else: + raise e + for sku in products_data: if products_data[sku]['productFamily'] != "Compute Instance": continue diff --git a/tox.ini b/tox.ini index 7f5caf9..3a6d498 100644 --- a/tox.ini +++ b/tox.ini @@ -152,7 +152,9 @@ commands = python contrib/scrape-ec2-prices.py basepython: python3.7 deps = requests ijson -commands = bash -c 'python contrib/scrape-ec2-sizes.py > libcloud/compute/constants.py' +commands = + bash -c 'echo "Scrapping EC2 sizes, this may take up to 5 minutes more since the actual JSON data we download and scrape is very large"' + bash -c 'python contrib/scrape-ec2-sizes.py > libcloud/compute/constants.py' [testenv:pylint] deps = -r{toxinidir}/requirements-tests.txt
