yangaws commented on a change in pull request #4091: [AIRFLOW-2524] Update 
SageMaker hook, operator and sensor for training, tuning and transform
URL: https://github.com/apache/incubator-airflow/pull/4091#discussion_r229909937
 
 

 ##########
 File path: airflow/contrib/hooks/sagemaker_hook.py
 ##########
 @@ -16,299 +16,793 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import copy
+import tarfile
+import tempfile
 import time
+import os
+import collections
+import functools
+from datetime import datetime
+
+import botocore.config
 from botocore.exceptions import ClientError
 
 from airflow.exceptions import AirflowException
 from airflow.contrib.hooks.aws_hook import AwsHook
 from airflow.hooks.S3_hook import S3Hook
 
 
+class LogState(object):
+    STARTING = 1
+    WAIT_IN_PROGRESS = 2
+    TAILING = 3
+    JOB_COMPLETE = 4
+    COMPLETE = 5
+
+
+# Position is a tuple that includes the last read timestamp and the number of 
items that were read
+# at that time. This is used to figure out which event to start with on the 
next read.
+Position = collections.namedtuple('Position', ['timestamp', 'skip'])
+
+
+def argmin(arr, f):
+    """Return the index, i, in arr that minimizes f(arr[i])"""
+    m = None
+    i = None
+    for idx, item in enumerate(arr):
+        if item is not None:
+            if m is None or f(item) < m:
+                m = f(item)
+                i = idx
+    return i
+
+
+def some(arr):
+    """Return True iff there is an element, a, of arr such that a is not 
None"""
+    return functools.reduce(lambda x, y: x or (y is not None), arr, False)
+
+
+def secondary_training_status_changed(current_job_description, 
prev_job_description):
+    """
+    Returns true if training job's secondary status message has changed.
+
+    :param current_job_description: Current job description, returned from 
DescribeTrainingJob call.
+    :type current_job_description: dict
+    :param prev_job_description: Previous job description, returned from 
DescribeTrainingJob call.
+    :type prev_job_description: dict
+
+    :return: Whether the secondary status message of a training job changed or 
not.
+    """
+    current_secondary_status_transitions = 
current_job_description.get('SecondaryStatusTransitions')
+    if current_secondary_status_transitions is None or 
len(current_secondary_status_transitions) == 0:
+        return False
+
+    prev_job_secondary_status_transitions = 
prev_job_description.get('SecondaryStatusTransitions') \
+        if prev_job_description is not None else None
+
+    last_message = prev_job_secondary_status_transitions[-1]['StatusMessage'] \
+        if prev_job_secondary_status_transitions is not None \
+        and len(prev_job_secondary_status_transitions) > 0 else ''
+
+    message = 
current_job_description['SecondaryStatusTransitions'][-1]['StatusMessage']
+
+    return message != last_message
+
+
+def secondary_training_status_message(job_description, prev_description):
+    """
+    Returns a string contains start time and the secondary training job status 
message.
+
+    :param job_description: Returned response from DescribeTrainingJob call
+    :type job_description: dict
+    :param prev_description: Previous job description from DescribeTrainingJob 
call
+    :type prev_description: dict
+
+    :return: Job status string to be printed.
+    """
+
+    if job_description is None or 
job_description.get('SecondaryStatusTransitions') is None\
+            or len(job_description.get('SecondaryStatusTransitions')) == 0:
+        return ''
+
+    prev_description_secondary_transitions = 
prev_description.get('SecondaryStatusTransitions')\
+        if prev_description is not None else None
+    prev_transitions_num = len(prev_description['SecondaryStatusTransitions'])\
+        if prev_description_secondary_transitions is not None else 0
+    current_transitions = job_description['SecondaryStatusTransitions']
+
+    transitions_to_print = current_transitions[-1:] if 
len(current_transitions) == prev_transitions_num else \
+        current_transitions[prev_transitions_num - len(current_transitions):]
+
+    status_strs = []
+    for transition in transitions_to_print:
+        message = transition['StatusMessage']
+        time_str = datetime.utcfromtimestamp(
+            
time.mktime(job_description['LastModifiedTime'].timetuple())).strftime('%Y-%m-%d
 %H:%M:%S')
+        status_strs.append('{} {} - {}'.format(time_str, transition['Status'], 
message))
+
+    return '\n'.join(status_strs)
+
+
 class SageMakerHook(AwsHook):
     """
     Interact with Amazon SageMaker.
-    sagemaker_conn_id is required for using
-    the config stored in db for training/tuning
     """
-    non_terminal_states = {'InProgress', 'Stopping', 'Stopped'}
+    non_terminal_states = {'InProgress', 'Stopping'}
+    endpoint_non_terminal_states = {'Creating', 'Updating', 'SystemUpdating',
+                                    'RollingBack', 'Deleting'}
     failed_states = {'Failed'}
 
     def __init__(self,
-                 sagemaker_conn_id=None,
-                 use_db_config=False,
-                 region_name=None,
-                 check_interval=5,
-                 max_ingestion_time=None,
                  *args, **kwargs):
         super(SageMakerHook, self).__init__(*args, **kwargs)
-        self.sagemaker_conn_id = sagemaker_conn_id
-        self.use_db_config = use_db_config
-        self.region_name = region_name
-        self.check_interval = check_interval
-        self.max_ingestion_time = max_ingestion_time
-        self.conn = self.get_conn()
+        self.s3_hook = S3Hook(aws_conn_id=self.aws_conn_id)
+
+    def expand_role(self, role):
+        """
+        Expand an IAM role name to an IAM role ARN. If role is already an IAM 
ARN,
+        no change is made.
+
+        :param role: IAM role name or ARN
+        :return: IAM role ARN
+        """
+        if '/' in role:
+            return role
+        else:
+            return self.get_iam_conn().get_role(RoleName=role)['Role']['Arn']
+
+    def tar_and_s3_upload(self, path, key, bucket):
+        """
+        Tar the local file or directory and upload to s3
 
-    def check_for_url(self, s3url):
+        :param path: local file or directory
+        :type path: str
+        :param key: s3 key
+        :type key: str
+        :param bucket: s3 bucket
+        :type bucket: str
+        :return: None
+        """
+        with tempfile.TemporaryFile() as temp_file:
+            if os.path.isdir(path):
+                files = [os.path.join(path, name) for name in os.listdir(path)]
+            else:
+                files = [path]
+            with tarfile.open(mode='w:gz', fileobj=temp_file) as tar_file:
+                for f in files:
+                    tar_file.add(f, arcname=os.path.basename(f))
+            temp_file.seek(0)
+            self.s3_hook.load_file_obj(temp_file, key, bucket, True)
+
+    def configure_s3_resources(self, config):
+        """
+        Extract the S3 operations from the configuration and execute them.
+
+        :param config: config of SageMaker operation
+        :type config: dict
+        :return: dict
         """
-        check if the s3url exists
+        s3_operations = config.pop('S3Operations', None)
+
+        if s3_operations is not None:
+            create_bucket_ops = s3_operations.get('S3CreateBucket')
+            upload_ops = s3_operations.get('S3Upload')
+            if create_bucket_ops:
+                for op in create_bucket_ops:
+                    self.s3_hook.create_bucket(bucket_name=op['Bucket'])
+            if upload_ops:
+                for op in upload_ops:
+                    if op['Tar']:
+                        self.tar_and_s3_upload(op['Path'], op['Key'],
+                                               op['Bucket'])
+                    else:
+                        self.s3_hook.load_file(op['Path'], op['Key'],
+                                               op['Bucket'])
+
+        return config
 
 Review comment:
   Yep config is already updated in place. My bad.
   
   Updated it with no return.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to