[systemml] branch master updated: [SYSTEMML-540] Added performance tests for ResNet200

niketanpansare Fri, 29 Mar 2019 10:27:03 -0700

This is an automated email from the ASF dual-hosted git repository.

niketanpansare pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git



The following commit(s) were added to refs/heads/master by this push:
     new 794c5a2  [SYSTEMML-540] Added performance tests for ResNet200
794c5a2 is described below

commit 794c5a232a3f462e2a85836dea55570f102e1682
Author: Niketan Pansare <npan...@us.ibm.com>
AuthorDate: Fri Mar 29 10:26:04 2019 -0700

    [SYSTEMML-540] Added performance tests for ResNet200
    
    These tests compare the effect of different eviction policies when
    training ResNet as well as performs baseline comparison with Unified
    Memory, TF and TF-GPU.
---
 scripts/perftest/gpu_resnet_perftest/resnet.py | 282 +++++++++++++++++++++++++
 scripts/perftest/gpu_resnet_perftest/run.py    | 219 +++++++++++++++++++
 scripts/perftest/gpu_resnet_perftest/run.sh    |  72 +++++++
 3 files changed, 573 insertions(+)

diff --git a/scripts/perftest/gpu_resnet_perftest/resnet.py 
b/scripts/perftest/gpu_resnet_perftest/resnet.py
new file mode 100644
index 0000000..a2e8514
--- /dev/null
+++ b/scripts/perftest/gpu_resnet_perftest/resnet.py
@@ -0,0 +1,282 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+from __future__ import division
+
+import six
+from keras.models import Model
+from keras.layers import (
+    Input,
+    Activation,
+    Dense,
+    Flatten
+)
+from keras.layers.convolutional import (
+    Conv2D,
+    MaxPooling2D,
+    AveragePooling2D
+)
+from keras.layers.merge import add
+from keras.layers.normalization import BatchNormalization
+from keras.regularizers import l2
+from keras import backend as K
+
+
+def _bn_relu(input):
+    """Helper to build a BN -> relu block
+    """
+    norm = BatchNormalization(axis=CHANNEL_AXIS)(input)
+    return Activation("relu")(norm)
+
+
+def _conv_bn_relu(**conv_params):
+    """Helper to build a conv -> BN -> relu block
+    """
+    filters = conv_params["filters"]
+    kernel_size = conv_params["kernel_size"]
+    strides = conv_params.setdefault("strides", (1, 1))
+    kernel_initializer = conv_params.setdefault("kernel_initializer", 
"he_normal")
+    padding = conv_params.setdefault("padding", "same")
+    kernel_regularizer = conv_params.setdefault("kernel_regularizer", 
l2(1.e-4))
+
+    def f(input):
+        conv = Conv2D(filters=filters, kernel_size=kernel_size,
+                      strides=strides, padding=padding,
+                      kernel_initializer=kernel_initializer,
+                      kernel_regularizer=kernel_regularizer)(input)
+        return _bn_relu(conv)
+
+    return f
+
+
+def _bn_relu_conv(**conv_params):
+    """Helper to build a BN -> relu -> conv block.
+    This is an improved scheme proposed in 
http://arxiv.org/pdf/1603.05027v2.pdf
+    """
+    filters = conv_params["filters"]
+    kernel_size = conv_params["kernel_size"]
+    strides = conv_params.setdefault("strides", (1, 1))
+    kernel_initializer = conv_params.setdefault("kernel_initializer", 
"he_normal")
+    padding = conv_params.setdefault("padding", "same")
+    kernel_regularizer = conv_params.setdefault("kernel_regularizer", 
l2(1.e-4))
+
+    def f(input):
+        activation = _bn_relu(input)
+        return Conv2D(filters=filters, kernel_size=kernel_size,
+                      strides=strides, padding=padding,
+                      kernel_initializer=kernel_initializer,
+                      kernel_regularizer=kernel_regularizer)(activation)
+
+    return f
+
+
+def _shortcut(input, residual):
+    """Adds a shortcut between input and residual block and merges them with 
"sum"
+    """
+    # Expand channels of shortcut to match residual.
+    # Stride appropriately to match residual (width, height)
+    # Should be int if network architecture is correctly configured.
+    input_shape = K.int_shape(input)
+    residual_shape = K.int_shape(residual)
+    stride_width = int(round(input_shape[ROW_AXIS] / residual_shape[ROW_AXIS]))
+    stride_height = int(round(input_shape[COL_AXIS] / 
residual_shape[COL_AXIS]))
+    equal_channels = input_shape[CHANNEL_AXIS] == residual_shape[CHANNEL_AXIS]
+
+    shortcut = input
+    # 1 X 1 conv if shape is different. Else identity.
+    if stride_width > 1 or stride_height > 1 or not equal_channels:
+        shortcut = Conv2D(filters=residual_shape[CHANNEL_AXIS],
+                          kernel_size=(1, 1),
+                          strides=(stride_width, stride_height),
+                          padding="valid",
+                          kernel_initializer="he_normal",
+                          kernel_regularizer=l2(0.0001))(input)
+
+    return add([shortcut, residual])
+
+
+def _residual_block(block_function, filters, repetitions, 
is_first_layer=False):
+    """Builds a residual block with repeating bottleneck blocks.
+    """
+    def f(input):
+        for i in range(repetitions):
+            init_strides = (1, 1)
+            if i == 0 and not is_first_layer:
+                init_strides = (2, 2)
+            input = block_function(filters=filters, init_strides=init_strides,
+                                   
is_first_block_of_first_layer=(is_first_layer and i == 0))(input)
+        return input
+
+    return f
+
+
+def basic_block(filters, init_strides=(1, 1), 
is_first_block_of_first_layer=False):
+    """Basic 3 X 3 convolution blocks for use on resnets with layers <= 34.
+    Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf
+    """
+    def f(input):
+
+        if is_first_block_of_first_layer:
+            # don't repeat bn->relu since we just did bn->relu->maxpool
+            conv1 = Conv2D(filters=filters, kernel_size=(3, 3),
+                           strides=init_strides,
+                           padding="same",
+                           kernel_initializer="he_normal",
+                           kernel_regularizer=l2(1e-4))(input)
+        else:
+            conv1 = _bn_relu_conv(filters=filters, kernel_size=(3, 3),
+                                  strides=init_strides)(input)
+
+        residual = _bn_relu_conv(filters=filters, kernel_size=(3, 3))(conv1)
+        return _shortcut(input, residual)
+
+    return f
+
+
+def bottleneck(filters, init_strides=(1, 1), 
is_first_block_of_first_layer=False):
+    """Bottleneck architecture for > 34 layer resnet.
+    Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf
+
+    Returns:
+        A final conv layer of filters * 4
+    """
+    def f(input):
+
+        if is_first_block_of_first_layer:
+            # don't repeat bn->relu since we just did bn->relu->maxpool
+            conv_1_1 = Conv2D(filters=filters, kernel_size=(1, 1),
+                              strides=init_strides,
+                              padding="same",
+                              kernel_initializer="he_normal",
+                              kernel_regularizer=l2(1e-4))(input)
+        else:
+            conv_1_1 = _bn_relu_conv(filters=filters, kernel_size=(1, 1),
+                                     strides=init_strides)(input)
+
+        conv_3_3 = _bn_relu_conv(filters=filters, kernel_size=(3, 3))(conv_1_1)
+        residual = _bn_relu_conv(filters=filters * 4, kernel_size=(1, 
1))(conv_3_3)
+        return _shortcut(input, residual)
+
+    return f
+
+
+def _handle_dim_ordering():
+    global ROW_AXIS
+    global COL_AXIS
+    global CHANNEL_AXIS
+    if K.image_dim_ordering() == 'tf':
+        ROW_AXIS = 1
+        COL_AXIS = 2
+        CHANNEL_AXIS = 3
+    else:
+        CHANNEL_AXIS = 1
+        ROW_AXIS = 2
+        COL_AXIS = 3
+
+
+def _get_block(identifier):
+    if isinstance(identifier, six.string_types):
+        res = globals().get(identifier)
+        if not res:
+            raise ValueError('Invalid {}'.format(identifier))
+        return res
+    return identifier
+
+
+class ResnetBuilder(object):
+    @staticmethod
+    def build(input_shape, num_outputs, block_fn, repetitions):
+        """Builds a custom ResNet like architecture.
+
+        Args:
+            input_shape: The input shape in the form (nb_channels, nb_rows, 
nb_cols)
+            num_outputs: The number of outputs at final softmax layer
+            block_fn: The block function to use. This is either `basic_block` 
or `bottleneck`.
+                The original paper used basic_block for layers < 50
+            repetitions: Number of repetitions of various block units.
+                At each block unit, the number of filters are doubled and the 
input size is halved
+
+        Returns:
+            The keras `Model`.
+        """
+        _handle_dim_ordering()
+        if len(input_shape) != 3:
+            raise Exception("Input shape should be a tuple (nb_channels, 
nb_rows, nb_cols)")
+
+        # Permute dimension order if necessary
+        if K.image_dim_ordering() == 'tf':
+            input_shape = (input_shape[1], input_shape[2], input_shape[0])
+
+        # Load function from str if needed.
+        block_fn = _get_block(block_fn)
+
+        input = Input(shape=input_shape)
+        conv1 = _conv_bn_relu(filters=64, kernel_size=(7, 7), strides=(2, 
2))(input)
+        pool1 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), 
padding="same")(conv1)
+
+        block = pool1
+        filters = 64
+        for i, r in enumerate(repetitions):
+            block = _residual_block(block_fn, filters=filters, repetitions=r, 
is_first_layer=(i == 0))(block)
+            filters *= 2
+
+        # Last activation
+        block = _bn_relu(block)
+
+        # Classifier block
+        block_shape = K.int_shape(block)
+        pool2 = AveragePooling2D(pool_size=(block_shape[ROW_AXIS], 
block_shape[COL_AXIS]),
+                                 strides=(1, 1))(block)
+        flatten1 = Flatten()(pool2)
+        dense = Dense(units=num_outputs, kernel_initializer="he_normal",
+                      activation="softmax")(flatten1)
+
+        model = Model(inputs=input, outputs=dense)
+        return model
+
+    @staticmethod
+    def build_resnet_18(input_shape, num_outputs):
+        return ResnetBuilder.build(input_shape, num_outputs, basic_block, [2, 
2, 2, 2])
+
+    @staticmethod
+    def build_resnet_34(input_shape, num_outputs):
+        return ResnetBuilder.build(input_shape, num_outputs, basic_block, [3, 
4, 6, 3])
+
+    @staticmethod
+    def build_resnet_50(input_shape, num_outputs):
+        return ResnetBuilder.build(input_shape, num_outputs, bottleneck, [3, 
4, 6, 3])
+
+    @staticmethod
+    def build_resnet_101(input_shape, num_outputs):
+        return ResnetBuilder.build(input_shape, num_outputs, bottleneck, [3, 
4, 23, 3])
+
+    @staticmethod
+    def build_resnet_152(input_shape, num_outputs):
+        return ResnetBuilder.build(input_shape, num_outputs, bottleneck, [3, 
8, 36, 3])
+
+    @staticmethod
+    def build_resnet_200(input_shape, num_outputs):
+        return ResnetBuilder.build(input_shape, num_outputs, bottleneck, [3, 
24, 36, 3])
+
+    @staticmethod
+    def build_resnet_1001(input_shape, num_outputs):
+        # TODO: From 
https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua
+        return ResnetBuilder.build(input_shape, num_outputs, bottleneck, [16, 
64, 128, 256])
diff --git a/scripts/perftest/gpu_resnet_perftest/run.py 
b/scripts/perftest/gpu_resnet_perftest/run.py
new file mode 100644
index 0000000..eb7cc14
--- /dev/null
+++ b/scripts/perftest/gpu_resnet_perftest/run.py
@@ -0,0 +1,219 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import time, os, argparse, sys, math
+import numpy as np
+
+from pyspark import SparkContext
+sc = SparkContext()
+from pyspark.sql import SparkSession
+spark = SparkSession.builder.getOrCreate()
+
+parser=argparse.ArgumentParser("Testing deep networks for different batches")
+parser.add_argument('--network', type=str, default='vgg16', choices=['vgg16', 
'vgg19', 'resnet200', 'resnet1001', 'unet'])
+parser.add_argument('--allocator', type=str, default='cuda', choices=['cuda', 
'unified_memory'])
+parser.add_argument('--batch_size', help='Batch size. Default: 64', type=int, 
default=64)
+parser.add_argument('--num_images', help='Number of images. Default: 2048', 
type=int, default=2048)
+parser.add_argument('--eviction_policy', help='Eviction policy. Default: 
align_memory', type=str, default='align_memory', choices=['align_memory', 
'lru', 'fifo', 'min_evict', 'lfu', 'mru'])
+parser.add_argument('--framework', help='The framework to use for running the 
benchmark. Default: systemml', type=str, default='systemml', 
choices=['systemml', 'tensorflow', 'systemml_force_gpu', 'tensorflow-gpu'])
+parser.add_argument('--num_channels', help='Number of channels. Default: 3', 
type=int, default=3)
+parser.add_argument('--height', help='Height. Default: 224', type=int, 
default=224)
+parser.add_argument('--width', help='Width. Default: 224', type=int, 
default=224)
+args=parser.parse_args()
+
+#######################################################################
+# Required to ensure that TF only uses exactly 1 GPU if framework is 
tensorflow-gpu, else no gpu
+os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+if args.framework == 'tensorflow-gpu':
+       os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
+        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+else:
+       # Disable tensorflow from grabbing the entire GPU memory
+       os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
+       os.environ['CUDA_VISIBLE_DEVICES'] = ''
+#######################################################################
+
+# To discount the transfer time of batches, we use one randomly generated batch
+# and scale the number of epochs
+batch_size = args.batch_size
+num_images = args.num_images
+num_images = num_images - int(num_images % batch_size)
+n_batches_for_epoch = num_images / batch_size
+
+# Model-specific parameters
+num_classes = 1000
+input_shape = (args.num_channels, args.height, args.width)
+if args.network == 'unet' and (input_shape[0] != 1 or input_shape[1] != 256 or 
input_shape[2] != 256):
+       raise ValueError('Incorrect input shape for unet: ' + str(input_shape) 
+ '. Supported input shape fo unet: (1, 256, 256)' )
+num_pixels = input_shape[0]*input_shape[1]*input_shape[2]
+
+import keras
+from keras.utils import np_utils
+from keras import backend as K
+if args.framework.startswith('systemml'):
+       K.set_image_data_format('channels_first')
+import os 
+import numpy as np
+from keras.models import *
+from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Dropout, 
Cropping2D, concatenate # merge
+from keras.optimizers import *
+
+#####################################################################################
+# Ideally we would have preferred to compare the performance on double 
precision
+# as SystemML's CPU backend only supports double precision. 
+# But since TF 1.7 crashes with double precision, we only test with single 
precision 
+use_double_precision = False 
+if use_double_precision:
+       K.set_floatx('float64')
+if args.framework == 'tensorflow-gpu':
+       import tensorflow as tf
+       from keras.backend.tensorflow_backend import set_session
+       tf_config = tf.ConfigProto()
+       if args.allocator =='cuda':
+               tf_config.gpu_options.per_process_gpu_memory_fraction = 0.9
+       elif args.allocator =='unified_memory':
+               tf_config.gpu_options.allow_growth = True
+       set_session(tf.Session(config=tf_config))
+#####################################################################################
+
+error_occured = False
+print("Building model ... ")
+if args.network == 'vgg16':
+       model = keras.applications.vgg16.VGG16(weights='imagenet', 
classes=num_classes)
+elif args.network == 'vgg19':
+       model = keras.applications.vgg19.VGG19(weights='imagenet', 
classes=num_classes)
+elif args.network == 'resnet200':
+       import resnet
+       model = resnet.ResnetBuilder.build_resnet_200(input_shape, num_classes)
+elif args.network == 'resnet1001':
+       import resnet
+       model = resnet.ResnetBuilder.build_resnet_1001(input_shape, num_classes)
+elif args.network == 'unet':
+       def conv3x3(input, num_filters):
+                       conv = Conv2D(num_filters, 3, activation = 'relu', 
padding = 'same')(input)
+                       conv = Conv2D(num_filters, 3, activation = 'relu', 
padding = 'same')(conv)
+                       return conv
+       num_filters = [64, 128, 256, 512, 1024]
+       model_input = Input((input_shape[1], input_shape[2], input_shape[0]))
+       input = model_input
+       side_inputs = []
+       for i in range(len(num_filters)):
+                       # Apply max pooling for all except first down_conv
+                       input = MaxPooling2D(pool_size=(2, 2))(input) if i != 0 
else input
+                       input = conv3x3(input, num_filters[i])
+                       # Apply dropouts to only last 2 down_conv
+                       input = Dropout(0.5)(input) if i >= len(num_filters)-2 
else input
+                       side_inputs.append(input)
+       input = side_inputs.pop()
+       num_filters.pop()
+       for i in range(len(num_filters)):
+                       filters = num_filters.pop()
+                       input = Conv2D(filters, 3, activation = 'relu', padding 
= 'same')(UpSampling2D(size = (2,2))(input))
+                       #input = merge([side_inputs.pop(), input], mode = 
'concat', concat_axis = 3)
+                       input = concatenate([side_inputs.pop(), input])
+                       input = conv3x3(input, filters)
+       conv1 = Conv2D(2, 3, activation = 'relu', padding = 'same')(input)
+       model_output = Conv2D(1, 1, activation = 'sigmoid')(conv1)
+       model = Model(input = model_input, output = model_output)
+else:
+       raise ValueError('Unsupported network:' + args.network)
+if args.network == 'unet':
+       model.compile(optimizer = keras.optimizers.SGD(lr=1e-6, momentum=0.95, 
decay=5e-4, nesterov=True), loss = 'mean_squared_error')
+else:
+       model.compile(loss='categorical_crossentropy', 
optimizer=keras.optimizers.SGD(lr=1e-6, momentum=0.95, decay=5e-4, 
nesterov=True))
+
+#------------------------------------------------------------------------------------------
+# Use this for baseline experiments:
+# Alternate way to avoid eviction is to perform multiple forward/backward 
pass, aggregate gradients and finally perform update.
+looped_minibatch = False
+local_batch_size = batch_size
+if looped_minibatch:
+       if args.network == 'resnet200':
+               local_batch_size = 16
+       else:
+               raise ValueError('looped_minibatch not yet implemented for ' + 
str(args.network))
+       if batch_size % local_batch_size != 0:
+               raise ValueError('local_batch_size = ' + str(local_batch_size) 
+ ' should be multiple of batch size=' + str(batch_size))
+#------------------------------------------------------------------------------------------
+
+if args.framework.startswith('systemml'):
+       print("Initializing Keras2DML.")
+       from systemml.mllearn import Keras2DML
+       should_load_weights=False
+       sysml_model = Keras2DML(spark, model, 
load_keras_weights=should_load_weights, weights="tmp_weights1")
+       if looped_minibatch:
+               sysml_model.set(train_algo="looped_minibatch", 
parallel_batches=int(batch_size/local_batch_size), test_algo="batch") # 
systemml doesnot have a generator
+               sysml_model.set(weight_parallel_batches=False)
+       else:
+               sysml_model.set(train_algo="batch", test_algo="batch") 
+       sysml_model.set(perform_fused_backward_update=True)
+       sysml_model.setStatistics(True).setStatisticsMaxHeavyHitters(100)
+       # Since this script is used for measuring performance and not for 
printing script, inline the nn library
+       sysml_model.set(inline_nn_library=True)
+       # For apples-to-apples comparison, donot force set the allocated array 
to 0
+       sysml_model.setConfigProperty("sysml.gpu.force.memSetZero", "false")
+       # Use single GPU
+       sysml_model.setConfigProperty("sysml.gpu.availableGPUs", "0")
+       # Use user-specified allocator: cuda (default) or unified_memory
+       sysml_model.setConfigProperty("sysml.gpu.memory.allocator", 
args.allocator);
+       # Use user-specified eviction policy
+       sysml_model.setConfigProperty("sysml.gpu.eviction.policy", 
args.eviction_policy)
+       # Please consider allocating large enough JVM and using large CPU cache
+       sysml_model.setConfigProperty("sysml.gpu.eviction.shadow.bufferSize", 
"0.5")
+       sysml_model.setConfigProperty("sysml.caching.bufferSize", "1.0")
+       # Use user-specified precision
+       if not use_double_precision:
+               sysml_model.setConfigProperty("sysml.floating.point.precision", 
"single")
+       
sysml_model.setGPU(True).setForceGPU(args.framework=='systemml_force_gpu')
+       Xb = np.random.uniform(0,1,num_pixels*batch_size)
+       Xb = Xb.reshape((batch_size, num_pixels))
+       if args.network == 'unet':
+               yb = np.random.randint(5, 
size=num_pixels*batch_size).reshape((batch_size, num_pixels))
+               sysml_model.set(perform_one_hot_encoding=False)
+       else:
+               yb = np.random.randint(num_classes, size=batch_size)
+       from py4j.protocol import Py4JJavaError
+       start = time.time()
+       try:
+               print("Invoking fit")
+               sysml_model.fit(Xb, yb, batch_size=local_batch_size, 
epochs=n_batches_for_epoch)
+               print("Done with fit")
+       except Py4JJavaError as e:
+               error_occured = True
+               print("Execution failed: " + str(e))
+       except AttributeError as e1:
+               error_occured = True
+               print("Execution failed: " + str(e1))
+elif args.framework.startswith('tensorflow'):
+       Xb = np.random.randint(256, 
size=num_pixels*batch_size).reshape((batch_size, input_shape[1],input_shape[2], 
input_shape[0])) + 1
+       if args.network == 'unet':
+               yb = np.random.randint(5, 
size=num_pixels*batch_size).reshape((batch_size, input_shape[1],input_shape[2], 
input_shape[0]))
+       else:
+               yb = np.random.randint(num_classes, size=batch_size)
+               yb = np_utils.to_categorical(yb, num_classes)
+       start = time.time()
+       model.fit(Xb, yb, batch_size=batch_size, epochs=n_batches_for_epoch)
+K.clear_session()
+end = time.time()
+if not error_occured:
+       with open('time.txt', 'a') as f:
+               f.write(args.framework + ',' + args.network + 
',synthetic_imagenet,1,' + str(batch_size) + ',1,' + str(num_images) + "," + 
str(end-start) + "," + args.eviction_policy + ',' + args.allocator + '\n')
diff --git a/scripts/perftest/gpu_resnet_perftest/run.sh 
b/scripts/perftest/gpu_resnet_perftest/run.sh
new file mode 100644
index 0000000..30187f1
--- /dev/null
+++ b/scripts/perftest/gpu_resnet_perftest/run.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#rm -rf time.txt logs
+#mkdir logs
+
+SPARK_HOME='/home/.../spark-2.3.0-bin-hadoop2.7'
+DRIVER_MEMORY='200g'
+
+function compare_baseline {
+       network=$1
+       num_images=$2
+       batch_size=$3
+       num_channels=$4
+       height=$5
+       width=$6
+       allocator='unified_memory'
+       eviction_policy='lru'
+       for framework in tensorflow-gpu tensorflow systemml_force_gpu
+       do
+               echo "Running 
"$framework"_"$batch_size"_"$network"_"$num_images"_"$eviction_policy
+               rm -rf tmp_weights1 scratch_space spark-warehouse &> /dev/null
+               $SPARK_HOME/bin/spark-submit --driver-memory $DRIVER_MEMORY 
run.py --num_channels $num_channels --height $height --width $width 
--num_images $num_images --eviction_policy $eviction_policy --network $network 
--batch_size $batch_size --framework $framework --allocator $allocator &> 
logs/$framework"_"$batch_size"_"$network"_"$num_images"_"$eviction_policy"_"$allocator"_"$num_channels"_"$height"_"$width".log"
+       done
+}
+
+function compare_eviction_policy {
+       network=$1
+       num_images=$2
+       batch_size=$3
+       num_channels=$4
+       height=$5
+       width=$6
+       framework='systemml_force_gpu'
+       allocator='cuda'
+       for eviction_policy in min_evict align_memory lru lfu
+       do
+               echo "Running 
"$framework"_"$batch_size"_"$network"_"$num_images"_"$eviction_policy
+               rm -rf tmp_weights1 scratch_space spark-warehouse &> /dev/null
+               $SPARK_HOME/bin/spark-submit --driver-memory $DRIVER_MEMORY 
run.py --num_channels $num_channels --height $height --width $width 
--num_images $num_images --eviction_policy $eviction_policy --network $network 
--batch_size $batch_size --framework $framework --allocator $allocator &> 
logs/$framework"_"$batch_size"_"$network"_"$num_images"_"$eviction_policy"_"$allocator"_"$num_channels"_"$height"_"$width".log"
+       done
+}
+
+# Experiment 1: Very Deep ResNet-200
+compare_baseline resnet200 2 1 3 1792 1792
+compare_eviction_policy resnet200 2 1 3 1792 1792
+
+# Experiment 2: Psuedo in-memory  ResNet-200
+for b in 32 96 64 48 16 4
+do
+       compare_baseline resnet200 15360 $b 3 224 224  
+       compare_eviction_policy resnet200 15360 $b 3 224 224
+done
\ No newline at end of file

[systemml] branch master updated: [SYSTEMML-540] Added performance tests for ResNet200

Reply via email to