chrishkchris commented on a change in pull request #468: Distributted module URL: https://github.com/apache/incubator-singa/pull/468#discussion_r310328731
########## File path: src/api/config.i ########## @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + + +// Pass in cmake configurations to swig +#define USE_CUDA 1 +#define USE_CUDNN 1 +#define USE_OPENCL 0 +#define USE_PYTHON 1 +#define USE_MKLDNN 1 +#define USE_JAVA 0 +#define CUDNN_VERSION 7401 + +// SINGA version +#define SINGA_MAJOR_VERSION 1 Review comment: I have trained the dist_new branch resnet (because resnet has batch norm) with cifar10 dataset using 1 GPU, and obtained 92.5% test accuracy in 100 Epochs with data augmentation. This suggest that the batch norm is in good condition (while the onnx interface of batchnorm may need to be considered but I am not sure) ``` ubuntu@ip-172-31-27-25:~/incubator-singa/examples/autograd$ python3 resnet_realdata.py Loading data file cifar-10-batches-py/data_batch_1 Loading data file cifar-10-batches-py/data_batch_2 Loading data file cifar-10-batches-py/data_batch_3 Loading data file cifar-10-batches-py/data_batch_4 Loading data file cifar-10-batches-py/data_batch_5 Loading data file cifar-10-batches-py/test_batch Start intialization............ Epoch=0: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 2927.551146, training accuracy = 0.338068 Test accuracy = 0.441306 Epoch=1: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.59it/s] Training loss = 2110.360374, training accuracy = 0.511984 Test accuracy = 0.606571 Epoch=2: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 1658.897868, training accuracy = 0.623199 Test accuracy = 0.645232 Epoch=3: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.64it/s] Training loss = 1354.082412, training accuracy = 0.694442 Test accuracy = 0.731170 Epoch=4: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 1155.785529, training accuracy = 0.743478 Test accuracy = 0.761318 Epoch=5: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.59it/s] Training loss = 1022.750388, training accuracy = 0.773668 Test accuracy = 0.741286 Epoch=6: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 945.400214, training accuracy = 0.790373 Test accuracy = 0.795072 Epoch=7: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 840.933215, training accuracy = 0.814441 Test accuracy = 0.810096 Epoch=8: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 765.215148, training accuracy = 0.830566 Test accuracy = 0.807091 Epoch=9: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 701.153867, training accuracy = 0.845951 Test accuracy = 0.822316 Epoch=10: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 666.267428, training accuracy = 0.853073 Test accuracy = 0.851162 Epoch=11: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 606.699607, training accuracy = 0.866817 Test accuracy = 0.770232 Epoch=12: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 564.226388, training accuracy = 0.875760 Test accuracy = 0.811599 Epoch=13: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.58it/s] Training loss = 545.325170, training accuracy = 0.877621 Test accuracy = 0.856771 Epoch=14: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.59it/s] Training loss = 497.799559, training accuracy = 0.889885 Test accuracy = 0.873798 Epoch=15: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.59it/s] Training loss = 472.436701, training accuracy = 0.895927 Test accuracy = 0.864083 Epoch=16: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 442.924634, training accuracy = 0.901288 Test accuracy = 0.856070 Epoch=17: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 412.631328, training accuracy = 0.907810 Test accuracy = 0.868289 Epoch=18: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.59it/s] Training loss = 391.663337, training accuracy = 0.911932 Test accuracy = 0.878906 Epoch=19: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 364.874849, training accuracy = 0.918774 Test accuracy = 0.874399 Epoch=20: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.59it/s] Training loss = 352.975441, training accuracy = 0.920355 Test accuracy = 0.885517 Epoch=21: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.60it/s] Training loss = 324.665358, training accuracy = 0.927157 Test accuracy = 0.878506 Epoch=22: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 314.736589, training accuracy = 0.929617 Test accuracy = 0.875701 Epoch=23: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 296.109193, training accuracy = 0.934399 Test accuracy = 0.878405 Epoch=24: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 276.580544, training accuracy = 0.937960 Test accuracy = 0.878906 Epoch=25: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.58it/s] Training loss = 267.193980, training accuracy = 0.939481 Test accuracy = 0.867488 Epoch=26: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 239.712160, training accuracy = 0.946063 Test accuracy = 0.886118 Epoch=27: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 238.206897, training accuracy = 0.946803 Test accuracy = 0.899139 Epoch=28: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.58it/s] Training loss = 229.405735, training accuracy = 0.947343 Test accuracy = 0.873698 Epoch=29: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 211.089237, training accuracy = 0.951865 Test accuracy = 0.896735 Epoch=30: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 198.430727, training accuracy = 0.954665 Test accuracy = 0.900040 Epoch=31: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.60it/s] Training loss = 191.366123, training accuracy = 0.956086 Test accuracy = 0.899639 Epoch=32: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 182.107810, training accuracy = 0.959287 Test accuracy = 0.902544 Epoch=33: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 172.826398, training accuracy = 0.960767 Test accuracy = 0.893530 Epoch=34: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 169.613187, training accuracy = 0.961008 Test accuracy = 0.895933 Epoch=35: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 155.248689, training accuracy = 0.965709 Test accuracy = 0.896334 Epoch=36: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 153.824897, training accuracy = 0.965969 Test accuracy = 0.905549 Epoch=37: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.58it/s] Training loss = 146.810095, training accuracy = 0.966649 Test accuracy = 0.887921 Epoch=38: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.60it/s] Training loss = 135.051945, training accuracy = 0.969390 Test accuracy = 0.899639 Epoch=39: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 133.820405, training accuracy = 0.969950 Test accuracy = 0.897436 Epoch=40: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 135.417706, training accuracy = 0.969230 Test accuracy = 0.914964 Epoch=41: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 120.854997, training accuracy = 0.972371 Test accuracy = 0.905449 Epoch=42: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 116.340978, training accuracy = 0.973211 Test accuracy = 0.905048 Epoch=43: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 110.309411, training accuracy = 0.975292 Test accuracy = 0.909455 Epoch=44: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 111.145267, training accuracy = 0.974912 Test accuracy = 0.914163 Epoch=45: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.60it/s] Training loss = 101.818090, training accuracy = 0.977693 Test accuracy = 0.903546 Epoch=46: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 94.800353, training accuracy = 0.978513 Test accuracy = 0.910958 Epoch=47: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 98.355502, training accuracy = 0.977513 Test accuracy = 0.904347 Epoch=48: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 96.644517, training accuracy = 0.978333 Test accuracy = 0.907252 Epoch=49: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 93.228892, training accuracy = 0.979253 Test accuracy = 0.908353 Epoch=50: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.60it/s] Training loss = 88.646000, training accuracy = 0.980034 Test accuracy = 0.909355 Epoch=51: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 85.061176, training accuracy = 0.981314 Test accuracy = 0.906450 Epoch=52: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 84.551198, training accuracy = 0.980994 Test accuracy = 0.913962 Epoch=53: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 80.467703, training accuracy = 0.982634 Test accuracy = 0.909555 Epoch=54: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.64it/s] Training loss = 75.081518, training accuracy = 0.983395 Test accuracy = 0.905449 Epoch=55: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.58it/s] Training loss = 74.390348, training accuracy = 0.983635 Test accuracy = 0.921975 Epoch=56: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 71.364901, training accuracy = 0.984235 Test accuracy = 0.908754 Epoch=57: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 63.377296, training accuracy = 0.986216 Test accuracy = 0.896835 Epoch=58: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 64.011915, training accuracy = 0.985715 Test accuracy = 0.906250 Epoch=59: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 60.711218, training accuracy = 0.986156 Test accuracy = 0.910757 Epoch=60: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 60.320302, training accuracy = 0.986516 Test accuracy = 0.906751 Epoch=61: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.58it/s] Training loss = 54.098782, training accuracy = 0.988016 Test accuracy = 0.890625 Epoch=62: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 64.635589, training accuracy = 0.986116 Test accuracy = 0.915966 Epoch=63: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 51.199125, training accuracy = 0.988516 Test accuracy = 0.905849 Epoch=64: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 55.281087, training accuracy = 0.988156 Test accuracy = 0.887620 Epoch=65: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 50.973392, training accuracy = 0.988596 Test accuracy = 0.919571 Epoch=66: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.59it/s] Training loss = 52.769947, training accuracy = 0.988296 Test accuracy = 0.903746 Epoch=67: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.59it/s] Training loss = 49.218939, training accuracy = 0.989437 Test accuracy = 0.885617 Epoch=68: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 54.984499, training accuracy = 0.987776 Test accuracy = 0.920573 Epoch=69: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 52.156214, training accuracy = 0.989277 Test accuracy = 0.911458 Epoch=70: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.60it/s] Training loss = 44.693285, training accuracy = 0.990797 Test accuracy = 0.916266 Epoch=71: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 42.939453, training accuracy = 0.990557 Test accuracy = 0.916567 Epoch=72: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 44.035228, training accuracy = 0.990657 Test accuracy = 0.908554 Epoch=73: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.57it/s] Training loss = 46.019535, training accuracy = 0.989917 Test accuracy = 0.915064 Epoch=74: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.64it/s] Training loss = 38.398655, training accuracy = 0.991597 Test accuracy = 0.923478 Epoch=75: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.58it/s] Training loss = 44.667590, training accuracy = 0.990497 Test accuracy = 0.906450 Epoch=76: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.60it/s] Training loss = 40.122385, training accuracy = 0.991357 Test accuracy = 0.913562 Epoch=77: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.59it/s] Training loss = 40.387087, training accuracy = 0.991157 Test accuracy = 0.919571 Epoch=78: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 40.673833, training accuracy = 0.991177 Test accuracy = 0.916466 Epoch=79: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 37.217760, training accuracy = 0.991357 Test accuracy = 0.892929 Epoch=80: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 38.898771, training accuracy = 0.991797 Test accuracy = 0.921575 Epoch=81: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 36.469006, training accuracy = 0.992298 Test accuracy = 0.908153 Epoch=82: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.60it/s] Training loss = 42.442582, training accuracy = 0.990977 Test accuracy = 0.919972 Epoch=83: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.59it/s] Training loss = 38.075628, training accuracy = 0.991677 Test accuracy = 0.915966 Epoch=84: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 38.809648, training accuracy = 0.991417 Test accuracy = 0.920974 Epoch=85: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 35.128319, training accuracy = 0.992298 Test accuracy = 0.919772 Epoch=86: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 29.161311, training accuracy = 0.993638 Test accuracy = 0.918169 Epoch=87: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.58it/s] Training loss = 33.438135, training accuracy = 0.993138 Test accuracy = 0.917067 Epoch=88: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 30.297209, training accuracy = 0.993318 Test accuracy = 0.909856 Epoch=89: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.63it/s] Training loss = 29.246781, training accuracy = 0.993998 Test accuracy = 0.914463 Epoch=90: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.60it/s] Training loss = 33.667621, training accuracy = 0.992718 Test accuracy = 0.913562 Epoch=91: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.59it/s] Training loss = 27.974487, training accuracy = 0.994018 Test accuracy = 0.917568 Epoch=92: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 28.299258, training accuracy = 0.993618 Test accuracy = 0.926583 Epoch=93: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 34.511342, training accuracy = 0.992177 Test accuracy = 0.919972 Epoch=94: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 30.856448, training accuracy = 0.993478 Test accuracy = 0.924179 Epoch=95: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 30.624649, training accuracy = 0.993198 Test accuracy = 0.927183 Epoch=96: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.60it/s] Training loss = 24.563530, training accuracy = 0.994378 Test accuracy = 0.922075 Epoch=97: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 24.466792, training accuracy = 0.994958 Test accuracy = 0.920573 Epoch=98: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.61it/s] Training loss = 28.265006, training accuracy = 0.994038 Test accuracy = 0.922276 Epoch=99: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1562/1562 [03:56<00:00, 6.62it/s] Training loss = 27.351641, training accuracy = 0.993638 Test accuracy = 0.925481 ubuntu@ip-172-31-27-25:~/incubator-singa/examples/autograd$ ``` The code used: ``` # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # the code is modified from # https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py try: import pickle except ImportError: import cPickle as pickle from singa import autograd from singa import tensor from singa import device from singa import opt import cv2 import numpy as np from tqdm import trange def load_dataset(filepath): print('Loading data file %s' % filepath) with open(filepath, 'rb') as fd: try: cifar10 = pickle.load(fd, encoding='latin1') except TypeError: cifar10 = pickle.load(fd) image = cifar10['data'].astype(dtype=np.uint8) image = image.reshape((-1, 3, 32, 32)) label = np.asarray(cifar10['labels'], dtype=np.uint8) label = label.reshape(label.size, 1) return image, label def load_train_data(dir_path='cifar-10-batches-py', num_batches=5): labels = [] batchsize = 10000 images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8) for did in range(1, num_batches + 1): fname_train_data = dir_path + "/data_batch_{}".format(did) image, label = load_dataset(fname_train_data) images[(did - 1) * batchsize:did * batchsize] = image labels.extend(label) images = np.array(images, dtype=np.float32) labels = np.array(labels, dtype=np.int32) return images, labels def load_test_data(dir_path='cifar-10-batches-py'): images, labels = load_dataset(dir_path + "/test_batch") return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32) def normalize_for_resnet(train_x, test_x): mean=[0.4914, 0.4822, 0.4465] std=[0.2023, 0.1994, 0.2010] train_x /= 255 test_x /= 255 for ch in range(0,2): train_x[:, ch, :, :] -= mean[ch] train_x[:, ch, :, :] /= std[ch] test_x[:, ch, :, :] -= mean[ch] test_x[:, ch, :, :] /= std[ch] return train_x, test_x def resize_dataset(x,IMG_SIZE): num_data = x.shape[0] dim = x.shape[1] X = np.zeros(shape=(num_data,dim,IMG_SIZE,IMG_SIZE), dtype=np.float32) for n in range(0,num_data): for d in range(0,dim): X[n, d, :, :] = cv2.resize(x[n , d, : ,:], (IMG_SIZE,IMG_SIZE)).astype(np.float32) return X def augmentation(x, batch_size): xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric') for data_num in range(0, batch_size): offset = np.random.randint(8, size=2) x[data_num,:,:,:] = xpad[data_num, :, offset[0]: offset[0] + 32, offset[1]: offset[1] + 32] if_flip = np.random.randint(2) if (if_flip): x[data_num, :, :, :] = x[data_num, :, :, ::-1] return x def accuracy(pred, target): y = np.argmax(pred, axis=1) t = np.argmax(target, axis=1) a = y == t return np.array(a, "int").sum() def to_categorical(y, num_classes): y = np.array(y, dtype="int") n = y.shape[0] categorical = np.zeros((n, num_classes)) for i in range(0,n): categorical[i, y[i]] = 1 categorical = categorical.astype(np.float32) return categorical if __name__ == '__main__': #load dataset #need to download with "/python3 incubator-singa/examples/cifar10/download_data.py py" train_x, train_y = load_train_data() test_x, test_y = load_test_data() train_x, test_x = normalize_for_resnet(train_x, test_x) from resnet import resnet50 model = resnet50(num_classes=10) print('Start intialization............') dev = device.create_cuda_gpu_on(0) max_epoch = 100 batch_size = 32 IMG_SIZE = 224 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size,), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) for epoch in range(max_epoch): np.random.shuffle(idx) #Training Phase autograd.training = True train_correct = 0 test_correct = 0 train_loss = 0 with trange(num_train_batch) as t: t.set_description('Epoch={}'.format(epoch)) for b in t: x = train_x[idx[b * batch_size: (b + 1) * batch_size]] x = augmentation(x, batch_size) x = resize_dataset(x,IMG_SIZE) y = train_y[idx[b * batch_size: (b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), to_categorical(y, 10)) train_loss += tensor.to_numpy(loss)[0] for p, g in autograd.backward(loss): sgd.update(p, g) sgd.step() print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch*(batch_size)))) #Evaulation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size: (b + 1) * batch_size] x = resize_dataset(x,IMG_SIZE) y = test_y[b * batch_size: (b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model(tx) test_correct += accuracy(tensor.to_numpy(out_test), to_categorical(y, 10)) print('Test accuracy = %f' % (test_correct / (num_test_batch*(batch_size)))) ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services