cjolivier01 commented on a change in pull request #9933: [MXNET-23] Adding support to profile kvstore server during distributed training URL: https://github.com/apache/incubator-mxnet/pull/9933#discussion_r176945375
########## File path: src/kvstore/kvstore_dist_server.h ########## @@ -153,23 +157,82 @@ class KVStoreDistServer { void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) { CommandType recved_type = static_cast<CommandType>(recved.head); - if (recved_type == CommandType::kStopServer) { - exec_.Stop(); - } else if (recved_type == CommandType::kSyncMode) { - sync_mode_ = true; - } else if (recved_type == CommandType::kSetGradientCompression) { - gradient_compression_->DecodeParams(recved.body); - } else { - // this uses value 0 for message id from frontend - // let the main thread to execute ctrl, which is necessary for python - exec_.Exec([this, recved]() { - CHECK(controller_); - controller_(recved.head, recved.body); - }); + switch (recved_type) { + case CommandType::kStopServer: + exec_.Stop(); + break; + case CommandType::kSyncMode: + sync_mode_ = true; + break; + case CommandType::kSetGradientCompression: + gradient_compression_->DecodeParams(recved.body); + break; + case CommandType::kSetProfilerParams: + // last char is the type of profiler command + ProcessServerProfilerCommands(static_cast<KVStoreServerProfilerCommand> + (recved.body.back() - '0'), + recved.body); + break; + case CommandType::kController: + // this uses value 0 for message id from frontend + // let the main thread to execute ctrl, which is necessary for python + exec_.Exec([this, recved]() { + CHECK(controller_); + controller_(recved.head, recved.body); + }); + break; } app->Response(recved); } + void ProcessServerProfilerCommands(KVStoreServerProfilerCommand type, const std::string& body) { + switch (type) { + case KVStoreServerProfilerCommand::kSetConfig: + SetProfilerConfig(body.substr(0, body.size() - 1)); + break; + case KVStoreServerProfilerCommand::kState: + MXSetProfilerState(static_cast<int>(body.front() - '0')); + break; + case KVStoreServerProfilerCommand::kPause: + MXProfilePause(static_cast<int>(body.front() - '0')); + break; + case KVStoreServerProfilerCommand::kDump: + MXDumpProfile(static_cast<int>(body.front() - '0')); + break; + } + } + + void SetProfilerConfig(std::string params_str) { + std::vector<std::string> elems; + mxnet::kvstore::split(params_str, ',', std::back_inserter(elems)); + std::vector<const char*> ckeys; + std::vector<const char*> cvals; + ckeys.reserve(elems.size()); + cvals.reserve(elems.size()); + + for (int i=0; i < elems.size(); i++) { + std::vector<std::string> parts; + mxnet::kvstore::split(elems[i], ':', std::back_inserter(parts)); + CHECK(!parts[0].empty()) << "ProfilerConfig parameter is empty"; + CHECK(!parts[1].empty()) << "ProfilerConfig value is empty for parameter "<< parts[0]; + if (parts[0] == "filename") { + parts[1] = "rank" + std::to_string(ps::MyRank()) + "_" + parts[1]; + } + char* ckey = new char[parts[0].length() + 1]; Review comment: ok can you have another vector then with unique_ptr in order to guarantee they’re freed? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services