jroesch commented on a change in pull request #8788:
URL: https://github.com/apache/tvm/pull/8788#discussion_r693113525



##########
File path: docs/langref/relay_op.rst
##########
@@ -215,7 +215,7 @@ This level support backpropagation of broadcast operators. 
It is temporary.
    tvm.relay.ndarray_size
    tvm.relay.layout_transform
    tvm.relay.device_copy
-   tvm.relay.annotation.on_device
+   tvm.relay.annotation.on_device_op

Review comment:
       What is the reason for renaming the op? 

##########
File path: include/tvm/runtime/logging.h
##########
@@ -409,6 +411,57 @@ inline bool DebugLoggingEnabled() {
   return state == 1;
 }
 
+/*!
+ * \brief Returns true if \p filename is mentioned in the environment
+ * variable \p TVM_LOG_DEBUG as enabled for 'verbose' logging at /p level.
+ * Levels go up to 9.
+ *
+ * To enable file \p foo.cc for level >= 2 and \p bar.cc for level >= 0 set:
+ *
+ * \code
+ * TVM_LOG_DEBUG="1;foo.cc=2;bar.cc=0;"
+ * \endcode
+ */
+bool VerboseLoggingEnabled(const char* filename, int level);

Review comment:
       Is there a reason that we need Verbose logging vs. using DLOG?

##########
File path: src/relay/op/annotation/annotation.cc
##########
@@ -36,18 +36,18 @@
 namespace tvm {
 namespace relay {
 
-// relay.annotation.on_device
+// relay.annotation.on_device_op
 TVM_REGISTER_NODE_TYPE(OnDeviceAttrs);
 
-TVM_REGISTER_GLOBAL("relay.op.annotation._make.on_device")

Review comment:
       Same as above Q.

##########
File path: src/relay/backend/interpreter.cc
##########
@@ -913,17 +917,14 @@ std::pair<IRModule, Map<String, IRModule>> 
Prepare(IRModule mod, Device device,
   // We only have one device-specific target.
   tec::TargetMap targets = {{device.device_type, target}};
 
-  // All calls to primitives will use the unique target.

Review comment:
       This is great. 

##########
File path: src/relay/backend/graph_executor_codegen.cc
##########
@@ -197,32 +200,30 @@ class GraphExecutorCodegen : public 
backend::MemoizedExprTranslator<std::vector<
   LoweredOutput Codegen(relay::Function func, String mod_name) {
     mod_name_ = mod_name;
 
-    // TODO(@jroesch): we need to split device planning and memory planning
-    // first we run device assignment, then we perform lowering, and then
-    // storage planning in ideal world.
-
-    memory_plan_ = GraphPlanMemory(func);
-
     // This first phase moves from implicit use of compile engine,
     // to instead explicitly lowering the incoming IRModule, and then
     // performing the preexisting graph executor code generation phase.
     IRModule mod = IRModule::FromExpr(func);
 
-    // Build a map from each operation to device.
-    tec::DeviceMap device_context_map;
-    for (const auto& it : memory_plan_->expr_to_storage_info) {
-      auto expr = it.first;
-      auto storage_info = it.second;
-      auto device_types = storage_info->device_types;
-      // CHECK_EQ(device_types.size(), 1);
-      tvm::Device dev;
-      dev.device_id = 0;
-      dev.device_type = device_types[0];
-      device_context_map.insert({expr, dev});
+    // Make explicit which device each expression should be executed on.

Review comment:
       Same Q as above, also can we put this code in some kind of helper?

##########
File path: src/relay/backend/te_compiler.cc
##########
@@ -518,54 +515,55 @@ class LowerTensorExprMutator : public ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const CallNode* call) override {
-    Call expr = GetRef<Call>(call);
+  Expr VisitExpr_(const CallNode* call_node) override {
+    Device device = GetOnDeviceDevice(call_node);
+    if (device.device_type) {
+      PushDevice(device);
+      Expr arg = VisitExpr(call_node->args[0]);
+      PopDevice();
+      return Call(call_node->op, {arg}, call_node->attrs, 
call_node->type_args, call_node->span);
+    }
+
+    Call expr = GetRef<Call>(call_node);
 
     // Look for (indirect) calls to primitives.
-    Function prim_func = ResolveToPrimitive(call->op);
+    Function prim_func = ResolveToPrimitive(call_node->op);
     if (!prim_func.defined()) {
-      // Not a call to a primitive function.
-      if (const FunctionNode* fn = call->op.as<FunctionNode>()) {
+      // Not a call_node to a primitive function.
+      if (const FunctionNode* fn = call_node->op.as<FunctionNode>()) {
         this->process_fn_(GetRef<Function>(fn));
       }
-      return ExprMutator::VisitExpr_(call);
+      return ExprMutator::VisitExpr_(call_node);
     }
 
     // Find the desired target device.
     Target target;
     if (prim_func->GetAttr<String>(attr::kCompiler).defined()) {
       // The generic 'external device' target.
       target = Target("ext_dev");
-    } else if (device_context_map_.empty() && targets_.size() == 1) {
-      // The unique target.
-      target = GetTargetFromInteger(kDLCPU, targets_);
     } else {
-      // The target corresponding to the call expression's annotation.
-      auto itr = device_context_map_.find(expr);
-      ICHECK(itr != device_context_map_.end())
-          << "Could not find an entry in the device context map for " << expr
-          << "The memory planning was either not performed for this precise 
node, or there is "
-             "bug in the memory planner.";
-      target = GetTargetFromInteger(itr->second.device_type, targets_);
+      // The target corresponding to the call_node expression's annotation.
+      device = GetDevice(expr);
+      // TODO(mbs): device id
+      target = GetTargetFromInteger(device.device_type, targets_);

Review comment:
       This will become a lookup after we refactor the map?

##########
File path: src/relay/backend/graph_plan_memory.cc
##########
@@ -452,7 +469,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
   // all the storage resources available
   std::vector<StorageToken*> data_;
   /*! \brief internal prototype token map */
-  std::unordered_map<const ExprNode*, std::vector<StorageToken*> > prototype_;
+  std::unordered_map<const ExprNode*, std::vector<StorageToken*>> prototype_;
 };
 
 StaticMemoryPlan GraphPlanMemory(const Function& func) { return 
StorageAllocator().Plan(func); }

Review comment:
       A related question should we do the Array of struct vs. Struct of array 
swap here? the memory plan data structure is actually kind of funky right now 
due to the fact it maps ops to list of storages per device. 

##########
File path: src/relay/transforms/device_annotation.cc
##########
@@ -320,191 +326,11 @@ class AnnotatationVisitor : private ExprVisitor {
   Map<Expr, Integer> annotations_;
 };
 
-/*
- * \brief Return device allocation map based on the post order traversed graph.

Review comment:
       Should we replace this with a similar updated comment?

##########
File path: src/relay/backend/vm/compiler.cc
##########
@@ -1041,13 +1013,31 @@ IRModule VMCompiler::OptimizeModule(IRModule mod, const 
TargetsMap& targets_arg,
 
   Array<Pass> pass_seqs = relay::backend::GetPassPrefix(targets, true);
 
+  Device default_device;

Review comment:
       Is it possible for us to compute this outside the OptimizeModule, 
ideally this should just build the optimization pipeline and have minimal 
logic. 

##########
File path: src/relay/backend/aot_executor_codegen.cc
##########
@@ -562,32 +558,35 @@ class AOTExecutorCodegen : public ExprVisitor {
         
use_unpacked_api_(target_host->GetAttr<Bool>("unpacked-api").value_or(Bool(false)))
 {}
 
   LoweredOutput Codegen(relay::Function func, String mod_name) {
-    auto aot_allocator = AOTOnDemandAllocator();
-    aot_allocator.Run(func);
-
-    // Pre-lowering storage map and memory plan
-    StorageMap initial_storage_map = aot_allocator.GetStorageMap();
-    StaticMemoryPlan memory_plan(initial_storage_map);
-
-    // Build a map from each operation to device.
-    tec::DeviceMap device_context_map;
-    for (const auto& it : memory_plan->expr_to_storage_info) {
-      auto expr = it.first;
-      auto storage_info = it.second;
-      auto device_types = storage_info->device_types;
-      // CHECK_EQ(device_types.size(), 1);
-      tvm::Device dev;
-      dev.device_id = 0;
-      dev.device_type = device_types[0];
-      device_context_map.insert({expr, dev});
-    }
-
     // This first phase moves from implicit use of compile engine,
     // to instead explicitly lowering the incoming IRModule, and then
     // performing the preexisting AOT executor code generation phase.
     IRModule mod = IRModule::FromExpr(func);
+
+    // Make explicit which device each expression should be executed on.

Review comment:
       This code will get folded away in future refactors as we flow the 
target/device information around?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@tvm.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to