tkonolige commented on code in PR #11000:
URL: https://github.com/apache/tvm/pull/11000#discussion_r852420219


##########
python/tvm/contrib/debugger/debug_executor.py:
##########
@@ -281,6 +282,42 @@ def run_individual(self, number, repeat=1, 
min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
+    def run_individual_node(self, index, number, repeat=1, min_repeat_ms=0):
+        """Benchmark a single node in the serialized graph.

Review Comment:
   Can you specify that this does not do any data transfer and uses arrays that 
are already on the device?



##########
python/tvm/contrib/debugger/debug_executor.py:
##########
@@ -281,6 +282,42 @@ def run_individual(self, number, repeat=1, 
min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
+    def run_individual_node(self, index, number, repeat=1, min_repeat_ms=0):
+        """Benchmark a single node in the serialized graph.
+
+        Parameters
+        ----------
+        index : int
+            The index of the node, see `self.debug_datum.get_graph_nodes`
+
+        number: int
+            The number of times to run the node to get a benchmark result.
+
+        repeat: int
+            The number of times to benchmark the nodes.

Review Comment:
   Can you use the same language as the `time_evaluator` docs here. Or just 
point to them.



##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -362,6 +396,33 @@ PackedFunc GraphExecutorDebug::GetFunction(const 
std::string& name,
       ICHECK_GE(min_repeat_ms, 0);
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
+  } else if (name == "run_individual_node") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      int node_index = args[0];
+      int number = args[1];
+      int repeat = args[2];
+      int min_repeat_ms = args[3];
+      ICHECK_GE(node_index, 0);
+      ICHECK_LT(node_index, nodes_.size());
+      ICHECK_GT(number, 0);
+      ICHECK_GT(repeat, 0);
+      ICHECK_GE(min_repeat_ms, 0);
+      std::vector<std::vector<double>> results =
+          this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
+
+      std::stringstream s;
+      s.precision(6);  // down to microseconds

Review Comment:
   This should use the maximum precision available and `std::fixed` to avoid 
any issues with rounding (which we've encountered in tests before).



##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -114,15 +94,69 @@ class GraphExecutorDebug : public GraphExecutor {
 
     std::ostringstream os;
     for (size_t index = 0; index < time_sec_per_op.size(); index++) {
-      os << time_sec_per_op[index] << ",";
+      double time = time_sec_per_op[index];
+      // To have good behavior when calculating total time, etc.
+      if (isnan(time)) {
+        time = 0;
+      }

Review Comment:
   I don't really understand how `time_sec_per_op` could become nan, but I this 
is probably a good change regardless.



##########
python/tvm/contrib/debugger/debug_executor.py:
##########
@@ -281,6 +282,42 @@ def run_individual(self, number, repeat=1, 
min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
+    def run_individual_node(self, index, number, repeat=1, min_repeat_ms=0):

Review Comment:
   Number should default to the same as `time_evaluator` (10 I think).



##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -114,15 +94,69 @@ class GraphExecutorDebug : public GraphExecutor {
 
     std::ostringstream os;
     for (size_t index = 0; index < time_sec_per_op.size(); index++) {
-      os << time_sec_per_op[index] << ",";
+      double time = time_sec_per_op[index];
+      // To have good behavior when calculating total time, etc.
+      if (isnan(time)) {
+        time = 0;
+      }
+      os << time << ",";
     }
     return os.str();
   }
 
+  std::vector<std::vector<double>> RunIndividualNode(int node_index, int 
number, int repeat,
+                                                     int min_repeat_ms) {
+    // warmup run
+    // GraphExecutor::Run();
+    std::string tkey = module_->type_key();
+
+    // results_in_seconds[a][b] is the bth index run of the ath index repeat
+    std::vector<std::vector<double>> results_in_seconds;
+
+    if (tkey == "rpc") {
+      LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
+    }
+
+    for (int i = 0; i < repeat; ++i) {
+      std::vector<Timer> op_timers;
+      double duration_ms = 0.0;
+
+      // Keep timing operations, upping number of repeats until we reach 
min_repeat_ms
+      do {
+        op_timers.clear();
+        if (duration_ms > 0.0) {
+          number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / 
number) + 1),
+                                             number * 1.618));  // 1.618 is 
chosen by random
+        }
+
+        std::chrono::time_point<std::chrono::high_resolution_clock, 
std::chrono::nanoseconds>
+            tbegin, tend;
+        tbegin = std::chrono::high_resolution_clock::now();

Review Comment:
   I know you just moved the code, but this should really use the timers 
interface. Or better yet, just call out to time_evaluator.



##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -362,6 +396,33 @@ PackedFunc GraphExecutorDebug::GetFunction(const 
std::string& name,
       ICHECK_GE(min_repeat_ms, 0);
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
+  } else if (name == "run_individual_node") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      int node_index = args[0];
+      int number = args[1];
+      int repeat = args[2];
+      int min_repeat_ms = args[3];
+      ICHECK_GE(node_index, 0);
+      ICHECK_LT(node_index, nodes_.size());
+      ICHECK_GT(number, 0);
+      ICHECK_GT(repeat, 0);
+      ICHECK_GE(min_repeat_ms, 0);
+      std::vector<std::vector<double>> results =
+          this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
+
+      std::stringstream s;
+      s.precision(6);  // down to microseconds
+
+      for (std::vector<double>& row : results) {
+        for (double cur : row) {
+          s << cur << ", ";
+        }
+        s << "\n";
+      }
+
+      // Have problems returning Integers and FloatImm so this is hack
+      *rv = s.str();

Review Comment:
   We really could use support for sending arrays of floats and ints over RPC. 
Note that `time_evaluator` just casts an array of `double` to `char*` and send 
that. Not sure it is a better approach though.



##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -362,6 +396,33 @@ PackedFunc GraphExecutorDebug::GetFunction(const 
std::string& name,
       ICHECK_GE(min_repeat_ms, 0);
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
+  } else if (name == "run_individual_node") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {

Review Comment:
   FYI, you can use `TypedPackedFunc` with a lambda to avoid manually unpacking 
the args.



##########
python/tvm/contrib/debugger/debug_executor.py:
##########
@@ -281,6 +282,42 @@ def run_individual(self, number, repeat=1, 
min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
+    def run_individual_node(self, index, number, repeat=1, min_repeat_ms=0):
+        """Benchmark a single node in the serialized graph.
+
+        Parameters
+        ----------
+        index : int
+            The index of the node, see `self.debug_datum.get_graph_nodes`
+
+        number: int
+            The number of times to run the node to get a benchmark result.
+
+        repeat: int
+            The number of times to benchmark the nodes.
+
+        min_repeat_ms: int
+            The minimum consecutive runtime of the node for a benchmark result.
+
+        Returns
+        -------
+        A list of dimensions `number` x `repeat` each one the runtime of the 
node

Review Comment:
   I don't really understand this message. Also, would it make sense to return 
an array of `BenchmarkResult` to match `time_evaluator`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@tvm.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to