This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 4d2af992224b13006a94fde2c67acd004e138835
Author: Alexey Serbin <ale...@apache.org>
AuthorDate: Wed Jul 19 17:26:03 2023 -0700

    [tablet] add more details for operation's APPLY trace
    
    While troubleshooting a performance issue, I saw a trace like below
    where it wasn't possible to tell what exact part of the APPLY phase
    too very long.  This patch adds more tracing around BulkCheckPresence()
    and ApplyRowOperation(), so now it will be possible to tell between
    the two.  I also took the liberty to update other tracing messages,
    making their style more uniform.
    
      W0717 00:13:03.171553 34715 rpcz_store.cc:260] Trace:
      0716 23:58:44.240887 (+     0us) service_pool.cc:169] Inserting onto call 
queue
      0716 23:58:44.240902 (+    15us) service_pool.cc:228] Handling call
      0717 00:13:03.171190 (+858930288us) inbound_call.cc:171] Queueing success 
response
      Related trace 'op':
      0716 23:58:44.241244 (+     0us) write_op.cc:162] PREPARE: Starting.
      0716 23:58:44.241285 (+    41us) write_op.cc:383] Acquiring schema lock 
in shared mode
      0716 23:58:44.241286 (+     1us) write_op.cc:386] Acquired schema lock
      0716 23:58:44.241287 (+     1us) tablet.cc:541] Decoding operations
      0716 23:58:44.241315 (+    28us) write_op.cc:558] Acquiring the partition 
lock for write op
      0716 23:58:44.241316 (+     1us) write_op.cc:579] Partition lock acquired 
for write op
      0716 23:58:44.241316 (+     0us) tablet.cc:563] Acquiring locks for 1 
operations
      0716 23:58:44.241325 (+     9us) tablet.cc:579] Row locks acquired
      0716 23:58:44.241325 (+     0us) write_op.cc:227] PREPARE: Finished.
      0716 23:58:44.241349 (+    24us) write_op.cc:237] Start()
      0716 23:58:44.241351 (+     2us) write_op.cc:242] Timestamp: P: 
1689551924241331 usec, L: 0
      0716 23:58:44.241351 (+     0us) op_driver.cc:317] REPLICATION: Starting.
      0716 23:58:44.241403 (+    52us) log.cc:834] Serialized 3904 byte log 
entry
      0716 23:58:44.242207 (+   804us) op_driver.cc:430] REPLICATION: Finished.
      0716 23:58:44.242222 (+    15us) write_op.cc:266] APPLY: Starting.
      0717 00:13:03.171012 (+858928790us) tablet_metrics.cc:481] ProbeStats: 
bloom_lookups=5,key_file_lookups=0,delta_file_lookups=0,mrs_lookups=0
      0717 00:13:03.171024 (+    12us) write_op.cc:277] APPLY: Finished.
      0717 00:13:03.171062 (+    38us) log.cc:834] Serialized 27 byte log entry
      0717 00:13:03.171083 (+    21us) write_op.cc:440] Releasing partition, 
row and schema locks
      0717 00:13:03.171103 (+    20us) write_op.cc:405] Released schema lock
      0717 00:13:03.171104 (+     1us) write_op.cc:301] FINISH: Updating 
metrics.
    
    Change-Id: I9628a098f3bd01cb5854998829d159036c708103
    Reviewed-on: http://gerrit.cloudera.org:8080/20232
    Tested-by: Kudu Jenkins
    Reviewed-by: Abhishek Chennaka <achenn...@cloudera.com>
---
 src/kudu/tablet/ops/op_driver.cc      | 11 ++++++-----
 src/kudu/tablet/ops/participant_op.cc |  8 ++++----
 src/kudu/tablet/ops/write_op.cc       |  8 ++++----
 src/kudu/tablet/tablet.cc             | 15 ++++++++++-----
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/src/kudu/tablet/ops/op_driver.cc b/src/kudu/tablet/ops/op_driver.cc
index d717c00c1..73a7f7f21 100644
--- a/src/kudu/tablet/ops/op_driver.cc
+++ b/src/kudu/tablet/ops/op_driver.cc
@@ -21,6 +21,7 @@
 #include <memory>
 #include <mutex>
 #include <ostream>
+#include <type_traits>
 #include <utility>
 
 #include <glog/logging.h>
@@ -39,10 +40,10 @@
 #include "kudu/rpc/result_tracker.h"
 #include "kudu/rpc/rpc_header.pb.h"
 #include "kudu/tablet/mvcc.h"
-#include "kudu/tablet/tablet.h"
-#include "kudu/tablet/tablet_replica.h"
 #include "kudu/tablet/op_order_verifier.h"
 #include "kudu/tablet/ops/op_tracker.h"
+#include "kudu/tablet/tablet.h"
+#include "kudu/tablet/tablet_replica.h"
 #include "kudu/util/debug/trace_event.h"
 #include "kudu/util/logging.h"
 #include "kudu/util/pb_util.h"
@@ -314,7 +315,7 @@ Status OpDriver::Prepare() {
                         mutable_state()->consensus_round()->replicate_msg()));
       RETURN_NOT_OK(op_->Start());
       VLOG_WITH_PREFIX(4) << "Triggering consensus replication.";
-      TRACE("REPLICATION: Starting.");
+      TRACE("REPLICATION: starting");
       // Trigger consensus replication.
       {
         std::lock_guard<simple_spinlock> lock(lock_);
@@ -427,7 +428,7 @@ void OpDriver::ReplicationFinished(const Status& status) {
   }
 
   TRACE_COUNTER_INCREMENT("replication_time_us", 
replication_duration.ToMicroseconds());
-  TRACE("REPLICATION: Finished.");
+  TRACE("REPLICATION: finished");
 
   // If we have prepared and replicated, we're ready
   // to move ahead and apply this operation.
@@ -533,7 +534,7 @@ void OpDriver::ApplyTask() {
     // until now.earliest > prepare_latest. Only after this are the locks
     // released.
     if (mutable_state()->external_consistency_mode() == COMMIT_WAIT) {
-      TRACE("APPLY: Commit Wait.");
+      TRACE("APPLY: CommitWait");
       // If we can't commit wait and have already applied we might have 
consistency
       // issues if we still reply to the client that the operation was a 
success.
       // On the other hand we don't have rollbacks as of yet thus we can't 
undo the
diff --git a/src/kudu/tablet/ops/participant_op.cc 
b/src/kudu/tablet/ops/participant_op.cc
index 0bd305dc0..598b88043 100644
--- a/src/kudu/tablet/ops/participant_op.cc
+++ b/src/kudu/tablet/ops/participant_op.cc
@@ -160,7 +160,7 @@ void 
ParticipantOp::NewReplicateMsg(unique_ptr<ReplicateMsg>* replicate_msg) {
 
 Status ParticipantOp::Prepare() {
   TRACE_EVENT0("op", "ParticipantOp::Prepare");
-  TRACE("PREPARE: Starting.");
+  TRACE("PREPARE: starting");
   state_->AcquireTxnAndLock();
   RETURN_NOT_OK(state_->ValidateOp());
 
@@ -196,7 +196,7 @@ Status ParticipantOp::Prepare() {
       // Nothing to do in all other cases.
       break;
   }
-  TRACE("PREPARE: Finished.");
+  TRACE("PREPARE: finished");
   return Status::OK();
 }
 
@@ -262,12 +262,12 @@ Status ParticipantOpState::PerformOp(const 
consensus::OpId& op_id, Tablet* table
 
 Status ParticipantOp::Apply(CommitMsg** commit_msg) {
   TRACE_EVENT0("op", "ParticipantOp::Apply");
-  TRACE("APPLY: Starting.");
+  TRACE("APPLY: starting");
   state_->tablet_replica()->tablet()->StartApplying(state_.get());
   CHECK_OK(state_->PerformOp(state()->op_id(), 
state_->tablet_replica()->tablet()));
   *commit_msg = 
google::protobuf::Arena::CreateMessage<CommitMsg>(state_->pb_arena());
   (*commit_msg)->set_op_type(OperationType::PARTICIPANT_OP);
-  TRACE("APPLY: Finished.");
+  TRACE("APPLY: finished");
   return Status::OK();
 }
 
diff --git a/src/kudu/tablet/ops/write_op.cc b/src/kudu/tablet/ops/write_op.cc
index f2a62d3cd..343da89f7 100644
--- a/src/kudu/tablet/ops/write_op.cc
+++ b/src/kudu/tablet/ops/write_op.cc
@@ -166,7 +166,7 @@ Status WriteOp::Prepare() {
   TRACE_EVENT0("op", "WriteOp::Prepare");
 
   Tablet* tablet = state()->tablet_replica()->tablet();
-  TRACE(Substitute("PREPARE: Starting on tablet $0", tablet->tablet_id()));
+  TRACE(Substitute("PREPARE: starting on tablet $0", tablet->tablet_id()));
   // Decode everything first so that we give up if something major is wrong.
   Schema client_schema;
   RETURN_NOT_OK_PREPEND(SchemaFromPB(state_->request()->schema(), 
&client_schema),
@@ -243,7 +243,7 @@ Status WriteOp::Prepare() {
   }
   RETURN_NOT_OK(tablet->AcquireRowLocks(state()));
 
-  TRACE("PREPARE: Finished");
+  TRACE("PREPARE: finished");
   return Status::OK();
 }
 
@@ -283,7 +283,7 @@ void WriteOp::UpdatePerRowMetricsAndErrors() {
 // it seems pointless to return a Status!
 Status WriteOp::Apply(CommitMsg** commit_msg) {
   TRACE_EVENT0("op", "WriteOp::Apply");
-  TRACE("APPLY: Starting");
+  TRACE("APPLY: starting");
 
   if (PREDICT_FALSE(ANNOTATE_UNPROTECTED_READ(
       FLAGS_tablet_inject_latency_on_apply_write_op_ms) > 0)) {
@@ -294,7 +294,7 @@ Status WriteOp::Apply(CommitMsg** commit_msg) {
 
   Tablet* tablet = state()->tablet_replica()->tablet();
   RETURN_NOT_OK(tablet->ApplyRowOperations(state()));
-  TRACE("APPLY: Finished");
+  TRACE("APPLY: finished");
 
   UpdatePerRowMetricsAndErrors();
 
diff --git a/src/kudu/tablet/tablet.cc b/src/kudu/tablet/tablet.cc
index dc42c2820..7c17301fe 100644
--- a/src/kudu/tablet/tablet.cc
+++ b/src/kudu/tablet/tablet.cc
@@ -1363,28 +1363,33 @@ void Tablet::AbortTransaction(Txn* txn,  const OpId& 
op_id) {
 }
 
 Status Tablet::ApplyRowOperations(WriteOpState* op_state) {
-  int num_ops = op_state->row_ops().size();
-
+  const size_t num_ops = op_state->row_ops().size();
   StartApplying(op_state);
 
+  TRACE("starting BulkCheckPresence");
   IOContext io_context({ tablet_id() });
   RETURN_NOT_OK(BulkCheckPresence(&io_context, op_state));
+  TRACE("finished BulkCheckPresence");
 
+  TRACE("starting ApplyRowOperation cycle");
   // Actually apply the ops.
-  for (int op_idx = 0; op_idx < num_ops; op_idx++) {
+  for (size_t op_idx = 0; op_idx < num_ops; ++op_idx) {
     RowOp* row_op = op_state->row_ops()[op_idx];
-    if (row_op->has_result()) continue;
+    if (row_op->has_result()) {
+      continue;
+    }
     RETURN_NOT_OK(ApplyRowOperation(&io_context, op_state, row_op,
                                     op_state->mutable_op_stats(op_idx)));
     DCHECK(row_op->has_result());
   }
+  TRACE("finished ApplyRowOperation cycle");
 
   {
     std::lock_guard<rw_spinlock> l(last_rw_time_lock_);
     last_write_time_ = MonoTime::Now();
   }
 
-  if (metrics_ && num_ops > 0) {
+  if (metrics_ && PREDICT_TRUE(num_ops > 0)) {
     metrics_->AddProbeStats(op_state->mutable_op_stats(0), num_ops, 
op_state->arena());
   }
   return Status::OK();

Reply via email to