This is an automated email from the ASF dual-hosted git repository.

sudheerv pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/master by this push:
     new c1ed4a4  fix DNS spike issue for TCP_RETRY mode (#7307)
c1ed4a4 is described below

commit c1ed4a4a50ba4b9d32933c58c2c0eb53f3636076
Author: Xin Li <33378623+whut...@users.noreply.github.com>
AuthorDate: Mon May 10 11:04:06 2021 -0700

    fix DNS spike issue for TCP_RETRY mode (#7307)
    
    fix compiling Error
    
    rename metrics
    
    update the docs for the new config and metric
    
    Co-authored-by: xinli1 <xin...@linkedin.com>
---
 doc/admin-guide/files/records.config.en.rst        |  7 +++
 .../monitoring/statistics/core/dns.en.rst          | 12 +++++
 iocore/dns/DNS.cc                                  | 63 +++++++++++++++++++++-
 iocore/dns/P_DNSProcessor.h                        | 11 +++-
 mgmt/RecordsConfig.cc                              |  2 +
 5 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/doc/admin-guide/files/records.config.en.rst 
b/doc/admin-guide/files/records.config.en.rst
index 7ab3992..ac443fe 100644
--- a/doc/admin-guide/files/records.config.en.rst
+++ b/doc/admin-guide/files/records.config.en.rst
@@ -2602,6 +2602,13 @@ DNS
    ``2`` TCP_ONLY:  |TS| always talks to nameservers over TCP.
    ===== ======================================================================
 
+.. ts:cv:: CONFIG proxy.config.dns.max_tcp_continuous_failures INT 10
+
+   If DNS connection mode is TCP_RETRY, set the threshold of the continuous TCP
+   query failures count for the TCP connection, reset the TCP connection 
immediately
+   if the continuous TCP query failures conut over the threshold. If the 
threshold
+   is 0 (or less than 0) we close this feature.
+
 .. ts:cv:: CONFIG proxy.config.dns.max_dns_in_flight INT 2048
 
    Maximum inflight DNS queries made by |TS| at any given instant
diff --git a/doc/admin-guide/monitoring/statistics/core/dns.en.rst 
b/doc/admin-guide/monitoring/statistics/core/dns.en.rst
index 0e0f93e..fed97d0 100644
--- a/doc/admin-guide/monitoring/statistics/core/dns.en.rst
+++ b/doc/admin-guide/monitoring/statistics/core/dns.en.rst
@@ -35,6 +35,18 @@ DNS
 
    The number of DNS lookups currently in progress.
 
+.. ts:stat:: global proxy.process.dns.tcp_retries integer
+   :type: gauge
+   :ungathered:
+
+   The number of DNS query over TCP in TCP_RETRY connection mode.
+
+.. ts:stat:: global proxy.process.dns.tcp_reset integer
+   :type: gauge
+   :ungathered:
+
+   The number of resetting TCP connection in TCP_RETRY connection mode.
+
 .. ts:stat:: global proxy.process.dns.lookup_avg_time integer
    :type: derivative
    :units: milliseconds
diff --git a/iocore/dns/DNS.cc b/iocore/dns/DNS.cc
index 74e247e..e7e9ebd 100644
--- a/iocore/dns/DNS.cc
+++ b/iocore/dns/DNS.cc
@@ -44,6 +44,7 @@ int dns_failover_number              = 
DEFAULT_FAILOVER_NUMBER;
 int dns_failover_period              = DEFAULT_FAILOVER_PERIOD;
 int dns_failover_try_period          = DEFAULT_FAILOVER_TRY_PERIOD;
 int dns_max_dns_in_flight            = MAX_DNS_IN_FLIGHT;
+int dns_max_tcp_continuous_failures  = MAX_DNS_TCP_CONTINUOUS_FAILURES;
 int dns_validate_qname               = 0;
 unsigned int dns_handler_initialized = 0;
 int dns_ns_rr                        = 0;
@@ -217,6 +218,7 @@ DNSProcessor::start(int, size_t stacksize)
   REC_EstablishStaticConfigInt32(dns_max_dns_in_flight, 
"proxy.config.dns.max_dns_in_flight");
   REC_EstablishStaticConfigInt32(dns_validate_qname, 
"proxy.config.dns.validate_query_name");
   REC_EstablishStaticConfigInt32(dns_ns_rr, 
"proxy.config.dns.round_robin_nameservers");
+  REC_EstablishStaticConfigInt32(dns_max_tcp_continuous_failures, 
"proxy.config.dns.max_tcp_continuous_failures");
   REC_ReadConfigStringAlloc(dns_ns_list, "proxy.config.dns.nameservers");
   REC_ReadConfigStringAlloc(dns_local_ipv4, "proxy.config.dns.local_ipv4");
   REC_ReadConfigStringAlloc(dns_local_ipv6, "proxy.config.dns.local_ipv6");
@@ -459,6 +461,17 @@ DNSHandler::open_cons(sockaddr const *target, bool failed, 
int icon)
 }
 
 /**
+ Close the old TCP connection and open a new one
+ */
+bool
+DNSHandler::reset_tcp_conn(int ndx)
+{
+  DNS_INCREMENT_DYN_STAT(dns_tcp_reset_stat);
+  tcpcon[ndx].close();
+  return open_con(&m_res->nsaddr_list[ndx].sa, true, ndx, true);
+}
+
+/**
   Open (and close) connections as necessary and also assures that the
   epoll fd struct is properly updated.
 
@@ -472,11 +485,12 @@ DNSHandler::open_cons(sockaddr const *target, bool 
failed, int icon)
   target != nullptr and icon > 0 :
       open connection to target.
 */
-void
+bool
 DNSHandler::open_con(sockaddr const *target, bool failed, int icon, bool 
over_tcp)
 {
   ip_port_text_buffer ip_text;
   PollDescriptor *pd = get_PollDescriptor(dnsProcessor.thread);
+  bool ret           = false;
 
   ink_assert(target != &ip.sa);
 
@@ -508,7 +522,7 @@ DNSHandler::open_con(sockaddr const *target, bool failed, 
int icon, bool over_tc
         failover();
       }
     }
-    return;
+    return false;
   } else {
     ns_down[icon] = 0;
     if (cur_con.eio.start(pd, &cur_con, EVENTIO_READ) < 0) {
@@ -517,7 +531,10 @@ DNSHandler::open_con(sockaddr const *target, bool failed, 
int icon, bool over_tc
       cur_con.num = icon;
       Debug("dns", "opening connection %s SUCCEEDED for %d", ip_text, icon);
     }
+    ret = true;
   }
+
+  return ret;
 }
 
 void
@@ -933,12 +950,32 @@ DNSHandler::recv_dns(int /* event ATS_UNUSED */, Event * 
/* e ATS_UNUSED */)
   }
 }
 
+void
+DNSHandler::check_and_reset_tcp_conn()
+{
+  for (int i = 0; i < n_con; i++) {
+    if (dns_max_tcp_continuous_failures > 0 && tcp_continuous_failures[i] >= 
dns_max_tcp_continuous_failures) {
+      // The times of continuous failures are more than the threshold, have to 
reset the connection
+      if (reset_tcp_conn(i)) {
+        // Reset the counter after the new TCP connection
+        // don't reset the counter if tcp conn reset failed, and we need 
continue to try to reset tcp conn next time
+        Warning("Reset tcp connection: nameserver = %d, failures = %d, 
threshold = %d", i, tcp_continuous_failures[i],
+                dns_max_tcp_continuous_failures);
+        tcp_continuous_failures[i] = 0;
+      }
+    }
+  }
+}
+
 /** Main event for the DNSHandler. Attempt to read from and write to named. */
 int
 DNSHandler::mainEvent(int event, Event *e)
 {
   recv_dns(event, e);
   if (dns_ns_rr) {
+    if (DNS_CONN_MODE::TCP_RETRY == dns_conn_mode) {
+      check_and_reset_tcp_conn();
+    }
     ink_hrtime t = Thread::get_hrtime();
     if (t - last_primary_retry > DNS_PRIMARY_RETRY_PERIOD) {
       for (int i = 0; i < n_con; i++) {
@@ -1139,6 +1176,14 @@ write_dns_event(DNSHandler *h, DNSEntry *e, bool 
over_tcp)
   int s = socketManager.send(con_fd, buffer, r, 0);
   if (s != r) {
     Debug("dns", "send() failed: qname = %s, %d != %d, nameserver= %d", 
e->qname, s, r, h->name_server);
+
+    if (over_tcp) {
+      // add the counter for tcp connection failed
+      Debug("dns", "tcp query failed: name_server = %d, 
tcp_continuous_failures = %d", h->name_server,
+            h->tcp_continuous_failures[h->name_server]);
+      ++h->tcp_continuous_failures[h->name_server];
+    }
+
     // changed if condition from 'r < 0' to 's < 0' - 8/2001 pas
     if (s < 0) {
       if (dns_ns_rr) {
@@ -1150,6 +1195,13 @@ write_dns_event(DNSHandler *h, DNSEntry *e, bool 
over_tcp)
     return false;
   }
 
+  if (over_tcp && h->tcp_continuous_failures[h->name_server] > 0) {
+    // reset the counter for any tcp connection succeed
+    Debug("dns", "reset tcp_continuous_failures: name_server = %d, 
tcp_continuous_failures = %d", h->name_server,
+          h->tcp_continuous_failures[h->name_server]);
+    h->tcp_continuous_failures[h->name_server] = 0;
+  }
+
   e->written_flag      = true;
   e->which_ns          = h->name_server;
   e->once_written_flag = true;
@@ -1487,6 +1539,7 @@ dns_process(DNSHandler *handler, HostEnt *buf, int len)
   if (dns_conn_mode == DNS_CONN_MODE::TCP_RETRY && h->tc == 1) {
     Debug("dns", "Retrying DNS query over TCP for [%s]", e->qname);
     tcp_retry = true;
+    DNS_INCREMENT_DYN_STAT(dns_tcp_retries_stat);
     goto Lerror;
   }
 
@@ -1848,6 +1901,12 @@ ink_dns_init(ts::ModuleVersion v)
 
   RecRegisterRawStat(dns_rsb, RECT_PROCESS, "proxy.process.dns.in_flight", 
RECD_INT, RECP_NON_PERSISTENT, (int)dns_in_flight_stat,
                      RecRawStatSyncSum);
+
+  RecRegisterRawStat(dns_rsb, RECT_PROCESS, "proxy.process.dns.tcp_retries", 
RECD_INT, RECP_PERSISTENT, (int)dns_tcp_retries_stat,
+                     RecRawStatSyncSum);
+
+  RecRegisterRawStat(dns_rsb, RECT_PROCESS, "proxy.process.dns.tcp_reset", 
RECD_INT, RECP_PERSISTENT, (int)dns_tcp_reset_stat,
+                     RecRawStatSyncSum);
 }
 
 #if TS_HAS_TESTS
diff --git a/iocore/dns/P_DNSProcessor.h b/iocore/dns/P_DNSProcessor.h
index 3715ca3..4c16f32 100644
--- a/iocore/dns/P_DNSProcessor.h
+++ b/iocore/dns/P_DNSProcessor.h
@@ -30,6 +30,7 @@
 #define MAX_DNS_RETRIES 9
 #define DEFAULT_DNS_TIMEOUT 30
 #define MAX_DNS_IN_FLIGHT 2048
+#define MAX_DNS_TCP_CONTINUOUS_FAILURES 10
 #define DEFAULT_FAILOVER_NUMBER (DEFAULT_DNS_RETRIES + 1)
 #define DEFAULT_FAILOVER_PERIOD (DEFAULT_DNS_TIMEOUT + 30)
 // how many seconds before FAILOVER_PERIOD to try the primary with
@@ -49,6 +50,7 @@ extern int dns_failover_number;
 extern int dns_failover_period;
 extern int dns_failover_try_period;
 extern int dns_max_dns_in_flight;
+extern int dns_max_tcp_continuous_failures;
 extern unsigned int dns_sequence_number;
 
 //
@@ -85,6 +87,8 @@ enum DNS_Stats {
   dns_retries_stat,
   dns_max_retries_exceeded_stat,
   dns_in_flight_stat,
+  dns_tcp_retries_stat,
+  dns_tcp_reset_stat,
   DNS_Stat_Count
 };
 
@@ -173,6 +177,7 @@ struct DNSHandler : public Continuation {
   int ns_down[MAX_NAMED];
   int failover_number[MAX_NAMED];
   int failover_soon_number[MAX_NAMED];
+  int tcp_continuous_failures[MAX_NAMED];
   ink_hrtime crossed_failover_number[MAX_NAMED];
   ink_hrtime last_primary_retry  = 0;
   ink_hrtime last_primary_reopen = 0;
@@ -225,7 +230,7 @@ struct DNSHandler : public Continuation {
   int mainEvent(int event, Event *e);
 
   void open_cons(sockaddr const *addr, bool failed = false, int icon = 0);
-  void open_con(sockaddr const *addr, bool failed = false, int icon = 0, bool 
over_tcp = false);
+  bool open_con(sockaddr const *addr, bool failed = false, int icon = 0, bool 
over_tcp = false);
   void failover();
   void rr_failure(int ndx);
   void recover();
@@ -257,6 +262,9 @@ struct DNSHandler : public Continuation {
 private:
   // Check the IP address and switch to default if needed.
   void validate_ip();
+  // Check tcp connection for TCP_RETRY mode
+  void check_and_reset_tcp_conn();
+  bool reset_tcp_conn(int ndx);
 };
 
 /* --------------------------------------------------------------
@@ -295,6 +303,7 @@ DNSHandler::DNSHandler()
     failover_number[i]         = 0;
     failover_soon_number[i]    = 0;
     crossed_failover_number[i] = 0;
+    tcp_continuous_failures[i] = 0;
     ns_down[i]                 = 1;
     tcpcon[i].handler          = this;
     udpcon[i].handler          = this;
diff --git a/mgmt/RecordsConfig.cc b/mgmt/RecordsConfig.cc
index a2befd4..cca9165 100644
--- a/mgmt/RecordsConfig.cc
+++ b/mgmt/RecordsConfig.cc
@@ -882,6 +882,8 @@ static const RecordElement RecordsConfig[] =
   ,
   {RECT_CONFIG, "proxy.config.dns.max_dns_in_flight", RECD_INT, "2048", 
RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL}
   ,
+  {RECT_CONFIG, "proxy.config.dns.max_tcp_continuous_failures", RECD_INT, 
"10", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL}
+  ,
   {RECT_CONFIG, "proxy.config.dns.validate_query_name", RECD_INT, "0", 
RECU_DYNAMIC, RR_NULL, RECC_NULL, "[0-1]", RECA_NULL}
   ,
   {RECT_CONFIG, "proxy.config.dns.splitDNS.enabled", RECD_INT, "0", 
RECU_DYNAMIC, RR_NULL, RECC_INT, "[0-1]", RECA_NULL}

Reply via email to