This is an automated email from the ASF dual-hosted git repository. sudheerv pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/master by this push: new c1ed4a4 fix DNS spike issue for TCP_RETRY mode (#7307) c1ed4a4 is described below commit c1ed4a4a50ba4b9d32933c58c2c0eb53f3636076 Author: Xin Li <33378623+whut...@users.noreply.github.com> AuthorDate: Mon May 10 11:04:06 2021 -0700 fix DNS spike issue for TCP_RETRY mode (#7307) fix compiling Error rename metrics update the docs for the new config and metric Co-authored-by: xinli1 <xin...@linkedin.com> --- doc/admin-guide/files/records.config.en.rst | 7 +++ .../monitoring/statistics/core/dns.en.rst | 12 +++++ iocore/dns/DNS.cc | 63 +++++++++++++++++++++- iocore/dns/P_DNSProcessor.h | 11 +++- mgmt/RecordsConfig.cc | 2 + 5 files changed, 92 insertions(+), 3 deletions(-) diff --git a/doc/admin-guide/files/records.config.en.rst b/doc/admin-guide/files/records.config.en.rst index 7ab3992..ac443fe 100644 --- a/doc/admin-guide/files/records.config.en.rst +++ b/doc/admin-guide/files/records.config.en.rst @@ -2602,6 +2602,13 @@ DNS ``2`` TCP_ONLY: |TS| always talks to nameservers over TCP. ===== ====================================================================== +.. ts:cv:: CONFIG proxy.config.dns.max_tcp_continuous_failures INT 10 + + If DNS connection mode is TCP_RETRY, set the threshold of the continuous TCP + query failures count for the TCP connection, reset the TCP connection immediately + if the continuous TCP query failures conut over the threshold. If the threshold + is 0 (or less than 0) we close this feature. + .. ts:cv:: CONFIG proxy.config.dns.max_dns_in_flight INT 2048 Maximum inflight DNS queries made by |TS| at any given instant diff --git a/doc/admin-guide/monitoring/statistics/core/dns.en.rst b/doc/admin-guide/monitoring/statistics/core/dns.en.rst index 0e0f93e..fed97d0 100644 --- a/doc/admin-guide/monitoring/statistics/core/dns.en.rst +++ b/doc/admin-guide/monitoring/statistics/core/dns.en.rst @@ -35,6 +35,18 @@ DNS The number of DNS lookups currently in progress. +.. ts:stat:: global proxy.process.dns.tcp_retries integer + :type: gauge + :ungathered: + + The number of DNS query over TCP in TCP_RETRY connection mode. + +.. ts:stat:: global proxy.process.dns.tcp_reset integer + :type: gauge + :ungathered: + + The number of resetting TCP connection in TCP_RETRY connection mode. + .. ts:stat:: global proxy.process.dns.lookup_avg_time integer :type: derivative :units: milliseconds diff --git a/iocore/dns/DNS.cc b/iocore/dns/DNS.cc index 74e247e..e7e9ebd 100644 --- a/iocore/dns/DNS.cc +++ b/iocore/dns/DNS.cc @@ -44,6 +44,7 @@ int dns_failover_number = DEFAULT_FAILOVER_NUMBER; int dns_failover_period = DEFAULT_FAILOVER_PERIOD; int dns_failover_try_period = DEFAULT_FAILOVER_TRY_PERIOD; int dns_max_dns_in_flight = MAX_DNS_IN_FLIGHT; +int dns_max_tcp_continuous_failures = MAX_DNS_TCP_CONTINUOUS_FAILURES; int dns_validate_qname = 0; unsigned int dns_handler_initialized = 0; int dns_ns_rr = 0; @@ -217,6 +218,7 @@ DNSProcessor::start(int, size_t stacksize) REC_EstablishStaticConfigInt32(dns_max_dns_in_flight, "proxy.config.dns.max_dns_in_flight"); REC_EstablishStaticConfigInt32(dns_validate_qname, "proxy.config.dns.validate_query_name"); REC_EstablishStaticConfigInt32(dns_ns_rr, "proxy.config.dns.round_robin_nameservers"); + REC_EstablishStaticConfigInt32(dns_max_tcp_continuous_failures, "proxy.config.dns.max_tcp_continuous_failures"); REC_ReadConfigStringAlloc(dns_ns_list, "proxy.config.dns.nameservers"); REC_ReadConfigStringAlloc(dns_local_ipv4, "proxy.config.dns.local_ipv4"); REC_ReadConfigStringAlloc(dns_local_ipv6, "proxy.config.dns.local_ipv6"); @@ -459,6 +461,17 @@ DNSHandler::open_cons(sockaddr const *target, bool failed, int icon) } /** + Close the old TCP connection and open a new one + */ +bool +DNSHandler::reset_tcp_conn(int ndx) +{ + DNS_INCREMENT_DYN_STAT(dns_tcp_reset_stat); + tcpcon[ndx].close(); + return open_con(&m_res->nsaddr_list[ndx].sa, true, ndx, true); +} + +/** Open (and close) connections as necessary and also assures that the epoll fd struct is properly updated. @@ -472,11 +485,12 @@ DNSHandler::open_cons(sockaddr const *target, bool failed, int icon) target != nullptr and icon > 0 : open connection to target. */ -void +bool DNSHandler::open_con(sockaddr const *target, bool failed, int icon, bool over_tcp) { ip_port_text_buffer ip_text; PollDescriptor *pd = get_PollDescriptor(dnsProcessor.thread); + bool ret = false; ink_assert(target != &ip.sa); @@ -508,7 +522,7 @@ DNSHandler::open_con(sockaddr const *target, bool failed, int icon, bool over_tc failover(); } } - return; + return false; } else { ns_down[icon] = 0; if (cur_con.eio.start(pd, &cur_con, EVENTIO_READ) < 0) { @@ -517,7 +531,10 @@ DNSHandler::open_con(sockaddr const *target, bool failed, int icon, bool over_tc cur_con.num = icon; Debug("dns", "opening connection %s SUCCEEDED for %d", ip_text, icon); } + ret = true; } + + return ret; } void @@ -933,12 +950,32 @@ DNSHandler::recv_dns(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */) } } +void +DNSHandler::check_and_reset_tcp_conn() +{ + for (int i = 0; i < n_con; i++) { + if (dns_max_tcp_continuous_failures > 0 && tcp_continuous_failures[i] >= dns_max_tcp_continuous_failures) { + // The times of continuous failures are more than the threshold, have to reset the connection + if (reset_tcp_conn(i)) { + // Reset the counter after the new TCP connection + // don't reset the counter if tcp conn reset failed, and we need continue to try to reset tcp conn next time + Warning("Reset tcp connection: nameserver = %d, failures = %d, threshold = %d", i, tcp_continuous_failures[i], + dns_max_tcp_continuous_failures); + tcp_continuous_failures[i] = 0; + } + } + } +} + /** Main event for the DNSHandler. Attempt to read from and write to named. */ int DNSHandler::mainEvent(int event, Event *e) { recv_dns(event, e); if (dns_ns_rr) { + if (DNS_CONN_MODE::TCP_RETRY == dns_conn_mode) { + check_and_reset_tcp_conn(); + } ink_hrtime t = Thread::get_hrtime(); if (t - last_primary_retry > DNS_PRIMARY_RETRY_PERIOD) { for (int i = 0; i < n_con; i++) { @@ -1139,6 +1176,14 @@ write_dns_event(DNSHandler *h, DNSEntry *e, bool over_tcp) int s = socketManager.send(con_fd, buffer, r, 0); if (s != r) { Debug("dns", "send() failed: qname = %s, %d != %d, nameserver= %d", e->qname, s, r, h->name_server); + + if (over_tcp) { + // add the counter for tcp connection failed + Debug("dns", "tcp query failed: name_server = %d, tcp_continuous_failures = %d", h->name_server, + h->tcp_continuous_failures[h->name_server]); + ++h->tcp_continuous_failures[h->name_server]; + } + // changed if condition from 'r < 0' to 's < 0' - 8/2001 pas if (s < 0) { if (dns_ns_rr) { @@ -1150,6 +1195,13 @@ write_dns_event(DNSHandler *h, DNSEntry *e, bool over_tcp) return false; } + if (over_tcp && h->tcp_continuous_failures[h->name_server] > 0) { + // reset the counter for any tcp connection succeed + Debug("dns", "reset tcp_continuous_failures: name_server = %d, tcp_continuous_failures = %d", h->name_server, + h->tcp_continuous_failures[h->name_server]); + h->tcp_continuous_failures[h->name_server] = 0; + } + e->written_flag = true; e->which_ns = h->name_server; e->once_written_flag = true; @@ -1487,6 +1539,7 @@ dns_process(DNSHandler *handler, HostEnt *buf, int len) if (dns_conn_mode == DNS_CONN_MODE::TCP_RETRY && h->tc == 1) { Debug("dns", "Retrying DNS query over TCP for [%s]", e->qname); tcp_retry = true; + DNS_INCREMENT_DYN_STAT(dns_tcp_retries_stat); goto Lerror; } @@ -1848,6 +1901,12 @@ ink_dns_init(ts::ModuleVersion v) RecRegisterRawStat(dns_rsb, RECT_PROCESS, "proxy.process.dns.in_flight", RECD_INT, RECP_NON_PERSISTENT, (int)dns_in_flight_stat, RecRawStatSyncSum); + + RecRegisterRawStat(dns_rsb, RECT_PROCESS, "proxy.process.dns.tcp_retries", RECD_INT, RECP_PERSISTENT, (int)dns_tcp_retries_stat, + RecRawStatSyncSum); + + RecRegisterRawStat(dns_rsb, RECT_PROCESS, "proxy.process.dns.tcp_reset", RECD_INT, RECP_PERSISTENT, (int)dns_tcp_reset_stat, + RecRawStatSyncSum); } #if TS_HAS_TESTS diff --git a/iocore/dns/P_DNSProcessor.h b/iocore/dns/P_DNSProcessor.h index 3715ca3..4c16f32 100644 --- a/iocore/dns/P_DNSProcessor.h +++ b/iocore/dns/P_DNSProcessor.h @@ -30,6 +30,7 @@ #define MAX_DNS_RETRIES 9 #define DEFAULT_DNS_TIMEOUT 30 #define MAX_DNS_IN_FLIGHT 2048 +#define MAX_DNS_TCP_CONTINUOUS_FAILURES 10 #define DEFAULT_FAILOVER_NUMBER (DEFAULT_DNS_RETRIES + 1) #define DEFAULT_FAILOVER_PERIOD (DEFAULT_DNS_TIMEOUT + 30) // how many seconds before FAILOVER_PERIOD to try the primary with @@ -49,6 +50,7 @@ extern int dns_failover_number; extern int dns_failover_period; extern int dns_failover_try_period; extern int dns_max_dns_in_flight; +extern int dns_max_tcp_continuous_failures; extern unsigned int dns_sequence_number; // @@ -85,6 +87,8 @@ enum DNS_Stats { dns_retries_stat, dns_max_retries_exceeded_stat, dns_in_flight_stat, + dns_tcp_retries_stat, + dns_tcp_reset_stat, DNS_Stat_Count }; @@ -173,6 +177,7 @@ struct DNSHandler : public Continuation { int ns_down[MAX_NAMED]; int failover_number[MAX_NAMED]; int failover_soon_number[MAX_NAMED]; + int tcp_continuous_failures[MAX_NAMED]; ink_hrtime crossed_failover_number[MAX_NAMED]; ink_hrtime last_primary_retry = 0; ink_hrtime last_primary_reopen = 0; @@ -225,7 +230,7 @@ struct DNSHandler : public Continuation { int mainEvent(int event, Event *e); void open_cons(sockaddr const *addr, bool failed = false, int icon = 0); - void open_con(sockaddr const *addr, bool failed = false, int icon = 0, bool over_tcp = false); + bool open_con(sockaddr const *addr, bool failed = false, int icon = 0, bool over_tcp = false); void failover(); void rr_failure(int ndx); void recover(); @@ -257,6 +262,9 @@ struct DNSHandler : public Continuation { private: // Check the IP address and switch to default if needed. void validate_ip(); + // Check tcp connection for TCP_RETRY mode + void check_and_reset_tcp_conn(); + bool reset_tcp_conn(int ndx); }; /* -------------------------------------------------------------- @@ -295,6 +303,7 @@ DNSHandler::DNSHandler() failover_number[i] = 0; failover_soon_number[i] = 0; crossed_failover_number[i] = 0; + tcp_continuous_failures[i] = 0; ns_down[i] = 1; tcpcon[i].handler = this; udpcon[i].handler = this; diff --git a/mgmt/RecordsConfig.cc b/mgmt/RecordsConfig.cc index a2befd4..cca9165 100644 --- a/mgmt/RecordsConfig.cc +++ b/mgmt/RecordsConfig.cc @@ -882,6 +882,8 @@ static const RecordElement RecordsConfig[] = , {RECT_CONFIG, "proxy.config.dns.max_dns_in_flight", RECD_INT, "2048", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL} , + {RECT_CONFIG, "proxy.config.dns.max_tcp_continuous_failures", RECD_INT, "10", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL} + , {RECT_CONFIG, "proxy.config.dns.validate_query_name", RECD_INT, "0", RECU_DYNAMIC, RR_NULL, RECC_NULL, "[0-1]", RECA_NULL} , {RECT_CONFIG, "proxy.config.dns.splitDNS.enabled", RECD_INT, "0", RECU_DYNAMIC, RR_NULL, RECC_INT, "[0-1]", RECA_NULL}