[ https://issues.apache.org/jira/browse/TS-2193?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14726617#comment-14726617 ]
Brian Geffon edited comment on TS-2193 at 9/2/15 2:38 AM: ---------------------------------------------------------- The issue here is that ET_DNS threads do not have the thread local that is expected by the per-thread session pools. One easy way to resolve that is to reschedule it to at ET_NET thread. The issue is to make that work I need to take advantage of the "cookie" parameter of the Event class, so it'll require a small modification to UnixEThread to allow the user to specify the cookie and call back the handler later with the cookie instead of the Event pointer. Below is a patch that fixes the crash, [~amc] [~zwoop] please review. {code} diff --git a/iocore/eventsystem/I_Event.h b/iocore/eventsystem/I_Event.h index a61c49c..14056b0 100644 --- a/iocore/eventsystem/I_Event.h +++ b/iocore/eventsystem/I_Event.h @@ -212,6 +212,7 @@ public: unsigned int immediate : 1; unsigned int globally_allocated : 1; unsigned int in_heap : 4; + bool use_cookie_in_callback; int callback_event; ink_hrtime timeout_at; diff --git a/iocore/eventsystem/P_UnixEvent.h b/iocore/eventsystem/P_UnixEvent.h index a224443..c9e137b 100644 --- a/iocore/eventsystem/P_UnixEvent.h +++ b/iocore/eventsystem/P_UnixEvent.h @@ -32,6 +32,7 @@ Event::init(Continuation *c, ink_hrtime atimeout_at, ink_hrtime aperiod) period = aperiod; immediate = !period && !atimeout_at; cancelled = false; + use_cookie_in_callback = false; return this; } @@ -45,7 +46,7 @@ Event::free() TS_INLINE Event::Event() : ethread(0), in_the_prot_queue(false), in_the_priority_queue(false), immediate(false), globally_allocated(true), in_heap(false), - timeout_at(0), period(0) + use_cookie_in_callback(false), timeout_at(0), period(0) { } diff --git a/iocore/eventsystem/UnixEThread.cc b/iocore/eventsystem/UnixEThread.cc index fb3af43..ecc3267 100644 --- a/iocore/eventsystem/UnixEThread.cc +++ b/iocore/eventsystem/UnixEThread.cc @@ -125,7 +125,7 @@ EThread::process_event(Event *e, int calling_code) return; } Continuation *c_temp = e->continuation; - e->continuation->handleEvent(calling_code, e); + e->continuation->handleEvent(calling_code, e->use_cookie_in_callback ? e->cookie : e); ink_assert(!e->in_the_priority_queue); ink_assert(c_temp == e->continuation); MUTEX_RELEASE(lock); diff --git a/iocore/hostdb/HostDB.cc b/iocore/hostdb/HostDB.cc index 984d137..12cf32e 100644 --- a/iocore/hostdb/HostDB.cc +++ b/iocore/hostdb/HostDB.cc @@ -65,6 +65,7 @@ char hostdb_hostfile_path[PATH_NAME_MAX] = ""; int hostdb_sync_frequency = 120; int hostdb_srv_enabled = 0; int hostdb_disable_reverse_lookup = 0; +extern int dns_thread; ClassAllocator<HostDBContinuation> hostDBContAllocator("hostDBContAllocator"); @@ -581,18 +582,36 @@ HostDBContinuation::refresh_MD5() mutex = hostDB.lock_for_bucket((int)(fold_md5(md5.hash) % hostDB.buckets)); } +static void callBackCont(Continuation *cont, int event, HostDBInfo *r) { + if (dns_thread) { + /* + * TS-3882: Let's make sure we hop back to a net thread because at some point + * if you have per-thread server sessions it will attempt to access those pools + * that wont exist in a thread local on a DNS thread. + */ + Event *e = eventAllocator.alloc(); + e->callback_event = event; + e->cookie = static_cast<void*>(r); + e->init(cont, 0, 0); + e->use_cookie_in_callback = true; + eventProcessor.schedule(e, ET_NET); + } else { + cont->handleEvent(event, r); + } +} + static bool reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false) { if (r == NULL || r->is_srv != is_srv || r->failed()) { - cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); + callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); return false; } if (r->reverse_dns) { if (!r->hostname()) { ink_assert(!"missing hostname"); - cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); + callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); Warning("bogus entry deleted from HostDB: missing hostname"); hostDB.delete_block(r); return false; @@ -603,7 +622,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false) if (!r->is_srv && r->round_robin) { if (!r->rr()) { ink_assert(!"missing round-robin"); - cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); + callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); Warning("bogus entry deleted from HostDB: missing round-robin"); hostDB.delete_block(r); return false; @@ -612,7 +631,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false) Debug("hostdb", "RR of %d with %d good, 1st IP = %s", r->rr()->rrcount, r->rr()->good, ats_ip_ntop(r->ip(), ipb, sizeof ipb)); } - cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r); + callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r); if (!r->full) { Warning("bogus entry deleted from HostDB: none"); @@ -752,7 +771,7 @@ HostDBContinuation::insert(unsigned int attl) attl = HOST_DB_MAX_TTL; r->ip_timeout_interval = attl; r->ip_timestamp = hostdb_current_interval; - Debug("hostdb", "inserting for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", md5.host_len, md5.host_name, + Debug("hostdb", "inserting (%p) for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", r, md5.host_len, md5.host_name, folded_md5, bucket, r->ip_timestamp, r->ip_timeout_interval, attl); return r; } {code} I'm really not thrilled with having to modify the Event class and UnixEThread. was (Author: briang): The issue here is that ET_DNS threads do not have the thread local that is expected by the per-thread session pools. One easy way to resolve that is to reschedule it to at ET_NET thread. The issue is to make that work I need to take advantage of the "cookie" parameter of the Event class, so it'll require a small modification to UnixEThread to allow the user to specify the cookie and call back the handler later with the cookie instead of the Event pointer. Below is a patch that fixes the crash, [~amc] [~zwoop] please review. {code} diff --git a/iocore/eventsystem/I_Event.h b/iocore/eventsystem/I_Event.h index a61c49c..14056b0 100644 --- a/iocore/eventsystem/I_Event.h +++ b/iocore/eventsystem/I_Event.h @@ -212,6 +212,7 @@ public: unsigned int immediate : 1; unsigned int globally_allocated : 1; unsigned int in_heap : 4; + bool use_cookie_in_callback; int callback_event; ink_hrtime timeout_at; diff --git a/iocore/eventsystem/P_UnixEvent.h b/iocore/eventsystem/P_UnixEvent.h index a224443..c9e137b 100644 --- a/iocore/eventsystem/P_UnixEvent.h +++ b/iocore/eventsystem/P_UnixEvent.h @@ -32,6 +32,7 @@ Event::init(Continuation *c, ink_hrtime atimeout_at, ink_hrtime aperiod) period = aperiod; immediate = !period && !atimeout_at; cancelled = false; + use_cookie_in_callback = false; return this; } @@ -45,7 +46,7 @@ Event::free() TS_INLINE Event::Event() : ethread(0), in_the_prot_queue(false), in_the_priority_queue(false), immediate(false), globally_allocated(true), in_heap(false), - timeout_at(0), period(0) + use_cookie_in_callback(false), timeout_at(0), period(0) { } diff --git a/iocore/eventsystem/UnixEThread.cc b/iocore/eventsystem/UnixEThread.cc index fb3af43..ecc3267 100644 --- a/iocore/eventsystem/UnixEThread.cc +++ b/iocore/eventsystem/UnixEThread.cc @@ -125,7 +125,7 @@ EThread::process_event(Event *e, int calling_code) return; } Continuation *c_temp = e->continuation; - e->continuation->handleEvent(calling_code, e); + e->continuation->handleEvent(calling_code, e->use_cookie_in_callback ? e->cookie : e); ink_assert(!e->in_the_priority_queue); ink_assert(c_temp == e->continuation); MUTEX_RELEASE(lock); diff --git a/iocore/hostdb/HostDB.cc b/iocore/hostdb/HostDB.cc index 984d137..12cf32e 100644 --- a/iocore/hostdb/HostDB.cc +++ b/iocore/hostdb/HostDB.cc @@ -65,6 +65,7 @@ char hostdb_hostfile_path[PATH_NAME_MAX] = ""; int hostdb_sync_frequency = 120; int hostdb_srv_enabled = 0; int hostdb_disable_reverse_lookup = 0; +extern int dns_thread; ClassAllocator<HostDBContinuation> hostDBContAllocator("hostDBContAllocator"); @@ -581,18 +582,36 @@ HostDBContinuation::refresh_MD5() mutex = hostDB.lock_for_bucket((int)(fold_md5(md5.hash) % hostDB.buckets)); } +static void callBackCont(Continuation *cont, int event, HostDBInfo *r) { + if (dns_thread) { + /* + * TS-3882: Let's make sure we hop back to a net thread because at some point + * if you have per-thread server sessions it will attempt to access those pools + * that wont exist in a thread local on a DNS thread. + */ + Event *e = eventAllocator.alloc(); + e->callback_event = event; + e->cookie = static_cast<void*>(r); + e->init(cont, 0, 0); + e->use_cookie_in_callback = true; + eventProcessor.schedule(e, ET_NET); + } else { + cont->handleEvent(event, r); + } +} + static bool reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false) { if (r == NULL || r->is_srv != is_srv || r->failed()) { - cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); + callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); return false; } if (r->reverse_dns) { if (!r->hostname()) { ink_assert(!"missing hostname"); - cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); + callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); Warning("bogus entry deleted from HostDB: missing hostname"); hostDB.delete_block(r); return false; @@ -603,7 +622,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false) if (!r->is_srv && r->round_robin) { if (!r->rr()) { ink_assert(!"missing round-robin"); - cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); + callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL); Warning("bogus entry deleted from HostDB: missing round-robin"); hostDB.delete_block(r); return false; @@ -612,7 +631,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false) Debug("hostdb", "RR of %d with %d good, 1st IP = %s", r->rr()->rrcount, r->rr()->good, ats_ip_ntop(r->ip(), ipb, sizeof ipb)); } - cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r); + callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r); if (!r->full) { Warning("bogus entry deleted from HostDB: none"); @@ -752,7 +771,7 @@ HostDBContinuation::insert(unsigned int attl) attl = HOST_DB_MAX_TTL; r->ip_timeout_interval = attl; r->ip_timestamp = hostdb_current_interval; - Debug("hostdb", "inserting for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", md5.host_len, md5.host_name, + Debug("hostdb", "inserting (%p) for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", r, md5.host_len, md5.host_name, folded_md5, bucket, r->ip_timestamp, r->ip_timeout_interval, attl); return r; } {code} > Trafficserver 4.1 Crash with proxy.config.dns.dedicated_thread = 1 > ------------------------------------------------------------------ > > Key: TS-2193 > URL: https://issues.apache.org/jira/browse/TS-2193 > Project: Traffic Server > Issue Type: Bug > Components: DNS > Affects Versions: 4.1.2 > Reporter: Tommy Lee > Assignee: Brian Geffon > Labels: Crash > Fix For: 6.1.0 > > Attachments: bt-01.txt > > > Hi all, > I've tried to enable DNS Thread without luck. > When i set proxy.config.dns.dedicated_thread to 1, it crashes with the > information below. > The ATS is working in Forward Proxy mode. > Thanks in advance. > -------------------------------------- > traffic.out > NOTE: Traffic Server received Sig 11: Segmentation fault > /usr/local/cache-4.1/bin/traffic_server - STACK TRACE: > /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0)[0x2af714875cb0] > /usr/local/cache-4.1/bin/traffic_server(_Z16_acquire_sessionP13SessionBucketPK8sockaddrR7INK_MD5P6HttpSM+0x52)[0x51dac2] > /usr/local/cache-4.1/bin/traffic_server(_ZN18HttpSessionManager15acquire_sessionEP12ContinuationPK8sockaddrPKcP17HttpClientSessionP6HttpSM+0x3d1)[0x51e0f1] > /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM19do_http_server_openEb+0x30c)[0x53644c] > /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM14set_next_stateEv+0x6a0)[0x537560] > /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM14set_next_stateEv+0x57e)[0x53743e] > /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM14set_next_stateEv+0x57e)[0x53743e] > /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM27state_hostdb_reverse_lookupEiPv+0xb9)[0x526b99] > /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM12main_handlerEiPv+0xd8)[0x531be8] > /usr/local/cache-4.1/bin/traffic_server[0x5d7c8a] > /usr/local/cache-4.1/bin/traffic_server(_ZN18HostDBContinuation8dnsEventEiP7HostEnt+0x821)[0x5decd1] > /usr/local/cache-4.1/bin/traffic_server(_ZN8DNSEntry9postEventEiP5Event+0x44)[0x5f7a94] > /usr/local/cache-4.1/bin/traffic_server[0x5fd382] > /usr/local/cache-4.1/bin/traffic_server(_ZN10DNSHandler8recv_dnsEiP5Event+0x852)[0x5fee72] > /usr/local/cache-4.1/bin/traffic_server(_ZN10DNSHandler9mainEventEiP5Event+0x14)[0x5ffd94] > /usr/local/cache-4.1/bin/traffic_server(_ZN7EThread13process_eventEP5Eventi+0x91)[0x6b2a41] > /usr/local/cache-4.1/bin/traffic_server(_ZN7EThread7executeEv+0x514)[0x6b3534] > /usr/local/cache-4.1/bin/traffic_server[0x6b17ea] > /lib/x86_64-linux-gnu/libpthread.so.0(+0x7e9a)[0x2af71486de9a] > /lib/x86_64-linux-gnu/libc.so.6(clone+0x6d)[0x2af71558dccd] -- This message was sent by Atlassian JIRA (v6.3.4#6332)