https://fedorahosted.org/sssd/ticket/3131

I couldn't reproduce manually so I used the second patch as a by-code reproducer. If you apply the patch then sssd will try to resolve meta server twice simultaneously and triggering the problematic code path. You can search for RESOLV SRV DONE in logs.
From 11b6d2b7623208c7a3e148f4128e9f49c35a6ce1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pavel=20B=C5=99ezina?= <pbrez...@redhat.com>
Date: Wed, 24 Aug 2016 14:21:12 +0200
Subject: [PATCH 1/2] failover: proceed normally when no new server is found

Multiple failover requests come in same time, the first one will
result in collapsing the meta server but multiple resolution of
SRV records are triggered. The first one finishes normally but the
others won't find any new server thus ends with an error.

This patch makes failover to proceed normally even in such case.

Resolves:
https://fedorahosted.org/sssd/ticket/3131
---
 src/providers/fail_over.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/providers/fail_over.c b/src/providers/fail_over.c
index 8ab39f27f77e19e601855632196006a8dbbdf136..77084098831a312bc8629513ccfc2a91165241ba 100644
--- a/src/providers/fail_over.c
+++ b/src/providers/fail_over.c
@@ -1112,7 +1112,9 @@ fo_resolve_service_cont(struct tevent_req *subreq)
     ret = resolve_srv_recv(subreq, &state->server);
     talloc_zfree(subreq);
 
-    if (ret) {
+    /* We will proceed normally on ERR_SRV_DUPLICATES and if the server
+     * is already being resolved, we hook to that request. */
+    if (ret != EOK && ret != ERR_SRV_DUPLICATES) {
         tevent_req_error(req, ret);
         return;
     }
@@ -1398,11 +1400,23 @@ resolve_srv_done(struct tevent_req *subreq)
         }
 
         if (last_server == state->meta) {
-            /* SRV lookup returned only those servers
-             * that are already present. */
+            /* SRV lookup returned only those servers that are already present.
+             * This may happen only when an ongoing SRV resolution already
+             * exist. We will return server, but won't set any state. */
             DEBUG(SSSDBG_TRACE_FUNC, "SRV lookup did not return "
                                       "any new server.\n");
             ret = ERR_SRV_DUPLICATES;
+
+            /* Since no new server is returned, state->meta->next is NULL.
+             * We return last tried server if possible which is server
+             * from previous resolution of SRV record, and first server
+             * otherwise. */
+            if (state->service->last_tried_server != NULL) {
+                state->out = state->service->last_tried_server;
+                goto done;
+            }
+
+            state->out = state->service->server_list;
             goto done;
         }
 
@@ -1438,7 +1452,10 @@ resolve_srv_done(struct tevent_req *subreq)
     }
 
 done:
-    if (ret != EOK) {
+    if (ret == ERR_SRV_DUPLICATES) {
+        tevent_req_error(req, ret);
+        return;
+    } else if (ret != EOK) {
         state->out = state->meta;
         set_srv_data_status(state->meta->srv_data, SRV_RESOLVE_ERROR);
         tevent_req_error(req, ret);
-- 
2.1.0

From 4034fbb62417cf6ff0e13cb61a4401a488134a8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pavel=20B=C5=99ezina?= <pbrez...@redhat.com>
Date: Wed, 31 Aug 2016 13:01:31 +0200
Subject: [PATCH 2/2] debug

---
 src/providers/fail_over.c                  | 4 ++++
 src/providers/ldap/sdap_async_connection.c | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/src/providers/fail_over.c b/src/providers/fail_over.c
index 77084098831a312bc8629513ccfc2a91165241ba..f83b5a6eb8e1dc8c74a47153e20d290a89ea7027 100644
--- a/src/providers/fail_over.c
+++ b/src/providers/fail_over.c
@@ -1274,6 +1274,8 @@ resolve_srv_send(TALLOC_CTX *mem_ctx, struct tevent_context *ev,
     state->fo_ctx = ctx;
     state->meta = server->srv_data->meta;
 
+    DEBUG(SSSDBG_IMPORTANT_INFO, "RESOLV SRV SEND\n");
+
     status = get_srv_data_status(server->srv_data);
     DEBUG(SSSDBG_FUNC_DATA, "The status of SRV lookup is %s\n",
           str_srv_data_status(status));
@@ -1356,6 +1358,8 @@ resolve_srv_done(struct tevent_req *subreq)
     int ret;
     uint32_t ttl;
 
+    DEBUG(SSSDBG_IMPORTANT_INFO, "RESOLV SRV DONE\n");
+
     ret = state->fo_ctx->srv_recv_fn(state, subreq, &dns_domain, &ttl,
                                      &primary_servers, &num_primary_servers,
                                      &backup_servers, &num_backup_servers);
diff --git a/src/providers/ldap/sdap_async_connection.c b/src/providers/ldap/sdap_async_connection.c
index a8d4262b52c4b2d2810450d68794f00558ea5c2d..67604e3780a10d4456c346deed3f9b9dcfe64ae0 100644
--- a/src/providers/ldap/sdap_async_connection.c
+++ b/src/providers/ldap/sdap_async_connection.c
@@ -1509,6 +1509,9 @@ static int sdap_cli_resolve_next(struct tevent_req *req)
     subreq = be_resolve_server_send(state, state->ev,
                                     state->be, state->service->name,
                                     state->srv == NULL ? true : false);
+    subreq = be_resolve_server_send(state, state->ev,
+                                    state->be, state->service->name,
+                                    state->srv == NULL ? true : false);
     if (!subreq) {
         return ENOMEM;
     }
-- 
2.1.0

_______________________________________________
sssd-devel mailing list
sssd-devel@lists.fedorahosted.org
https://lists.fedorahosted.org/admin/lists/sssd-devel@lists.fedorahosted.org

Reply via email to