This is an automated email from the ASF dual-hosted git repository.

epugh pushed a commit to branch branch_9_8
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/branch_9_8 by this push:
     new 0cad3f7ebf8 SOLR-17306: fix replication problem on follower restart 
(#2918)
0cad3f7ebf8 is described below

commit 0cad3f7ebf881ea25becf0f759b3fe59626e41fa
Author: Martin Anzinger <132433648+ds-manzin...@users.noreply.github.com>
AuthorDate: Thu Dec 19 16:55:44 2024 +0100

    SOLR-17306: fix replication problem on follower restart (#2918)
    
    (cherry picked from commit 9cef6e390719cbd7b55085cfef98fcb053785f77)
---
 solr/CHANGES.txt                                   |   2 +
 .../java/org/apache/solr/handler/IndexFetcher.java |   6 +
 .../solr/handler/TestReplicationHandler.java       | 135 ++++++++++++++++++++-
 3 files changed, 142 insertions(+), 1 deletion(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index b6f30d7f998..08c21bfd7c0 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -117,6 +117,8 @@ Bug Fixes
 * SOLR-17595: Fix two issues in Solr CLI that prevent Solr from starting with 
the techproducts example and from
   correctly parsing arguments on Windows that start with -D and have multiple 
values separated by "," or spaces. (Christos Malliaridis)
 
+* SOLR-17306: fix replication problem on follower restart (Martin Anzinger and 
Peter Kroiss via Eric Pugh)
+
 Dependency Upgrades
 ---------------------
 * PR#2702: chore(deps): update io.netty:* to v4.1.114.final (solrbot)
diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java 
b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
index 049af659b3c..061be7a9269 100644
--- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
+++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
@@ -531,6 +531,12 @@ public class IndexFetcher {
             IndexDeletionPolicyWrapper.getCommitTimestamp(commit)); // nowarn
       }
 
+      // Leader's version is 0 and generation is 0 -  not open for replication
+      if (latestVersion == 0L && latestGeneration == 0L) {
+        log.info("Leader's version is 0 and generation is 0 -  not open for 
replication");
+        return IndexFetchResult.LEADER_IS_NOT_ACTIVE;
+      }
+
       if (latestVersion == 0L) {
         if (IndexDeletionPolicyWrapper.getCommitTimestamp(commit) != 0L) {
           // since we won't get the files for an empty index,
diff --git 
a/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java 
b/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java
index 930a5a2b11f..0625807fed8 100644
--- a/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java
+++ b/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java
@@ -118,7 +118,7 @@ public class TestReplicationHandler extends SolrTestCaseJ4 {
   public void setUp() throws Exception {
     super.setUp();
     systemSetPropertySolrDisableUrlAllowList("true");
-    //    System.setProperty("solr.directoryFactory", 
"solr.StandardDirectoryFactory");
+    System.setProperty("solr.directoryFactory", 
"solr.StandardDirectoryFactory");
     // For manual testing only
     // useFactory(null); // force an FS factory.
     leader = new SolrInstance(createTempDir("solr-instance").toFile(), 
"leader", null);
@@ -1800,6 +1800,139 @@ public class TestReplicationHandler extends 
SolrTestCaseJ4 {
     }
   }
 
+  @Test
+  public void doTestIndexFollowerAfterRestartWhenReplicationIsDisabled() 
throws Exception {
+    // failed before changes to IndexFetcher
+    testReplicationRestartFollower("disablereplication");
+  }
+
+  @Test
+  public void doTestIndexFollowerAfterRestartWhenReplicationIsEnabled() throws 
Exception {
+    testReplicationRestartFollower("enablereplication");
+  }
+
+  private void testReplicationRestartFollower(String replicationCmd) throws 
Exception {
+    useFactory(null);
+    try {
+      clearIndexWithReplication();
+      // change solrconfig having 'replicateAfter startup' option on leader
+      leader.copyConfigFile(CONF_DIR + "solrconfig-leader2.xml", 
"solrconfig.xml");
+
+      leaderJetty.stop();
+      final TimeOut waitForLeaderToShutdown =
+          new TimeOut(300, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+      waitForLeaderToShutdown.waitFor(
+          "Gave up after waiting an obscene amount of time for leader to shut 
down",
+          () -> leaderJetty.isStopped());
+
+      leaderJetty.start();
+      final TimeOut waitForLeaderToStart = new TimeOut(30, TimeUnit.SECONDS, 
TimeSource.NANO_TIME);
+      waitForLeaderToStart.waitFor(
+          "Gave up after waiting an obscene amount of time for leader to 
start",
+          () -> leaderJetty.isRunning());
+
+      // close and re-create leader client because its connection pool has 
stale connections
+      leaderClient.close();
+      leaderClient =
+          createNewSolrClient(buildUrl(leaderJetty.getLocalPort()), 
DEFAULT_TEST_CORENAME);
+
+      NamedList<Object> leaderQueryRsp = rQuery(0, "*:*", leaderClient);
+      SolrDocumentList leaderQueryResult = (SolrDocumentList) 
leaderQueryRsp.get("response");
+      assertEquals(0, numFound(leaderQueryRsp));
+
+      // get docs from follower and check if number is equal to leader
+      NamedList<Object> followerQueryRsp = rQuery(0, "*:*", followerClient);
+      SolrDocumentList followerQueryResult = (SolrDocumentList) 
followerQueryRsp.get("response");
+      assertEquals(0, numFound(followerQueryRsp));
+
+      // compare results
+      String cmp =
+          BaseDistributedSearchTestCase.compare(leaderQueryResult, 
followerQueryResult, 0, null);
+      assertNull(cmp);
+
+      nDocs--;
+      for (int i = 0; i < nDocs; i++) {
+        index(leaderClient, "id", i, "name", "name = " + i);
+      }
+
+      leaderClient.commit();
+
+      leaderQueryRsp = rQuery(nDocs, "*:*", leaderClient);
+      leaderQueryResult = (SolrDocumentList) leaderQueryRsp.get("response");
+      assertEquals(nDocs, numFound(leaderQueryRsp));
+
+      // get docs from follower and check if number is equal to leader
+      followerQueryRsp = rQuery(nDocs, "*:*", followerClient);
+      followerQueryResult = (SolrDocumentList) 
followerQueryRsp.get("response");
+      assertEquals(nDocs, numFound(followerQueryRsp));
+
+      // compare results
+      cmp = BaseDistributedSearchTestCase.compare(leaderQueryResult, 
followerQueryResult, 0, null);
+      assertNull(cmp);
+
+      String timesReplicatedString = 
getFollowerDetails("timesIndexReplicated");
+      String timesFailed;
+      Integer previousTimesFailed = null;
+      if (timesReplicatedString == null) {
+        timesFailed = "0";
+      } else {
+        int timesReplicated = Integer.parseInt(timesReplicatedString);
+        timesFailed = getFollowerDetails("timesFailed");
+        if (null == timesFailed) {
+          timesFailed = "0";
+        }
+
+        previousTimesFailed = Integer.parseInt(timesFailed);
+        // Sometimes replication will fail because leader's core is still 
loading; make sure there
+        // was one success
+        assertEquals(1, timesReplicated - previousTimesFailed);
+      }
+
+      followerJetty.stop();
+
+      invokeReplicationCommand(
+          buildUrl(leaderJetty.getLocalPort()) + "/" + DEFAULT_TEST_CORENAME, 
replicationCmd);
+
+      final TimeOut waitForFollowerToShutdown =
+          new TimeOut(300, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+      waitForFollowerToShutdown.waitFor(
+          "Gave up after waiting an obscene amount of time for leader to shut 
down",
+          () -> followerJetty.isStopped());
+
+      log.info("FOLLOWER START ********************************************");
+      followerJetty.start();
+
+      final TimeOut waitForFollowerToStart =
+          new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+      waitForFollowerToStart.waitFor(
+          "Gave up after waiting an obscene amount of time for leader to 
start",
+          () -> followerJetty.isRunning());
+
+      // poll interval on follower is 1 second, so we just sleep for a few 
seconds
+      Thread.sleep(3000);
+      followerClient.close();
+      followerClient =
+          createNewSolrClient(buildUrl(followerJetty.getLocalPort()), 
DEFAULT_TEST_CORENAME);
+      NamedList<Object> details = getDetails(followerClient);
+
+      leaderQueryRsp = rQuery(nDocs, "*:*", leaderClient);
+      leaderQueryResult = (SolrDocumentList) leaderQueryRsp.get("response");
+      assertEquals(nDocs, numFound(leaderQueryRsp));
+
+      // get docs from follower and check if number is equal to leader
+      followerQueryRsp = rQuery(nDocs, "*:*", followerClient);
+      followerQueryResult = (SolrDocumentList) 
followerQueryRsp.get("response");
+      assertEquals(nDocs, numFound(followerQueryRsp));
+
+      // compare results again
+      cmp = BaseDistributedSearchTestCase.compare(leaderQueryResult, 
followerQueryResult, 0, null);
+      assertNull(cmp);
+
+    } finally {
+      resetFactory();
+    }
+  }
+
   private void assertReplicationResponseSucceeded(NamedList<?> response) {
     assertNotNull("null response from server", response);
     assertNotNull("Expected replication response to have 'status' field", 
response.get("status"));

Reply via email to