This is an automated email from the ASF dual-hosted git repository. epugh pushed a commit to branch branch_9_8 in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9_8 by this push: new 0cad3f7ebf8 SOLR-17306: fix replication problem on follower restart (#2918) 0cad3f7ebf8 is described below commit 0cad3f7ebf881ea25becf0f759b3fe59626e41fa Author: Martin Anzinger <132433648+ds-manzin...@users.noreply.github.com> AuthorDate: Thu Dec 19 16:55:44 2024 +0100 SOLR-17306: fix replication problem on follower restart (#2918) (cherry picked from commit 9cef6e390719cbd7b55085cfef98fcb053785f77) --- solr/CHANGES.txt | 2 + .../java/org/apache/solr/handler/IndexFetcher.java | 6 + .../solr/handler/TestReplicationHandler.java | 135 ++++++++++++++++++++- 3 files changed, 142 insertions(+), 1 deletion(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index b6f30d7f998..08c21bfd7c0 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -117,6 +117,8 @@ Bug Fixes * SOLR-17595: Fix two issues in Solr CLI that prevent Solr from starting with the techproducts example and from correctly parsing arguments on Windows that start with -D and have multiple values separated by "," or spaces. (Christos Malliaridis) +* SOLR-17306: fix replication problem on follower restart (Martin Anzinger and Peter Kroiss via Eric Pugh) + Dependency Upgrades --------------------- * PR#2702: chore(deps): update io.netty:* to v4.1.114.final (solrbot) diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java index 049af659b3c..061be7a9269 100644 --- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java +++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java @@ -531,6 +531,12 @@ public class IndexFetcher { IndexDeletionPolicyWrapper.getCommitTimestamp(commit)); // nowarn } + // Leader's version is 0 and generation is 0 - not open for replication + if (latestVersion == 0L && latestGeneration == 0L) { + log.info("Leader's version is 0 and generation is 0 - not open for replication"); + return IndexFetchResult.LEADER_IS_NOT_ACTIVE; + } + if (latestVersion == 0L) { if (IndexDeletionPolicyWrapper.getCommitTimestamp(commit) != 0L) { // since we won't get the files for an empty index, diff --git a/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java b/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java index 930a5a2b11f..0625807fed8 100644 --- a/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java +++ b/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java @@ -118,7 +118,7 @@ public class TestReplicationHandler extends SolrTestCaseJ4 { public void setUp() throws Exception { super.setUp(); systemSetPropertySolrDisableUrlAllowList("true"); - // System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory"); + System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory"); // For manual testing only // useFactory(null); // force an FS factory. leader = new SolrInstance(createTempDir("solr-instance").toFile(), "leader", null); @@ -1800,6 +1800,139 @@ public class TestReplicationHandler extends SolrTestCaseJ4 { } } + @Test + public void doTestIndexFollowerAfterRestartWhenReplicationIsDisabled() throws Exception { + // failed before changes to IndexFetcher + testReplicationRestartFollower("disablereplication"); + } + + @Test + public void doTestIndexFollowerAfterRestartWhenReplicationIsEnabled() throws Exception { + testReplicationRestartFollower("enablereplication"); + } + + private void testReplicationRestartFollower(String replicationCmd) throws Exception { + useFactory(null); + try { + clearIndexWithReplication(); + // change solrconfig having 'replicateAfter startup' option on leader + leader.copyConfigFile(CONF_DIR + "solrconfig-leader2.xml", "solrconfig.xml"); + + leaderJetty.stop(); + final TimeOut waitForLeaderToShutdown = + new TimeOut(300, TimeUnit.SECONDS, TimeSource.NANO_TIME); + waitForLeaderToShutdown.waitFor( + "Gave up after waiting an obscene amount of time for leader to shut down", + () -> leaderJetty.isStopped()); + + leaderJetty.start(); + final TimeOut waitForLeaderToStart = new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME); + waitForLeaderToStart.waitFor( + "Gave up after waiting an obscene amount of time for leader to start", + () -> leaderJetty.isRunning()); + + // close and re-create leader client because its connection pool has stale connections + leaderClient.close(); + leaderClient = + createNewSolrClient(buildUrl(leaderJetty.getLocalPort()), DEFAULT_TEST_CORENAME); + + NamedList<Object> leaderQueryRsp = rQuery(0, "*:*", leaderClient); + SolrDocumentList leaderQueryResult = (SolrDocumentList) leaderQueryRsp.get("response"); + assertEquals(0, numFound(leaderQueryRsp)); + + // get docs from follower and check if number is equal to leader + NamedList<Object> followerQueryRsp = rQuery(0, "*:*", followerClient); + SolrDocumentList followerQueryResult = (SolrDocumentList) followerQueryRsp.get("response"); + assertEquals(0, numFound(followerQueryRsp)); + + // compare results + String cmp = + BaseDistributedSearchTestCase.compare(leaderQueryResult, followerQueryResult, 0, null); + assertNull(cmp); + + nDocs--; + for (int i = 0; i < nDocs; i++) { + index(leaderClient, "id", i, "name", "name = " + i); + } + + leaderClient.commit(); + + leaderQueryRsp = rQuery(nDocs, "*:*", leaderClient); + leaderQueryResult = (SolrDocumentList) leaderQueryRsp.get("response"); + assertEquals(nDocs, numFound(leaderQueryRsp)); + + // get docs from follower and check if number is equal to leader + followerQueryRsp = rQuery(nDocs, "*:*", followerClient); + followerQueryResult = (SolrDocumentList) followerQueryRsp.get("response"); + assertEquals(nDocs, numFound(followerQueryRsp)); + + // compare results + cmp = BaseDistributedSearchTestCase.compare(leaderQueryResult, followerQueryResult, 0, null); + assertNull(cmp); + + String timesReplicatedString = getFollowerDetails("timesIndexReplicated"); + String timesFailed; + Integer previousTimesFailed = null; + if (timesReplicatedString == null) { + timesFailed = "0"; + } else { + int timesReplicated = Integer.parseInt(timesReplicatedString); + timesFailed = getFollowerDetails("timesFailed"); + if (null == timesFailed) { + timesFailed = "0"; + } + + previousTimesFailed = Integer.parseInt(timesFailed); + // Sometimes replication will fail because leader's core is still loading; make sure there + // was one success + assertEquals(1, timesReplicated - previousTimesFailed); + } + + followerJetty.stop(); + + invokeReplicationCommand( + buildUrl(leaderJetty.getLocalPort()) + "/" + DEFAULT_TEST_CORENAME, replicationCmd); + + final TimeOut waitForFollowerToShutdown = + new TimeOut(300, TimeUnit.SECONDS, TimeSource.NANO_TIME); + waitForFollowerToShutdown.waitFor( + "Gave up after waiting an obscene amount of time for leader to shut down", + () -> followerJetty.isStopped()); + + log.info("FOLLOWER START ********************************************"); + followerJetty.start(); + + final TimeOut waitForFollowerToStart = + new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME); + waitForFollowerToStart.waitFor( + "Gave up after waiting an obscene amount of time for leader to start", + () -> followerJetty.isRunning()); + + // poll interval on follower is 1 second, so we just sleep for a few seconds + Thread.sleep(3000); + followerClient.close(); + followerClient = + createNewSolrClient(buildUrl(followerJetty.getLocalPort()), DEFAULT_TEST_CORENAME); + NamedList<Object> details = getDetails(followerClient); + + leaderQueryRsp = rQuery(nDocs, "*:*", leaderClient); + leaderQueryResult = (SolrDocumentList) leaderQueryRsp.get("response"); + assertEquals(nDocs, numFound(leaderQueryRsp)); + + // get docs from follower and check if number is equal to leader + followerQueryRsp = rQuery(nDocs, "*:*", followerClient); + followerQueryResult = (SolrDocumentList) followerQueryRsp.get("response"); + assertEquals(nDocs, numFound(followerQueryRsp)); + + // compare results again + cmp = BaseDistributedSearchTestCase.compare(leaderQueryResult, followerQueryResult, 0, null); + assertNull(cmp); + + } finally { + resetFactory(); + } + } + private void assertReplicationResponseSucceeded(NamedList<?> response) { assertNotNull("null response from server", response); assertNotNull("Expected replication response to have 'status' field", response.get("status"));