This is an automated email from the ASF dual-hosted git repository. daim pushed a commit to branch DetailedGC/OAK-10199 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit ce8f274bbfed14b7e3ae43957a1bd329102ab951 Author: Rishabh Kumar <d...@adobe.com> AuthorDate: Wed Jun 28 02:06:50 2023 +0530 OAK-10199 : added logic to skip non garbage documents --- .../plugins/document/VersionGCRecommendations.java | 4 +- .../plugins/document/VersionGarbageCollector.java | 31 ++++++++----- .../document/VersionGarbageCollectorIT.java | 52 ++++++++++++++++++++-- 3 files changed, 71 insertions(+), 16 deletions(-) diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCRecommendations.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCRecommendations.java index c80399f005..0fd0766f5b 100644 --- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCRecommendations.java +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCRecommendations.java @@ -118,7 +118,7 @@ public class VersionGCRecommendations { detailedGCTimestamp = (long) settings.get(SETTINGS_COLLECTION_DETAILED_GC_TIMESTAMP_PROP); oldestModifiedDocId = (String) settings.get(SETTINGS_COLLECTION_DETAILED_GC_DOCUMENT_ID_PROP); if (detailedGCTimestamp == 0) { - // it will only happens for the very first time, we run this detailedGC + // it will only happen for the very first time, we run this detailedGC log.info("No detailedGCTimestamp found, querying for the oldest modified candidate"); final NodeDocument doc = vgc.getOldestModifiedDoc(clock); if (doc == NULL) { @@ -129,7 +129,7 @@ public class VersionGCRecommendations { oldestModifiedDocId = MIN_ID_VALUE; log.info("detailedGCTimestamp found: {}", timestampToString(oldestModifiedDocTimeStamp)); } else { - oldestModifiedDocTimeStamp = detailedGCTimestamp - 1; + oldestModifiedDocTimeStamp = detailedGCTimestamp - 1L; } TimeInterval detailedGCTimeInternal = new TimeInterval(oldestModifiedDocTimeStamp, MAX_VALUE); diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java index eb25184f62..1e5b1129b9 100644 --- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java @@ -77,6 +77,8 @@ import static org.apache.jackrabbit.oak.plugins.document.NodeDocument.MODIFIED_I import static org.apache.jackrabbit.oak.plugins.document.NodeDocument.SplitDocType.COMMIT_ROOT_ONLY; import static org.apache.jackrabbit.oak.plugins.document.NodeDocument.SplitDocType.DEFAULT_LEAF; import static org.apache.jackrabbit.oak.plugins.document.NodeDocument.SplitDocType.DEFAULT_NO_BRANCH; +import static org.apache.jackrabbit.oak.plugins.document.NodeDocument.setDeleted; +import static org.apache.jackrabbit.oak.plugins.document.NodeDocument.setModified; import static org.slf4j.helpers.MessageFormatter.arrayFormat; public class VersionGarbageCollector { @@ -614,15 +616,17 @@ public class VersionGarbageCollector { * * @param phases {@link GCPhases} * @param headRevision the current head revision of node store + * @param rec {@link VersionGCRecommendations} to recommend GC operation */ private void collectDetailedGarbage(final GCPhases phases, final RevisionVector headRevision, final VersionGCRecommendations rec) throws IOException { int docsTraversed = 0; boolean foundDoc = true; - final long oldestModifiedDocTimeStamp = rec.scopeDetailedGC.fromMs; + final long oldestModifiedMs = rec.scopeDetailedGC.fromMs; + long oldModifiedMs = oldestModifiedMs; final String oldestModifiedDocId = rec.detailedGCId; try (DetailedGC gc = new DetailedGC(headRevision, monitor, cancel)) { - long fromModified = oldestModifiedDocTimeStamp; + long fromModified = oldestModifiedMs; String fromId = oldestModifiedDocId; NodeDocument lastDoc = null; final long toModified = rec.scopeDetailedGC.toMs; @@ -630,7 +634,9 @@ public class VersionGarbageCollector { while (foundDoc && fromModified < toModified && docsTraversed <= PROGRESS_BATCH_SIZE) { // set foundDoc to false to allow exiting the while loop foundDoc = false; + lastDoc = null; Iterable<NodeDocument> itr = versionStore.getModifiedDocs(fromModified, toModified, 1000, fromId); + final Revision revision = nodeStore.newRevision(); try { for (NodeDocument doc : itr) { foundDoc = true; @@ -649,7 +655,7 @@ public class VersionGarbageCollector { lastDoc = doc; // collect the data to delete in next step if (phases.start(GCPhase.COLLECTING)) { - gc.collectGarbage(doc, phases); + gc.collectGarbage(doc, phases, revision); phases.stop(GCPhase.COLLECTING); } @@ -657,7 +663,7 @@ public class VersionGarbageCollector { if (modified == null) { monitor.warn("collectDetailGarbage : document has no _modified property : {}", doc.getId()); - } else if (SECONDS.toMillis(modified) < oldestModifiedDocTimeStamp) { + } else if (SECONDS.toMillis(modified) < oldestModifiedMs) { monitor.warn( "collectDetailGarbage : document has older _modified than query boundary : {} (from: {}, to: {})", modified, fromModified, toModified); @@ -669,26 +675,29 @@ public class VersionGarbageCollector { phases.stop(GCPhase.DETAILED_GC_CLEANUP); } if (lastDoc != null) { - fromModified = lastDoc.getModified() == null ? oldestModifiedDocTimeStamp : SECONDS.toMillis(lastDoc.getModified()); + fromModified = lastDoc.getModified() == null ? oldModifiedMs : SECONDS.toMillis(lastDoc.getModified()); fromId = lastDoc.getId(); } } finally { Utils.closeIfCloseable(itr); phases.stats.oldestModifiedDocTimeStamp = fromModified; - if (fromModified > (oldestModifiedDocTimeStamp + 1)) { + if (fromModified > (oldModifiedMs + 1)) { // we have moved ahead, now we can reset oldestModifiedId to min value + fromId = MIN_ID_VALUE; phases.stats.oldestModifiedDocId = MIN_ID_VALUE; } else { // there are still documents pending at oldest Modified timestamp, // save the last _id traversed to avoid re-fetching of ids phases.stats.oldestModifiedDocId = fromId; } + oldModifiedMs = fromModified - 1; } // if we are already at last document of current timeStamp, - // we need to reset fromId and check again + // we need to reset fromId & increment fromModified and check again if (!foundDoc && !Objects.equals(fromId, MIN_ID_VALUE)) { fromId = MIN_ID_VALUE; + fromModified = fromModified + SECONDS.toMillis(5); foundDoc = true; // to run while loop again } } @@ -806,14 +815,14 @@ public class VersionGarbageCollector { this.timer = Stopwatch.createUnstarted(); } - public void collectGarbage(final NodeDocument doc, final GCPhases phases) { + public void collectGarbage(final NodeDocument doc, final GCPhases phases, final Revision revision) { monitor.info("Collecting Detailed Garbage for doc [{}]", doc.getId()); final UpdateOp op = new UpdateOp(requireNonNull(doc.getId()), false); op.equals(MODIFIED_IN_SECS, doc.getModified()); - collectDeletedProperties(doc, phases, op); + collectDeletedProperties(doc, phases, op, revision); collectUnmergedBranchCommitDocument(doc, phases, op); collectOldRevisions(doc, phases, op); // only add if there are changes for this doc @@ -836,7 +845,7 @@ public class VersionGarbageCollector { } - private void collectDeletedProperties(final NodeDocument doc, final GCPhases phases, final UpdateOp updateOp) { + private void collectDeletedProperties(final NodeDocument doc, final GCPhases phases, final UpdateOp updateOp, final Revision revision) { // get Map of all properties along with their values if (phases.start(GCPhase.COLLECT_PROPS)) { @@ -855,6 +864,8 @@ public class VersionGarbageCollector { .filter(p -> !retainPropSet.contains(p)) .mapToInt(x -> { updateOp.remove(x); + setModified(updateOp,revision); + setDeleted(updateOp, revision, false); return 1;}) .sum(); diff --git a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java index 6ba8dd6f81..6d7382bac5 100644 --- a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java +++ b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java @@ -44,6 +44,7 @@ import static java.util.concurrent.TimeUnit.HOURS; import static java.util.concurrent.TimeUnit.MINUTES; import static org.apache.jackrabbit.oak.api.Type.STRING; import static org.apache.jackrabbit.oak.plugins.document.Collection.NODES; +import static org.apache.jackrabbit.oak.plugins.document.Collection.SETTINGS; import static org.apache.jackrabbit.oak.plugins.document.NodeDocument.MIN_ID_VALUE; import static org.apache.jackrabbit.oak.plugins.document.NodeDocument.NUM_REVS_THRESHOLD; import static org.apache.jackrabbit.oak.plugins.document.NodeDocument.PREV_SPLIT_FACTOR; @@ -51,6 +52,9 @@ import static org.apache.jackrabbit.oak.plugins.document.NodeDocument.SplitDocTy import static org.apache.jackrabbit.oak.plugins.document.NodeDocument.setModified; import static org.apache.jackrabbit.oak.plugins.document.Revision.newRevision; import static org.apache.jackrabbit.oak.plugins.document.TestUtils.NO_BINARY; +import static org.apache.jackrabbit.oak.plugins.document.VersionGarbageCollector.SETTINGS_COLLECTION_DETAILED_GC_DOCUMENT_ID_PROP; +import static org.apache.jackrabbit.oak.plugins.document.VersionGarbageCollector.SETTINGS_COLLECTION_DETAILED_GC_TIMESTAMP_PROP; +import static org.apache.jackrabbit.oak.plugins.document.VersionGarbageCollector.SETTINGS_COLLECTION_ID; import static org.apache.jackrabbit.oak.plugins.document.VersionGarbageCollector.VersionGCStats; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -288,7 +292,7 @@ public class VersionGarbageCollectorIT { stats = gc.gc(maxAge*2, HOURS); assertEquals(1, stats.deletedPropsGCCount); assertEquals(1, stats.updatedDetailedGCDocsCount); - assertEquals("1:/z", stats.oldestModifiedDocId); + assertEquals(MIN_ID_VALUE, stats.oldestModifiedDocId); //4. Check that a revived property (deleted and created again) does not get gc NodeBuilder b3 = store.getRoot().builder(); @@ -341,7 +345,7 @@ public class VersionGarbageCollectorIT { VersionGCStats stats = gc.gc(maxAge*2, HOURS); assertEquals(50_000, stats.deletedPropsGCCount); assertEquals(5_000, stats.updatedDetailedGCDocsCount); - assertNotEquals(MIN_ID_VALUE, stats.oldestModifiedDocId); + assertEquals(MIN_ID_VALUE, stats.oldestModifiedDocId); } @Test @@ -387,6 +391,46 @@ public class VersionGarbageCollectorIT { assertEquals(MIN_ID_VALUE, stats.oldestModifiedDocId); } + @Test + public void testGC_WithNoDeletedProps_And_MoreThan_10_000_DocWithDifferentRevision() throws Exception { + //1. Create nodes with properties + NodeBuilder b1 = store.getRoot().builder(); + for (int k = 0; k < 50; k ++) { + b1 = store.getRoot().builder(); + // Add property to node & save + for (int i = 0; i < 500; i++) { + for (int j = 0; j < 10; j++) { + b1.child(k + "z" + i).setProperty("prop" + j, "foo", STRING); + } + } + store.merge(b1, EmptyHook.INSTANCE, CommitInfo.EMPTY); + // increase the clock to create new revision for next batch + clock.waitUntil(Revision.getCurrentTimestamp() + (k * 5)); + } + store.merge(b1, EmptyHook.INSTANCE, CommitInfo.EMPTY); + + // enable the detailed gc flag + writeField(gc, "detailedGCEnabled", true, true); + long maxAge = 1; //hours + long delta = TimeUnit.MINUTES.toMillis(20); + + + store.runBackgroundOperations(); + //3. Check that deleted property does get collected post maxAge + clock.waitUntil(clock.getTime() + HOURS.toMillis(maxAge*2) + delta); + + for (int i = 0; i < 3 ; i++) { + + VersionGCStats stats = gc.gc(maxAge, HOURS); + String oldestModifiedDocId = stats.oldestModifiedDocId; + long oldestModifiedDocTimeStamp = stats.oldestModifiedDocTimeStamp; + + Document document = store.getDocumentStore().find(SETTINGS, SETTINGS_COLLECTION_ID); + assertEquals(document.get(SETTINGS_COLLECTION_DETAILED_GC_TIMESTAMP_PROP), oldestModifiedDocTimeStamp); + assertEquals(document.get(SETTINGS_COLLECTION_DETAILED_GC_DOCUMENT_ID_PROP), oldestModifiedDocId); + } + } + @Test public void testGCDeletedPropsAlreadyGCed() throws Exception { //1. Create nodes with properties @@ -420,7 +464,7 @@ public class VersionGarbageCollectorIT { stats = gc.gc(maxAge*2, HOURS); assertEquals(10, stats.deletedPropsGCCount); assertEquals(10, stats.updatedDetailedGCDocsCount); - assertNotEquals(MIN_ID_VALUE, stats.oldestModifiedDocId); + assertEquals(MIN_ID_VALUE, stats.oldestModifiedDocId); //3. now reCreate those properties again NodeBuilder b3 = store.getRoot().builder(); @@ -635,7 +679,7 @@ public class VersionGarbageCollectorIT { stats = gc.gc(maxAge*2, HOURS); assertEquals(0, stats.updatedDetailedGCDocsCount); assertEquals(0, stats.deletedPropsGCCount); - assertNotEquals(MIN_ID_VALUE, stats.oldestModifiedDocId); + assertEquals(MIN_ID_VALUE, stats.oldestModifiedDocId); } @Test