This is an automated email from the ASF dual-hosted git repository. daim pushed a commit to branch OAK-10199 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 98f7c07f6f802efc31baaeb5b5acf6f6540d030a Author: Rishabh Kumar <d...@adobe.com> AuthorDate: Mon Jun 19 23:04:33 2023 +0530 OAK-10199 : added check to include oldestId when running detailedGc very first time --- .../oak/plugins/document/VersionGCSupport.java | 20 +++++--- .../plugins/document/VersionGarbageCollector.java | 5 +- .../document/mongo/MongoVersionGCSupport.java | 18 ++++--- .../plugins/document/rdb/RDBVersionGCSupport.java | 22 +++++---- .../oak/plugins/document/VersionGCSupportTest.java | 39 ++++++++++++--- .../document/VersionGarbageCollectorIT.java | 57 ++++++++++++++++++++++ 6 files changed, 128 insertions(+), 33 deletions(-) diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java index db54553061..96fa2bbaea 100644 --- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java @@ -74,21 +74,25 @@ public class VersionGCSupport { /** * Returns documents that have a {@link NodeDocument#MODIFIED_IN_SECS} value - * within the given range .The two passed modified timestamps are in milliseconds + * within the given range and are greater than given @{@link NodeDocument#ID}. + * <p> + * The two passed modified timestamps are in milliseconds * since the epoch and the implementation will convert them to seconds at * the granularity of the {@link NodeDocument#MODIFIED_IN_SECS} field and * then perform the comparison. + * <p/> * - * @param fromModified the lower bound modified timestamp (inclusive) - * @param toModified the upper bound modified timestamp (exclusive) - * @param limit the limit of documents to return - * @param fromId the lower bound {@link NodeDocument#ID} (exclusive) + * @param fromModified the lower bound modified timestamp (inclusive) + * @param toModified the upper bound modified timestamp (exclusive) + * @param limit the limit of documents to return + * @param fromId the lower bound {@link NodeDocument#ID} + * @param includeFromId boolean indicating whether {@code fromId} is inclusive or not * @return matching documents. */ public Iterable<NodeDocument> getModifiedDocs(final long fromModified, final long toModified, final int limit, - @NotNull final String fromId) { + @NotNull final String fromId, boolean includeFromId) { return StreamSupport - .stream(getSelectedDocuments(store, MODIFIED_IN_SECS, 1, fromId).spliterator(), false) + .stream(getSelectedDocuments(store, MODIFIED_IN_SECS, 1, includeFromId ? "\0"+fromId : fromId).spliterator(), false) .filter(input -> modifiedGreaterThanEquals(input, fromModified) && modifiedLessThan(input, toModified)) .sorted((o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2)) .limit(limit) @@ -193,7 +197,7 @@ public class VersionGCSupport { LOG.info("find oldest modified document"); try { - docs = getModifiedDocs(ts, now, 1, MIN_ID_VALUE); + docs = getModifiedDocs(ts, now, 1, MIN_ID_VALUE, false); if (docs.iterator().hasNext()) { return docs.iterator().next(); } diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java index f54299e3fd..0c7da0d4fa 100644 --- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java @@ -615,6 +615,7 @@ public class VersionGarbageCollector { throws IOException { int docsTraversed = 0; boolean foundDoc = true; + boolean includeFromId = true; long oldestModifiedDocTimeStamp = rec.scopeDetailedGC.fromMs; String oldestModifiedDocId = rec.detailedGCId; try (DetailedGC gc = new DetailedGC(headRevision, monitor, cancel)) { @@ -624,7 +625,9 @@ public class VersionGarbageCollector { while (foundDoc && oldestModifiedDocTimeStamp < toModified && docsTraversed <= PROGRESS_BATCH_SIZE) { // set foundDoc to false to allow exiting the while loop foundDoc = false; - Iterable<NodeDocument> itr = versionStore.getModifiedDocs(oldestModifiedDocTimeStamp, toModified, 1000, oldestModifiedDocId); + Iterable<NodeDocument> itr = versionStore.getModifiedDocs(oldestModifiedDocTimeStamp, toModified, 1000, oldestModifiedDocId, includeFromId); + // set includeFromId to false for subsequent queries + includeFromId = false; try { for (NodeDocument doc : itr) { foundDoc = true; diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java index 9896857e36..ca9a8a955b 100644 --- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java @@ -131,23 +131,27 @@ public class MongoVersionGCSupport extends VersionGCSupport { /** * Returns documents that have a {@link NodeDocument#MODIFIED_IN_SECS} value - * within the given range .The two passed modified timestamps are in milliseconds + * within the given range and are greater than given @{@link NodeDocument#ID}. + * <p> + * The two passed modified timestamps are in milliseconds * since the epoch and the implementation will convert them to seconds at * the granularity of the {@link NodeDocument#MODIFIED_IN_SECS} field and * then perform the comparison. + * <p/> * - * @param fromModified the lower bound modified timestamp (inclusive) - * @param toModified the upper bound modified timestamp (exclusive) - * @param limit the limit of documents to return - * @param fromId the lower bound {@link NodeDocument#ID} (exclusive) + * @param fromModified the lower bound modified timestamp (inclusive) + * @param toModified the upper bound modified timestamp (exclusive) + * @param limit the limit of documents to return + * @param fromId the lower bound {@link NodeDocument#ID} + * @param includeFromId boolean indicating whether {@code fromId} is inclusive or not * @return matching documents. */ @Override public Iterable<NodeDocument> getModifiedDocs(final long fromModified, final long toModified, final int limit, - @NotNull final String fromId) { + @NotNull final String fromId, boolean includeFromId) { // _modified >= fromModified && _modified < toModified && _id > fromId final Bson query = and(gte(MODIFIED_IN_SECS, getModifiedInSecs(fromModified)), - lt(MODIFIED_IN_SECS, getModifiedInSecs(toModified)), gt(ID, fromId)); + lt(MODIFIED_IN_SECS, getModifiedInSecs(toModified)), includeFromId ? gte(ID, fromId) :gt(ID, fromId)); // first sort by _modified and then by _id final Bson sort = and(eq(MODIFIED_IN_SECS, 1), eq(ID, 1)); diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/rdb/RDBVersionGCSupport.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/rdb/RDBVersionGCSupport.java index 7006c18683..efce4b8006 100644 --- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/rdb/RDBVersionGCSupport.java +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/rdb/RDBVersionGCSupport.java @@ -96,27 +96,31 @@ public class RDBVersionGCSupport extends VersionGCSupport { /** * Returns documents that have a {@link NodeDocument#MODIFIED_IN_SECS} value - * within the given range .The two passed modified timestamps are in milliseconds + * within the given range and are greater than given @{@link NodeDocument#ID}. + * <p> + * The two passed modified timestamps are in milliseconds * since the epoch and the implementation will convert them to seconds at * the granularity of the {@link NodeDocument#MODIFIED_IN_SECS} field and * then perform the comparison. + * <p/> * - * @param fromModified the lower bound modified timestamp (inclusive) - * @param toModified the upper bound modified timestamp (exclusive) - * @param limit the limit of documents to return - * @param fromId the lower bound {@link NodeDocument#ID} (exclusive) + * @param fromModified the lower bound modified timestamp (inclusive) + * @param toModified the upper bound modified timestamp (exclusive) + * @param limit the limit of documents to return + * @param fromId the lower bound {@link NodeDocument#ID} + * @param includeFromId boolean indicating whether {@code fromId} is inclusive or not * @return matching documents. */ @Override public Iterable<NodeDocument> getModifiedDocs(final long fromModified, final long toModified, final int limit, - @NotNull final String fromId) { + @NotNull final String fromId, boolean includeFromId) { List<QueryCondition> conditions = of(new QueryCondition(MODIFIED_IN_SECS, "<", getModifiedInSecs(toModified)), new QueryCondition(MODIFIED_IN_SECS, ">=", getModifiedInSecs(fromModified)), - new QueryCondition(ID, ">", of(fromId))); + new QueryCondition(ID, includeFromId ? ">=" : ">", of(fromId))); if (MODE == 1) { return getIterator(EMPTY_KEY_PATTERN, conditions); } else { - return store.queryAsIterable(NODES, fromId, null, EMPTY_KEY_PATTERN, conditions, limit, of(MODIFIED_IN_SECS, ID)); + return store.queryAsIterable(NODES, null, null, EMPTY_KEY_PATTERN, conditions, limit, of(MODIFIED_IN_SECS, ID)); } } @@ -287,7 +291,7 @@ public class RDBVersionGCSupport extends VersionGCSupport { LOG.info("getOldestModifiedDoc() <- start"); Iterable<NodeDocument> modifiedDocs = null; try { - modifiedDocs = getModifiedDocs(0L, clock.getTime(), 1, MIN_ID_VALUE); + modifiedDocs = getModifiedDocs(0L, clock.getTime(), 1, MIN_ID_VALUE, false); doc = modifiedDocs.iterator().hasNext() ? modifiedDocs.iterator().next() : NULL; } catch (DocumentStoreException ex) { LOG.error("getOldestModifiedDoc()", ex); diff --git a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java index 0061771383..cff9511a66 100644 --- a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java +++ b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java @@ -33,6 +33,7 @@ import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import static java.lang.Long.MAX_VALUE; import static java.util.Comparator.comparing; import static java.util.List.of; import static java.util.Optional.ofNullable; @@ -206,8 +207,8 @@ public class VersionGCSupportTest { public void findModifiedDocsWhenModifiedIsDifferent() { long secs = 42; long offset = SECONDS.toMillis(secs); - List<UpdateOp> updateOps = new ArrayList<>(5_001); - for (int i = 0; i < 5_001; i++) { + List<UpdateOp> updateOps = new ArrayList<>(5_000); + for (int i = 0; i < 5_000; i++) { Revision r = new Revision(offset + (i * 5), 0, 1); String id = getIdFromPath("/x" + i); ids.add(id); @@ -223,9 +224,10 @@ public class VersionGCSupportTest { long oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L); assertEquals(40L, oldestModifiedDocTs); assertEquals("1:/x0", oldestModifiedDocId); + boolean includeFromId = true; for(int i = 0; i < 5; i++) { - Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), Long.MAX_VALUE, 1000, oldestModifiedDocId); + Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, includeFromId); assertTrue(isInOrder(modifiedDocs, (o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2))); long count = stream(modifiedDocs.spliterator(), false).count(); assertEquals(1000, count); @@ -234,6 +236,7 @@ public class VersionGCSupportTest { } oldestModifiedDocId = oldestModifiedDoc.getId(); oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L); + includeFromId = false; } } @@ -249,7 +252,7 @@ public class VersionGCSupportTest { setModified(op, r); updateOps.add(op); } - // create 5_000 nodes + // create 5_001 nodes store.create(NODES, updateOps); NodeDocument oldestModifiedDoc = gcSupport.getOldestModifiedDoc(SIMPLE); @@ -257,9 +260,10 @@ public class VersionGCSupportTest { long oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L); assertEquals(40L, oldestModifiedDocTs); assertEquals("1:/x0", oldestModifiedDocId); + boolean includeFromId = true; for(int i = 0; i < 5; i++) { - Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), Long.MAX_VALUE, 1000, oldestModifiedDocId); + Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, includeFromId); assertTrue(isInOrder(modifiedDocs, (o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2))); long count = stream(modifiedDocs.spliterator(), false).count(); assertEquals(1000, count); @@ -268,7 +272,21 @@ public class VersionGCSupportTest { } oldestModifiedDocId = oldestModifiedDoc.getId(); oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L); + includeFromId = false; } + + // fetch last remaining document now + Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, false); + assertEquals(1, stream(modifiedDocs.spliterator(), false).count()); + assertTrue(isInOrder(modifiedDocs, (o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2))); + oldestModifiedDoc = modifiedDocs.iterator().next(); + oldestModifiedDocId = oldestModifiedDoc.getId(); + oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L); + + // all documents had been fetched, now we won't get any document + modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, false); + assertEquals(0, stream(modifiedDocs.spliterator(), false).count()); + } @Test @@ -291,10 +309,10 @@ public class VersionGCSupportTest { } // create 5_000 nodes store.create(NODES, updateOps); - + boolean includeFromId = true; for(int i = 0; i < 5; i++) { - Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), Long.MAX_VALUE, 1000, oldestModifiedDocId); + Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, includeFromId); assertTrue(isInOrder(modifiedDocs, (o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2))); long count = stream(modifiedDocs.spliterator(), false).count(); assertEquals(1000, count); @@ -303,7 +321,12 @@ public class VersionGCSupportTest { } oldestModifiedDocId = oldestModifiedDoc.getId(); oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L); + includeFromId = false; } + + // all documents had been fetched, now we won't get any document + Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, false); + assertEquals(0, stream(modifiedDocs.spliterator(), false).count()); } private void assertPossiblyDeleted(long fromSeconds, long toSeconds, long num) { @@ -312,7 +335,7 @@ public class VersionGCSupportTest { } private void assertModified(long fromSeconds, long toSeconds, long num) { - Iterable<NodeDocument> docs = gcSupport.getModifiedDocs(SECONDS.toMillis(fromSeconds), SECONDS.toMillis(toSeconds), 10, MIN_ID_VALUE); + Iterable<NodeDocument> docs = gcSupport.getModifiedDocs(SECONDS.toMillis(fromSeconds), SECONDS.toMillis(toSeconds), 10, MIN_ID_VALUE, false); assertEquals(num, stream(docs.spliterator(), false).count()); assertTrue(isInOrder(docs, (o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2))); } diff --git a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java index caa156a6d4..f6e8554252 100644 --- a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java +++ b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java @@ -34,6 +34,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; +import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.commons.lang3.reflect.FieldUtils.writeField; import static org.apache.jackrabbit.guava.common.collect.Iterables.filter; import static org.apache.jackrabbit.guava.common.collect.Iterables.size; @@ -83,6 +84,7 @@ import org.apache.jackrabbit.oak.stats.Clock; import org.jetbrains.annotations.NotNull; import org.junit.After; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; @@ -337,6 +339,61 @@ public class VersionGarbageCollectorIT { assertEquals(50_000, stats.deletedPropsGCCount); } + + // Test when we have more than 1000 deleted properties with different revisions + @Test + @Ignore + public void testGCDeletedProps_2() throws Exception { + //1. Create nodes with properties + NodeBuilder b1 = null; + for (int k = 0; k < 50; k ++) { + b1 = store.getRoot().builder(); + // Add property to node & save + for (int i = 0; i < 100; i++) { + for (int j = 0; j < 10; j++) { + b1.child(k + "z" + i).setProperty("prop" + j, "foo", STRING); + } + } + store.merge(b1, EmptyHook.INSTANCE, CommitInfo.EMPTY); + // increase the clock to create new revision for next batch + clock.waitUntil(Revision.getCurrentTimestamp() + SECONDS.toMillis(k * 5)); + } + + // enable the detailed gc flag + writeField(gc, "detailedGCEnabled", true, true); + long maxAge = 1; //hours + long delta = TimeUnit.MINUTES.toMillis(10); + //1. Go past GC age and check no GC done as nothing deleted + clock.waitUntil(Revision.getCurrentTimestamp() + maxAge); + VersionGCStats stats = gc.gc(maxAge, HOURS); + assertEquals(0, stats.deletedPropsGCCount); + + //Remove property + NodeBuilder b2 = store.getRoot().builder(); + for (int k = 0; k < 50; k ++) { + for (int i = 0; i < 100; i++) { + for (int j = 0; j < 10; j++) { + b2.getChildNode(k + "z" + i).removeProperty("prop" + j); + } + } + } + store.merge(b2, EmptyHook.INSTANCE, CommitInfo.EMPTY); + + store.runBackgroundOperations(); + + //2. Check that a deleted property is not collected before maxAge + //Clock cannot move back (it moved forward in #1) so double the maxAge + clock.waitUntil(clock.getTime() + delta); + stats = gc.gc(maxAge*2, HOURS); + assertEquals(0, stats.deletedPropsGCCount); + + //3. Check that deleted property does get collected post maxAge + clock.waitUntil(clock.getTime() + HOURS.toMillis(maxAge*2) + delta); + + stats = gc.gc(maxAge*2, HOURS); + assertEquals(50_000, stats.deletedPropsGCCount); + + } private void gcSplitDocsInternal(String subNodeName) throws Exception { long maxAge = 1; //hrs