This is an automated email from the ASF dual-hosted git repository.

daim pushed a commit to branch OAK-10199
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git

commit 0c129bcf8bdb82668f474be47c791acb30185c88
Author: Rishabh Kumar <d...@adobe.com>
AuthorDate: Wed Jun 28 02:06:50 2023 +0530

    OAK-10199 : added logic to skip non garbage documents
---
 .../plugins/document/VersionGCRecommendations.java |  4 +-
 .../plugins/document/VersionGarbageCollector.java  | 31 ++++++++-----
 .../document/VersionGarbageCollectorIT.java        | 52 ++++++++++++++++++++--
 3 files changed, 71 insertions(+), 16 deletions(-)

diff --git 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCRecommendations.java
 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCRecommendations.java
index c80399f005..0fd0766f5b 100644
--- 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCRecommendations.java
+++ 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCRecommendations.java
@@ -118,7 +118,7 @@ public class VersionGCRecommendations {
         detailedGCTimestamp = (long) 
settings.get(SETTINGS_COLLECTION_DETAILED_GC_TIMESTAMP_PROP);
         oldestModifiedDocId = (String) 
settings.get(SETTINGS_COLLECTION_DETAILED_GC_DOCUMENT_ID_PROP);
         if (detailedGCTimestamp == 0) {
-            // it will only happens for the very first time, we run this 
detailedGC
+            // it will only happen for the very first time, we run this 
detailedGC
             log.info("No detailedGCTimestamp found, querying for the oldest 
modified candidate");
             final NodeDocument doc = vgc.getOldestModifiedDoc(clock);
             if (doc == NULL) {
@@ -129,7 +129,7 @@ public class VersionGCRecommendations {
             oldestModifiedDocId = MIN_ID_VALUE;
             log.info("detailedGCTimestamp found: {}", 
timestampToString(oldestModifiedDocTimeStamp));
         } else {
-            oldestModifiedDocTimeStamp = detailedGCTimestamp - 1;
+            oldestModifiedDocTimeStamp = detailedGCTimestamp - 1L;
         }
 
         TimeInterval detailedGCTimeInternal = new 
TimeInterval(oldestModifiedDocTimeStamp, MAX_VALUE);
diff --git 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java
 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java
index eb25184f62..1e5b1129b9 100644
--- 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java
+++ 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java
@@ -77,6 +77,8 @@ import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.MODIFIED_I
 import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.SplitDocType.COMMIT_ROOT_ONLY;
 import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.SplitDocType.DEFAULT_LEAF;
 import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.SplitDocType.DEFAULT_NO_BRANCH;
+import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.setDeleted;
+import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.setModified;
 import static org.slf4j.helpers.MessageFormatter.arrayFormat;
 
 public class VersionGarbageCollector {
@@ -614,15 +616,17 @@ public class VersionGarbageCollector {
          *
          * @param phases {@link GCPhases}
          * @param headRevision the current head revision of node store
+         * @param rec {@link VersionGCRecommendations} to recommend GC 
operation
          */
         private void collectDetailedGarbage(final GCPhases phases, final 
RevisionVector headRevision, final VersionGCRecommendations rec)
                 throws IOException {
             int docsTraversed = 0;
             boolean foundDoc = true;
-            final long oldestModifiedDocTimeStamp = rec.scopeDetailedGC.fromMs;
+            final long oldestModifiedMs = rec.scopeDetailedGC.fromMs;
+            long oldModifiedMs = oldestModifiedMs;
             final String oldestModifiedDocId = rec.detailedGCId;
             try (DetailedGC gc = new DetailedGC(headRevision, monitor, 
cancel)) {
-                long fromModified = oldestModifiedDocTimeStamp;
+                long fromModified = oldestModifiedMs;
                 String fromId = oldestModifiedDocId;
                 NodeDocument lastDoc = null;
                 final long toModified = rec.scopeDetailedGC.toMs;
@@ -630,7 +634,9 @@ public class VersionGarbageCollector {
                     while (foundDoc && fromModified < toModified && 
docsTraversed <= PROGRESS_BATCH_SIZE) {
                         // set foundDoc to false to allow exiting the while 
loop
                         foundDoc = false;
+                        lastDoc = null;
                         Iterable<NodeDocument> itr = 
versionStore.getModifiedDocs(fromModified, toModified, 1000, fromId);
+                        final Revision revision = nodeStore.newRevision();
                         try {
                             for (NodeDocument doc : itr) {
                                 foundDoc = true;
@@ -649,7 +655,7 @@ public class VersionGarbageCollector {
                                 lastDoc = doc;
                                 // collect the data to delete in next step
                                 if (phases.start(GCPhase.COLLECTING)) {
-                                    gc.collectGarbage(doc, phases);
+                                    gc.collectGarbage(doc, phases, revision);
                                     phases.stop(GCPhase.COLLECTING);
                                 }
 
@@ -657,7 +663,7 @@ public class VersionGarbageCollector {
                                 if (modified == null) {
                                     monitor.warn("collectDetailGarbage : 
document has no _modified property : {}",
                                             doc.getId());
-                                } else if (SECONDS.toMillis(modified) < 
oldestModifiedDocTimeStamp) {
+                                } else if (SECONDS.toMillis(modified) < 
oldestModifiedMs) {
                                     monitor.warn(
                                             "collectDetailGarbage : document 
has older _modified than query boundary : {} (from: {}, to: {})",
                                             modified, fromModified, 
toModified);
@@ -669,26 +675,29 @@ public class VersionGarbageCollector {
                                 phases.stop(GCPhase.DETAILED_GC_CLEANUP);
                             }
                             if (lastDoc != null) {
-                                fromModified = lastDoc.getModified() == null ? 
oldestModifiedDocTimeStamp : SECONDS.toMillis(lastDoc.getModified());
+                                fromModified = lastDoc.getModified() == null ? 
oldModifiedMs : SECONDS.toMillis(lastDoc.getModified());
                                 fromId = lastDoc.getId();
                             }
                         } finally {
                             Utils.closeIfCloseable(itr);
                             phases.stats.oldestModifiedDocTimeStamp = 
fromModified;
-                            if (fromModified > (oldestModifiedDocTimeStamp + 
1)) {
+                            if (fromModified > (oldModifiedMs + 1)) {
                                 // we have moved ahead, now we can reset 
oldestModifiedId to min value
+                                fromId = MIN_ID_VALUE;
                                 phases.stats.oldestModifiedDocId = 
MIN_ID_VALUE;
                             } else {
                                 // there are still documents pending at oldest 
Modified timestamp,
                                 // save the last _id traversed to avoid 
re-fetching of ids
                                 phases.stats.oldestModifiedDocId = fromId;
                             }
+                            oldModifiedMs = fromModified - 1;
                         }
 
                         // if we are already at last document of current 
timeStamp,
-                        // we need to reset fromId and check again
+                        // we need to reset fromId & increment fromModified 
and check again
                         if (!foundDoc && !Objects.equals(fromId, 
MIN_ID_VALUE)) {
                             fromId = MIN_ID_VALUE;
+                            fromModified = fromModified + SECONDS.toMillis(5);
                             foundDoc = true; // to run while loop again
                         }
                     }
@@ -806,14 +815,14 @@ public class VersionGarbageCollector {
             this.timer = Stopwatch.createUnstarted();
         }
 
-        public void collectGarbage(final NodeDocument doc, final GCPhases 
phases) {
+        public void collectGarbage(final NodeDocument doc, final GCPhases 
phases, final Revision revision) {
 
             monitor.info("Collecting Detailed Garbage for doc [{}]", 
doc.getId());
 
             final UpdateOp op = new UpdateOp(requireNonNull(doc.getId()), 
false);
             op.equals(MODIFIED_IN_SECS, doc.getModified());
 
-            collectDeletedProperties(doc, phases, op);
+            collectDeletedProperties(doc, phases, op, revision);
             collectUnmergedBranchCommitDocument(doc, phases, op);
             collectOldRevisions(doc, phases, op);
             // only add if there are changes for this doc
@@ -836,7 +845,7 @@ public class VersionGarbageCollector {
 
         }
 
-        private void collectDeletedProperties(final NodeDocument doc, final 
GCPhases phases, final UpdateOp updateOp) {
+        private void collectDeletedProperties(final NodeDocument doc, final 
GCPhases phases, final UpdateOp updateOp, final Revision revision) {
 
             // get Map of all properties along with their values
             if (phases.start(GCPhase.COLLECT_PROPS)) {
@@ -855,6 +864,8 @@ public class VersionGarbageCollector {
                         .filter(p -> !retainPropSet.contains(p))
                         .mapToInt(x -> {
                             updateOp.remove(x);
+                            setModified(updateOp,revision);
+                            setDeleted(updateOp, revision, false);
                             return 1;})
                         .sum();
 
diff --git 
a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java
 
b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java
index 6ba8dd6f81..6d7382bac5 100644
--- 
a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java
+++ 
b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java
@@ -44,6 +44,7 @@ import static java.util.concurrent.TimeUnit.HOURS;
 import static java.util.concurrent.TimeUnit.MINUTES;
 import static org.apache.jackrabbit.oak.api.Type.STRING;
 import static org.apache.jackrabbit.oak.plugins.document.Collection.NODES;
+import static org.apache.jackrabbit.oak.plugins.document.Collection.SETTINGS;
 import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.MIN_ID_VALUE;
 import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.NUM_REVS_THRESHOLD;
 import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.PREV_SPLIT_FACTOR;
@@ -51,6 +52,9 @@ import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.SplitDocTy
 import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.setModified;
 import static org.apache.jackrabbit.oak.plugins.document.Revision.newRevision;
 import static org.apache.jackrabbit.oak.plugins.document.TestUtils.NO_BINARY;
+import static 
org.apache.jackrabbit.oak.plugins.document.VersionGarbageCollector.SETTINGS_COLLECTION_DETAILED_GC_DOCUMENT_ID_PROP;
+import static 
org.apache.jackrabbit.oak.plugins.document.VersionGarbageCollector.SETTINGS_COLLECTION_DETAILED_GC_TIMESTAMP_PROP;
+import static 
org.apache.jackrabbit.oak.plugins.document.VersionGarbageCollector.SETTINGS_COLLECTION_ID;
 import static 
org.apache.jackrabbit.oak.plugins.document.VersionGarbageCollector.VersionGCStats;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
@@ -288,7 +292,7 @@ public class VersionGarbageCollectorIT {
         stats = gc.gc(maxAge*2, HOURS);
         assertEquals(1, stats.deletedPropsGCCount);
         assertEquals(1, stats.updatedDetailedGCDocsCount);
-        assertEquals("1:/z", stats.oldestModifiedDocId);
+        assertEquals(MIN_ID_VALUE, stats.oldestModifiedDocId);
 
         //4. Check that a revived property (deleted and created again) does 
not get gc
         NodeBuilder b3 = store.getRoot().builder();
@@ -341,7 +345,7 @@ public class VersionGarbageCollectorIT {
         VersionGCStats stats = gc.gc(maxAge*2, HOURS);
         assertEquals(50_000, stats.deletedPropsGCCount);
         assertEquals(5_000, stats.updatedDetailedGCDocsCount);
-        assertNotEquals(MIN_ID_VALUE, stats.oldestModifiedDocId);
+        assertEquals(MIN_ID_VALUE, stats.oldestModifiedDocId);
     }
 
     @Test
@@ -387,6 +391,46 @@ public class VersionGarbageCollectorIT {
         assertEquals(MIN_ID_VALUE, stats.oldestModifiedDocId);
     }
 
+    @Test
+    public void 
testGC_WithNoDeletedProps_And_MoreThan_10_000_DocWithDifferentRevision() throws 
Exception {
+        //1. Create nodes with properties
+        NodeBuilder b1 = store.getRoot().builder();
+        for (int k = 0; k < 50; k ++) {
+            b1 = store.getRoot().builder();
+            // Add property to node & save
+            for (int i = 0; i < 500; i++) {
+                for (int j = 0; j < 10; j++) {
+                    b1.child(k + "z" + i).setProperty("prop" + j, "foo", 
STRING);
+                }
+            }
+            store.merge(b1, EmptyHook.INSTANCE, CommitInfo.EMPTY);
+            // increase the clock to create new revision for next batch
+            clock.waitUntil(Revision.getCurrentTimestamp() + (k * 5));
+        }
+        store.merge(b1, EmptyHook.INSTANCE, CommitInfo.EMPTY);
+
+        // enable the detailed gc flag
+        writeField(gc, "detailedGCEnabled", true, true);
+        long maxAge = 1; //hours
+        long delta = TimeUnit.MINUTES.toMillis(20);
+
+
+        store.runBackgroundOperations();
+        //3. Check that deleted property does get collected post maxAge
+        clock.waitUntil(clock.getTime() + HOURS.toMillis(maxAge*2) + delta);
+
+        for (int i = 0; i < 3 ; i++) {
+
+            VersionGCStats stats = gc.gc(maxAge, HOURS);
+            String oldestModifiedDocId = stats.oldestModifiedDocId;
+            long oldestModifiedDocTimeStamp = stats.oldestModifiedDocTimeStamp;
+
+            Document document = store.getDocumentStore().find(SETTINGS, 
SETTINGS_COLLECTION_ID);
+            
assertEquals(document.get(SETTINGS_COLLECTION_DETAILED_GC_TIMESTAMP_PROP), 
oldestModifiedDocTimeStamp);
+            
assertEquals(document.get(SETTINGS_COLLECTION_DETAILED_GC_DOCUMENT_ID_PROP), 
oldestModifiedDocId);
+        }
+    }
+
     @Test
     public void testGCDeletedPropsAlreadyGCed() throws Exception {
         //1. Create nodes with properties
@@ -420,7 +464,7 @@ public class VersionGarbageCollectorIT {
         stats = gc.gc(maxAge*2, HOURS);
         assertEquals(10, stats.deletedPropsGCCount);
         assertEquals(10, stats.updatedDetailedGCDocsCount);
-        assertNotEquals(MIN_ID_VALUE, stats.oldestModifiedDocId);
+        assertEquals(MIN_ID_VALUE, stats.oldestModifiedDocId);
 
         //3. now reCreate those properties again
         NodeBuilder b3 = store.getRoot().builder();
@@ -635,7 +679,7 @@ public class VersionGarbageCollectorIT {
         stats = gc.gc(maxAge*2, HOURS);
         assertEquals(0, stats.updatedDetailedGCDocsCount);
         assertEquals(0, stats.deletedPropsGCCount);
-        assertNotEquals(MIN_ID_VALUE, stats.oldestModifiedDocId);
+        assertEquals(MIN_ID_VALUE, stats.oldestModifiedDocId);
     }
 
     @Test

Reply via email to