(jackrabbit-oak) branch trunk updated: OAK-10742 : add support for defining include/exclude paths for fullGC (#1547)

stefanegli Thu, 20 Jun 2024 08:57:51 -0700

This is an automated email from the ASF dual-hosted git repository.

stefanegli pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git



The following commit(s) were added to refs/heads/trunk by this push:
     new fd8b860ea6 OAK-10742 : add support for defining include/exclude paths 
for fullGC (#1547)
fd8b860ea6 is described below

commit fd8b860ea69adb0d5cdaaef15141563cf8bf3f01
Author: stefan-egli <stefane...@apache.org>
AuthorDate: Thu Jun 20 17:57:34 2024 +0200

    OAK-10742 : add support for defining include/exclude paths for fullGC 
(#1547)
---
 .../oak/plugins/document/VersionGCSupport.java     |  13 ++-
 .../plugins/document/VersionGarbageCollector.java  |  27 ++++-
 .../document/mongo/MongoVersionGCSupport.java      |  60 ++++++++++-
 .../oak/plugins/document/util/Utils.java           | 111 ++++++++++++++++++++-
 .../oak/plugins/document/VersionGCSupportTest.java |  18 ++--
 .../document/VersionGarbageCollectorIT.java        | 111 ++++++++++++++++++++-
 6 files changed, 319 insertions(+), 21 deletions(-)

diff --git 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java
 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java
index b938da536a..0c320e0e54 100644
--- 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java
+++ 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java
@@ -34,6 +34,7 @@ import static 
org.apache.jackrabbit.oak.plugins.document.NodeDocument.getModifie
 import static 
org.apache.jackrabbit.oak.plugins.document.util.Utils.getAllDocuments;
 import static 
org.apache.jackrabbit.oak.plugins.document.util.Utils.getSelectedDocuments;
 
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Objects;
@@ -98,12 +99,16 @@ public class VersionGCSupport {
      * @return matching documents.
      */
     public Iterable<NodeDocument> getModifiedDocs(final long fromModified, 
final long toModified, final int limit,
-                                                  @NotNull final String 
fromId) {
+                                                  @NotNull final String fromId,
+                                                  @NotNull final Set<String> 
includePaths,
+                                                  @NotNull final Set<String> 
excludePaths) {
         // (_modified = fromModified && _id > fromId || _modified > 
fromModified && _modified < toModified)
-        final Stream<NodeDocument> s1 = stream(getSelectedDocuments(store, 
MODIFIED_IN_SECS, 1, fromId).spliterator(), false)
+        final Stream<NodeDocument> s1 = stream(getSelectedDocuments(store,
+                MODIFIED_IN_SECS, 1, fromId, includePaths, 
excludePaths).spliterator(), false)
                 .filter(input -> modifiedEqualsTo(input, fromModified));
 
-        final Stream<NodeDocument> s2 = stream(getSelectedDocuments(store, 
MODIFIED_IN_SECS, 1).spliterator(), false)
+        final Stream<NodeDocument> s2 = stream(getSelectedDocuments(store,
+                MODIFIED_IN_SECS, 1, includePaths, 
excludePaths).spliterator(), false)
                 .filter(input -> modifiedGreaterThan(input, fromModified) && 
modifiedLessThan(input, toModified));
 
         return concat(s1, s2)
@@ -222,7 +227,7 @@ public class VersionGCSupport {
         long now = clock.getTime();
         Iterable<NodeDocument> docs = null;
         try {
-            docs = getModifiedDocs(0, now, 1, MIN_ID_VALUE);
+            docs = getModifiedDocs(0, now, 1, MIN_ID_VALUE, 
Collections.emptySet(), Collections.emptySet());
             if (docs.iterator().hasNext()) {
                 final NodeDocument oldestModifiedDoc = docs.iterator().next();
                 LOG.info("Oldest modified document is {}", oldestModifiedDoc);
diff --git 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java
 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java
index c26e433c0e..33ebdb2ed2 100644
--- 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java
+++ 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java
@@ -216,6 +216,8 @@ public class VersionGarbageCollector {
     private final boolean fullGCEnabled;
     private final boolean isFullGCDryRun;
     private final boolean embeddedVerification;
+    private Set<String> fullGCIncludePaths = Collections.emptySet();
+    private Set<String> fullGCExcludePaths = Collections.emptySet();
     private final VersionGCSupport versionStore;
     private final AtomicReference<GCJob> collector = newReference();
     private VersionGCOptions options;
@@ -238,6 +240,20 @@ public class VersionGarbageCollector {
         AUDIT_LOG.info("<init> VersionGarbageCollector created with fullGcMode 
= {}", fullGcMode);
     }
 
+    /**
+     * Please note that at the moment the includes do not
+     * take long paths into account. That is, if a long path was
+     * supposed to be included via an include, it is not.
+     * Reason for this is that long paths would require
+     * the mongo query to include a '_path' condition - which disallows
+     * mongo from using the '_modified_id' index. IOW long paths
+     * would result in full scans - which results in bad performance.
+     */
+    void setFullGCPaths(@NotNull Set<String> includes, @NotNull Set<String> 
excludes) {
+        this.fullGCIncludePaths = requireNonNull(includes);
+        this.fullGCExcludePaths = requireNonNull(excludes);
+    }
+
     void setStatisticsProvider(StatisticsProvider provider) {
         this.gcStats = new RevisionGCStats(provider);
         this.fullGCStats = new FullGCStatsCollectorImpl(provider);
@@ -833,7 +849,7 @@ public class VersionGarbageCollector {
                         if (log.isDebugEnabled()) {
                             log.debug("Fetching docs from [{}] to [{}] with Id 
starting from [{}]", timestampToString(fromModifiedMs), 
timestampToString(toModifiedMs), fromId);
                         }
-                        Iterable<NodeDocument> itr = 
versionStore.getModifiedDocs(fromModifiedMs, toModifiedMs, FULL_GC_BATCH_SIZE, 
fromId);
+                        Iterable<NodeDocument> itr = 
versionStore.getModifiedDocs(fromModifiedMs, toModifiedMs, FULL_GC_BATCH_SIZE, 
fromId, fullGCIncludePaths, fullGCExcludePaths);
                         try {
                             for (NodeDocument doc : itr) {
                                 foundDoc = true;
@@ -851,7 +867,14 @@ public class VersionGarbageCollector {
                                 lastDoc = doc;
                                 // collect the data to delete in next step
                                 if 
(phases.start(GCPhase.FULL_GC_COLLECT_GARBAGE)) {
-                                    gc.collectGarbage(doc, phases);
+                                    if (Utils.isIncluded(doc.getPath(), 
Collections.emptySet(), fullGCExcludePaths)) {
+                                        gc.collectGarbage(doc, phases);
+                                    } else {
+                                        // MongoVersionGCSupport doesn't take 
long paths into consideration
+                                        // for neither includes nor excludes. 
If isIncluded returns false here,
+                                        // that can only be due to an excluded 
long path.
+                                        // in which case, we can actually 
honor that and skip this
+                                    }
                                     
phases.stop(GCPhase.FULL_GC_COLLECT_GARBAGE);
                                 }
 
diff --git 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java
 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java
index 67751d912a..060c105689 100644
--- 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java
+++ 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java
@@ -22,6 +22,7 @@ package org.apache.jackrabbit.oak.plugins.document.mongo;
 import static com.mongodb.client.model.Filters.eq;
 import static com.mongodb.client.model.Filters.exists;
 import static com.mongodb.client.model.Filters.gt;
+import static com.mongodb.client.model.Filters.not;
 import static com.mongodb.client.model.Filters.or;
 import static com.mongodb.client.model.Projections.include;
 import static java.util.Optional.empty;
@@ -132,6 +133,56 @@ public class MongoVersionGCSupport extends 
VersionGCSupport {
                 input -> store.convertFromDBObject(NODES, input)));
     }
 
+    /**
+     * Calculate the bson representing including only the provided
+     * include path prefixes and/or excluding the provided
+     * exclude path prefixes - if any are provided - AND the provided
+     * query.
+     * Please note that at the moment the includes do not
+     * take long paths into account. That is, if a long path was
+     * supposed to be included via an include, it is not.
+     * Reason for this is that long paths would require
+     * the mongo query to include a '_path' condition - which disallows
+     * mongo from using the '_modified_id' index. IOW long paths
+     * would result in full scans - which results in bad performance.
+     * @param includes set of path prefixes which should only be considered
+     * @param excludes set of path prefixes which should be excluded.
+     * if these overlap with includes, then exclude has precedence.
+     * @param query the query with which to do an AND
+     * @return the combined bson with include/exclude path prefixes
+     * AND the provided query
+     */
+    private Bson withIncludeExcludes(Set<String> includes, Set<String> 
excludes, Bson query) {
+        Bson inclExcl = null;
+        if (includes != null && !includes.isEmpty()) {
+            final List<Bson> ors = new ArrayList<>(includes.size());
+            for (String incl : includes) {
+                ors.add(Filters.regex(ID, ":" + incl));
+            }
+            inclExcl = or(ors);
+        }
+        if (excludes != null && !excludes.isEmpty()) {
+            final List<Bson> ands = new ArrayList<>(excludes.size());
+            for (String excl : excludes) {
+                ands.add(not(Filters.regex(ID, ":" + excl)));
+            }
+            if (inclExcl != null) {
+                ands.add(inclExcl);
+            }
+            inclExcl = and(ands);
+        }
+        if (inclExcl == null) {
+            // if no include or exclude path prefixes are defined,
+            // then everything is included - i.e. we fall back to
+            // just the provided query
+            return query;
+        } else {
+            // if there are include or exclude path prefixes,
+            // then add them via AND
+            return and(inclExcl, query);
+        }
+    }
+
     /**
      * Returns documents that have a {@link NodeDocument#MODIFIED_IN_SECS} 
value
      * within the given range and are greater than given @{@link 
NodeDocument#ID}.
@@ -150,11 +201,14 @@ public class MongoVersionGCSupport extends 
VersionGCSupport {
      */
     @Override
     public Iterable<NodeDocument> getModifiedDocs(final long fromModified, 
final long toModified, final int limit,
-                                                  @NotNull final String 
fromId) {
+                                                  @NotNull final String fromId,
+                                                  Set<String> 
includedPathPrefixes, Set<String> excludedPathPrefixes) {
         // (_modified = fromModified && _id > fromId || _modified > 
fromModified && _modified < toModified)
         final Bson query = or(
-                and(eq(MODIFIED_IN_SECS, getModifiedInSecs(fromModified)), 
gt(ID, fromId)),
-                and(gt(MODIFIED_IN_SECS, getModifiedInSecs(fromModified)), 
lt(MODIFIED_IN_SECS, getModifiedInSecs(toModified))));
+                withIncludeExcludes(includedPathPrefixes, excludedPathPrefixes,
+                        and(eq(MODIFIED_IN_SECS, 
getModifiedInSecs(fromModified)), gt(ID, fromId))),
+                withIncludeExcludes(includedPathPrefixes, excludedPathPrefixes,
+                        and(gt(MODIFIED_IN_SECS, 
getModifiedInSecs(fromModified)), lt(MODIFIED_IN_SECS, 
getModifiedInSecs(toModified)))));
 
         // first sort by _modified and then by _id
         final Bson sort = and(eq(MODIFIED_IN_SECS, 1), eq(ID, 1));
diff --git 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/Utils.java
 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/Utils.java
index f4ba7c245c..53ab10cc25 100644
--- 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/Utils.java
+++ 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/Utils.java
@@ -26,6 +26,7 @@ import java.sql.Timestamp;
 import java.time.Instant;
 import java.time.format.DateTimeFormatter;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.Comparator;
 import java.util.Date;
 import java.util.Iterator;
@@ -54,6 +55,7 @@ import org.apache.jackrabbit.oak.plugins.document.Path;
 import org.apache.jackrabbit.oak.plugins.document.Revision;
 import org.apache.jackrabbit.oak.plugins.document.RevisionVector;
 import org.apache.jackrabbit.oak.plugins.document.StableRevisionComparator;
+import org.apache.jackrabbit.oak.plugins.document.mongo.MongoDocumentStore;
 import org.apache.jackrabbit.oak.spi.toggle.Feature;
 import org.apache.jackrabbit.oak.stats.Clock;
 import org.jetbrains.annotations.NotNull;
@@ -806,6 +808,17 @@ public class Utils {
         return internalGetSelectedDocuments(store, indexedProperty, 
startValue, MIN_ID_VALUE, DEFAULT_BATCH_SIZE);
     }
 
+    /**
+     * Like {@link #getSelectedDocuments(DocumentStore, String, long, int)} 
with
+     * a default {@code batchSize}.
+     */
+    public static Iterable<NodeDocument> getSelectedDocuments(
+            DocumentStore store, String indexedProperty, long startValue,
+            @NotNull final Set<String> includePaths, @NotNull final 
Set<String> excludePaths) {
+        return internalGetSelectedDocuments(store, indexedProperty, startValue,
+                MIN_ID_VALUE, includePaths, excludePaths, DEFAULT_BATCH_SIZE);
+    }
+
     /**
      * Like {@link #getSelectedDocuments(DocumentStore, String, long, int)} 
with
      * a default {@code batchSize}.
@@ -815,12 +828,89 @@ public class Utils {
         return internalGetSelectedDocuments(store, indexedProperty, 
startValue, fromId, DEFAULT_BATCH_SIZE);
     }
 
+    /**
+     * Like {@link #getSelectedDocuments(DocumentStore, String, long, int)} 
with
+     * a default {@code batchSize}.
+     */
+    public static Iterable<NodeDocument> getSelectedDocuments(
+            DocumentStore store, String indexedProperty, long startValue, 
String fromId,
+            @NotNull final Set<String> includePaths, @NotNull final 
Set<String> excludePaths) {
+        return internalGetSelectedDocuments(store, indexedProperty, 
startValue, fromId,
+                includePaths, excludePaths, DEFAULT_BATCH_SIZE);
+    }
+
+    /**
+     * Default implementation for applying include/exclude path prefixes
+     * client-side, meaning the query to the DocumentStore searches for all
+     * documents and include/excludes are then filtered after receiving that
+     * query.
+     * This variant is obviously not intended for production use, as client
+     * side filtering is slow. Hence this is only used for testing for
+     * any non-MongoDocumentStore. It should not be enabled in production,
+     * unless this performance hit here is understood and accepted.
+     * @param path the path for which to evaluate the include/excludes
+     * @param includes set of path prefixes which should only be considered
+     * @param excludes set of path prefixes which should be excluded.
+     * if these overlap with includes, then exclude has precedence.
+     * @return whether the provided path is included or not
+     */
+    public static boolean isIncluded(Path path, @NotNull Set<String> includes,
+            @NotNull Set<String> excludes) {
+        // check first if includes/excludes are empty
+        if (includes.isEmpty() && excludes.isEmpty()) {
+            return true;
+        }
+        String s = path.toString();
+        // then check excludes first
+        for (String anExclude : excludes) {
+            if (s.startsWith(anExclude)) {
+                // if there is an exclude matching the path
+                // we need to definitely exclude it,
+                // no matter whether there is even an include
+                // for it or not
+                return false;
+            }
+        }
+        if (includes.isEmpty()) {
+            // if we have no includes defined at all, and it
+            // was not excluded, then it is an include
+            return true;
+        }
+        // then the includes
+        for (String anInclude : includes) {
+            if (s.startsWith(anInclude)) {
+                // if we have a matching include, and given
+                // it was not excluded above, then this is
+                // an include
+                return true;
+            }
+        }
+        // if we have any includes defined, but none of
+        // them matched so far, then this is an exclude
+        return false;
+    }
+
+    private static Iterable<NodeDocument> internalGetSelectedDocuments(
+            final DocumentStore store, final String indexedProperty,
+            final long startValue, String fromId,
+            final int batchSize) {
+        return internalGetSelectedDocuments(store, indexedProperty, 
startValue, fromId,
+                Collections.emptySet(), Collections.emptySet(), batchSize);
+    }
+
     private static Iterable<NodeDocument> internalGetSelectedDocuments(
             final DocumentStore store, final String indexedProperty,
-            final long startValue, String fromId, final int batchSize) {
+            final long startValue, String fromId,
+            @NotNull final Set<String> includePaths,
+            @NotNull final Set<String> excludePaths,
+            final int batchSize) {
         if (batchSize < 2) {
             throw new IllegalArgumentException("batchSize must be > 1");
         }
+        if ((store instanceof MongoDocumentStore)
+                && (!includePaths.isEmpty() || !excludePaths.isEmpty())) {
+            throw new IllegalArgumentException("cannot use with 
MongoDocumentStore");
+        }
         return new Iterable<NodeDocument>() {
             @Override
             public Iterator<NodeDocument> iterator() {
@@ -832,6 +922,25 @@ public class Utils {
 
                     @Override
                     protected NodeDocument computeNext() {
+                        do {
+                            final NodeDocument n = doComputeNext();
+                            if (n == null) {
+                                return null;
+                            }
+                            if (isIncluded(n.getPath(), includePaths, 
excludePaths)) {
+                                return n;
+                            }
+                            // else repeat
+                            // note that this loop is potentially dangerous,
+                            // depending on the include/exclude definition.
+                            // that's why currently this variant is not 
supported
+                            // against MongoDocumentStore. I.e. it is only used
+                            // for unit testing. FullGC for RDBDocumentStore
+                            // is not supported at all.
+                        } while(true);
+                    }
+
+                    private NodeDocument doComputeNext() {
                         // read next batch if necessary
                         if (!batch.hasNext()) {
                             batch = nextBatch();
diff --git 
a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java
 
b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java
index eb848fe4cc..26f5759854 100644
--- 
a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java
+++ 
b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java
@@ -19,7 +19,9 @@
 package org.apache.jackrabbit.oak.plugins.document;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
+import java.util.Set;
 
 import com.mongodb.ReadPreference;
 
@@ -59,6 +61,8 @@ import static org.junit.Assert.assertTrue;
 @RunWith(Parameterized.class)
 public class VersionGCSupportTest {
 
+    private static final Set<String> EMPTY_STRING_SET = Collections.emptySet();
+
     private final DocumentStoreFixture fixture;
     private final DocumentStore store;
     private final VersionGCSupport gcSupport;
@@ -278,7 +282,7 @@ public class VersionGCSupportTest {
         oldestModifiedDocId = MIN_ID_VALUE;
 
         for(int i = 0; i < 5; i++) {
-            Iterable<NodeDocument> modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId);
+            Iterable<NodeDocument> modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId, EMPTY_STRING_SET, EMPTY_STRING_SET);
             assertTrue(isInOrder(modifiedDocs, (o1, o2) -> 
comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, 
o2)));
             long count = stream(modifiedDocs.spliterator(), false).count();
             assertEquals(1000, count);
@@ -313,7 +317,7 @@ public class VersionGCSupportTest {
         oldestModifiedDocId = MIN_ID_VALUE;
 
         for(int i = 0; i < 5; i++) {
-            Iterable<NodeDocument> modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId);
+            Iterable<NodeDocument> modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId, EMPTY_STRING_SET, EMPTY_STRING_SET);
             assertTrue(isInOrder(modifiedDocs, (o1, o2) -> 
comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, 
o2)));
             long count = stream(modifiedDocs.spliterator(), false).count();
             assertEquals(1000, count);
@@ -325,7 +329,7 @@ public class VersionGCSupportTest {
         }
 
         // fetch last remaining document now
-        Iterable<NodeDocument> modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId);
+        Iterable<NodeDocument> modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId, EMPTY_STRING_SET, EMPTY_STRING_SET);
         assertEquals(1, stream(modifiedDocs.spliterator(), false).count());
         assertTrue(isInOrder(modifiedDocs, (o1, o2) -> 
comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, 
o2)));
         oldestModifiedDoc = modifiedDocs.iterator().next();
@@ -333,7 +337,7 @@ public class VersionGCSupportTest {
         oldestModifiedDocTs = 
ofNullable(oldestModifiedDoc.getModified()).orElse(0L);
 
         // all documents had been fetched, now we won't get any document
-        modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId);
+        modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId, EMPTY_STRING_SET, EMPTY_STRING_SET);
         assertEquals(0, stream(modifiedDocs.spliterator(), false).count());
 
     }
@@ -360,7 +364,7 @@ public class VersionGCSupportTest {
         store.create(NODES, updateOps);
 
         for(int i = 0; i < 5; i++) {
-            Iterable<NodeDocument> modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId);
+            Iterable<NodeDocument> modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId, EMPTY_STRING_SET, EMPTY_STRING_SET);
             assertTrue(isInOrder(modifiedDocs, (o1, o2) -> 
comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, 
o2)));
             long count = stream(modifiedDocs.spliterator(), false).count();
             assertEquals(1000, count);
@@ -372,7 +376,7 @@ public class VersionGCSupportTest {
         }
 
         // all documents had been fetched, now we won't get any document
-        Iterable<NodeDocument> modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId);
+        Iterable<NodeDocument> modifiedDocs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 
1000, oldestModifiedDocId, EMPTY_STRING_SET, EMPTY_STRING_SET);
         assertEquals(0, stream(modifiedDocs.spliterator(), false).count());
     }
 
@@ -382,7 +386,7 @@ public class VersionGCSupportTest {
     }
 
     private void assertModified(long fromSeconds, long toSeconds, long num) {
-        Iterable<NodeDocument> docs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(fromSeconds), 
SECONDS.toMillis(toSeconds), 10, MIN_ID_VALUE);
+        Iterable<NodeDocument> docs = 
gcSupport.getModifiedDocs(SECONDS.toMillis(fromSeconds), 
SECONDS.toMillis(toSeconds), 10, MIN_ID_VALUE, EMPTY_STRING_SET, 
EMPTY_STRING_SET);
         assertEquals(num, stream(docs.spliterator(), false).count());
         assertTrue(isInOrder(docs, (o1, o2) -> 
comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, 
o2)));
     }
diff --git 
a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java
 
b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java
index 2dfbbab2f4..3566925c0d 100644
--- 
a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java
+++ 
b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java
@@ -21,6 +21,7 @@ package org.apache.jackrabbit.oak.plugins.document;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
@@ -536,6 +537,103 @@ public class VersionGarbageCollectorIT {
         assertTrue(stats.canceled);
     }
 
+    @Test
+    public void testGCDeletedLongPathPropsInclExcl_excludes() throws Exception 
{
+        String longName = repeat("p", PATH_LONG + 1);
+        createEmptyProps("/a/b/" + longName + "/x", "/b/c/" + longName + "/x",
+                "/c/d/" + longName + "/x");
+        setGCIncludeExcludes(Sets.newHashSet(), Sets.newHashSet("/b/c", "/c"));
+        doTestDeletedPropsGC(1, 1);
+    }
+
+    @Test
+    public void testGCDeletedPropsInclExcl_oneInclude() throws Exception {
+        createEmptyProps("/a/b/c", "/b/c/d", "/c/d/e");
+        setGCIncludeExcludes(Sets.newHashSet("/a"), Sets.newHashSet());
+        doTestDeletedPropsGC(1, 1);
+    }
+
+    @Test
+    public void testGCDeletedPropsInclExcl_twoIncludes() throws Exception {
+        createEmptyProps("/a/b/c", "/b/c/d", "/c/d/e");
+        setGCIncludeExcludes(Sets.newHashSet("/a", "/c"), Sets.newHashSet());
+        doTestDeletedPropsGC(2, 2);
+    }
+
+    @Test
+    public void testGCDeletedPropsInclExcl_inclAndExcl() throws Exception {
+        createEmptyProps("/a/b/c", "/b/c/d", "/c/d/e");
+        setGCIncludeExcludes(Sets.newHashSet("/a", "/c"), 
Sets.newHashSet("/c/d"));
+        doTestDeletedPropsGC(1, 1);
+    }
+
+    @Test
+    public void testGCDeletedPropsInclExcl_excludes() throws Exception {
+        createEmptyProps("/a/b/c", "/b/c/d", "/c/d/e");
+        setGCIncludeExcludes(Sets.newHashSet(), Sets.newHashSet("/b", "/c"));
+        doTestDeletedPropsGC(1, 1);
+    }
+
+    @Test
+    public void testGCDeletedPropsInclExcl_emptyEmpty() throws Exception {
+        createEmptyProps("/a/b/c", "/b/c/d", "/c/d/e");
+        setGCIncludeExcludes(Collections.emptySet(), Collections.emptySet());
+        doTestDeletedPropsGC(3, 3);
+    }
+
+    private void setGCIncludeExcludes(Set<String> includes, Set<String> 
excludes) {
+        gc.setFullGCPaths(requireNonNull(includes), requireNonNull(excludes));
+    }
+
+    private void doTestDeletedPropsGC(int deletedPropsCount, int 
updatedDocsCount)
+            throws Exception {
+        // enable the full gc flag
+        writeField(gc, "fullGCEnabled", true, true);
+        long maxAge = 1; //hours
+        clock.waitUntil(getCurrentTimestamp() + 
TimeUnit.HOURS.toMillis(maxAge));
+        VersionGCStats stats = gc(gc, maxAge, HOURS);
+        assertStatsCountsEqual(stats,
+                gapOrphOnly(),
+                gapOrphProp(0, deletedPropsCount, 0, 0, 0, 0, 
updatedDocsCount),
+                allOrphProp(0, deletedPropsCount, 0, 0, 0, 0, 
updatedDocsCount),
+                keepOneFull(0, deletedPropsCount, 0, 0, 0, 0, 
updatedDocsCount),
+                keepOneUser(0, deletedPropsCount, 0, 0, 0, 0, 
updatedDocsCount),
+                unmergedBcs(0, deletedPropsCount, 0, 0, 0, 0, 
updatedDocsCount),
+                betweenChkp(0, deletedPropsCount, 0, 0, 3, 0, 
updatedDocsCount),
+                btwnChkpUBC(0, deletedPropsCount, 0, 0, 3, 0, 
updatedDocsCount));
+    }
+
+    /**
+     * Utility method to create empty properties, meaning they
+     * are created, then removed again. That leaves them
+     * as not existing properties, except they still exist
+     * in the document. FullGC can then remove them after
+     * a certain amount of time
+     */
+    private void createEmptyProps(String... paths) throws 
CommitFailedException {
+        // 1. create nodes with properties
+        NodeBuilder rb1 = store1.getRoot().builder();
+        for (String path : paths) {
+            NodeBuilder b1 = rb1;
+            for (String name : Path.fromString(path).elements()) {
+                b1 = b1.child(name);
+            }
+            b1.setProperty("foo", "bar", STRING);
+        }
+        store1.merge(rb1, EmptyHook.INSTANCE, CommitInfo.EMPTY);
+        // 2. refresh the builder
+        rb1 = store1.getRoot().builder();
+        // 3. empty the properties
+        for (String path : paths) {
+            NodeBuilder b1 = rb1;
+            for (String name : Path.fromString(path).elements()) {
+                b1 = b1.child(name);
+            }
+            b1.removeProperty("foo");
+        }
+        store1.merge(rb1, EmptyHook.INSTANCE, CommitInfo.EMPTY);
+    }
+
     @Test
     public void testGCDeletedProps() throws Exception {
         //1. Create nodes with properties
@@ -1276,8 +1374,11 @@ public class VersionGarbageCollectorIT {
         VersionGCSupport gcSupport = new 
VersionGCSupport(store1.getDocumentStore()) {
 
             @Override
-            public Iterable<NodeDocument> getModifiedDocs(long fromModified, 
long toModified, int limit, @NotNull String fromId) {
-                Iterable<NodeDocument> modifiedDocs = 
super.getModifiedDocs(fromModified, toModified, limit, fromId);
+            public Iterable<NodeDocument> getModifiedDocs(long fromModified,
+                    long toModified, int limit, @NotNull String fromId,
+                    @NotNull final Set<String> includePaths, @NotNull final 
Set<String> excludePaths) {
+                Iterable<NodeDocument> modifiedDocs = 
super.getModifiedDocs(fromModified,
+                        toModified, limit, fromId, includePaths, excludePaths);
                 List<NodeDocument> result = stream(modifiedDocs.spliterator(), 
false).collect(toList());
                 final Revision updateRev = newRevision(1);
                 store1.getDocumentStore().findAndUpdate(NODES, 
stream(modifiedDocs.spliterator(), false)
@@ -1336,7 +1437,8 @@ public class VersionGarbageCollectorIT {
         final VersionGCSupport gcSupport = new 
VersionGCSupport(store1.getDocumentStore()) {
 
             @Override
-            public Iterable<NodeDocument> getModifiedDocs(long fromModified, 
long toModified, int limit, @NotNull String fromId) {
+            public Iterable<NodeDocument> getModifiedDocs(long fromModified, 
long toModified, int limit, @NotNull String fromId, final Set<String> 
includePaths,
+                    final Set<String> excludePaths) {
                 return () -> new AbstractIterator<>() {
                     private final Iterator<NodeDocument> it = 
candidates(fromModified, toModified, limit, fromId);
 
@@ -1353,7 +1455,8 @@ public class VersionGarbageCollectorIT {
             }
 
             private Iterator<NodeDocument> candidates(long fromModified, long 
toModified, int limit, @NotNull String fromId) {
-                return super.getModifiedDocs(fromModified, toModified, limit, 
fromId).iterator();
+                return super.getModifiedDocs(fromModified, toModified, limit, 
fromId,
+                        Collections.emptySet(), 
Collections.emptySet()).iterator();
             }
         };

(jackrabbit-oak) branch trunk updated: OAK-10742 : add support for defining include/exclude paths for fullGC (#1547)

Reply via email to