stefan-egli commented on code in PR #1372:
URL: https://github.com/apache/jackrabbit-oak/pull/1372#discussion_r1537690222


##########
oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java:
##########
@@ -1246,6 +1349,256 @@ private void 
collectRevisionsOlderThan24hAndBetweenCheckpoints(final NodeDocumen
             }
         }
 
+        /**
+         * Remove all property revisions in the local document that are no 
longer used.
+         * This includes bundled properties. It also includes related entries 
that
+         * become obsolete as a result - i.e. _commitRoot and _bc (though the 
latter
+         * is never removed on root)
+         */
+        private void collectUnusedPropertyRevisions(final NodeDocument doc,
+                final GCPhases phases, final UpdateOp updateOp,
+                final DocumentNodeState traversedMainNode,
+                final boolean ignoreInternalProperties) {
+
+            if (!phases.start(GCPhase.DETAILED_GC_COLLECT_OLD_REVS)){
+                // cancelled
+                return;
+            }
+            final Set<Revision> allKeepRevs = new HashSet<>();
+            // phase A : collectUnusedUserPropertyRevisions
+            int deletedTotalRevsCount = 
collectUnusedUserPropertyRevisions(doc, phases, updateOp, traversedMainNode, 
allKeepRevs);
+            int deletedUserRevsCount = deletedTotalRevsCount;
+            // phase B : collectUnusedInternalPropertyRevisions
+            if (!ignoreInternalProperties) {
+                deletedTotalRevsCount = 
collectUnusedInternalPropertyRevisions(doc, phases, updateOp, 
traversedMainNode, allKeepRevs, deletedTotalRevsCount);
+            }
+
+            // then some accounting...
+            int deletedInternalRevsCount = deletedTotalRevsCount - 
deletedUserRevsCount;
+            if (deletedUserRevsCount != 0) {
+                deletedPropRevsCountMap.merge(doc.getId(), 
deletedUserRevsCount, Integer::sum);
+            }
+            if (deletedInternalRevsCount != 0) {
+                deletedInternalPropRevsCountMap.merge(doc.getId(), 
deletedInternalRevsCount, Integer::sum);
+            }
+            phases.stop(GCPhase.DETAILED_GC_COLLECT_OLD_REVS);
+        }
+
+        private int collectUnusedUserPropertyRevisions(final NodeDocument doc,
+                final GCPhases phases, final UpdateOp updateOp,
+                final DocumentNodeState traversedMainNode,
+                final Set<Revision> allKeepRevs) {
+            int deletedRevsCount = 0;
+
+            // phase 1 : non bundled nodes only
+            for (PropertyState prop : traversedMainNode.getProperties()) {
+                final String escapedPropName = 
Utils.escapePropertyName(prop.getName());
+                deletedRevsCount += removeUnusedPropertyEntries(doc, 
traversedMainNode, updateOp, escapedPropName,
+                        (r) -> { updateOp.removeMapEntry(escapedPropName, r); 
return null;},
+                        allKeepRevs);
+            }
+
+            // phase 2 : bundled nodes only
+            final Map<Path, DocumentNodeState> bundledNodeStates = new 
HashMap<>();
+            for (DocumentNodeState dns : 
traversedMainNode.getAllBundledNodesStates()) {
+                bundledNodeStates.put(dns.getPath(), dns);
+            }
+            // remember that getAllBundledProperties returns unescaped keys
+            for (String propName : 
traversedMainNode.getAllBundledProperties().keySet()) {
+                final int lastSlash = propName.lastIndexOf("/");
+                if (lastSlash == -1) {
+                    // then it is an unbundled property which was already 
handled in phase 1
+                    continue;
+                }
+                final String escapedPropName = 
Utils.escapePropertyName(propName);
+                // bundled values are of format sub/tree/propertyKey
+                // to extract this we need the last index of "/"
+                final String unbundledSubtreeName = propName.substring(0, 
lastSlash);
+                final String unbundledPropName = propName.substring(lastSlash 
+ 1);
+                final String unbundledPath = 
traversedMainNode.getPath().toString() + "/" + unbundledSubtreeName;
+                final DocumentNodeState traversedNode = 
bundledNodeStates.get(Path.fromString(unbundledPath));
+                if (traversedNode == null) {
+                    log.error("collectUnusedPropertyRevisions : could not find 
traversed node for bundled key {} unbundledPath {} in doc {}",
+                            propName, unbundledPath, doc.getId());
+                    continue;
+                }
+                final PropertyState traversedProperty = 
traversedNode.getProperty(unbundledPropName);
+                if (traversedProperty == null) {
+                    log.error("collectUnusedPropertyRevisions : could not get 
property {} from traversed node {}",
+                            unbundledPropName, traversedNode.getPath());
+                    continue;
+                }
+                deletedRevsCount += removeUnusedPropertyEntries(doc, 
traversedNode, updateOp, escapedPropName,
+                        (r) -> { updateOp.removeMapEntry(escapedPropName, r); 
return null;},
+                        allKeepRevs);
+            }
+
+            // phase 3 : "_deleted"
+            int numDeleted = removeUnusedPropertyEntries(doc, 
traversedMainNode,
+                    updateOp, NodeDocument.DELETED,
+                    (r) -> { NodeDocument.removeDeleted(updateOp, r); return 
null;} ,
+                    allKeepRevs);
+            deletedRevsCount += numDeleted;
+            return deletedRevsCount;
+        }
+
+        private int collectUnusedInternalPropertyRevisions(final NodeDocument 
doc,
+                final GCPhases phases, final UpdateOp updateOp,
+                final DocumentNodeState traversedMainNode,
+                final Set<Revision> toKeepUserPropRevs,
+                int deletedRevsCount) {
+            boolean hasUnmergedBranchCommits = false;
+            for (Revision localBc : doc.getLocalBranchCommits()) {
+                if (!isCommitted(nodeStore.getCommitValue(localBc, doc))) {
+                    hasUnmergedBranchCommits = true;
+                }
+            }
+            if (deletedRevsCount == 0 && !hasUnmergedBranchCommits) {
+                return deletedRevsCount;
+            }
+            // if we did some rev deletion OR there are unmerged BCs, then 
let's deep-dive
+            Set<Revision> allRequiredRevs = new HashSet<>(toKeepUserPropRevs);
+            for (Revision revision : 
doc.getLocalMap(NodeDocument.COLLISIONS).keySet()) {
+                if (!allRequiredRevs.contains(revision)) {
+                    Operation has = updateOp.getChanges().get(new 
Key(NodeDocument.COLLISIONS, revision));
+                    if (has != null) {
+                        // then skip
+                        continue;
+                    }
+                    NodeDocument.removeCollision(updateOp, revision);
+                    deletedRevsCount++;
+                }
+            }
+            // "_revisions"
+            for (Entry<Revision, String> e : 
doc.getLocalRevisions().entrySet()) {
+                Revision revision = e.getKey();
+                if (allRequiredRevs.contains(revision)) {
+                    // if it is still referenced locally, keep it
+                    continue;
+                }
+                final boolean isRoot = 
doc.getId().equals(Utils.getIdFromPath(Path.ROOT));
+                // local bcs only considered for removal
+                final boolean isBC = 
!doc.getLocalBranchCommits().contains(revision);
+                final boolean newerThanSweep = 
nodeStore.getSweepRevisions().isRevisionNewer(revision);
+                if (newerThanSweep) {
+                    //TODO wondering if we should at all do any DGC on 
documents newer than sweep
+                    if (isBC) {
+                        // analysed further down, we can remove them if 
unmerged
+                    } else {
+                        // for normal commits, we need to keep them -> also 
add to allRequiredRevs
+                        allRequiredRevs.add(revision);
+                        continue;
+                    }
+                }
+                // committed revisions are hard to delete, as determining
+                // whether or not they are orphaned is difficult.
+                // child nodes could be having _commitRoots pointing to this 
doc
+                // and this revision - in which case it wouldn't be orphaned.
+                // without scanning the children (or having knowledge about
+                // which revisions are referenced by children) it is unsafe
+                // to delete committed revisions.
+                // if they are uncommitted, then they are garbage, as then they
+                // are either normal commits (thus a not finished commit)
+                // or an unmerged branch commit. In both cases they are 
garbage.
+                boolean isCommitted = 
isCommitted(nodeStore.getCommitValue(revision, doc));
+                if (isCommitted) {
+                    if (isRoot) {
+                        // root : cannot remove since it could be referenced 
and required by a child
+                        // also add to allRequiredRevs
+                        allRequiredRevs.add(revision);
+                        continue;
+                    } else if (!isBC) {
+                        // non root and normal : cannot remove, same as above,
+                        // it could be referenced and required by a child 
+                        // also add to allRequiredRevs
+                        allRequiredRevs.add(revision);
+                        continue;
+                    } else {
+                        // non root and bc : can remove, as non root bc cannot 
be referenced by child
+                    }
+                }
+                Operation has = updateOp.getChanges().get(new 
Key(NodeDocument.REVISIONS, revision));
+                if (has != null) {
+                    // then skip
+                    continue;
+                }
+                NodeDocument.removeRevision(updateOp, revision);
+                deletedRevsCount++;
+            }
+            // "_commitRoot"
+            for (Revision revision : doc.getLocalCommitRoot().keySet()) {
+                if (!allRequiredRevs.contains(revision)) {
+                    Operation has = updateOp.getChanges().get(new 
Key(NodeDocument.COMMIT_ROOT, revision));
+                    if (has != null) {
+                        // then skip
+                        continue;
+                    }
+                    NodeDocument.removeCommitRoot(updateOp, revision);
+                    deletedRevsCount++;
+                }
+            }
+            // "_bc"
+            for (Revision revision : doc.getLocalBranchCommits()) {
+                if (!allRequiredRevs.contains(revision)) {
+                    Operation has = updateOp.getChanges().get(new 
Key(NodeDocument.BRANCH_COMMITS, revision));
+                    if (has != null) {
+                        // then skip
+                        continue;
+                    }
+
+                    NodeDocument.removeBranchCommit(updateOp, revision);
+                    deletedRevsCount++;
+                }
+            }
+            return deletedRevsCount;
+        }
+
+        private int removeUnusedPropertyEntries(NodeDocument doc,
+                DocumentNodeState traversedMainNode, UpdateOp updateOp,
+                String propertyKey, Function<Revision, Void> removeRevision,

Review Comment:
   +1 indeed more appropriate, done now in 
https://github.com/apache/jackrabbit-oak/pull/1372/commits/4506faa4764d833bb87f798ae182f248608dcf1f



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@jackrabbit.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to