stefan-egli commented on code in PR #1372: URL: https://github.com/apache/jackrabbit-oak/pull/1372#discussion_r1537690222
########## oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java: ########## @@ -1246,6 +1349,256 @@ private void collectRevisionsOlderThan24hAndBetweenCheckpoints(final NodeDocumen } } + /** + * Remove all property revisions in the local document that are no longer used. + * This includes bundled properties. It also includes related entries that + * become obsolete as a result - i.e. _commitRoot and _bc (though the latter + * is never removed on root) + */ + private void collectUnusedPropertyRevisions(final NodeDocument doc, + final GCPhases phases, final UpdateOp updateOp, + final DocumentNodeState traversedMainNode, + final boolean ignoreInternalProperties) { + + if (!phases.start(GCPhase.DETAILED_GC_COLLECT_OLD_REVS)){ + // cancelled + return; + } + final Set<Revision> allKeepRevs = new HashSet<>(); + // phase A : collectUnusedUserPropertyRevisions + int deletedTotalRevsCount = collectUnusedUserPropertyRevisions(doc, phases, updateOp, traversedMainNode, allKeepRevs); + int deletedUserRevsCount = deletedTotalRevsCount; + // phase B : collectUnusedInternalPropertyRevisions + if (!ignoreInternalProperties) { + deletedTotalRevsCount = collectUnusedInternalPropertyRevisions(doc, phases, updateOp, traversedMainNode, allKeepRevs, deletedTotalRevsCount); + } + + // then some accounting... + int deletedInternalRevsCount = deletedTotalRevsCount - deletedUserRevsCount; + if (deletedUserRevsCount != 0) { + deletedPropRevsCountMap.merge(doc.getId(), deletedUserRevsCount, Integer::sum); + } + if (deletedInternalRevsCount != 0) { + deletedInternalPropRevsCountMap.merge(doc.getId(), deletedInternalRevsCount, Integer::sum); + } + phases.stop(GCPhase.DETAILED_GC_COLLECT_OLD_REVS); + } + + private int collectUnusedUserPropertyRevisions(final NodeDocument doc, + final GCPhases phases, final UpdateOp updateOp, + final DocumentNodeState traversedMainNode, + final Set<Revision> allKeepRevs) { + int deletedRevsCount = 0; + + // phase 1 : non bundled nodes only + for (PropertyState prop : traversedMainNode.getProperties()) { + final String escapedPropName = Utils.escapePropertyName(prop.getName()); + deletedRevsCount += removeUnusedPropertyEntries(doc, traversedMainNode, updateOp, escapedPropName, + (r) -> { updateOp.removeMapEntry(escapedPropName, r); return null;}, + allKeepRevs); + } + + // phase 2 : bundled nodes only + final Map<Path, DocumentNodeState> bundledNodeStates = new HashMap<>(); + for (DocumentNodeState dns : traversedMainNode.getAllBundledNodesStates()) { + bundledNodeStates.put(dns.getPath(), dns); + } + // remember that getAllBundledProperties returns unescaped keys + for (String propName : traversedMainNode.getAllBundledProperties().keySet()) { + final int lastSlash = propName.lastIndexOf("/"); + if (lastSlash == -1) { + // then it is an unbundled property which was already handled in phase 1 + continue; + } + final String escapedPropName = Utils.escapePropertyName(propName); + // bundled values are of format sub/tree/propertyKey + // to extract this we need the last index of "/" + final String unbundledSubtreeName = propName.substring(0, lastSlash); + final String unbundledPropName = propName.substring(lastSlash + 1); + final String unbundledPath = traversedMainNode.getPath().toString() + "/" + unbundledSubtreeName; + final DocumentNodeState traversedNode = bundledNodeStates.get(Path.fromString(unbundledPath)); + if (traversedNode == null) { + log.error("collectUnusedPropertyRevisions : could not find traversed node for bundled key {} unbundledPath {} in doc {}", + propName, unbundledPath, doc.getId()); + continue; + } + final PropertyState traversedProperty = traversedNode.getProperty(unbundledPropName); + if (traversedProperty == null) { + log.error("collectUnusedPropertyRevisions : could not get property {} from traversed node {}", + unbundledPropName, traversedNode.getPath()); + continue; + } + deletedRevsCount += removeUnusedPropertyEntries(doc, traversedNode, updateOp, escapedPropName, + (r) -> { updateOp.removeMapEntry(escapedPropName, r); return null;}, + allKeepRevs); + } + + // phase 3 : "_deleted" + int numDeleted = removeUnusedPropertyEntries(doc, traversedMainNode, + updateOp, NodeDocument.DELETED, + (r) -> { NodeDocument.removeDeleted(updateOp, r); return null;} , + allKeepRevs); + deletedRevsCount += numDeleted; + return deletedRevsCount; + } + + private int collectUnusedInternalPropertyRevisions(final NodeDocument doc, + final GCPhases phases, final UpdateOp updateOp, + final DocumentNodeState traversedMainNode, + final Set<Revision> toKeepUserPropRevs, + int deletedRevsCount) { + boolean hasUnmergedBranchCommits = false; + for (Revision localBc : doc.getLocalBranchCommits()) { + if (!isCommitted(nodeStore.getCommitValue(localBc, doc))) { + hasUnmergedBranchCommits = true; + } + } + if (deletedRevsCount == 0 && !hasUnmergedBranchCommits) { + return deletedRevsCount; + } + // if we did some rev deletion OR there are unmerged BCs, then let's deep-dive + Set<Revision> allRequiredRevs = new HashSet<>(toKeepUserPropRevs); + for (Revision revision : doc.getLocalMap(NodeDocument.COLLISIONS).keySet()) { + if (!allRequiredRevs.contains(revision)) { + Operation has = updateOp.getChanges().get(new Key(NodeDocument.COLLISIONS, revision)); + if (has != null) { + // then skip + continue; + } + NodeDocument.removeCollision(updateOp, revision); + deletedRevsCount++; + } + } + // "_revisions" + for (Entry<Revision, String> e : doc.getLocalRevisions().entrySet()) { + Revision revision = e.getKey(); + if (allRequiredRevs.contains(revision)) { + // if it is still referenced locally, keep it + continue; + } + final boolean isRoot = doc.getId().equals(Utils.getIdFromPath(Path.ROOT)); + // local bcs only considered for removal + final boolean isBC = !doc.getLocalBranchCommits().contains(revision); + final boolean newerThanSweep = nodeStore.getSweepRevisions().isRevisionNewer(revision); + if (newerThanSweep) { + //TODO wondering if we should at all do any DGC on documents newer than sweep + if (isBC) { + // analysed further down, we can remove them if unmerged + } else { + // for normal commits, we need to keep them -> also add to allRequiredRevs + allRequiredRevs.add(revision); + continue; + } + } + // committed revisions are hard to delete, as determining + // whether or not they are orphaned is difficult. + // child nodes could be having _commitRoots pointing to this doc + // and this revision - in which case it wouldn't be orphaned. + // without scanning the children (or having knowledge about + // which revisions are referenced by children) it is unsafe + // to delete committed revisions. + // if they are uncommitted, then they are garbage, as then they + // are either normal commits (thus a not finished commit) + // or an unmerged branch commit. In both cases they are garbage. + boolean isCommitted = isCommitted(nodeStore.getCommitValue(revision, doc)); + if (isCommitted) { + if (isRoot) { + // root : cannot remove since it could be referenced and required by a child + // also add to allRequiredRevs + allRequiredRevs.add(revision); + continue; + } else if (!isBC) { + // non root and normal : cannot remove, same as above, + // it could be referenced and required by a child + // also add to allRequiredRevs + allRequiredRevs.add(revision); + continue; + } else { + // non root and bc : can remove, as non root bc cannot be referenced by child + } + } + Operation has = updateOp.getChanges().get(new Key(NodeDocument.REVISIONS, revision)); + if (has != null) { + // then skip + continue; + } + NodeDocument.removeRevision(updateOp, revision); + deletedRevsCount++; + } + // "_commitRoot" + for (Revision revision : doc.getLocalCommitRoot().keySet()) { + if (!allRequiredRevs.contains(revision)) { + Operation has = updateOp.getChanges().get(new Key(NodeDocument.COMMIT_ROOT, revision)); + if (has != null) { + // then skip + continue; + } + NodeDocument.removeCommitRoot(updateOp, revision); + deletedRevsCount++; + } + } + // "_bc" + for (Revision revision : doc.getLocalBranchCommits()) { + if (!allRequiredRevs.contains(revision)) { + Operation has = updateOp.getChanges().get(new Key(NodeDocument.BRANCH_COMMITS, revision)); + if (has != null) { + // then skip + continue; + } + + NodeDocument.removeBranchCommit(updateOp, revision); + deletedRevsCount++; + } + } + return deletedRevsCount; + } + + private int removeUnusedPropertyEntries(NodeDocument doc, + DocumentNodeState traversedMainNode, UpdateOp updateOp, + String propertyKey, Function<Revision, Void> removeRevision, Review Comment: +1 indeed more appropriate, done now in https://github.com/apache/jackrabbit-oak/pull/1372/commits/4506faa4764d833bb87f798ae182f248608dcf1f -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@jackrabbit.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org