fabriziofortino commented on code in PR #1331: URL: https://github.com/apache/jackrabbit-oak/pull/1331#discussion_r1505500001
########## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java: ########## @@ -184,65 +181,70 @@ static Bson computeMongoQueryFilter(@NotNull MongoFilterPaths mongoFilterPaths, } } - static Bson createCustomExcludedEntriesFilter(String customRegexPattern, boolean queryUsesIndexTraversal) { - if (customRegexPattern == null || customRegexPattern.trim().isEmpty()) { - LOG.info("Mongo custom regex is disabled"); - return null; - } else { - LOG.info("Excluding nodes with paths matching regex: {}", customRegexPattern); - var pattern = Pattern.compile(customRegexPattern); - Bson pathFilter = createPathFilter(List.of(pattern), queryUsesIndexTraversal); - return Filters.nor(Filters.regex(NodeDocument.ID, pattern), pathFilter); - } - } - - private static Bson descendantsFilter(List<String> paths, boolean queryUsesIndexTraversal) { + private static List<Pattern> descendantsIncludedPatterns(List<String> paths) { if (paths.isEmpty()) { - return null; + return List.of(); } if (paths.size() == 1 && paths.get(0).equals("/")) { - return null; + return List.of(); } - // The filter for descendants of a list of paths is a series of or conditions. For each path, we have to build - // two conditions in two different fields of the documents: - // _ _id - for non-long paths - In this case, the _id is of the form "2:/foo/bar" - // _ _path - for long paths - In this case, the _id is a hash and the document contains an additional _path - // field with the path of the document. + // The filter for descendants of a list of paths is a series of or conditions, each a regex filter on the _id + // field. // We use the $in operator with a regular expression to match the paths. // https://www.mongodb.com/docs/manual/reference/operator/query/in/#use-the--in-operator-with-a-regular-expression - ArrayList<Pattern> pathPatterns = new ArrayList<>(); - ArrayList<Pattern> idPatterns = new ArrayList<>(); + ArrayList<Pattern> patterns = new ArrayList<>(); for (String path : paths) { if (!path.endsWith("/")) { path = path + "/"; } String quotedPath = Pattern.quote(path); - idPatterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$")); - pathPatterns.add(Pattern.compile("^" + quotedPath + ".*$")); + patterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$")); } + // The conditions above on the _id field is not enough to match all JCR nodes in the given paths because nodes + // with paths longer than a certain threshold, are represented by Mongo documents where the _id field is replaced + // by a hash and the full path is stored in an additional field _path. To retrieve these long path documents, + // we could add a condition on the _path field, but this would slow down substantially scanning the DB, because + // the _path field is not part of the index used by this query (it's an index on _modified, _id). Therefore, + // Mongo would have to retrieve every document from the column store to evaluate the filter condition. So instead + // we add below a condition to download all the long path documents. These documents can be identified by the + // format of the _id field (<n>:h<hash>), so it is possible to identify them using only the index. + // This might download documents for nodes that are not in the included paths, but those documents will anyway + // be filtered in the transform stage. And in most repositories, the number of long path documents is very small, + // often there are none, so the extra documents downloaded will not slow down by much the download. However, the + // performance gains of evaluating the filter of the query using only the index are very significant, especially + // when the index requieres only a small number of nodes. Review Comment: nitpick: typo ```suggestion // when the index requires only a small number of nodes. ``` ########## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java: ########## @@ -184,65 +181,70 @@ static Bson computeMongoQueryFilter(@NotNull MongoFilterPaths mongoFilterPaths, } } - static Bson createCustomExcludedEntriesFilter(String customRegexPattern, boolean queryUsesIndexTraversal) { - if (customRegexPattern == null || customRegexPattern.trim().isEmpty()) { - LOG.info("Mongo custom regex is disabled"); - return null; - } else { - LOG.info("Excluding nodes with paths matching regex: {}", customRegexPattern); - var pattern = Pattern.compile(customRegexPattern); - Bson pathFilter = createPathFilter(List.of(pattern), queryUsesIndexTraversal); - return Filters.nor(Filters.regex(NodeDocument.ID, pattern), pathFilter); - } - } - - private static Bson descendantsFilter(List<String> paths, boolean queryUsesIndexTraversal) { + private static List<Pattern> descendantsIncludedPatterns(List<String> paths) { if (paths.isEmpty()) { - return null; + return List.of(); } if (paths.size() == 1 && paths.get(0).equals("/")) { - return null; + return List.of(); } - // The filter for descendants of a list of paths is a series of or conditions. For each path, we have to build - // two conditions in two different fields of the documents: - // _ _id - for non-long paths - In this case, the _id is of the form "2:/foo/bar" - // _ _path - for long paths - In this case, the _id is a hash and the document contains an additional _path - // field with the path of the document. + // The filter for descendants of a list of paths is a series of or conditions, each a regex filter on the _id + // field. // We use the $in operator with a regular expression to match the paths. // https://www.mongodb.com/docs/manual/reference/operator/query/in/#use-the--in-operator-with-a-regular-expression - ArrayList<Pattern> pathPatterns = new ArrayList<>(); - ArrayList<Pattern> idPatterns = new ArrayList<>(); + ArrayList<Pattern> patterns = new ArrayList<>(); for (String path : paths) { if (!path.endsWith("/")) { path = path + "/"; } String quotedPath = Pattern.quote(path); - idPatterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$")); - pathPatterns.add(Pattern.compile("^" + quotedPath + ".*$")); + patterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$")); } + // The conditions above on the _id field is not enough to match all JCR nodes in the given paths because nodes + // with paths longer than a certain threshold, are represented by Mongo documents where the _id field is replaced + // by a hash and the full path is stored in an additional field _path. To retrieve these long path documents, + // we could add a condition on the _path field, but this would slow down substantially scanning the DB, because + // the _path field is not part of the index used by this query (it's an index on _modified, _id). Therefore, + // Mongo would have to retrieve every document from the column store to evaluate the filter condition. So instead + // we add below a condition to download all the long path documents. These documents can be identified by the + // format of the _id field (<n>:h<hash>), so it is possible to identify them using only the index. + // This might download documents for nodes that are not in the included paths, but those documents will anyway + // be filtered in the transform stage. And in most repositories, the number of long path documents is very small, + // often there are none, so the extra documents downloaded will not slow down by much the download. However, the + // performance gains of evaluating the filter of the query using only the index are very significant, especially + // when the index requieres only a small number of nodes. + patterns.add(LONG_PATH_ID_PATTERN); + + return patterns; + } - Bson pathFilter = createPathFilter(pathPatterns, queryUsesIndexTraversal); - return Filters.or(Filters.in(NodeDocument.ID, idPatterns), pathFilter); + private static List<Pattern> descendantsExcludedPatterns(List<String> paths) { + if (paths.isEmpty()) { + return List.of(); + } + if (paths.size() == 1 && paths.get(0).equals("/")) { + return List.of(); + } + ArrayList<Pattern> patterns = new ArrayList<>(); + for (String path : paths) { + if (!path.endsWith("/")) { + path = path + "/"; + } + String quotedPath = Pattern.quote(path); + patterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$")); + } + return patterns; Review Comment: just noticed that `descendantsIncludedPatterns` and `descendantsExcludedPatterns` are almost the same. The former only adds an extra condition. I would consider reusing the logic. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@jackrabbit.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org