Yuti-G commented on a change in pull request #747: URL: https://github.com/apache/lucene/pull/747#discussion_r829320811
########## File path: lucene/facet/src/java/org/apache/lucene/facet/Facets.java ########## @@ -48,4 +48,13 @@ public abstract FacetResult getTopChildren(int topN, String dim, String... path) * indexed, for example depending on the type of document. */ public abstract List<FacetResult> getAllDims(int topN) throws IOException; + + /** + * Returns labels for topN dimensions and their topNChildren sorted by the number of hits that + * dimension matched + */ + public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException { Review comment: Thank you so much! ########## File path: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java ########## @@ -366,33 +435,55 @@ public Number getSpecificValue(String dim, String... path) throws IOException { return counts[ord]; } + /** Returns FacetResult for a dimension. */ + private FacetResult getFacetResultForDim( + String dim, + int topNChildren, + HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult) + throws IOException { + FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); + if (dimConfig.hierarchical) { + DimTree dimTree = state.getDimTree(dim); + int dimOrd = dimTree.dimStartOrd; + FacetResult fr = + getPathResult( + dimConfig, + dim, + emptyPath, + dimOrd, + dimTree.iterator(), + topNChildren, + cacheChildOrdsResult); + if (fr != null) { + return fr; + } + } else { + OrdRange ordRange = state.getOrdRange(dim); + int dimOrd = ordRange.start; + PrimitiveIterator.OfInt childIt = ordRange.iterator(); + if (dimConfig.multiValued && dimConfig.requireDimCount) { + // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed + // the dimension and we need to skip past it so the iterator is positioned on the first + // child: + childIt.next(); + } + FacetResult fr = + getPathResult( + dimConfig, dim, emptyPath, dimOrd, childIt, topNChildren, cacheChildOrdsResult); + if (fr != null) { + return fr; + } + } + return null; + } Review comment: Thanks for cleaning up code! ########## File path: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java ########## @@ -143,9 +146,49 @@ private FacetResult getPathResult( String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, - int topN) + int topN, + HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult) Review comment: Thanks for the great suggestion! This is definitely more readable. ########## File path: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java ########## @@ -190,20 +234,45 @@ private FacetResult getPathResult( String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value); } + return labelValues; + } - if (dimConfig.hierarchical == false) { + /** Returns value/count of a dimension. */ + private int getDimValue( + FacetsConfig.DimConfig dimConfig, + String dim, + int pathOrd, + PrimitiveIterator.OfInt childOrds, + int topN, + HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult) { + + // if dimConfig.hierarchical == true, return dimCount directly + if (dimConfig.hierarchical == true && pathOrd >= 0) { + return counts[pathOrd]; + } + + // if dimConfig.hierarchical == false + if (dimConfig.multiValued) { // see if dimCount is actually reliable or needs to be reset - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - dimCount = counts[pathOrd]; - } else { - dimCount = -1; // dimCount is in accurate at this point, so set it to -1 - } + if (dimConfig.requireDimCount && pathOrd >= 0) { + return counts[pathOrd]; + } else { + return -1; // dimCount is inaccurate at this point, so set it to -1 } - return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount); - } else { - return new FacetResult(dim, path, counts[pathOrd], labelValues, childCount); } + + // if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount + SortedSetDocValuesChildOrdsResult childOrdsResult = getChildOrdsResult(childOrds, topN); + if (childOrdsResult.q == null) { + return 0; + } Review comment: Yes, sorry for the negligence. Thanks! ########## File path: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java ########## @@ -190,20 +234,45 @@ private FacetResult getPathResult( String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value); } + return labelValues; + } - if (dimConfig.hierarchical == false) { + /** Returns value/count of a dimension. */ + private int getDimValue( + FacetsConfig.DimConfig dimConfig, + String dim, + int pathOrd, + PrimitiveIterator.OfInt childOrds, + int topN, + HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult) { + + // if dimConfig.hierarchical == true, return dimCount directly + if (dimConfig.hierarchical == true && pathOrd >= 0) { Review comment: I came across a `pathOrd` check under `getTopChildren` and thought it would be safer to ensure `pathOrd` >= 0 when getting `counts[pathOrd]`, but could be redundant. I just removed it in `getDimValues` and still passed all the tests. I will update it in my next PR :) Thanks! <img width="815" alt="Screen Shot 2022-03-17 at 10 39 54 AM" src="https://user-images.githubusercontent.com/44444710/158862245-eda0f0a3-a1fc-427d-8924-ba076976f1c8.png"> ########## File path: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java ########## @@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) { return results; } + + @Override + public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException { + // Creates priority queue to store top dimensions and sort by their aggregated values/hits and + // string values. + PriorityQueue<SortedSetDocValuesDimValueResult> pq = + new PriorityQueue<>(topNDims) { + @Override + protected boolean lessThan( + SortedSetDocValuesDimValueResult a, SortedSetDocValuesDimValueResult b) { + if (a.value.intValue() > b.value.intValue()) { + return false; + } else if (a.value.intValue() < b.value.intValue()) { + return true; + } else { + return a.dim.compareTo(b.dim) > 0; + } + } + }; + + HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult = new HashMap<>(); + + for (String dim : state.getDims()) { + FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); + if (dimConfig.hierarchical) { + DimTree dimTree = state.getDimTree(dim); + int dimOrd = dimTree.dimStartOrd; + // get dim value + int dimCount = + getDimValue( + dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, cacheChildOrdsResult); + if (dimCount != 0) { + // use priority queue to store SortedSetDocValuesDimValueResult for topNDims + pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, dimCount)); + } + } else { + OrdRange ordRange = state.getOrdRange(dim); + int dimOrd = ordRange.start; + PrimitiveIterator.OfInt childIt = ordRange.iterator(); + if (dimConfig.multiValued && dimConfig.requireDimCount) { + // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed + // the dimension and we need to skip past it so the iterator is positioned on the first + // child: + childIt.next(); + } + int dimCount = + getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, cacheChildOrdsResult); + if (dimCount != 0) { + pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, dimCount)); + } + } + } + + // get FacetResult for topNDims + List<FacetResult> results = new LinkedList<>(); + while (pq.size() > 0) { + SortedSetDocValuesDimValueResult dimValueResult = pq.pop(); + if (dimValueResult != null) { + FacetResult factResult = + getFacetResultForDim(dimValueResult.dim, topNChildren, cacheChildOrdsResult); + if (factResult != null) { + results.add(0, factResult); + } + } + } + return results; + } + + /** + * Creates SortedSetDocValuesChildOrdsResult to store dimCount, childCount, and TopOrdAndIntQueue + * q for getPathResult. + */ + private class SortedSetDocValuesChildOrdsResult { Review comment: Sure! Great suggestion. Thanks! ########## File path: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java ########## @@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) { return results; } + + @Override + public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException { + // Creates priority queue to store top dimensions and sort by their aggregated values/hits and + // string values. + PriorityQueue<SortedSetDocValuesDimValueResult> pq = + new PriorityQueue<>(topNDims) { + @Override + protected boolean lessThan( + SortedSetDocValuesDimValueResult a, SortedSetDocValuesDimValueResult b) { + if (a.value.intValue() > b.value.intValue()) { + return false; + } else if (a.value.intValue() < b.value.intValue()) { + return true; + } else { + return a.dim.compareTo(b.dim) > 0; + } Review comment: Thanks :) ########## File path: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java ########## @@ -143,9 +146,49 @@ private FacetResult getPathResult( String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, - int topN) + int topN, + HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult) throws IOException { + SortedSetDocValuesChildOrdsResult childOrdsResult; + + // if getTopDims is called, get results from cacheChildOrdsResult, otherwise call + // getChildOrdsResult to get dimCount, childCount and TopOrdAndIntQueue q + if (cacheChildOrdsResult != null && cacheChildOrdsResult.containsKey(dim)) { + childOrdsResult = cacheChildOrdsResult.get(dim); + } else { + childOrdsResult = getChildOrdsResult(childOrds, topN); + } + + if (childOrdsResult.q == null) { + return null; + } + + LabelAndValue[] labelValues = getLabelValuesFromTopOrdAndIntQueue(childOrdsResult.q); + + int dimCount = childOrdsResult.dimCount; Review comment: Will do. Thanks! ########## File path: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java ########## @@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) { return results; } + + @Override + public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException { + // Creates priority queue to store top dimensions and sort by their aggregated values/hits and + // string values. + PriorityQueue<SortedSetDocValuesDimValueResult> pq = + new PriorityQueue<>(topNDims) { + @Override + protected boolean lessThan( + SortedSetDocValuesDimValueResult a, SortedSetDocValuesDimValueResult b) { + if (a.value.intValue() > b.value.intValue()) { + return false; + } else if (a.value.intValue() < b.value.intValue()) { + return true; + } else { + return a.dim.compareTo(b.dim) > 0; + } + } + }; + + HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult = new HashMap<>(); + + for (String dim : state.getDims()) { + FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); + if (dimConfig.hierarchical) { + DimTree dimTree = state.getDimTree(dim); + int dimOrd = dimTree.dimStartOrd; + // get dim value + int dimCount = + getDimValue( + dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, cacheChildOrdsResult); + if (dimCount != 0) { + // use priority queue to store SortedSetDocValuesDimValueResult for topNDims + pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, dimCount)); + } + } else { + OrdRange ordRange = state.getOrdRange(dim); + int dimOrd = ordRange.start; + PrimitiveIterator.OfInt childIt = ordRange.iterator(); + if (dimConfig.multiValued && dimConfig.requireDimCount) { + // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed + // the dimension and we need to skip past it so the iterator is positioned on the first + // child: + childIt.next(); + } + int dimCount = + getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, cacheChildOrdsResult); + if (dimCount != 0) { + pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, dimCount)); + } + } + } + + // get FacetResult for topNDims + List<FacetResult> results = new LinkedList<>(); Review comment: Sure, fixed size array is definitely less costly. Thanks! ########## File path: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java ########## @@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) { return results; } + + @Override + public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException { + // Creates priority queue to store top dimensions and sort by their aggregated values/hits and + // string values. + PriorityQueue<SortedSetDocValuesDimValueResult> pq = + new PriorityQueue<>(topNDims) { + @Override + protected boolean lessThan( + SortedSetDocValuesDimValueResult a, SortedSetDocValuesDimValueResult b) { + if (a.value.intValue() > b.value.intValue()) { + return false; + } else if (a.value.intValue() < b.value.intValue()) { + return true; + } else { + return a.dim.compareTo(b.dim) > 0; + } + } + }; + + HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult = new HashMap<>(); + + for (String dim : state.getDims()) { + FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); + if (dimConfig.hierarchical) { + DimTree dimTree = state.getDimTree(dim); + int dimOrd = dimTree.dimStartOrd; + // get dim value + int dimCount = + getDimValue( + dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, cacheChildOrdsResult); + if (dimCount != 0) { + // use priority queue to store SortedSetDocValuesDimValueResult for topNDims + pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, dimCount)); + } + } else { + OrdRange ordRange = state.getOrdRange(dim); + int dimOrd = ordRange.start; + PrimitiveIterator.OfInt childIt = ordRange.iterator(); + if (dimConfig.multiValued && dimConfig.requireDimCount) { + // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed + // the dimension and we need to skip past it so the iterator is positioned on the first + // child: + childIt.next(); + } + int dimCount = + getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, cacheChildOrdsResult); + if (dimCount != 0) { + pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, dimCount)); + } + } + } + + // get FacetResult for topNDims + List<FacetResult> results = new LinkedList<>(); + while (pq.size() > 0) { + SortedSetDocValuesDimValueResult dimValueResult = pq.pop(); + if (dimValueResult != null) { + FacetResult factResult = + getFacetResultForDim(dimValueResult.dim, topNChildren, cacheChildOrdsResult); + if (factResult != null) { Review comment: Ah.. sorry, I forgot to remove the null checks after changing the return type from `Number` to `int` in getDimValue. I will address this issue in the new PR. Thank you! ########## File path: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java ########## @@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) { return results; } + + @Override + public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException { + // Creates priority queue to store top dimensions and sort by their aggregated values/hits and + // string values. + PriorityQueue<SortedSetDocValuesDimValueResult> pq = + new PriorityQueue<>(topNDims) { + @Override + protected boolean lessThan( + SortedSetDocValuesDimValueResult a, SortedSetDocValuesDimValueResult b) { + if (a.value.intValue() > b.value.intValue()) { + return false; + } else if (a.value.intValue() < b.value.intValue()) { + return true; + } else { + return a.dim.compareTo(b.dim) > 0; + } + } + }; + + HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult = new HashMap<>(); + + for (String dim : state.getDims()) { + FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); + if (dimConfig.hierarchical) { + DimTree dimTree = state.getDimTree(dim); + int dimOrd = dimTree.dimStartOrd; + // get dim value + int dimCount = + getDimValue( + dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, cacheChildOrdsResult); + if (dimCount != 0) { + // use priority queue to store SortedSetDocValuesDimValueResult for topNDims + pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, dimCount)); Review comment: I will look into that. Thanks! ########## File path: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java ########## @@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) { return results; } + + @Override + public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException { + // Creates priority queue to store top dimensions and sort by their aggregated values/hits and + // string values. + PriorityQueue<SortedSetDocValuesDimValueResult> pq = + new PriorityQueue<>(topNDims) { + @Override + protected boolean lessThan( + SortedSetDocValuesDimValueResult a, SortedSetDocValuesDimValueResult b) { + if (a.value.intValue() > b.value.intValue()) { + return false; + } else if (a.value.intValue() < b.value.intValue()) { + return true; + } else { + return a.dim.compareTo(b.dim) > 0; + } + } + }; + + HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult = new HashMap<>(); Review comment: I haven't spent enough time on ordinal, and will look into this. Thanks! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org