Yuti-G commented on a change in pull request #747:
URL: https://github.com/apache/lucene/pull/747#discussion_r829320811



##########
File path: lucene/facet/src/java/org/apache/lucene/facet/Facets.java
##########
@@ -48,4 +48,13 @@ public abstract FacetResult getTopChildren(int topN, String 
dim, String... path)
    * indexed, for example depending on the type of document.
    */
   public abstract List<FacetResult> getAllDims(int topN) throws IOException;
+
+  /**
+   * Returns labels for topN dimensions and their topNChildren sorted by the 
number of hits that
+   * dimension matched
+   */
+  public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws 
IOException {

Review comment:
       Thank you so much! 

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -366,33 +435,55 @@ public Number getSpecificValue(String dim, String... 
path) throws IOException {
     return counts[ord];
   }
 
+  /** Returns FacetResult for a dimension. */
+  private FacetResult getFacetResultForDim(
+      String dim,
+      int topNChildren,
+      HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult)
+      throws IOException {
+    FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
+    if (dimConfig.hierarchical) {
+      DimTree dimTree = state.getDimTree(dim);
+      int dimOrd = dimTree.dimStartOrd;
+      FacetResult fr =
+          getPathResult(
+              dimConfig,
+              dim,
+              emptyPath,
+              dimOrd,
+              dimTree.iterator(),
+              topNChildren,
+              cacheChildOrdsResult);
+      if (fr != null) {
+        return fr;
+      }
+    } else {
+      OrdRange ordRange = state.getOrdRange(dim);
+      int dimOrd = ordRange.start;
+      PrimitiveIterator.OfInt childIt = ordRange.iterator();
+      if (dimConfig.multiValued && dimConfig.requireDimCount) {
+        // If the dim is multi-valued and requires dim counts, we know we've 
explicitly indexed
+        // the dimension and we need to skip past it so the iterator is 
positioned on the first
+        // child:
+        childIt.next();
+      }
+      FacetResult fr =
+          getPathResult(
+              dimConfig, dim, emptyPath, dimOrd, childIt, topNChildren, 
cacheChildOrdsResult);
+      if (fr != null) {
+        return fr;
+      }
+    }
+    return null;
+  }

Review comment:
       Thanks for cleaning up code!

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -143,9 +146,49 @@ private FacetResult getPathResult(
       String[] path,
       int pathOrd,
       PrimitiveIterator.OfInt childOrds,
-      int topN)
+      int topN,
+      HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult)

Review comment:
       Thanks for the great suggestion! This is definitely more readable.

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -190,20 +234,45 @@ private FacetResult getPathResult(
       String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
       labelValues[i] = new LabelAndValue(parts[parts.length - 1], 
ordAndValue.value);
     }
+    return labelValues;
+  }
 
-    if (dimConfig.hierarchical == false) {
+  /** Returns value/count of a dimension. */
+  private int getDimValue(
+      FacetsConfig.DimConfig dimConfig,
+      String dim,
+      int pathOrd,
+      PrimitiveIterator.OfInt childOrds,
+      int topN,
+      HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult) 
{
+
+    // if dimConfig.hierarchical == true, return dimCount directly
+    if (dimConfig.hierarchical == true && pathOrd >= 0) {
+      return counts[pathOrd];
+    }
+
+    // if dimConfig.hierarchical == false
+    if (dimConfig.multiValued) {
       // see if dimCount is actually reliable or needs to be reset
-      if (dimConfig.multiValued) {
-        if (dimConfig.requireDimCount) {
-          dimCount = counts[pathOrd];
-        } else {
-          dimCount = -1; // dimCount is in accurate at this point, so set it 
to -1
-        }
+      if (dimConfig.requireDimCount && pathOrd >= 0) {
+        return counts[pathOrd];
+      } else {
+        return -1; // dimCount is inaccurate at this point, so set it to -1
       }
-      return new FacetResult(dim, emptyPath, dimCount, labelValues, 
childCount);
-    } else {
-      return new FacetResult(dim, path, counts[pathOrd], labelValues, 
childCount);
     }
+
+    // if dimCount was not aggregated at indexing time, iterate over childOrds 
to get dimCount
+    SortedSetDocValuesChildOrdsResult childOrdsResult = 
getChildOrdsResult(childOrds, topN);
+    if (childOrdsResult.q == null) {
+      return 0;
+    }

Review comment:
       Yes, sorry for the negligence. Thanks!

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -190,20 +234,45 @@ private FacetResult getPathResult(
       String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
       labelValues[i] = new LabelAndValue(parts[parts.length - 1], 
ordAndValue.value);
     }
+    return labelValues;
+  }
 
-    if (dimConfig.hierarchical == false) {
+  /** Returns value/count of a dimension. */
+  private int getDimValue(
+      FacetsConfig.DimConfig dimConfig,
+      String dim,
+      int pathOrd,
+      PrimitiveIterator.OfInt childOrds,
+      int topN,
+      HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult) 
{
+
+    // if dimConfig.hierarchical == true, return dimCount directly
+    if (dimConfig.hierarchical == true && pathOrd >= 0) {

Review comment:
       I came across a `pathOrd` check under `getTopChildren` and thought it 
would be safer to ensure `pathOrd` >= 0 when getting `counts[pathOrd]`, but 
could be redundant. I just removed it in `getDimValues` and still passed all 
the tests. I will update it in my next PR :) Thanks!
   
   <img width="815" alt="Screen Shot 2022-03-17 at 10 39 54 AM" 
src="https://user-images.githubusercontent.com/44444710/158862245-eda0f0a3-a1fc-427d-8924-ba076976f1c8.png";>
   

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) {
 
     return results;
   }
+
+  @Override
+  public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws 
IOException {
+    // Creates priority queue to store top dimensions and sort by their 
aggregated values/hits and
+    // string values.
+    PriorityQueue<SortedSetDocValuesDimValueResult> pq =
+        new PriorityQueue<>(topNDims) {
+          @Override
+          protected boolean lessThan(
+              SortedSetDocValuesDimValueResult a, 
SortedSetDocValuesDimValueResult b) {
+            if (a.value.intValue() > b.value.intValue()) {
+              return false;
+            } else if (a.value.intValue() < b.value.intValue()) {
+              return true;
+            } else {
+              return a.dim.compareTo(b.dim) > 0;
+            }
+          }
+        };
+
+    HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult = 
new HashMap<>();
+
+    for (String dim : state.getDims()) {
+      FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
+      if (dimConfig.hierarchical) {
+        DimTree dimTree = state.getDimTree(dim);
+        int dimOrd = dimTree.dimStartOrd;
+        // get dim value
+        int dimCount =
+            getDimValue(
+                dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, 
cacheChildOrdsResult);
+        if (dimCount != 0) {
+          // use priority queue to store SortedSetDocValuesDimValueResult for 
topNDims
+          pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, 
dimCount));
+        }
+      } else {
+        OrdRange ordRange = state.getOrdRange(dim);
+        int dimOrd = ordRange.start;
+        PrimitiveIterator.OfInt childIt = ordRange.iterator();
+        if (dimConfig.multiValued && dimConfig.requireDimCount) {
+          // If the dim is multi-valued and requires dim counts, we know we've 
explicitly indexed
+          // the dimension and we need to skip past it so the iterator is 
positioned on the first
+          // child:
+          childIt.next();
+        }
+        int dimCount =
+            getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, 
cacheChildOrdsResult);
+        if (dimCount != 0) {
+          pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, 
dimCount));
+        }
+      }
+    }
+
+    // get FacetResult for topNDims
+    List<FacetResult> results = new LinkedList<>();
+    while (pq.size() > 0) {
+      SortedSetDocValuesDimValueResult dimValueResult = pq.pop();
+      if (dimValueResult != null) {
+        FacetResult factResult =
+            getFacetResultForDim(dimValueResult.dim, topNChildren, 
cacheChildOrdsResult);
+        if (factResult != null) {
+          results.add(0, factResult);
+        }
+      }
+    }
+    return results;
+  }
+
+  /**
+   * Creates SortedSetDocValuesChildOrdsResult to store dimCount, childCount, 
and TopOrdAndIntQueue
+   * q for getPathResult.
+   */
+  private class SortedSetDocValuesChildOrdsResult {

Review comment:
       Sure! Great suggestion. Thanks!

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) {
 
     return results;
   }
+
+  @Override
+  public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws 
IOException {
+    // Creates priority queue to store top dimensions and sort by their 
aggregated values/hits and
+    // string values.
+    PriorityQueue<SortedSetDocValuesDimValueResult> pq =
+        new PriorityQueue<>(topNDims) {
+          @Override
+          protected boolean lessThan(
+              SortedSetDocValuesDimValueResult a, 
SortedSetDocValuesDimValueResult b) {
+            if (a.value.intValue() > b.value.intValue()) {
+              return false;
+            } else if (a.value.intValue() < b.value.intValue()) {
+              return true;
+            } else {
+              return a.dim.compareTo(b.dim) > 0;
+            }

Review comment:
       Thanks :)

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -143,9 +146,49 @@ private FacetResult getPathResult(
       String[] path,
       int pathOrd,
       PrimitiveIterator.OfInt childOrds,
-      int topN)
+      int topN,
+      HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult)
       throws IOException {
 
+    SortedSetDocValuesChildOrdsResult childOrdsResult;
+
+    // if getTopDims is called, get results from cacheChildOrdsResult, 
otherwise call
+    // getChildOrdsResult to get dimCount, childCount and TopOrdAndIntQueue q
+    if (cacheChildOrdsResult != null && cacheChildOrdsResult.containsKey(dim)) 
{
+      childOrdsResult = cacheChildOrdsResult.get(dim);
+    } else {
+      childOrdsResult = getChildOrdsResult(childOrds, topN);
+    }
+
+    if (childOrdsResult.q == null) {
+      return null;
+    }
+
+    LabelAndValue[] labelValues = 
getLabelValuesFromTopOrdAndIntQueue(childOrdsResult.q);
+
+    int dimCount = childOrdsResult.dimCount;

Review comment:
       Will do. Thanks!

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) {
 
     return results;
   }
+
+  @Override
+  public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws 
IOException {
+    // Creates priority queue to store top dimensions and sort by their 
aggregated values/hits and
+    // string values.
+    PriorityQueue<SortedSetDocValuesDimValueResult> pq =
+        new PriorityQueue<>(topNDims) {
+          @Override
+          protected boolean lessThan(
+              SortedSetDocValuesDimValueResult a, 
SortedSetDocValuesDimValueResult b) {
+            if (a.value.intValue() > b.value.intValue()) {
+              return false;
+            } else if (a.value.intValue() < b.value.intValue()) {
+              return true;
+            } else {
+              return a.dim.compareTo(b.dim) > 0;
+            }
+          }
+        };
+
+    HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult = 
new HashMap<>();
+
+    for (String dim : state.getDims()) {
+      FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
+      if (dimConfig.hierarchical) {
+        DimTree dimTree = state.getDimTree(dim);
+        int dimOrd = dimTree.dimStartOrd;
+        // get dim value
+        int dimCount =
+            getDimValue(
+                dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, 
cacheChildOrdsResult);
+        if (dimCount != 0) {
+          // use priority queue to store SortedSetDocValuesDimValueResult for 
topNDims
+          pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, 
dimCount));
+        }
+      } else {
+        OrdRange ordRange = state.getOrdRange(dim);
+        int dimOrd = ordRange.start;
+        PrimitiveIterator.OfInt childIt = ordRange.iterator();
+        if (dimConfig.multiValued && dimConfig.requireDimCount) {
+          // If the dim is multi-valued and requires dim counts, we know we've 
explicitly indexed
+          // the dimension and we need to skip past it so the iterator is 
positioned on the first
+          // child:
+          childIt.next();
+        }
+        int dimCount =
+            getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, 
cacheChildOrdsResult);
+        if (dimCount != 0) {
+          pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, 
dimCount));
+        }
+      }
+    }
+
+    // get FacetResult for topNDims
+    List<FacetResult> results = new LinkedList<>();

Review comment:
       Sure, fixed size array is definitely less costly. Thanks!

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) {
 
     return results;
   }
+
+  @Override
+  public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws 
IOException {
+    // Creates priority queue to store top dimensions and sort by their 
aggregated values/hits and
+    // string values.
+    PriorityQueue<SortedSetDocValuesDimValueResult> pq =
+        new PriorityQueue<>(topNDims) {
+          @Override
+          protected boolean lessThan(
+              SortedSetDocValuesDimValueResult a, 
SortedSetDocValuesDimValueResult b) {
+            if (a.value.intValue() > b.value.intValue()) {
+              return false;
+            } else if (a.value.intValue() < b.value.intValue()) {
+              return true;
+            } else {
+              return a.dim.compareTo(b.dim) > 0;
+            }
+          }
+        };
+
+    HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult = 
new HashMap<>();
+
+    for (String dim : state.getDims()) {
+      FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
+      if (dimConfig.hierarchical) {
+        DimTree dimTree = state.getDimTree(dim);
+        int dimOrd = dimTree.dimStartOrd;
+        // get dim value
+        int dimCount =
+            getDimValue(
+                dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, 
cacheChildOrdsResult);
+        if (dimCount != 0) {
+          // use priority queue to store SortedSetDocValuesDimValueResult for 
topNDims
+          pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, 
dimCount));
+        }
+      } else {
+        OrdRange ordRange = state.getOrdRange(dim);
+        int dimOrd = ordRange.start;
+        PrimitiveIterator.OfInt childIt = ordRange.iterator();
+        if (dimConfig.multiValued && dimConfig.requireDimCount) {
+          // If the dim is multi-valued and requires dim counts, we know we've 
explicitly indexed
+          // the dimension and we need to skip past it so the iterator is 
positioned on the first
+          // child:
+          childIt.next();
+        }
+        int dimCount =
+            getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, 
cacheChildOrdsResult);
+        if (dimCount != 0) {
+          pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, 
dimCount));
+        }
+      }
+    }
+
+    // get FacetResult for topNDims
+    List<FacetResult> results = new LinkedList<>();
+    while (pq.size() > 0) {
+      SortedSetDocValuesDimValueResult dimValueResult = pq.pop();
+      if (dimValueResult != null) {
+        FacetResult factResult =
+            getFacetResultForDim(dimValueResult.dim, topNChildren, 
cacheChildOrdsResult);
+        if (factResult != null) {

Review comment:
       Ah.. sorry, I forgot to remove the null checks after changing the return 
type from `Number` to `int` in getDimValue. I will address this issue in the 
new PR. Thank you! 

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) {
 
     return results;
   }
+
+  @Override
+  public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws 
IOException {
+    // Creates priority queue to store top dimensions and sort by their 
aggregated values/hits and
+    // string values.
+    PriorityQueue<SortedSetDocValuesDimValueResult> pq =
+        new PriorityQueue<>(topNDims) {
+          @Override
+          protected boolean lessThan(
+              SortedSetDocValuesDimValueResult a, 
SortedSetDocValuesDimValueResult b) {
+            if (a.value.intValue() > b.value.intValue()) {
+              return false;
+            } else if (a.value.intValue() < b.value.intValue()) {
+              return true;
+            } else {
+              return a.dim.compareTo(b.dim) > 0;
+            }
+          }
+        };
+
+    HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult = 
new HashMap<>();
+
+    for (String dim : state.getDims()) {
+      FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
+      if (dimConfig.hierarchical) {
+        DimTree dimTree = state.getDimTree(dim);
+        int dimOrd = dimTree.dimStartOrd;
+        // get dim value
+        int dimCount =
+            getDimValue(
+                dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, 
cacheChildOrdsResult);
+        if (dimCount != 0) {
+          // use priority queue to store SortedSetDocValuesDimValueResult for 
topNDims
+          pq.insertWithOverflow(new SortedSetDocValuesDimValueResult(dim, 
dimCount));

Review comment:
       I will look into that. Thanks!

##########
File path: 
lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
##########
@@ -414,4 +505,101 @@ public int compare(FacetResult a, FacetResult b) {
 
     return results;
   }
+
+  @Override
+  public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws 
IOException {
+    // Creates priority queue to store top dimensions and sort by their 
aggregated values/hits and
+    // string values.
+    PriorityQueue<SortedSetDocValuesDimValueResult> pq =
+        new PriorityQueue<>(topNDims) {
+          @Override
+          protected boolean lessThan(
+              SortedSetDocValuesDimValueResult a, 
SortedSetDocValuesDimValueResult b) {
+            if (a.value.intValue() > b.value.intValue()) {
+              return false;
+            } else if (a.value.intValue() < b.value.intValue()) {
+              return true;
+            } else {
+              return a.dim.compareTo(b.dim) > 0;
+            }
+          }
+        };
+
+    HashMap<String, SortedSetDocValuesChildOrdsResult> cacheChildOrdsResult = 
new HashMap<>();

Review comment:
       I haven't spent enough time on ordinal, and will look into this. Thanks!




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to