dsmiley commented on code in PR #4277:
URL: https://github.com/apache/solr/pull/4277#discussion_r3090778383


##########
solr/core/src/java/org/apache/solr/handler/component/CombinedQueryComponent.java:
##########
@@ -296,6 +317,10 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest 
sreq) {
     long approximateTotalHits = 0;
     Map<String, List<ShardDoc>> shardDocMap = new HashMap<>();
     String[] queriesToCombineKeys = 
rb.req.getParams().getParams(CombinerParams.COMBINER_QUERY);
+    // Build per-shard set of doc IDs retained after collapse in simpleCombine.
+    // Used to filter per-query docs so that RRF doesn't reintroduce docs
+    // excluded by collapse at the shard level.
+    Map<String, Set<Object>> combinedDocIds = new HashMap<>();

Review Comment:
   again; the map should clarify what the key is.



##########
solr/core/src/test/org/apache/solr/handler/component/DistributedCombinedQueryComponentTest.java:
##########
@@ -290,4 +290,114 @@ public void testForcedDistrib() throws Exception {
     QueryResponse rsp = query("qt", "/forcedDistribTest", "q", "*:*", "rows", 
"0");
     // ForcedDistribSearchHandler would trigger a failure if this didn't work
   }
+
+  /**
+   * Reproduces duplicate documents in combined query results when using 
collapse. Two different
+   * sub-queries may select different group heads for the same collapse field 
value, and
+   * simpleCombine() merges by Lucene doc ID — not by collapse field value — 
so both survive.
+   *
+   * <p>Data setup (mod3_idv is the collapse field, values cycle 
1,2,0,1,2,0,...):
+   *
+   * <pre>
+   *   id=1  mod3_idv=1  text="alpha bravo"     title="alpha bravo"
+   *   id=2  mod3_idv=2  text="alpha charlie"    title="alpha charlie"
+   *   id=3  mod3_idv=0  text="bravo delta"      title="bravo delta"
+   *   id=4  mod3_idv=1  text="charlie delta"    title="charlie delta"
+   *   id=5  mod3_idv=2  text="alpha delta"      title="alpha delta"
+   *   id=6  mod3_idv=0  text="bravo charlie"    title="bravo charlie"
+   * </pre>
+   *
+   * <p>Query 1: "alpha bravo" → matches docs 1,2,3,5,6 with varying scores. 
After collapse on
+   * mod3_idv, picks one group head per value (0,1,2).
+   *
+   * <p>Query 2: "charlie delta" → matches docs 2,3,4,5,6 with varying scores. 
After collapse on
+   * mod3_idv, picks one group head per value (0,1,2).
+   *
+   * <p>Because the queries score differently, they may pick DIFFERENT group 
heads for the same
+   * mod3_idv value. After simpleCombine() merges by doc ID, both heads appear 
→ duplicates on the
+   * collapse field.
+   */
+  @Test
+  public void testCollapseWithCombinedQueryProducesDuplicates() throws 
Exception {
+    del("*:*");
+
+    // Index 6 docs where mod3_idv groups docs: {3,6}→0, {1,4}→1, {2,5}→2
+    // All docs indexed to the same shard (first client) to ensure co-location,
+    // which is a requirement for collapse in SolrCloud.
+    List<SolrInputDocument> docs = new ArrayList<>();
+    String[][] data = {
+      {"1", "alpha bravo", "alpha bravo"},
+      {"2", "alpha charlie", "alpha charlie"},
+      {"3", "bravo delta", "bravo delta"},
+      {"4", "charlie delta", "charlie delta"},
+      {"5", "alpha delta", "alpha delta"},
+      {"6", "bravo charlie", "bravo charlie"},
+    };
+    for (String[] row : data) {
+      SolrInputDocument doc = new SolrInputDocument();
+      doc.addField("id", row[0]);
+      doc.addField("text", row[1]);
+      doc.addField("title", row[2]);
+      int idVal = Integer.parseInt(row[0]);
+      doc.addField("mod3_idv", idVal % 3);
+      doc.addField("mod3_sdv", String.valueOf(idVal % 3));
+      docs.add(doc);
+    }
+    // Index all docs to the first shard (co-location for collapse).
+    // indexDoc(client, doc) also indexes to controlClient automatically.
+    for (SolrInputDocument doc : docs) {
+      indexDoc(clients.getFirst(), doc);
+    }
+    commit();
+
+    // Two queries that score docs differently, both collapsed on mod3_idv.
+    // Query 1 "alpha bravo" favours doc 1 (mod3_idv=1) and doc 3 (mod3_idv=0).
+    // Query 2 "charlie delta" favours doc 4 (mod3_idv=1) and doc 3 or 6 
(mod3_idv=0).
+    // After collapse, each query returns 3 docs (one per mod3_idv value 
0,1,2).
+    // But the group heads for mod3_idv=1 differ: query1 picks id=1, query2 
picks id=4.
+    // simpleCombine() merges by doc ID, so both id=1 and id=4 survive → 
duplicate on mod3_idv=1.
+    // q1 -> 1,2,5,3,6    q2 -> 2,4,6,3,5
+    // q1 -> 1,2,2,0,0    q2 -> 2,1,0,0,2
+    // q1 -> 1 for sure, 2,3  => 4 for sure, 3 and 2
+    String jsonQuery =
+        "{\"queries\":"

Review Comment:
   we are on Java 21 now; this really calls for a multi-line string.



##########
solr/core/src/test/org/apache/solr/handler/component/CombinedQuerySolrCloudTest.java:
##########
@@ -249,6 +269,41 @@ public void testQueriesWithFacetAndHighlights() throws 
Exception {
         rsp.getHighlighting().get("5").get("title").getFirst());
   }
 
+  /**
+   * Tests the combined query feature with faceting, highlighting and collapse.
+   *
+   * @throws Exception if any unexpected error occurs during the test 
execution.
+   */
+  @Test
+  public void testQueriesWithFacetAndHighlightsCollapse() throws Exception {
+    // Re-index all docs to the first shard for co-location (required for 
collapse in SolrCloud).

Review Comment:
   again; needless SolrCloud reference



##########
solr/core/src/test/org/apache/solr/handler/component/CombinedQuerySolrCloudTest.java:
##########
@@ -249,6 +269,41 @@ public void testQueriesWithFacetAndHighlights() throws 
Exception {
         rsp.getHighlighting().get("5").get("title").getFirst());
   }
 
+  /**
+   * Tests the combined query feature with faceting, highlighting and collapse.
+   *
+   * @throws Exception if any unexpected error occurs during the test 
execution.
+   */
+  @Test
+  public void testQueriesWithFacetAndHighlightsCollapse() throws Exception {
+    // Re-index all docs to the first shard for co-location (required for 
collapse in SolrCloud).
+    prepareIndexDocsColocated();
+    String jsonQuery =
+        "{\"queries\":"

Review Comment:
   multi-line string please



##########
solr/core/src/java/org/apache/solr/handler/component/CombinedQueryComponent.java:
##########
@@ -296,6 +317,10 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest 
sreq) {
     long approximateTotalHits = 0;
     Map<String, List<ShardDoc>> shardDocMap = new HashMap<>();
     String[] queriesToCombineKeys = 
rb.req.getParams().getParams(CombinerParams.COMBINER_QUERY);
+    // Build per-shard set of doc IDs retained after collapse in simpleCombine.

Review Comment:
   Can you elaborate on this please?  I'm lost.  Is this theoretical?  Tested?



##########
solr/core/src/test/org/apache/solr/handler/component/CombinedQuerySolrCloudTest.java:
##########
@@ -58,6 +60,28 @@ protected String getCloudSolrConfig() {
   }
 
   private synchronized void prepareIndexDocs() throws Exception {
+    List<SolrInputDocument> docs = getSolrDocuments();
+    del("*:*");
+    for (SolrInputDocument doc : docs) {
+      indexDoc(doc);
+    }
+    commit();
+  }
+
+  /** Index all docs to the same shard for co-location (limitation for 
collapse in SolrCloud). */

Review Comment:
   no need to reference SolrCloud, it's a limitation of collapse itself.



##########
solr/core/src/java/org/apache/solr/handler/component/CombinedQueryComponent.java:
##########
@@ -258,6 +266,19 @@ private void prepareCombinedResponseBuilder(
     }
   }
 
+  /** Extracts the CollapsingPostFilter from the filter list, if present. */
+  private static CollapsingQParserPlugin.CollapsingPostFilter 
getCollapseFilter(

Review Comment:
   I suppose we could have more than one?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to