Re: [PR] SOLR-18189: Improve tracking of duplicate Solr docs with content hash [solr]

via GitHub Fri, 03 Apr 2026 10:00:54 -0700


dsmiley commented on code in PR #4263:
URL: https://github.com/apache/solr/pull/4263#discussion_r3033423428



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values,
+ * and uses this hash value to reject/accept doc updates.
+ *
+ * <ul>
+ *   <li>When no corresponding doc with same id exists (create), computed hash 
is added to the
+ *       document.
+ *   <li>When a previous doc exists (update), a new hash is computed using new 
version values and
+ *       compared with old hash.
+ * </ul>
+ *
+ * Depending on {#discardSameDocuments} value, this processor may reject or 
accept doc update. This
+ * implementation can be used for monitoring or rejecting no-op updates 
(updates that do not change
+ * Solr document).
+ *
+ * <p>Note: hash is computed using {@link Lookup3Signature}.
+ *
+ * @see Lookup3Signature
+ */
+public class ContentHashVersionProcessor extends UpdateRequestProcessor {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private final SchemaField hashField;
+  private final SolrQueryResponse rsp;
+  private final SolrCore core;
+  private final Predicate<String> includedFields; // Matcher for included 
fields in hash
+  private final Predicate<String> excludedFields; // Matcher for excluded 
fields from hash
+  private OldDocProvider oldDocProvider = new DefaultDocProvider();
+  private boolean discardSameDocuments;
+  private int sameCount = 0;
+  private int differentCount = 0;
+  private int unknownCount = 0;
+
+  public ContentHashVersionProcessor(
+      Predicate<String> hashIncludedFields,
+      Predicate<String> hashExcludedFields,
+      String hashFieldName,
+      SolrQueryRequest req,
+      SolrQueryResponse rsp,
+      UpdateRequestProcessor next) {
+    super(next);
+    this.core = req.getCore();
+    this.hashField = new SchemaField(hashFieldName, new TextField());
+    this.rsp = rsp;
+    this.includedFields = hashIncludedFields;
+    this.excludedFields = hashExcludedFields;
+  }
+
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    SolrInputDocument newDoc = cmd.getSolrInputDocument();
+    String newHash = computeDocHash(newDoc);
+    newDoc.setField(hashField.getName(), newHash);
+    int i = 0;
+
+    if (!validateHash(cmd.getIndexedId(), newHash)) {
+      return;
+    }
+
+    while (true) {
+      logOverlyFailedRetries(i, cmd);
+      try {
+        super.processAdd(cmd);
+        return;
+      } catch (SolrException e) {
+        if (e.code() != 409) {
+          throw e;
+        }
+        ++i;
+      }
+    }
+  }
+
+  @Override
+  public void finish() throws IOException {
+    try {
+      super.finish();
+    } finally {
+      rsp.addToLog("numAddsExisting", sameCount + differentCount + 
unknownCount);
+      rsp.addToLog("numAddsExistingWithIdentical", sameCount);
+      rsp.addToLog("numAddsExistingUnknown", unknownCount);
+    }
+  }
+
+  private static void logOverlyFailedRetries(int i, UpdateCommand cmd) {
+    if ((i & 255) == 255) {
+      log.warn("Unusual number of optimistic concurrency retries: retries={} 
cmd={}", i, cmd);
+    }
+  }
+
+  void setOldDocProvider(OldDocProvider oldDocProvider) {
+    this.oldDocProvider = oldDocProvider;
+  }
+
+  void setDiscardSameDocuments(boolean discardSameDocuments) {
+    this.discardSameDocuments = discardSameDocuments;
+  }
+
+  private boolean validateHash(BytesRef indexedDocId, String newHash) throws 
IOException {
+    assert null != indexedDocId;
+
+    var docFoundAndOldUserVersions = 
getOldUserVersionsFromStored(indexedDocId);
+    if (docFoundAndOldUserVersions.found) {
+      String oldHash =
+          docFoundAndOldUserVersions.oldHash; // No hash: might want to keep 
track of these too
+      if (oldHash == null) {
+        unknownCount++;
+        return true;
+      } else if (Objects.equals(newHash, oldHash)) {
+        sameCount++;
+        return !discardSameDocuments;
+      } else {
+        differentCount++;
+        return true;
+      }
+    }
+    return true; // Doc not found
+  }
+
+  private DocFoundAndOldUserAndSolrVersions 
getOldUserVersionsFromStored(BytesRef indexedDocId)
+      throws IOException {
+    SolrInputDocument oldDoc = oldDocProvider.getDocument(core, 
hashField.getName(), indexedDocId);
+    return null == oldDoc
+        ? DocFoundAndOldUserAndSolrVersions.NOT_FOUND
+        : getUserVersionAndSolrVersionFromDocument(oldDoc);
+  }
+
+  private DocFoundAndOldUserAndSolrVersions 
getUserVersionAndSolrVersionFromDocument(
+      SolrInputDocument oldDoc) {
+    Object o = oldDoc.getFieldValue(hashField.getName());
+    if (o != null) {
+      return new DocFoundAndOldUserAndSolrVersions(o.toString());
+    }
+    return new DocFoundAndOldUserAndSolrVersions();
+  }
+
+  public String computeDocHash(SolrInputDocument doc) {
+    List<String> docIncludedFieldNames =
+        doc.getFieldNames().stream()
+            .filter(includedFields) // Keep fields that match 'included 
fields' matcher...
+            .filter(
+                excludedFields
+                    .negate()) // ...and exclude fields that match 'excluded 
fields' matcher
+            .sorted() // Sort to ensure consistent field order across 
different doc field orders
+            .toList();
+
+    final Signature sig = new Lookup3Signature();
+    for (String fieldName : docIncludedFieldNames) {
+      sig.add(fieldName);
+      Object o = doc.getFieldValue(fieldName);
+      if (o instanceof Collection) {
+        for (Object oo : (Collection<?>) o) {
+          sig.add(String.valueOf(oo));
+        }
+      } else {
+        sig.add(String.valueOf(o));
+      }
+    }
+
+    // Signature, depending on implementation, may return 8-byte or 16-byte 
value
+    byte[] signature = sig.getSignature();
+    return Base64.getEncoder()
+        .encodeToString(signature); // Makes a base64 hash out of signature 
value
+  }
+
+  interface OldDocProvider {

Review Comment:
   realistically, what impls are we going to have?



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values,
+ * and uses this hash value to reject/accept doc updates.
+ *
+ * <ul>
+ *   <li>When no corresponding doc with same id exists (create), computed hash 
is added to the
+ *       document.
+ *   <li>When a previous doc exists (update), a new hash is computed using new 
version values and
+ *       compared with old hash.
+ * </ul>
+ *
+ * Depending on {#discardSameDocuments} value, this processor may reject or 
accept doc update. This
+ * implementation can be used for monitoring or rejecting no-op updates 
(updates that do not change
+ * Solr document).
+ *
+ * <p>Note: hash is computed using {@link Lookup3Signature}.
+ *
+ * @see Lookup3Signature
+ */
+public class ContentHashVersionProcessor extends UpdateRequestProcessor {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private final SchemaField hashField;
+  private final SolrQueryResponse rsp;
+  private final SolrCore core;
+  private final Predicate<String> includedFields; // Matcher for included 
fields in hash
+  private final Predicate<String> excludedFields; // Matcher for excluded 
fields from hash
+  private OldDocProvider oldDocProvider = new DefaultDocProvider();
+  private boolean discardSameDocuments;
+  private int sameCount = 0;
+  private int differentCount = 0;
+  private int unknownCount = 0;
+
+  public ContentHashVersionProcessor(
+      Predicate<String> hashIncludedFields,
+      Predicate<String> hashExcludedFields,
+      String hashFieldName,
+      SolrQueryRequest req,
+      SolrQueryResponse rsp,
+      UpdateRequestProcessor next) {
+    super(next);
+    this.core = req.getCore();
+    this.hashField = new SchemaField(hashFieldName, new TextField());
+    this.rsp = rsp;
+    this.includedFields = hashIncludedFields;
+    this.excludedFields = hashExcludedFields;
+  }
+
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    SolrInputDocument newDoc = cmd.getSolrInputDocument();
+    String newHash = computeDocHash(newDoc);
+    newDoc.setField(hashField.getName(), newHash);
+    int i = 0;
+
+    if (!validateHash(cmd.getIndexedId(), newHash)) {
+      return;
+    }
+
+    while (true) {
+      logOverlyFailedRetries(i, cmd);
+      try {
+        super.processAdd(cmd);
+        return;
+      } catch (SolrException e) {
+        if (e.code() != 409) {
+          throw e;
+        }
+        ++i;
+      }
+    }
+  }
+
+  @Override
+  public void finish() throws IOException {
+    try {
+      super.finish();
+    } finally {
+      rsp.addToLog("numAddsExisting", sameCount + differentCount + 
unknownCount);
+      rsp.addToLog("numAddsExistingWithIdentical", sameCount);
+      rsp.addToLog("numAddsExistingUnknown", unknownCount);
+    }
+  }
+
+  private static void logOverlyFailedRetries(int i, UpdateCommand cmd) {
+    if ((i & 255) == 255) {
+      log.warn("Unusual number of optimistic concurrency retries: retries={} 
cmd={}", i, cmd);
+    }
+  }
+
+  void setOldDocProvider(OldDocProvider oldDocProvider) {
+    this.oldDocProvider = oldDocProvider;
+  }
+
+  void setDiscardSameDocuments(boolean discardSameDocuments) {
+    this.discardSameDocuments = discardSameDocuments;
+  }
+
+  private boolean validateHash(BytesRef indexedDocId, String newHash) throws 
IOException {
+    assert null != indexedDocId;
+
+    var docFoundAndOldUserVersions = 
getOldUserVersionsFromStored(indexedDocId);
+    if (docFoundAndOldUserVersions.found) {
+      String oldHash =
+          docFoundAndOldUserVersions.oldHash; // No hash: might want to keep 
track of these too
+      if (oldHash == null) {
+        unknownCount++;
+        return true;
+      } else if (Objects.equals(newHash, oldHash)) {
+        sameCount++;
+        return !discardSameDocuments;
+      } else {
+        differentCount++;
+        return true;
+      }
+    }
+    return true; // Doc not found
+  }
+
+  private DocFoundAndOldUserAndSolrVersions 
getOldUserVersionsFromStored(BytesRef indexedDocId)
+      throws IOException {
+    SolrInputDocument oldDoc = oldDocProvider.getDocument(core, 
hashField.getName(), indexedDocId);
+    return null == oldDoc
+        ? DocFoundAndOldUserAndSolrVersions.NOT_FOUND
+        : getUserVersionAndSolrVersionFromDocument(oldDoc);
+  }
+
+  private DocFoundAndOldUserAndSolrVersions 
getUserVersionAndSolrVersionFromDocument(
+      SolrInputDocument oldDoc) {
+    Object o = oldDoc.getFieldValue(hashField.getName());
+    if (o != null) {
+      return new DocFoundAndOldUserAndSolrVersions(o.toString());
+    }
+    return new DocFoundAndOldUserAndSolrVersions();
+  }
+
+  public String computeDocHash(SolrInputDocument doc) {
+    List<String> docIncludedFieldNames =
+        doc.getFieldNames().stream()
+            .filter(includedFields) // Keep fields that match 'included 
fields' matcher...
+            .filter(
+                excludedFields
+                    .negate()) // ...and exclude fields that match 'excluded 
fields' matcher
+            .sorted() // Sort to ensure consistent field order across 
different doc field orders
+            .toList();
+
+    final Signature sig = new Lookup3Signature();
+    for (String fieldName : docIncludedFieldNames) {
+      sig.add(fieldName);
+      Object o = doc.getFieldValue(fieldName);
+      if (o instanceof Collection) {
+        for (Object oo : (Collection<?>) o) {
+          sig.add(String.valueOf(oo));
+        }
+      } else {
+        sig.add(String.valueOf(o));
+      }
+    }
+
+    // Signature, depending on implementation, may return 8-byte or 16-byte 
value
+    byte[] signature = sig.getSignature();
+    return Base64.getEncoder()
+        .encodeToString(signature); // Makes a base64 hash out of signature 
value
+  }
+
+  interface OldDocProvider {
+    SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef 
indexedDocId)
+        throws IOException;
+  }
+
+  private static class DefaultDocProvider implements OldDocProvider {
+    @Override
+    public SolrInputDocument getDocument(SolrCore core, String hashField, 
BytesRef indexedDocId)
+        throws IOException {
+      return RealTimeGetComponent.getInputDocument(
+          core, indexedDocId, indexedDocId, null, Set.of(hashField), 
Resolution.PARTIAL);
+    }
+  }
+
+  private static class DocFoundAndOldUserAndSolrVersions {

Review Comment:
   should this be a record?  Or simply an Optional of the oldHash?



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values,
+ * and uses this hash value to reject/accept doc updates.
+ *
+ * <ul>
+ *   <li>When no corresponding doc with same id exists (create), computed hash 
is added to the
+ *       document.
+ *   <li>When a previous doc exists (update), a new hash is computed using new 
version values and
+ *       compared with old hash.
+ * </ul>
+ *
+ * Depending on {#discardSameDocuments} value, this processor may reject or 
accept doc update. This
+ * implementation can be used for monitoring or rejecting no-op updates 
(updates that do not change
+ * Solr document).
+ *
+ * <p>Note: hash is computed using {@link Lookup3Signature}.
+ *
+ * @see Lookup3Signature
+ */
+public class ContentHashVersionProcessor extends UpdateRequestProcessor {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private final SchemaField hashField;
+  private final SolrQueryResponse rsp;
+  private final SolrCore core;
+  private final Predicate<String> includedFields; // Matcher for included 
fields in hash
+  private final Predicate<String> excludedFields; // Matcher for excluded 
fields from hash
+  private OldDocProvider oldDocProvider = new DefaultDocProvider();
+  private boolean discardSameDocuments;
+  private int sameCount = 0;
+  private int differentCount = 0;
+  private int unknownCount = 0;
+
+  public ContentHashVersionProcessor(
+      Predicate<String> hashIncludedFields,
+      Predicate<String> hashExcludedFields,
+      String hashFieldName,
+      SolrQueryRequest req,
+      SolrQueryResponse rsp,
+      UpdateRequestProcessor next) {
+    super(next);
+    this.core = req.getCore();
+    this.hashField = new SchemaField(hashFieldName, new TextField());
+    this.rsp = rsp;
+    this.includedFields = hashIncludedFields;
+    this.excludedFields = hashExcludedFields;
+  }
+
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    SolrInputDocument newDoc = cmd.getSolrInputDocument();
+    String newHash = computeDocHash(newDoc);
+    newDoc.setField(hashField.getName(), newHash);
+    int i = 0;

Review Comment:
   shouldn't this be redone as a for-i loop?



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java:
##########
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.util.plugin.SolrCoreAware;
+
+/** Factory for {@link ContentHashVersionProcessor} instances. */
+public class ContentHashVersionProcessorFactory extends 
UpdateRequestProcessorFactory
+    implements SolrCoreAware, UpdateRequestProcessorFactory.RunAlways {
+  private static final char SEPARATOR = ','; // Separator for 
included/excluded fields
+  static final String CONTENT_HASH_ENABLED_PARAM = "contentHashEnabled";
+  private List<String> includeFields =
+      Collections.singletonList("*"); // Included fields defaults to 'all'

Review Comment:
   modern Java prefers `List.of`



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values,
+ * and uses this hash value to reject/accept doc updates.
+ *
+ * <ul>
+ *   <li>When no corresponding doc with same id exists (create), computed hash 
is added to the
+ *       document.
+ *   <li>When a previous doc exists (update), a new hash is computed using new 
version values and
+ *       compared with old hash.
+ * </ul>
+ *
+ * Depending on {#discardSameDocuments} value, this processor may reject or 
accept doc update. This
+ * implementation can be used for monitoring or rejecting no-op updates 
(updates that do not change
+ * Solr document).
+ *
+ * <p>Note: hash is computed using {@link Lookup3Signature}.
+ *
+ * @see Lookup3Signature
+ */
+public class ContentHashVersionProcessor extends UpdateRequestProcessor {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private final SchemaField hashField;
+  private final SolrQueryResponse rsp;
+  private final SolrCore core;
+  private final Predicate<String> includedFields; // Matcher for included 
fields in hash
+  private final Predicate<String> excludedFields; // Matcher for excluded 
fields from hash
+  private OldDocProvider oldDocProvider = new DefaultDocProvider();
+  private boolean discardSameDocuments;
+  private int sameCount = 0;
+  private int differentCount = 0;
+  private int unknownCount = 0;
+
+  public ContentHashVersionProcessor(
+      Predicate<String> hashIncludedFields,
+      Predicate<String> hashExcludedFields,
+      String hashFieldName,
+      SolrQueryRequest req,
+      SolrQueryResponse rsp,
+      UpdateRequestProcessor next) {
+    super(next);
+    this.core = req.getCore();
+    this.hashField = new SchemaField(hashFieldName, new TextField());
+    this.rsp = rsp;
+    this.includedFields = hashIncludedFields;
+    this.excludedFields = hashExcludedFields;
+  }
+
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    SolrInputDocument newDoc = cmd.getSolrInputDocument();
+    String newHash = computeDocHash(newDoc);
+    newDoc.setField(hashField.getName(), newHash);
+    int i = 0;
+
+    if (!validateHash(cmd.getIndexedId(), newHash)) {
+      return;
+    }
+
+    while (true) {
+      logOverlyFailedRetries(i, cmd);
+      try {
+        super.processAdd(cmd);
+        return;
+      } catch (SolrException e) {
+        if (e.code() != 409) {
+          throw e;
+        }
+        ++i;
+      }
+    }
+  }
+
+  @Override
+  public void finish() throws IOException {
+    try {
+      super.finish();
+    } finally {
+      rsp.addToLog("numAddsExisting", sameCount + differentCount + 
unknownCount);
+      rsp.addToLog("numAddsExistingWithIdentical", sameCount);
+      rsp.addToLog("numAddsExistingUnknown", unknownCount);

Review Comment:
   RE "numAdds" -- lets use "contentHash" prefix so as to associate these with 
this URP, and not confusing it with DocBasedVersionConstraintsProcessor which 
ought to include similar logs but doesn't yet do so.
   
   IMO adding a new doc shouldn't add any log.  I would expect one of these to 
incorporate the word "drop" or "discard" to reflect an action taken.  I 
understand this URP can be configured to *not* take that action, albeit that 
would be in an exploratory / trial situation that ought to be temporary (I 
imagine).



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values,
+ * and uses this hash value to reject/accept doc updates.
+ *
+ * <ul>
+ *   <li>When no corresponding doc with same id exists (create), computed hash 
is added to the
+ *       document.
+ *   <li>When a previous doc exists (update), a new hash is computed using new 
version values and
+ *       compared with old hash.
+ * </ul>
+ *
+ * Depending on {#discardSameDocuments} value, this processor may reject or 
accept doc update. This
+ * implementation can be used for monitoring or rejecting no-op updates 
(updates that do not change
+ * Solr document).
+ *
+ * <p>Note: hash is computed using {@link Lookup3Signature}.
+ *
+ * @see Lookup3Signature
+ */
+public class ContentHashVersionProcessor extends UpdateRequestProcessor {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private final SchemaField hashField;
+  private final SolrQueryResponse rsp;
+  private final SolrCore core;
+  private final Predicate<String> includedFields; // Matcher for included 
fields in hash
+  private final Predicate<String> excludedFields; // Matcher for excluded 
fields from hash
+  private OldDocProvider oldDocProvider = new DefaultDocProvider();
+  private boolean discardSameDocuments;
+  private int sameCount = 0;
+  private int differentCount = 0;
+  private int unknownCount = 0;
+
+  public ContentHashVersionProcessor(
+      Predicate<String> hashIncludedFields,
+      Predicate<String> hashExcludedFields,
+      String hashFieldName,
+      SolrQueryRequest req,
+      SolrQueryResponse rsp,
+      UpdateRequestProcessor next) {
+    super(next);
+    this.core = req.getCore();
+    this.hashField = new SchemaField(hashFieldName, new TextField());
+    this.rsp = rsp;
+    this.includedFields = hashIncludedFields;
+    this.excludedFields = hashExcludedFields;
+  }
+
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    SolrInputDocument newDoc = cmd.getSolrInputDocument();
+    String newHash = computeDocHash(newDoc);
+    newDoc.setField(hashField.getName(), newHash);
+    int i = 0;
+
+    if (!validateHash(cmd.getIndexedId(), newHash)) {

Review Comment:
   it's not at all obvious to me that the name "validateHash" does more than 
hash validation, whatever that is.  Yet it seems the return value is whether we 
should "discard".   (FWIW DocBasedVersionConstraintsProcessor uses the word 
"Dropping" or simply "drop").



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java:
##########
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.util.plugin.SolrCoreAware;
+
+/** Factory for {@link ContentHashVersionProcessor} instances. */
+public class ContentHashVersionProcessorFactory extends 
UpdateRequestProcessorFactory
+    implements SolrCoreAware, UpdateRequestProcessorFactory.RunAlways {
+  private static final char SEPARATOR = ','; // Separator for 
included/excluded fields
+  static final String CONTENT_HASH_ENABLED_PARAM = "contentHashEnabled";
+  private List<String> includeFields =
+      Collections.singletonList("*"); // Included fields defaults to 'all'
+  private List<String> excludeFields = new ArrayList<>();
+  // No excluded field by default, yet hashFieldName is excluded by default
+  private String hashFieldName =

Review Comment:
   arguably we should insist this be provided rather than some auto-magic name



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values,
+ * and uses this hash value to reject/accept doc updates.
+ *
+ * <ul>
+ *   <li>When no corresponding doc with same id exists (create), computed hash 
is added to the
+ *       document.
+ *   <li>When a previous doc exists (update), a new hash is computed using new 
version values and
+ *       compared with old hash.
+ * </ul>
+ *
+ * Depending on {#discardSameDocuments} value, this processor may reject or 
accept doc update. This
+ * implementation can be used for monitoring or rejecting no-op updates 
(updates that do not change
+ * Solr document).
+ *
+ * <p>Note: hash is computed using {@link Lookup3Signature}.
+ *
+ * @see Lookup3Signature
+ */
+public class ContentHashVersionProcessor extends UpdateRequestProcessor {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private final SchemaField hashField;
+  private final SolrQueryResponse rsp;
+  private final SolrCore core;
+  private final Predicate<String> includedFields; // Matcher for included 
fields in hash
+  private final Predicate<String> excludedFields; // Matcher for excluded 
fields from hash
+  private OldDocProvider oldDocProvider = new DefaultDocProvider();
+  private boolean discardSameDocuments;
+  private int sameCount = 0;
+  private int differentCount = 0;
+  private int unknownCount = 0;
+
+  public ContentHashVersionProcessor(
+      Predicate<String> hashIncludedFields,
+      Predicate<String> hashExcludedFields,
+      String hashFieldName,
+      SolrQueryRequest req,
+      SolrQueryResponse rsp,
+      UpdateRequestProcessor next) {
+    super(next);
+    this.core = req.getCore();
+    this.hashField = new SchemaField(hashFieldName, new TextField());
+    this.rsp = rsp;
+    this.includedFields = hashIncludedFields;
+    this.excludedFields = hashExcludedFields;
+  }
+
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    SolrInputDocument newDoc = cmd.getSolrInputDocument();
+    String newHash = computeDocHash(newDoc);
+    newDoc.setField(hashField.getName(), newHash);
+    int i = 0;
+
+    if (!validateHash(cmd.getIndexedId(), newHash)) {
+      return;
+    }
+
+    while (true) {
+      logOverlyFailedRetries(i, cmd);
+      try {
+        super.processAdd(cmd);
+        return;
+      } catch (SolrException e) {
+        if (e.code() != 409) {
+          throw e;
+        }
+        ++i;
+      }
+    }
+  }
+
+  @Override
+  public void finish() throws IOException {
+    try {
+      super.finish();
+    } finally {
+      rsp.addToLog("numAddsExisting", sameCount + differentCount + 
unknownCount);
+      rsp.addToLog("numAddsExistingWithIdentical", sameCount);
+      rsp.addToLog("numAddsExistingUnknown", unknownCount);
+    }
+  }
+
+  private static void logOverlyFailedRetries(int i, UpdateCommand cmd) {
+    if ((i & 255) == 255) {
+      log.warn("Unusual number of optimistic concurrency retries: retries={} 
cmd={}", i, cmd);
+    }
+  }
+
+  void setOldDocProvider(OldDocProvider oldDocProvider) {
+    this.oldDocProvider = oldDocProvider;
+  }
+
+  void setDiscardSameDocuments(boolean discardSameDocuments) {
+    this.discardSameDocuments = discardSameDocuments;
+  }
+
+  private boolean validateHash(BytesRef indexedDocId, String newHash) throws 
IOException {
+    assert null != indexedDocId;
+
+    var docFoundAndOldUserVersions = 
getOldUserVersionsFromStored(indexedDocId);

Review Comment:
   Oh wait... docs that have been but not yet visible via the IndexSearcher 
will not have their content hashes searchable.  Albeit in such a situation, the 
"worst" thing that would happen is we index a doc that could have been 
dropped/discarded -- a missed optimization opportunity.
   
   A terms lookup on the hash should be much faster than the RTG + 
double-docValues lookups though.



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values,
+ * and uses this hash value to reject/accept doc updates.
+ *
+ * <ul>
+ *   <li>When no corresponding doc with same id exists (create), computed hash 
is added to the
+ *       document.
+ *   <li>When a previous doc exists (update), a new hash is computed using new 
version values and
+ *       compared with old hash.
+ * </ul>
+ *
+ * Depending on {#discardSameDocuments} value, this processor may reject or 
accept doc update. This
+ * implementation can be used for monitoring or rejecting no-op updates 
(updates that do not change
+ * Solr document).
+ *
+ * <p>Note: hash is computed using {@link Lookup3Signature}.
+ *
+ * @see Lookup3Signature
+ */
+public class ContentHashVersionProcessor extends UpdateRequestProcessor {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private final SchemaField hashField;
+  private final SolrQueryResponse rsp;
+  private final SolrCore core;
+  private final Predicate<String> includedFields; // Matcher for included 
fields in hash
+  private final Predicate<String> excludedFields; // Matcher for excluded 
fields from hash
+  private OldDocProvider oldDocProvider = new DefaultDocProvider();
+  private boolean discardSameDocuments;
+  private int sameCount = 0;
+  private int differentCount = 0;
+  private int unknownCount = 0;
+
+  public ContentHashVersionProcessor(
+      Predicate<String> hashIncludedFields,
+      Predicate<String> hashExcludedFields,
+      String hashFieldName,
+      SolrQueryRequest req,
+      SolrQueryResponse rsp,
+      UpdateRequestProcessor next) {
+    super(next);
+    this.core = req.getCore();
+    this.hashField = new SchemaField(hashFieldName, new TextField());
+    this.rsp = rsp;
+    this.includedFields = hashIncludedFields;
+    this.excludedFields = hashExcludedFields;
+  }
+
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    SolrInputDocument newDoc = cmd.getSolrInputDocument();
+    String newHash = computeDocHash(newDoc);
+    newDoc.setField(hashField.getName(), newHash);
+    int i = 0;
+
+    if (!validateHash(cmd.getIndexedId(), newHash)) {
+      return;
+    }
+
+    while (true) {
+      logOverlyFailedRetries(i, cmd);
+      try {
+        super.processAdd(cmd);
+        return;
+      } catch (SolrException e) {
+        if (e.code() != 409) {
+          throw e;
+        }
+        ++i;
+      }
+    }
+  }
+
+  @Override
+  public void finish() throws IOException {
+    try {
+      super.finish();
+    } finally {
+      rsp.addToLog("numAddsExisting", sameCount + differentCount + 
unknownCount);
+      rsp.addToLog("numAddsExistingWithIdentical", sameCount);
+      rsp.addToLog("numAddsExistingUnknown", unknownCount);
+    }
+  }
+
+  private static void logOverlyFailedRetries(int i, UpdateCommand cmd) {
+    if ((i & 255) == 255) {
+      log.warn("Unusual number of optimistic concurrency retries: retries={} 
cmd={}", i, cmd);
+    }
+  }
+
+  void setOldDocProvider(OldDocProvider oldDocProvider) {
+    this.oldDocProvider = oldDocProvider;
+  }
+
+  void setDiscardSameDocuments(boolean discardSameDocuments) {
+    this.discardSameDocuments = discardSameDocuments;
+  }
+
+  private boolean validateHash(BytesRef indexedDocId, String newHash) throws 
IOException {

Review Comment:
   Perhaps we should align with a similar name in 
DocBasedVersionConstraintsProcessor, and rename to `hashIsAcceptable` ?



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values,
+ * and uses this hash value to reject/accept doc updates.
+ *
+ * <ul>
+ *   <li>When no corresponding doc with same id exists (create), computed hash 
is added to the
+ *       document.
+ *   <li>When a previous doc exists (update), a new hash is computed using new 
version values and
+ *       compared with old hash.
+ * </ul>
+ *
+ * Depending on {#discardSameDocuments} value, this processor may reject or 
accept doc update. This
+ * implementation can be used for monitoring or rejecting no-op updates 
(updates that do not change
+ * Solr document).
+ *
+ * <p>Note: hash is computed using {@link Lookup3Signature}.
+ *
+ * @see Lookup3Signature
+ */
+public class ContentHashVersionProcessor extends UpdateRequestProcessor {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private final SchemaField hashField;
+  private final SolrQueryResponse rsp;
+  private final SolrCore core;
+  private final Predicate<String> includedFields; // Matcher for included 
fields in hash
+  private final Predicate<String> excludedFields; // Matcher for excluded 
fields from hash
+  private OldDocProvider oldDocProvider = new DefaultDocProvider();
+  private boolean discardSameDocuments;
+  private int sameCount = 0;
+  private int differentCount = 0;
+  private int unknownCount = 0;
+
+  public ContentHashVersionProcessor(
+      Predicate<String> hashIncludedFields,
+      Predicate<String> hashExcludedFields,
+      String hashFieldName,
+      SolrQueryRequest req,
+      SolrQueryResponse rsp,
+      UpdateRequestProcessor next) {
+    super(next);
+    this.core = req.getCore();
+    this.hashField = new SchemaField(hashFieldName, new TextField());
+    this.rsp = rsp;
+    this.includedFields = hashIncludedFields;
+    this.excludedFields = hashExcludedFields;
+  }
+
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    SolrInputDocument newDoc = cmd.getSolrInputDocument();
+    String newHash = computeDocHash(newDoc);
+    newDoc.setField(hashField.getName(), newHash);
+    int i = 0;

Review Comment:
   See DocBasedVersionConstraintsProcessor.  It's worth a comment here that you 
are emulating similar logic there.



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values,
+ * and uses this hash value to reject/accept doc updates.
+ *
+ * <ul>
+ *   <li>When no corresponding doc with same id exists (create), computed hash 
is added to the
+ *       document.
+ *   <li>When a previous doc exists (update), a new hash is computed using new 
version values and
+ *       compared with old hash.
+ * </ul>
+ *
+ * Depending on {#discardSameDocuments} value, this processor may reject or 
accept doc update. This
+ * implementation can be used for monitoring or rejecting no-op updates 
(updates that do not change
+ * Solr document).
+ *
+ * <p>Note: hash is computed using {@link Lookup3Signature}.
+ *
+ * @see Lookup3Signature
+ */
+public class ContentHashVersionProcessor extends UpdateRequestProcessor {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private final SchemaField hashField;
+  private final SolrQueryResponse rsp;
+  private final SolrCore core;
+  private final Predicate<String> includedFields; // Matcher for included 
fields in hash
+  private final Predicate<String> excludedFields; // Matcher for excluded 
fields from hash
+  private OldDocProvider oldDocProvider = new DefaultDocProvider();
+  private boolean discardSameDocuments;
+  private int sameCount = 0;
+  private int differentCount = 0;
+  private int unknownCount = 0;
+
+  public ContentHashVersionProcessor(
+      Predicate<String> hashIncludedFields,
+      Predicate<String> hashExcludedFields,
+      String hashFieldName,
+      SolrQueryRequest req,
+      SolrQueryResponse rsp,
+      UpdateRequestProcessor next) {
+    super(next);
+    this.core = req.getCore();
+    this.hashField = new SchemaField(hashFieldName, new TextField());
+    this.rsp = rsp;
+    this.includedFields = hashIncludedFields;
+    this.excludedFields = hashExcludedFields;
+  }
+
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    SolrInputDocument newDoc = cmd.getSolrInputDocument();
+    String newHash = computeDocHash(newDoc);
+    newDoc.setField(hashField.getName(), newHash);
+    int i = 0;
+
+    if (!validateHash(cmd.getIndexedId(), newHash)) {
+      return;
+    }
+
+    while (true) {
+      logOverlyFailedRetries(i, cmd);
+      try {
+        super.processAdd(cmd);
+        return;
+      } catch (SolrException e) {
+        if (e.code() != 409) {
+          throw e;
+        }
+        ++i;
+      }
+    }
+  }
+
+  @Override
+  public void finish() throws IOException {
+    try {
+      super.finish();
+    } finally {
+      rsp.addToLog("numAddsExisting", sameCount + differentCount + 
unknownCount);
+      rsp.addToLog("numAddsExistingWithIdentical", sameCount);
+      rsp.addToLog("numAddsExistingUnknown", unknownCount);
+    }
+  }
+
+  private static void logOverlyFailedRetries(int i, UpdateCommand cmd) {
+    if ((i & 255) == 255) {
+      log.warn("Unusual number of optimistic concurrency retries: retries={} 
cmd={}", i, cmd);
+    }
+  }
+
+  void setOldDocProvider(OldDocProvider oldDocProvider) {
+    this.oldDocProvider = oldDocProvider;
+  }
+
+  void setDiscardSameDocuments(boolean discardSameDocuments) {
+    this.discardSameDocuments = discardSameDocuments;
+  }
+
+  private boolean validateHash(BytesRef indexedDocId, String newHash) throws 
IOException {
+    assert null != indexedDocId;
+
+    var docFoundAndOldUserVersions = 
getOldUserVersionsFromStored(indexedDocId);

Review Comment:
   This is one possible algorithm -- lookup  hash from docValues of a doc that 
we got by first looking up the current doc by ID (a terms index lookup).  But 
this is ~twice as expensive as doing a terms index lookup of the new hash, 
assuming we find no doc with that hash.  If we do find one, then it's similar, 
as we then need to fetch the ID via docValues to see if it's the same doc.  But 
we're about to drop the doc then, which isn't the common path.  Even "need" 
there is debatable; it'd be extremely unlikely to find a match by content hash 
to a different doc ID.
   
   Basically -- does the content hash field have indexed=true XOR 
docValues=true.  We don't need to support both (IMO) -- we can choose what's 
most efficient for this use-case.
   I understand you may not have time to implement another algorithm.
   
   Just an FYI: the postingsFormat (of the terms index) is configurable and 
there are a number of lovely implementations, incuding a bloom filter one, 
which would make a lot of sense for "lookup" / exact match only use-case.  Such 
a choice is up to the user to configure in the schema if they wish.



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values,
+ * and uses this hash value to reject/accept doc updates.
+ *
+ * <ul>
+ *   <li>When no corresponding doc with same id exists (create), computed hash 
is added to the
+ *       document.
+ *   <li>When a previous doc exists (update), a new hash is computed using new 
version values and
+ *       compared with old hash.
+ * </ul>
+ *
+ * Depending on {#discardSameDocuments} value, this processor may reject or 
accept doc update. This
+ * implementation can be used for monitoring or rejecting no-op updates 
(updates that do not change
+ * Solr document).
+ *
+ * <p>Note: hash is computed using {@link Lookup3Signature}.
+ *
+ * @see Lookup3Signature
+ */
+public class ContentHashVersionProcessor extends UpdateRequestProcessor {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private final SchemaField hashField;
+  private final SolrQueryResponse rsp;
+  private final SolrCore core;
+  private final Predicate<String> includedFields; // Matcher for included 
fields in hash
+  private final Predicate<String> excludedFields; // Matcher for excluded 
fields from hash
+  private OldDocProvider oldDocProvider = new DefaultDocProvider();
+  private boolean discardSameDocuments;
+  private int sameCount = 0;
+  private int differentCount = 0;
+  private int unknownCount = 0;
+
+  public ContentHashVersionProcessor(
+      Predicate<String> hashIncludedFields,
+      Predicate<String> hashExcludedFields,
+      String hashFieldName,
+      SolrQueryRequest req,
+      SolrQueryResponse rsp,
+      UpdateRequestProcessor next) {
+    super(next);
+    this.core = req.getCore();
+    this.hashField = new SchemaField(hashFieldName, new TextField());
+    this.rsp = rsp;
+    this.includedFields = hashIncludedFields;
+    this.excludedFields = hashExcludedFields;
+  }
+
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    SolrInputDocument newDoc = cmd.getSolrInputDocument();
+    String newHash = computeDocHash(newDoc);
+    newDoc.setField(hashField.getName(), newHash);
+    int i = 0;
+
+    if (!validateHash(cmd.getIndexedId(), newHash)) {
+      return;
+    }
+
+    while (true) {
+      logOverlyFailedRetries(i, cmd);
+      try {
+        super.processAdd(cmd);
+        return;
+      } catch (SolrException e) {
+        if (e.code() != 409) {
+          throw e;
+        }
+        ++i;
+      }
+    }
+  }
+
+  @Override
+  public void finish() throws IOException {
+    try {
+      super.finish();
+    } finally {
+      rsp.addToLog("numAddsExisting", sameCount + differentCount + 
unknownCount);
+      rsp.addToLog("numAddsExistingWithIdentical", sameCount);
+      rsp.addToLog("numAddsExistingUnknown", unknownCount);
+    }
+  }
+
+  private static void logOverlyFailedRetries(int i, UpdateCommand cmd) {
+    if ((i & 255) == 255) {
+      log.warn("Unusual number of optimistic concurrency retries: retries={} 
cmd={}", i, cmd);
+    }
+  }
+
+  void setOldDocProvider(OldDocProvider oldDocProvider) {
+    this.oldDocProvider = oldDocProvider;
+  }
+
+  void setDiscardSameDocuments(boolean discardSameDocuments) {
+    this.discardSameDocuments = discardSameDocuments;
+  }
+
+  private boolean validateHash(BytesRef indexedDocId, String newHash) throws 
IOException {
+    assert null != indexedDocId;
+
+    var docFoundAndOldUserVersions = 
getOldUserVersionsFromStored(indexedDocId);
+    if (docFoundAndOldUserVersions.found) {
+      String oldHash =
+          docFoundAndOldUserVersions.oldHash; // No hash: might want to keep 
track of these too
+      if (oldHash == null) {
+        unknownCount++;
+        return true;
+      } else if (Objects.equals(newHash, oldHash)) {
+        sameCount++;
+        return !discardSameDocuments;
+      } else {
+        differentCount++;
+        return true;
+      }
+    }
+    return true; // Doc not found
+  }
+
+  private DocFoundAndOldUserAndSolrVersions 
getOldUserVersionsFromStored(BytesRef indexedDocId)

Review Comment:
   IMO we should not support this.  It's not unreasonable to demand that 
someone using this feature enable docValues on the hash field.



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,196 @@
+package org.apache.solr.update.processor;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values, and uses this hash

Review Comment:
   "doc values" here could be confusing... as Lucene/Solr has "docValues" and 
this URP in fact uses that technical mechanism.



##########
solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java:
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Base64;
+import java.util.Collection;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Predicate;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.RealTimeGetComponent;
+import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.UpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An implementation of {@link UpdateRequestProcessor} which computes a hash 
of selected doc values,
+ * and uses this hash value to reject/accept doc updates.
+ *
+ * <ul>
+ *   <li>When no corresponding doc with same id exists (create), computed hash 
is added to the
+ *       document.
+ *   <li>When a previous doc exists (update), a new hash is computed using new 
version values and
+ *       compared with old hash.
+ * </ul>
+ *
+ * Depending on {#discardSameDocuments} value, this processor may reject or 
accept doc update. This
+ * implementation can be used for monitoring or rejecting no-op updates 
(updates that do not change
+ * Solr document).
+ *
+ * <p>Note: hash is computed using {@link Lookup3Signature}.
+ *
+ * @see Lookup3Signature
+ */
+public class ContentHashVersionProcessor extends UpdateRequestProcessor {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private final SchemaField hashField;
+  private final SolrQueryResponse rsp;
+  private final SolrCore core;
+  private final Predicate<String> includedFields; // Matcher for included 
fields in hash
+  private final Predicate<String> excludedFields; // Matcher for excluded 
fields from hash
+  private OldDocProvider oldDocProvider = new DefaultDocProvider();
+  private boolean discardSameDocuments;
+  private int sameCount = 0;
+  private int differentCount = 0;
+  private int unknownCount = 0;
+
+  public ContentHashVersionProcessor(
+      Predicate<String> hashIncludedFields,
+      Predicate<String> hashExcludedFields,
+      String hashFieldName,
+      SolrQueryRequest req,
+      SolrQueryResponse rsp,
+      UpdateRequestProcessor next) {
+    super(next);
+    this.core = req.getCore();
+    this.hashField = new SchemaField(hashFieldName, new TextField());
+    this.rsp = rsp;
+    this.includedFields = hashIncludedFields;
+    this.excludedFields = hashExcludedFields;
+  }
+
+  public void processAdd(AddUpdateCommand cmd) throws IOException {
+    SolrInputDocument newDoc = cmd.getSolrInputDocument();
+    String newHash = computeDocHash(newDoc);
+    newDoc.setField(hashField.getName(), newHash);
+    int i = 0;
+
+    if (!validateHash(cmd.getIndexedId(), newHash)) {
+      return;
+    }
+
+    while (true) {
+      logOverlyFailedRetries(i, cmd);
+      try {
+        super.processAdd(cmd);
+        return;
+      } catch (SolrException e) {
+        if (e.code() != 409) {
+          throw e;
+        }
+        ++i;
+      }
+    }
+  }
+
+  @Override
+  public void finish() throws IOException {
+    try {
+      super.finish();
+    } finally {
+      rsp.addToLog("numAddsExisting", sameCount + differentCount + 
unknownCount);
+      rsp.addToLog("numAddsExistingWithIdentical", sameCount);
+      rsp.addToLog("numAddsExistingUnknown", unknownCount);
+    }
+  }
+
+  private static void logOverlyFailedRetries(int i, UpdateCommand cmd) {
+    if ((i & 255) == 255) {
+      log.warn("Unusual number of optimistic concurrency retries: retries={} 
cmd={}", i, cmd);
+    }
+  }
+
+  void setOldDocProvider(OldDocProvider oldDocProvider) {
+    this.oldDocProvider = oldDocProvider;
+  }
+
+  void setDiscardSameDocuments(boolean discardSameDocuments) {
+    this.discardSameDocuments = discardSameDocuments;
+  }
+
+  private boolean validateHash(BytesRef indexedDocId, String newHash) throws 
IOException {
+    assert null != indexedDocId;
+
+    var docFoundAndOldUserVersions = 
getOldUserVersionsFromStored(indexedDocId);
+    if (docFoundAndOldUserVersions.found) {
+      String oldHash =
+          docFoundAndOldUserVersions.oldHash; // No hash: might want to keep 
track of these too
+      if (oldHash == null) {
+        unknownCount++;
+        return true;
+      } else if (Objects.equals(newHash, oldHash)) {
+        sameCount++;
+        return !discardSameDocuments;
+      } else {
+        differentCount++;
+        return true;
+      }
+    }
+    return true; // Doc not found
+  }
+
+  private DocFoundAndOldUserAndSolrVersions 
getOldUserVersionsFromStored(BytesRef indexedDocId)
+      throws IOException {
+    SolrInputDocument oldDoc = oldDocProvider.getDocument(core, 
hashField.getName(), indexedDocId);
+    return null == oldDoc
+        ? DocFoundAndOldUserAndSolrVersions.NOT_FOUND
+        : getUserVersionAndSolrVersionFromDocument(oldDoc);
+  }
+
+  private DocFoundAndOldUserAndSolrVersions 
getUserVersionAndSolrVersionFromDocument(
+      SolrInputDocument oldDoc) {
+    Object o = oldDoc.getFieldValue(hashField.getName());
+    if (o != null) {
+      return new DocFoundAndOldUserAndSolrVersions(o.toString());
+    }
+    return new DocFoundAndOldUserAndSolrVersions();
+  }
+
+  public String computeDocHash(SolrInputDocument doc) {
+    List<String> docIncludedFieldNames =

Review Comment:
   This results in a double-navigation of the fields of a document in order to 
have the hash be insensitive to the order of fields added.  But in practice, a 
given end-to-end pipeline is going to be deterministic.  Hmm.  I suppose its no 
big deal.  To improve, you cold instead stream SolrInputField, sorting by name, 
and get the value from them without another lookup.  You could also do the 
whole thing as a stream pipeline without needlessing producing a List.  It'd 
end in a forEach to process the value.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] SOLR-18189: Improve tracking of duplicate Solr docs with content hash [solr]

Reply via email to