Copilot commented on code in PR #13398:
URL: https://github.com/apache/lucene/pull/13398#discussion_r3122628297
##########
lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java:
##########
@@ -202,6 +203,96 @@ public boolean keepFullyDeletedSegment(
dir.close();
}
+ public void testDeleteByQueries() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer =
+ new IndexWriter(
+ dir,
+ new IndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(NoMergePolicy.INSTANCE));
+
+ Document doc;
+ for (int i = 0; i < 10; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 10; i < 20; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 20; i < 30; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+ // Match all docs in 2nd segment.
+ writer.deleteDocuments(LongPoint.newRangeQuery("content", 10, 19));
+ // This query can not be applied on 2nd segment, since it is fully deleted
by prior query.
+ writer.deleteDocuments(LongPoint.newRangeQuery("content", 12, 15));
+
+ DirectoryReader reader = DirectoryReader.open(writer);
+ IndexSearcher searcher = newSearcher(reader);
+ assertEquals(
+ 0, searcher.search(LongPoint.newRangeQuery("content", 10, 19),
100).totalHits.value());
+ assertEquals(
+ 20, searcher.search(LongPoint.newRangeQuery("content", 0, 30),
100).totalHits.value());
+
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+
+ public void testDeleteByTerms() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer =
+ new IndexWriter(
+ dir,
+ new IndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(NoMergePolicy.INSTANCE));
+
+ Document doc;
+ doc = new Document();
+ doc.add(new TextField("f", "foo bar tea", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ doc = new Document();
+ doc.add(new TextField("f", "foo bar", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ doc = new Document();
+ doc.add(new TextField("f", "foo tea", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ Term[] delTerms = new Term[3];
+ // This query only can be applied on 3rd segment, since segment 1, 2 are
fully deleted by prior
+ // query.
+ delTerms[0] = new Term("f", "foo");
+ // This query can be applied on every segment, since it is executed
firstly.
+ delTerms[1] = new Term("f", "bar");
+ // This query can not be applied, since all segments are fully deleted by
prior query.
Review Comment:
These comments refer to “query”, but this test is deleting by `Term` (and
term delete order is based on term sort order, not the array index). Please
update the wording to avoid confusion (e.g., refer to “delete term” and clarify
ordering if relevant).
##########
lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java:
##########
@@ -202,6 +203,96 @@ public boolean keepFullyDeletedSegment(
dir.close();
}
+ public void testDeleteByQueries() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer =
+ new IndexWriter(
+ dir,
+ new IndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(NoMergePolicy.INSTANCE));
+
+ Document doc;
+ for (int i = 0; i < 10; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 10; i < 20; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 20; i < 30; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+ // Match all docs in 2nd segment.
+ writer.deleteDocuments(LongPoint.newRangeQuery("content", 10, 19));
+ // This query can not be applied on 2nd segment, since it is fully deleted
by prior query.
Review Comment:
Spelling/grammar in these new comments: use “cannot” instead of “can not”.
##########
lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java:
##########
@@ -202,6 +203,96 @@ public boolean keepFullyDeletedSegment(
dir.close();
}
+ public void testDeleteByQueries() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer =
+ new IndexWriter(
+ dir,
+ new IndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(NoMergePolicy.INSTANCE));
+
+ Document doc;
+ for (int i = 0; i < 10; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 10; i < 20; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 20; i < 30; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+ // Match all docs in 2nd segment.
+ writer.deleteDocuments(LongPoint.newRangeQuery("content", 10, 19));
+ // This query can not be applied on 2nd segment, since it is fully deleted
by prior query.
+ writer.deleteDocuments(LongPoint.newRangeQuery("content", 12, 15));
+
+ DirectoryReader reader = DirectoryReader.open(writer);
+ IndexSearcher searcher = newSearcher(reader);
+ assertEquals(
+ 0, searcher.search(LongPoint.newRangeQuery("content", 10, 19),
100).totalHits.value());
+ assertEquals(
+ 20, searcher.search(LongPoint.newRangeQuery("content", 0, 30),
100).totalHits.value());
+
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+
+ public void testDeleteByTerms() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer =
+ new IndexWriter(
+ dir,
+ new IndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(NoMergePolicy.INSTANCE));
+
+ Document doc;
+ doc = new Document();
+ doc.add(new TextField("f", "foo bar tea", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ doc = new Document();
+ doc.add(new TextField("f", "foo bar", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ doc = new Document();
+ doc.add(new TextField("f", "foo tea", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ Term[] delTerms = new Term[3];
+ // This query only can be applied on 3rd segment, since segment 1, 2 are
fully deleted by prior
+ // query.
+ delTerms[0] = new Term("f", "foo");
+ // This query can be applied on every segment, since it is executed
firstly.
+ delTerms[1] = new Term("f", "bar");
+ // This query can not be applied, since all segments are fully deleted by
prior query.
Review Comment:
Spelling/grammar: use “cannot” instead of “can not”.
##########
lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java:
##########
@@ -202,6 +203,96 @@ public boolean keepFullyDeletedSegment(
dir.close();
}
+ public void testDeleteByQueries() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer =
+ new IndexWriter(
+ dir,
+ new IndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(NoMergePolicy.INSTANCE));
+
+ Document doc;
+ for (int i = 0; i < 10; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 10; i < 20; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 20; i < 30; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+ // Match all docs in 2nd segment.
+ writer.deleteDocuments(LongPoint.newRangeQuery("content", 10, 19));
+ // This query can not be applied on 2nd segment, since it is fully deleted
by prior query.
+ writer.deleteDocuments(LongPoint.newRangeQuery("content", 12, 15));
+
+ DirectoryReader reader = DirectoryReader.open(writer);
+ IndexSearcher searcher = newSearcher(reader);
+ assertEquals(
+ 0, searcher.search(LongPoint.newRangeQuery("content", 10, 19),
100).totalHits.value());
+ assertEquals(
+ 20, searcher.search(LongPoint.newRangeQuery("content", 0, 30),
100).totalHits.value());
+
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+
+ public void testDeleteByTerms() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer =
+ new IndexWriter(
+ dir,
+ new IndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(NoMergePolicy.INSTANCE));
+
+ Document doc;
+ doc = new Document();
+ doc.add(new TextField("f", "foo bar tea", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ doc = new Document();
+ doc.add(new TextField("f", "foo bar", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ doc = new Document();
+ doc.add(new TextField("f", "foo tea", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ Term[] delTerms = new Term[3];
+ // This query only can be applied on 3rd segment, since segment 1, 2 are
fully deleted by prior
+ // query.
+ delTerms[0] = new Term("f", "foo");
+ // This query can be applied on every segment, since it is executed
firstly.
Review Comment:
Grammar: “executed first” reads better than “executed firstly” in this
comment.
##########
lucene/core/src/java/org/apache/lucene/index/FrozenBufferedUpdates.java:
##########
@@ -379,6 +379,10 @@ private long
applyQueryDeletes(BufferedUpdatesStream.SegmentState[] segStates)
final LeafReaderContext readerContext = segState.reader.getContext();
for (int i = 0; i < deleteQueries.length; i++) {
+ // Break this loop if this segment is fully deleted by prior
delQueries.
+ if (segState.rld.isFullyDeleted()) {
+ break;
+ }
Query query = deleteQueries[i];
Review Comment:
The new per-query `segState.rld.isFullyDeleted()` check adds a synchronized
call on every iteration, even when the segment is not fully deleted. Consider
tracking remaining live docs (or delete count vs `reader.maxDoc()`) during the
loop so you only perform the fully-deleted check when needed, to avoid overhead
on large `deleteQueries` arrays.
##########
lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java:
##########
@@ -202,6 +203,96 @@ public boolean keepFullyDeletedSegment(
dir.close();
}
+ public void testDeleteByQueries() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer =
+ new IndexWriter(
+ dir,
+ new IndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(NoMergePolicy.INSTANCE));
+
+ Document doc;
+ for (int i = 0; i < 10; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 10; i < 20; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 20; i < 30; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+ // Match all docs in 2nd segment.
+ writer.deleteDocuments(LongPoint.newRangeQuery("content", 10, 19));
+ // This query can not be applied on 2nd segment, since it is fully deleted
by prior query.
+ writer.deleteDocuments(LongPoint.newRangeQuery("content", 12, 15));
+
+ DirectoryReader reader = DirectoryReader.open(writer);
+ IndexSearcher searcher = newSearcher(reader);
+ assertEquals(
+ 0, searcher.search(LongPoint.newRangeQuery("content", 10, 19),
100).totalHits.value());
+ assertEquals(
+ 20, searcher.search(LongPoint.newRangeQuery("content", 0, 30),
100).totalHits.value());
+
Review Comment:
This test only asserts final hit counts; it doesn't actually validate that
the 2nd delete query is skipped once the segment becomes fully deleted. To
cover the optimization, consider using a custom `Query` (or wrapper) that
records/throws if its weight/scorer is created for the fully-deleted segment,
and assert that it is not invoked.
##########
lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java:
##########
@@ -202,6 +203,96 @@ public boolean keepFullyDeletedSegment(
dir.close();
}
+ public void testDeleteByQueries() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer =
+ new IndexWriter(
+ dir,
+ new IndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(NoMergePolicy.INSTANCE));
+
+ Document doc;
+ for (int i = 0; i < 10; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 10; i < 20; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+
+ for (int i = 20; i < 30; i++) {
+ doc = new Document();
+ doc.add(new LongField("content", i, Field.Store.NO));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+ // Match all docs in 2nd segment.
+ writer.deleteDocuments(LongPoint.newRangeQuery("content", 10, 19));
+ // This query can not be applied on 2nd segment, since it is fully deleted
by prior query.
+ writer.deleteDocuments(LongPoint.newRangeQuery("content", 12, 15));
+
+ DirectoryReader reader = DirectoryReader.open(writer);
+ IndexSearcher searcher = newSearcher(reader);
+ assertEquals(
+ 0, searcher.search(LongPoint.newRangeQuery("content", 10, 19),
100).totalHits.value());
+ assertEquals(
+ 20, searcher.search(LongPoint.newRangeQuery("content", 0, 30),
100).totalHits.value());
+
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+
+ public void testDeleteByTerms() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer =
+ new IndexWriter(
+ dir,
+ new IndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(NoMergePolicy.INSTANCE));
+
+ Document doc;
+ doc = new Document();
+ doc.add(new TextField("f", "foo bar tea", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ doc = new Document();
+ doc.add(new TextField("f", "foo bar", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ doc = new Document();
+ doc.add(new TextField("f", "foo tea", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+
+ Term[] delTerms = new Term[3];
+ // This query only can be applied on 3rd segment, since segment 1, 2 are
fully deleted by prior
+ // query.
+ delTerms[0] = new Term("f", "foo");
+ // This query can be applied on every segment, since it is executed
firstly.
+ delTerms[1] = new Term("f", "bar");
+ // This query can not be applied, since all segments are fully deleted by
prior query.
+ delTerms[2] = new Term("f", "tea");
+
+ writer.deleteDocuments(delTerms);
+
+ DirectoryReader reader = DirectoryReader.open(writer);
+ IndexSearcher searcher = newSearcher(reader);
+ assertEquals(0, searcher.search(new MatchAllDocsQuery(),
10).totalHits.value());
+
Review Comment:
This test asserts that all docs are deleted, but it doesn't verify that
later delete terms are skipped once earlier terms fully delete the segment(s).
To make the behavior testable, consider introducing an instrumented
term/postings lookup (or similar hook) that fails if a term lookup occurs after
the segment is already fully deleted, and assert that it is not triggered.
##########
lucene/core/src/java/org/apache/lucene/index/FrozenBufferedUpdates.java:
##########
@@ -465,6 +469,10 @@ private long
applyTermDeletes(BufferedUpdatesStream.SegmentState[] segStates) th
BytesRef delTerm;
TermDocsIterator termDocsIterator = new
TermDocsIterator(segState.reader, true);
while ((delTerm = iter.next()) != null) {
+ // Break this loop if this segment is fully deleted by prior delTerms.
+ if (segState.rld.isFullyDeleted()) {
+ break;
+ }
final DocIdSetIterator iterator =
termDocsIterator.nextTerm(iter.field(), delTerm);
Review Comment:
Similar to query deletes, calling `segState.rld.isFullyDeleted()` on every
term iteration adds synchronization overhead even when the segment is not fully
deleted. Consider maintaining a local fully-deleted flag based on delete
count/maxDoc and only checking when deletions actually occur, so large delete
term sets don't pay extra per-term cost.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]