This is an automated email from the ASF dual-hosted git repository. dsmiley pushed a commit to branch branch_9x in repository https://gitbox.apache.org/repos/asf/solr.git
commit aba880e5e014c5b692110b4dc7f5bfb55aac0baf Author: Luke Kot-Zaniewski <[email protected]> AuthorDate: Sat Jan 31 14:51:21 2026 -0500 SOLR-18071: Support stored fields in ExportWriter (#4053) via a new includeStoredFields parameter (cherry picked from commit 45adb11a9eaed2456244cd03d573d8daf10d92e1) --- ...R-18071-support-stored-fields-export-writer.yml | 8 + .../solr/handler/export/DoubleFieldWriter.java | 8 +- .../apache/solr/handler/export/ExportWriter.java | 87 ++++-- .../apache/solr/handler/export/FieldWriter.java | 12 +- .../solr/handler/export/FloatFieldWriter.java | 8 +- .../apache/solr/handler/export/IntFieldWriter.java | 8 +- .../solr/handler/export/LongFieldWriter.java | 8 +- .../solr/handler/export/MultiFieldWriter.java | 9 +- .../solr/handler/export/StoredFieldsWriter.java | 143 ++++++++++ .../solr/handler/export/StringFieldWriter.java | 8 +- .../collection1/conf/schema-sortingresponse.xml | 20 +- .../solr/handler/export/TestExportWriter.java | 294 ++++++++++++++++++++- .../query-guide/pages/exporting-result-sets.adoc | 73 ++++- 13 files changed, 622 insertions(+), 64 deletions(-) diff --git a/changelog/unreleased/SOLR-18071-support-stored-fields-export-writer.yml b/changelog/unreleased/SOLR-18071-support-stored-fields-export-writer.yml new file mode 100644 index 00000000000..dbd1b8c0237 --- /dev/null +++ b/changelog/unreleased/SOLR-18071-support-stored-fields-export-writer.yml @@ -0,0 +1,8 @@ +# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc +title: Support including stored fields in Export Writer output. +type: added # added, changed, fixed, deprecated, removed, dependency_update, security, other +authors: + - name: Luke Kot-Zaniewski +links: + - name: SOLR-18071 + url: https://issues.apache.org/jira/browse/SOLR-18071 diff --git a/solr/core/src/java/org/apache/solr/handler/export/DoubleFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/DoubleFieldWriter.java index e439560894b..561d0336678 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/DoubleFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/DoubleFieldWriter.java @@ -34,8 +34,7 @@ class DoubleFieldWriter extends FieldWriter { } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew) throws IOException { double val; SortValue sortValue = sortDoc.getSortValue(this.field); @@ -43,7 +42,7 @@ class DoubleFieldWriter extends FieldWriter { if (sortValue.isPresent()) { val = (double) sortValue.getCurrentValue(); } else { // empty-value - return false; + return; } } else { // field is not part of 'sort' param, but part of 'fl' param @@ -53,10 +52,9 @@ class DoubleFieldWriter extends FieldWriter { if (vals != null) { val = Double.longBitsToDouble(vals.longValue()); } else { - return false; + return; } } ew.put(this.field, val); - return true; } } diff --git a/solr/core/src/java/org/apache/solr/handler/export/ExportWriter.java b/solr/core/src/java/org/apache/solr/handler/export/ExportWriter.java index 209680aee12..9307f167b48 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/ExportWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/ExportWriter.java @@ -28,8 +28,11 @@ import java.io.PrintWriter; import java.lang.invoke.MethodHandles; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeSet; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; @@ -99,15 +102,15 @@ public class ExportWriter implements SolrCore.RawWriter, Closeable { public static final String BATCH_SIZE_PARAM = "batchSize"; public static final String QUEUE_SIZE_PARAM = "queueSize"; + public static final String INCLUDE_STORED_FIELDS_PARAM = "includeStoredFields"; public static final int DEFAULT_BATCH_SIZE = 30000; public static final int DEFAULT_QUEUE_SIZE = 150000; private static final FieldWriter EMPTY_FIELD_WRITER = new FieldWriter() { @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, EntryWriter out, int fieldIndex) { - return false; + public void write(SortDoc sortDoc, LeafReaderContext readerContext, EntryWriter out) { + // do nothing } }; @@ -482,45 +485,72 @@ public class ExportWriter implements SolrCore.RawWriter, Closeable { throws IOException { int ord = sortDoc.ord; LeafReaderContext context = leaves.get(ord); - int fieldIndex = 0; for (FieldWriter fieldWriter : writers) { - if (fieldWriter.write(sortDoc, context, ew, fieldIndex)) { - ++fieldIndex; - } + fieldWriter.write(sortDoc, context, ew); } } public List<FieldWriter> getFieldWriters(String[] fields, SolrQueryRequest req) throws IOException { DocValuesIteratorCache dvIterCache = new DocValuesIteratorCache(req.getSearcher(), false); - SolrReturnFields solrReturnFields = new SolrReturnFields(fields, req); + boolean includeStoredFields = req.getParams().getBool(INCLUDE_STORED_FIELDS_PARAM, false); List<FieldWriter> writers = new ArrayList<>(); + Set<SchemaField> docValueFields = new LinkedHashSet<>(); + Map<String, SchemaField> storedFields = new LinkedHashMap<>(); + for (String field : req.getSearcher().getFieldNames()) { if (!solrReturnFields.wantsField(field)) { continue; } SchemaField schemaField = req.getSchema().getField(field); - if (!schemaField.hasDocValues()) { - throw new IOException(schemaField + " must have DocValues to use this feature."); - } - boolean multiValued = schemaField.multiValued(); FieldType fieldType = schemaField.getType(); - FieldWriter writer; - if (fieldType instanceof SortableTextField && !schemaField.useDocValuesAsStored()) { - if (solrReturnFields.getRequestedFieldNames() != null - && solrReturnFields.getRequestedFieldNames().contains(field)) { - // Explicitly requested field cannot be used due to not having useDocValuesAsStored=true, - // throw exception + Set<String> requestFieldNames = + solrReturnFields.getRequestedFieldNames() == null + ? Set.of() + : solrReturnFields.getRequestedFieldNames(); + + if (canUseDocValues(schemaField, fieldType)) { + // Prefer DocValues when available + docValueFields.add(schemaField); + } else if (schemaField.stored()) { + // Field is stored-only (no usable DocValues) + if (includeStoredFields) { + storedFields.put(field, schemaField); + } else if (requestFieldNames.contains(field)) { + // Explicitly requested field without DocValues and includeStoredFields=false + throw new IOException( + schemaField + + " must have DocValues to use this feature. " + + "Try setting includeStoredFields=true to retrieve this field from stored values."); + } + // Else: glob matched stored-only field without includeStoredFields - silently skip + } else if (requestFieldNames.contains(field)) { + // Explicitly requested field that has neither DocValues nor stored + if (fieldType instanceof SortableTextField && !schemaField.useDocValuesAsStored()) { throw new IOException( schemaField + " Must have useDocValuesAsStored='true' to be used with export writer"); } else { - // Glob pattern matched field cannot be used due to not having useDocValuesAsStored=true - continue; + throw new IOException( + schemaField + " must have DocValues or be stored to use this feature."); } } + // Else: glob matched field with neither DocValues nor stored - silently skip + } + + for (SchemaField schemaField : docValueFields) { + String field = schemaField.getName(); + boolean multiValued = schemaField.multiValued(); + FieldType fieldType = schemaField.getType(); + FieldWriter writer; + + if (schemaField.stored() && !storedFields.isEmpty()) { + // if we're reading StoredFields *anyway*, then we might as well avoid this extra DV lookup + storedFields.put(field, schemaField); + continue; + } DocValuesIteratorCache.FieldDocValuesSupplier docValuesCache = dvIterCache.getSupplier(field); @@ -574,9 +604,24 @@ public class ExportWriter implements SolrCore.RawWriter, Closeable { } writers.add(writer); } + + if (!storedFields.isEmpty()) { + writers.add(new StoredFieldsWriter(storedFields)); + } + return writers; } + private static boolean canUseDocValues(SchemaField schemaField, FieldType fieldType) { + return schemaField.hasDocValues() + // Special handling for SortableTextField: unlike other field types, it requires + // useDocValuesAsStored=true to be included via glob patterns in /export. This + // matches the behavior of /select (which requires useDocValuesAsStored=true for + // all globbed fields) and avoids performance issues. The requirement cannot be + // extended to other field types in /export for backward compatibility reasons. + && (!(fieldType instanceof SortableTextField) || schemaField.useDocValuesAsStored()); + } + SortDoc getSortDoc(SolrIndexSearcher searcher, SortField[] sortFields) throws IOException { SortValue[] sortValues = new SortValue[sortFields.length]; IndexSchema schema = searcher.getSchema(); @@ -591,7 +636,7 @@ public class ExportWriter implements SolrCore.RawWriter, Closeable { throw new IOException(field + " must have DocValues to use this feature."); } - if (ft instanceof SortableTextField && schemaField.useDocValuesAsStored() == false) { + if (ft instanceof SortableTextField && !schemaField.useDocValuesAsStored()) { throw new IOException( schemaField + " Must have useDocValuesAsStored='true' to be used with export writer"); } diff --git a/solr/core/src/java/org/apache/solr/handler/export/FieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/FieldWriter.java index 1923afb410f..4b7cf7eb47b 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/FieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/FieldWriter.java @@ -22,7 +22,15 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.solr.common.MapWriter; abstract class FieldWriter { - public abstract boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter out, int fieldIndex) + /** + * Writes field values from the document to the output. + * + * @param sortDoc the document being exported + * @param readerContext the leaf reader context for accessing field values + * @param out the output writer to write field values to + * @throws IOException if an I/O error occurs while reading or writing field values + */ + public abstract void write( + SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter out) throws IOException; } diff --git a/solr/core/src/java/org/apache/solr/handler/export/FloatFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/FloatFieldWriter.java index a60c14e6b0a..68a36f84b71 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/FloatFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/FloatFieldWriter.java @@ -34,8 +34,7 @@ class FloatFieldWriter extends FieldWriter { } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew) throws IOException { float val; SortValue sortValue = sortDoc.getSortValue(this.field); @@ -43,7 +42,7 @@ class FloatFieldWriter extends FieldWriter { if (sortValue.isPresent()) { val = (float) sortValue.getCurrentValue(); } else { // empty-value - return false; + return; } } else { // field is not part of 'sort' param, but part of 'fl' param @@ -53,10 +52,9 @@ class FloatFieldWriter extends FieldWriter { if (vals != null) { val = Float.intBitsToFloat((int) vals.longValue()); } else { - return false; + return; } } ew.put(this.field, val); - return true; } } diff --git a/solr/core/src/java/org/apache/solr/handler/export/IntFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/IntFieldWriter.java index bf0396d4ab8..fc7c2d174ab 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/IntFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/IntFieldWriter.java @@ -34,8 +34,7 @@ class IntFieldWriter extends FieldWriter { } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew) throws IOException { int val; SortValue sortValue = sortDoc.getSortValue(this.field); @@ -43,7 +42,7 @@ class IntFieldWriter extends FieldWriter { if (sortValue.isPresent()) { val = (int) sortValue.getCurrentValue(); } else { // empty-value - return false; + return; } } else { // field is not part of 'sort' param, but part of 'fl' param @@ -53,10 +52,9 @@ class IntFieldWriter extends FieldWriter { if (vals != null) { val = (int) vals.longValue(); } else { - return false; + return; } } ew.put(this.field, val); - return true; } } diff --git a/solr/core/src/java/org/apache/solr/handler/export/LongFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/LongFieldWriter.java index 7961549477c..38997e5a495 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/LongFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/LongFieldWriter.java @@ -35,8 +35,7 @@ class LongFieldWriter extends FieldWriter { } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew) throws IOException { long val; SortValue sortValue = sortDoc.getSortValue(this.field); @@ -44,7 +43,7 @@ class LongFieldWriter extends FieldWriter { if (sortValue.isPresent()) { val = (long) sortValue.getCurrentValue(); } else { // empty-value - return false; + return; } } else { // field is not part of 'sort' param, but part of 'fl' param @@ -54,11 +53,10 @@ class LongFieldWriter extends FieldWriter { if (vals != null) { val = vals.longValue(); } else { - return false; + return; } } doWrite(ew, val); - return true; } protected void doWrite(MapWriter.EntryWriter ew, long val) throws IOException { diff --git a/solr/core/src/java/org/apache/solr/handler/export/MultiFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/MultiFieldWriter.java index 7f5bdee4899..51ea833f852 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/MultiFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/MultiFieldWriter.java @@ -61,15 +61,14 @@ class MultiFieldWriter extends FieldWriter { } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter out, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter out) throws IOException { if (this.fieldType.isPointField()) { SortedNumericDocValues vals = docValuesCache.getSortedNumericDocValues( sortDoc.docId, readerContext.reader(), readerContext.ord); if (vals == null) { - return false; + return; } final SortedNumericDocValues docVals = vals; @@ -82,13 +81,12 @@ class MultiFieldWriter extends FieldWriter { w.add(bitsToValue.apply(docVals.nextValue())); } }); - return true; } else { SortedSetDocValues vals = docValuesCache.getSortedSetDocValues( sortDoc.docId, readerContext.reader(), readerContext.ord); if (vals == null) { - return false; + return; } final SortedSetDocValues docVals = vals; @@ -105,7 +103,6 @@ class MultiFieldWriter extends FieldWriter { else w.add(fieldType.toObject(f)); } }); - return true; } } diff --git a/solr/core/src/java/org/apache/solr/handler/export/StoredFieldsWriter.java b/solr/core/src/java/org/apache/solr/handler/export/StoredFieldsWriter.java new file mode 100644 index 00000000000..58d502e2579 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/export/StoredFieldsWriter.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.export; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.WeakHashMap; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.index.StoredFields; +import org.apache.solr.common.MapWriter.EntryWriter; +import org.apache.solr.schema.BoolField; +import org.apache.solr.schema.DateValueFieldType; +import org.apache.solr.schema.SchemaField; + +class StoredFieldsWriter extends FieldWriter { + + private static final ThreadLocal<WeakHashMap<IndexReader.CacheKey, StoredFields>> + STORED_FIELDS_MAP = ThreadLocal.withInitial(WeakHashMap::new); + private final Map<String, SchemaField> schemaFields; + + public StoredFieldsWriter(Map<String, SchemaField> fieldsToRead) { + this.schemaFields = fieldsToRead; + } + + @Override + public void write(SortDoc sortDoc, LeafReaderContext readerContext, EntryWriter out) + throws IOException { + WeakHashMap<IndexReader.CacheKey, StoredFields> map = STORED_FIELDS_MAP.get(); + LeafReader reader = readerContext.reader(); + StoredFields storedFields = map.get(reader.getReaderCacheHelper().getKey()); + if (storedFields == null) { + storedFields = reader.storedFields(); + map.put(reader.getReaderCacheHelper().getKey(), storedFields); + } + ExportVisitor visitor = new ExportVisitor(out); + storedFields.document(sortDoc.docId, visitor); + visitor.flush(); + } + + class ExportVisitor extends StoredFieldVisitor { + + final EntryWriter out; + String lastFieldName; + List<Object> multiValue = null; + int fieldsVisited; + + public ExportVisitor(EntryWriter out) { + this.out = out; + } + + @Override + public void stringField(FieldInfo fieldInfo, String value) throws IOException { + var schemaField = schemaFields.get(fieldInfo.name); + var fieldType = schemaField == null ? null : schemaField.getType(); + if (fieldType instanceof BoolField) { + // Convert "T"/"F" stored value to boolean true/false + addField(fieldInfo.name, Boolean.valueOf(fieldType.indexedToReadable(value))); + } else { + addField(fieldInfo.name, value); + } + } + + @Override + public void intField(FieldInfo fieldInfo, int value) throws IOException { + addField(fieldInfo.name, value); + } + + @Override + public void longField(FieldInfo fieldInfo, long value) throws IOException { + var schemaField = schemaFields.get(fieldInfo.name); + var fieldType = schemaField == null ? null : schemaField.getType(); + if (fieldType instanceof DateValueFieldType) { + Date date = new Date(value); + addField(fieldInfo.name, date); + } else { + addField(fieldInfo.name, value); + } + } + + @Override + public void floatField(FieldInfo fieldInfo, float value) throws IOException { + addField(fieldInfo.name, value); + } + + @Override + public void doubleField(FieldInfo fieldInfo, double value) throws IOException { + addField(fieldInfo.name, value); + } + + @Override + public Status needsField(FieldInfo fieldInfo) { + return schemaFields.containsKey(fieldInfo.name) ? Status.YES : Status.NO; + } + + private <T> void addField(String fieldName, T value) throws IOException { + if (fieldName.equals(lastFieldName)) { + // assume adding another value to a multi-value field + multiValue.add(value); + return; + } + // new/different field... + flush(); // completes the previous field if there's something to do + fieldsVisited++; + lastFieldName = fieldName; + + if (schemaFields.get(fieldName).multiValued()) { + multiValue = new ArrayList<>(); + multiValue.add(value); + } else { + out.put(fieldName, value); + } + } + + private void flush() throws IOException { + if (multiValue != null) { + out.put(lastFieldName, multiValue); + multiValue = null; + } + } + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java index 2f8d0963e3a..228f3c1c743 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java @@ -59,8 +59,7 @@ class StringFieldWriter extends FieldWriter { } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew) throws IOException { StringValue stringValue = (StringValue) sortDoc.getSortValue(this.field); BytesRef ref = null; @@ -74,7 +73,7 @@ class StringFieldWriter extends FieldWriter { if (stringValue.currentOrd == -1) { // Null sort value - return false; + return; } if (this.lastOrd == stringValue.currentOrd) { @@ -89,7 +88,7 @@ class StringFieldWriter extends FieldWriter { docValuesCache.getSortedDocValues( sortDoc.docId, readerContext.reader(), readerContext.ord); if (vals == null) { - return false; + return; } int ord = vals.ordValue(); @@ -102,7 +101,6 @@ class StringFieldWriter extends FieldWriter { } writeBytes(ew, ref, fieldType); - return true; } protected void writeBytes(MapWriter.EntryWriter ew, BytesRef ref, FieldType fieldType) diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-sortingresponse.xml b/solr/core/src/test-files/solr/collection1/conf/schema-sortingresponse.xml index d821c3935f2..5674b1dd7b2 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-sortingresponse.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-sortingresponse.xml @@ -33,7 +33,7 @@ <fieldType name="pdouble" class="solr.DoublePointField"/> <fieldType name="pfloat" class="solr.FloatPointField"/> <fieldType name="pdate" class="solr.DatePointField"/> - + <fieldType name="boolean" class="solr.BoolField"/> <fieldType name="string" class="solr.StrField"/> @@ -105,7 +105,7 @@ <dynamicField name="*_dts_p" type="pdate" indexed="true" stored="true" docValues="true" multiValued="true"/> <dynamicField name="*_dt_ni_p" type="pdate" indexed="false" stored="true" docValues="true" multiValued="false"/> <dynamicField name="*_dts_ni_p" type="pdate" indexed="false" stored="true" docValues="true" multiValued="true"/> - + <!-- Trie fields explicitly --> <dynamicField name="*_i_t" type="int" indexed="true" stored="true" docValues="true" multiValued="false"/> <dynamicField name="*_is_t" type="int" indexed="true" stored="true" docValues="true" multiValued="true"/> @@ -128,6 +128,22 @@ <dynamicField name="*_dt_ni_t" type="date" indexed="false" stored="true" docValues="true" multiValued="false"/> <dynamicField name="*_dts_ni_t" type="date" indexed="false" stored="true" docValues="true" multiValued="true"/> + <!-- Stored-only fields (no DocValues) --> + <dynamicField name="*_s_stored" type="string" indexed="false" stored="true" docValues="false" multiValued="false"/> + <dynamicField name="*_ss_stored" type="string" indexed="false" stored="true" docValues="false" multiValued="true"/> + <dynamicField name="*_i_stored" type="pint" indexed="true" stored="true" docValues="false" multiValued="false" uninvertible="true"/> + <dynamicField name="*_is_stored" type="pint" indexed="false" stored="true" docValues="false" multiValued="true"/> + <dynamicField name="*_l_stored" type="plong" indexed="false" stored="true" docValues="false" multiValued="false"/> + <dynamicField name="*_ls_stored" type="plong" indexed="false" stored="true" docValues="false" multiValued="true"/> + <dynamicField name="*_f_stored" type="pfloat" indexed="false" stored="true" docValues="false" multiValued="false"/> + <dynamicField name="*_fs_stored" type="pfloat" indexed="false" stored="true" docValues="false" multiValued="true"/> + <dynamicField name="*_d_stored" type="pdouble" indexed="false" stored="true" docValues="false" multiValued="false"/> + <dynamicField name="*_ds_stored" type="pdouble" indexed="false" stored="true" docValues="false" multiValued="true"/> + <dynamicField name="*_dt_stored" type="pdate" indexed="false" stored="true" docValues="false" multiValued="false"/> + <dynamicField name="*_dts_stored" type="pdate" indexed="false" stored="true" docValues="false" multiValued="true"/> + <dynamicField name="*_b_stored" type="boolean" indexed="false" stored="true" docValues="false" multiValued="false"/> + <dynamicField name="*_bs_stored" type="boolean" indexed="false" stored="true" docValues="false" multiValued="true"/> + <uniqueKey>id</uniqueKey> </schema> diff --git a/solr/core/src/test/org/apache/solr/handler/export/TestExportWriter.java b/solr/core/src/test/org/apache/solr/handler/export/TestExportWriter.java index f4836a93ce2..8829d7e5c60 100644 --- a/solr/core/src/test/org/apache/solr/handler/export/TestExportWriter.java +++ b/solr/core/src/test/org/apache/solr/handler/export/TestExportWriter.java @@ -17,6 +17,7 @@ package org.apache.solr.handler.export; import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; @@ -952,7 +953,7 @@ public class TestExportWriter extends SolrTestCaseJ4 { s.contains("\"status\":400}")); assertTrue( "Should have a cause when exporting sortabledv_m, it does not have useDocValuesAsStored='true'", - s.contains("Must have useDocValuesAsStored='true' to be used with export writer")); + s.contains("includeStoredFields=true")); s = h.query( @@ -970,7 +971,7 @@ public class TestExportWriter extends SolrTestCaseJ4 { s.contains("\"status\":400}")); assertTrue( "Should have a cause when exporting sortabledv, it does not have useDocValuesAsStored='true'", - s.contains("Must have useDocValuesAsStored='true' to be used with export writer")); + s.contains("includeStoredFields=true")); } private void assertJsonEquals(String actual, String expected) { @@ -1292,9 +1293,7 @@ public class TestExportWriter extends SolrTestCaseJ4 { assertTrue("doc doesn't have exception", doc.containsKey(StreamParams.EXCEPTION)); assertTrue( "wrong exception message", - doc.get(StreamParams.EXCEPTION) - .toString() - .contains("Must have useDocValuesAsStored='true'")); + doc.get(StreamParams.EXCEPTION).toString().contains("includeStoredFields=true")); } @Test @@ -1476,4 +1475,289 @@ public class TestExportWriter extends SolrTestCaseJ4 { doc.addField("number_" + type + (mv ? "s" : "") + "_ni_t", value); doc.addField("number_" + type + (mv ? "s" : "") + "_ni_p", value); } + + @Test + public void testIncludeStoredFieldsExplicitRequest() throws Exception { + // Test that stored-only fields are returned when includeStoredFields=true + clearIndex(); + + assertU( + adoc( + "id", "1", + "intdv", "1", + "str_s_stored", "hello", + "num_i_stored", "42", + "num_l_stored", "1234567890123", + "num_f_stored", "3.14", + "num_d_stored", "2.71828", + "date_dt_stored", "2024-01-15T10:30:00Z", + "bool_b_stored", "true")); + assertU(commit()); + + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", + "id,str_s_stored,num_i_stored,num_l_stored,num_f_stored,num_d_stored,date_dt_stored,bool_b_stored", + "sort", "intdv asc", + "includeStoredFields", "true")); + + assertJsonEquals( + resp, + "{" + + " \"responseHeader\":{\"status\":0}," + + " \"response\":{" + + " \"numFound\":1," + + " \"docs\":[{" + + " \"id\":\"1\"," + + " \"str_s_stored\":\"hello\"," + + " \"num_i_stored\":42," + + " \"num_l_stored\":1234567890123," + + " \"num_f_stored\":3.14," + + " \"num_d_stored\":2.71828," + + " \"date_dt_stored\":\"2024-01-15T10:30:00Z\"," + + " \"bool_b_stored\":true}]}}"); + } + + @Test + public void testIncludeStoredFieldsErrorWithoutParam() throws Exception { + // Test that error with hint is thrown when requesting stored-only field without + // includeStoredFields + clearIndex(); + + assertU(adoc("id", "1", "intdv", "1", "str_s_stored", "hello")); + assertU(commit()); + + // Request stored-only field without includeStoredFields=true should error + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "id,str_s_stored", + "sort", "intdv asc")); + + assertTrue( + "Expected error message to contain hint about includeStoredFields", + resp.contains("includeStoredFields=true")); + assertTrue("Expected error message to mention the field", resp.contains("str_s_stored")); + } + + @Test + public void testIncludeStoredFieldsGlobSkipsWithoutParam() throws Exception { + // Test that glob pattern silently skips stored-only fields when includeStoredFields=false + clearIndex(); + + assertU( + adoc( + "id", "1", + "intdv", "1", + "stringdv", "docvalue_string", + "str_s_stored", "stored_string")); + assertU(commit()); + + // Explicit fl with stored-only field should error + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "id,intdv,stringdv,str_s_stored", + "sort", "intdv asc")); + + // Should error because str_s_stored is explicitly requested + assertTrue( + "Expected error for explicitly requested stored-only field", resp.contains("str_s_stored")); + assertTrue( + "Expected hint about includeStoredFields", resp.contains("includeStoredFields=true")); + + // Now test with glob - should silently skip stored-only fields and succeed + resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "intdv,*", + "sort", "intdv asc")); + + assertJsonEquals( + resp, + "{" + + " \"responseHeader\":{\"status\":0}," + + " \"response\":{" + + " \"numFound\":1," + + " \"docs\":[{" + + " \"id\":\"1\"," + + " \"intdv\":1," + + " \"stringdv\":\"docvalue_string\"}]}}"); + } + + @Test + public void testIncludeStoredFieldsGlobIncludesWithParam() throws Exception { + // Test that glob pattern includes stored-only fields when includeStoredFields=true + clearIndex(); + + assertU( + adoc( + "id", "1", + "intdv", "1", + "stringdv", "docvalue_string", + "str_s_stored", "stored_string")); + assertU(commit()); + + // Glob fl=* with includeStoredFields=true should include stored-only fields + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "*", + "sort", "intdv asc", + "includeStoredFields", "true")); + + assertJsonEquals( + resp, + "{" + + " \"responseHeader\":{\"status\":0}," + + " \"response\":{" + + " \"numFound\":1," + + " \"docs\":[{" + + " \"intdv\":1," + + " \"stringdv\":\"docvalue_string\"," + + " \"id\":\"1\"," + + " \"str_s_stored\":\"stored_string\"}]}}"); + } + + @Test + public void testIncludeStoredFieldsMultiValued() throws Exception { + // Test that multi-valued stored-only fields work correctly + clearIndex(); + + assertU( + adoc( + "id", "1", + "intdv", "1", + "strs_ss_stored", "value1", + "strs_ss_stored", "value2", + "strs_ss_stored", "value3", + "nums_is_stored", "10", + "nums_is_stored", "20", + "nums_is_stored", "30")); + assertU(commit()); + + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "id,strs_ss_stored,nums_is_stored", + "sort", "intdv asc", + "includeStoredFields", "true")); + + assertJsonEquals( + resp, + "{" + + " \"responseHeader\":{\"status\":0}," + + " \"response\":{" + + " \"numFound\":1," + + " \"docs\":[{" + + " \"id\":\"1\"," + + " \"strs_ss_stored\":[\"value1\",\"value2\",\"value3\"]," + + " \"nums_is_stored\":[10,20,30]}]}}"); + } + + @Test + public void testIncludeStoredFieldsAllTypes() throws Exception { + // Test all supported stored field types including Date + clearIndex(); + + assertU( + adoc( + "id", "1", + "intdv", "1", + "str_s_stored", "test_string", + "num_i_stored", "123", + "num_l_stored", "9876543210", + "num_f_stored", "1.5", + "num_d_stored", "2.5", + "date_dt_stored", "2025-12-25T00:00:00Z", + "bool_b_stored", "false")); + assertU( + adoc( + "id", "2", + "intdv", "2", + "str_s_stored", "another_string", + "num_i_stored", "456", + "num_l_stored", "1234567890", + "num_f_stored", "2.5", + "num_d_stored", "3.5", + "date_dt_stored", "2025-06-15T12:30:00Z", + "bool_b_stored", "true")); + assertU(commit()); + + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", + "id,str_s_stored,num_i_stored,num_l_stored,num_f_stored,num_d_stored,date_dt_stored,bool_b_stored", + "sort", "intdv asc", + "includeStoredFields", "true")); + + assertJsonEquals( + resp, + "{" + + " \"responseHeader\":{\"status\":0}," + + " \"response\":{" + + " \"numFound\":2," + + " \"docs\":[{" + + " \"id\":\"1\"," + + " \"str_s_stored\":\"test_string\"," + + " \"num_i_stored\":123," + + " \"num_l_stored\":9876543210," + + " \"num_f_stored\":1.5," + + " \"num_d_stored\":2.5," + + " \"date_dt_stored\":\"2025-12-25T00:00:00Z\"," + + " \"bool_b_stored\":false}," + + " {" + + " \"id\":\"2\"," + + " \"str_s_stored\":\"another_string\"," + + " \"num_i_stored\":456," + + " \"num_l_stored\":1234567890," + + " \"num_f_stored\":2.5," + + " \"num_d_stored\":3.5," + + " \"date_dt_stored\":\"2025-06-15T12:30:00Z\"," + + " \"bool_b_stored\":true}]}}"); + } + + @Test + public void testSortingWithoutDocValues() throws Exception { + // Attempting to sort on a field without DocValues should fail + clearIndex(); + + assertU( + adoc( + "id", "1", + "sorted_i_stored", "0")); + assertU(commit()); + + IOException ex = + expectThrows( + IOException.class, + () -> + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "id", + "sort", "sorted_i_stored asc", + "includeStoredFields", "true"))); + + assertTrue( + "Error message should mention DocValues requirement", + ex.getMessage().contains("DocValues")); + } } diff --git a/solr/solr-ref-guide/modules/query-guide/pages/exporting-result-sets.adoc b/solr/solr-ref-guide/modules/query-guide/pages/exporting-result-sets.adoc index bbd31c7b358..fc6f4d6a7ef 100644 --- a/solr/solr-ref-guide/modules/query-guide/pages/exporting-result-sets.adoc +++ b/solr/solr-ref-guide/modules/query-guide/pages/exporting-result-sets.adoc @@ -25,7 +25,9 @@ The cases where this functionality may be useful include: session analysis, dist == Field Requirements -All the fields being sorted and exported must have docValues set to `true`. +All the fields being sorted must have docValues set to `true`. +By default, fields in the field list (`fl`) must also have docValues. +However, you can include stored-only fields (fields without docValues) by setting the `includeStoredFields` parameter to `true`. For more information, see the section on xref:indexing-guide:docvalues.adoc[]. == The /export RequestHandler @@ -44,6 +46,12 @@ Filter queries are also supported. An optional parameter `batchSize` determines the size of the internal buffers for partial results. The default value is `30000` but users may want to specify smaller values to limit the memory use (at the cost of degraded performance) or higher values to improve export performance (the relationship is not linear and larger values don't bring proportionally larger performance increases). +An optional parameter `includeStoredFields` (default `false`) enables exporting fields that only have stored values (no docValues). +When set to `true`, fields without docValues but with stored values can be included in the field list (`fl`). +Note that retrieving stored fields may significantly impact export performance compared to docValues fields, as stored fields require additional I/O operations. +If all requested fields are `docValues=true` then the data will only be read from docValues. +This behavior applies to fields that are also `stored=true` and does not depend on the value of the `includeStoredFields` parameter. + The supported response writers are `json` and `javabin`. For backward compatibility reasons `wt=xsort` is also supported as input, but `wt=xsort` behaves same as `wt=json`. The default output format is `json`. @@ -58,8 +66,8 @@ http://localhost:8983/solr/core_name/export?q=my-query&sort=severity+desc,timest === Specifying the Sort Criteria The `sort` property defines how documents will be sorted in the exported result set. -Results can be sorted by any field that has a field type of int,long, float, double, string. -The sort fields must be single valued fields. +Results can be sorted by any field that has a field type of int, long, float, double, string. +The sort fields must be single valued fields and must have docValues enabled. The export performance will get slower as you add more sort fields. If there is enough physical memory available outside of the JVM to load up the sort fields then the performance will be linearly slower with addition of sort fields. @@ -71,6 +79,10 @@ The `fl` property defines the fields that will be exported with the result set. Any of the field types that can be sorted (i.e., int, long, float, double, string, date, boolean) can be used in the field list. The fields can be single or multi-valued. +By default, fields in the field list must have docValues enabled. +However, when the `includeStoredFields` parameter is set to `true`, fields with only stored values (no docValues) can also be included. +Note that sort fields still require docValues, regardless of this setting. + Wildcard patterns can be used for the field list (e.g. `fl=*_i`) and will be expanded to the list of fields that match the pattern and are able to be exported, see <<Field Requirements>>. Returning scores is not supported at this time. @@ -105,6 +117,61 @@ http://localhost:8983/solr/core_name/export?q=my-query&sort=reporter+desc,&fl=re (Note that the `over` parameter must use one of the fields requested in the `fl` parameter). +== Comparison with Cursors + +The `/export` handler and xref:pagination-of-results.adoc#fetching-a-large-number-of-sorted-results-cursors[cursor-based pagination] offer different trade-offs for streaming large result sets. + +[cols="h,2,2"] +|=== +| |Export |Cursors + +|Advantages +a| +* Query executed once -- efficient +* Consistent snapshot (no duplicates or missing docs) +* Lower latency to the first document (typically) +* Decoupled reader and writer creates smoother flow +a| +* Sharded collection support, intrinsically supported +* Flexible sort criteria +* Resumable across requests and restarts +* Full `SearchHandler` features (highlighting, etc.) + +|Disadvantages +a| +* Requires streaming expressions for distributed queries +* Sort criteria can only be fields with docValues; no score +* Must consume in a single session +* A long session may retain old segments from being removed in a timely manner +a| +* Query re-executed for each page -- inefficient +* Possible duplicates or missing docs with concurrent updates +* Higher latency to the first document (typically) +* Uneven flow; large batches needed for throughput +|=== + +=== Details + +With cursors, the query is re-executed for each page of results. +In contrast, `/export` runs the filter query once and the resulting segment-level bitmasks are applied once per segment, after which the documents are simply iterated over. +Additionally, the segments that existed when the stream was opened are held open for the duration of the export, eliminating the disappearing or duplicate document issues that can occur with cursors. +However, this means IndexReaders are kept around for longer periods of time, which delays cleanup of memory and disk resources until the export completes. + +The `/export` handler has significantly lower latency until the first document is returned, because the internal batch size is decoupled from the response message size. +With cursors, you typically need to set the `rows` parameter to a high value (e.g., 10k-100k depending on `fl`/document size) to achieve decent throughput, and provided you have enough memory (rows * shards * `fl`-size). +However, this creates a "glugging" effect: when you request a large batch, Solr must build the entire payload and send it over the wire while your client waits (assuming a sharded-collection). +Only after receiving and decoding this large payload can the client request the next batch, but in the interim Solr sits idle on this request. +With the `/export` handler, these steps are decoupled; Solr can continue sorting and decoding/encoding documents while waiting for more demand from the client. + +The advantage of cursors is _flexibility_. +Cursors impose no constraints on the sort criteria except that you must include a unique key, which isn't a real constraint. +Cursors work as part of `SearchHandler` and thus can include most/all capabilities of it like highlighting. +A `cursorMark` can be persisted and resumed later, even across restarts, or never continued if enough results were consumed to satisfy the use-case. +An `/export` stream must be consumed in a single session. +Cursors also support distributed queries by default while `/export` does not, although they can be achieved using +xref:streaming-expressions.adoc[streaming expressions] which are built on top of the `/export` handler. + == Distributed Support See the section xref:streaming-expressions.adoc[] for distributed support. +
