This is an automated email from the ASF dual-hosted git repository.
epugh pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new 585b1d00a00 SOLR-17867: Export Tool should export regular docs cleanly
in .json, .jsonl, and javabin (second take) (#2636)
585b1d00a00 is described below
commit 585b1d00a0053b9fadcaecd82082667a5a255169
Author: Eric Pugh <[email protected]>
AuthorDate: Sat Aug 16 08:09:04 2025 -0400
SOLR-17867: Export Tool should export regular docs cleanly in .json,
.jsonl, and javabin (second take) (#2636)
* Properly handle the format parameter and the compress parameter
* Added new integration (bats) tests.
* Refactored sink code to reduce duplication.
Does NOT deal with nested documents properly. That issue remains.
---
.../ExportTool.java => ExportTool.java.original | 0
solr/CHANGES.txt | 2 +
.../src/java/org/apache/solr/cli/ExportTool.java | 135 +++++++++------------
.../test/org/apache/solr/cli/TestExportTool.java | 75 ++++++++++--
solr/packaging/test/test_export.bats | 5 +-
.../pages/solr-control-script-reference.adoc | 28 +++--
6 files changed, 145 insertions(+), 100 deletions(-)
diff --git a/solr/core/src/java/org/apache/solr/cli/ExportTool.java
b/ExportTool.java.original
similarity index 100%
copy from solr/core/src/java/org/apache/solr/cli/ExportTool.java
copy to ExportTool.java.original
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 3baf170ec67..73dd813599c 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -51,6 +51,8 @@ Improvements
* SOLR-17852: Migrate Schema Designer to use FileStore API instead of
BlobHandler for persisting working data. (Eric Pugh)
+* SOLR-17867: Export tool should properly output exported documents in json,
json w/ lines, and javabin formats. (Eric Pugh)
+
Optimizations
---------------------
* SOLR-17568: The CLI bin/solr export tool now contacts the appropriate nodes
directly for data instead of proxying through one.
diff --git a/solr/core/src/java/org/apache/solr/cli/ExportTool.java
b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
index 050092c2017..c742c00999c 100644
--- a/solr/core/src/java/org/apache/solr/cli/ExportTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/ExportTool.java
@@ -219,12 +219,20 @@ public class ExportTool extends ToolBase {
} else if (Files.isDirectory(Path.of(this.out))) {
this.out = this.out + "/" + coll;
}
- this.out = this.out + '.' + this.format;
- if (compress) {
+ if (!hasExtension(this.out)) {
+ this.out = this.out + '.' + this.format;
+ }
+ if (compress & !this.out.endsWith(".gz")) {
this.out = this.out + ".gz";
}
}
+ public static boolean hasExtension(String filename) {
+ return filename.contains(".json")
+ || filename.contains(".jsonl")
+ || filename.contains(".javabin");
+ }
+
DocsSink getSink() {
DocsSink docSink = null;
switch (format) {
@@ -311,6 +319,51 @@ public class ExportTool extends ToolBase {
Info info;
OutputStream fos;
+ /** Process a SolrDocument into a Map, handling special fields and date
conversion. */
+ protected Map<String, Object> processDocument(SolrDocument doc) {
+ Map<String, Object> m = CollectionUtil.newLinkedHashMap(doc.size());
+ doc.forEach(
+ (s, field) -> {
+ if (s.equals("_version_") || s.equals("_roor_")) return;
+ if (field instanceof List) {
+ if (((List<?>) field).size() == 1) {
+ field = ((List<?>) field).get(0);
+ }
+ }
+ field = constructDateStr(field);
+ if (field instanceof List<?> list) {
+ if (hasDate(list)) {
+ ArrayList<Object> listCopy = new ArrayList<>(list.size());
+ for (Object o : list) listCopy.add(constructDateStr(o));
+ field = listCopy;
+ }
+ }
+ m.put(s, field);
+ });
+ return m;
+ }
+
+ /** Check if a list contains any Date objects */
+ protected boolean hasDate(List<?> list) {
+ boolean hasDate = false;
+ for (Object o : list) {
+ if (o instanceof Date) {
+ hasDate = true;
+ break;
+ }
+ }
+ return hasDate;
+ }
+
+ /** Convert Date objects to ISO formatted strings */
+ protected Object constructDateStr(Object field) {
+ if (field instanceof Date) {
+ field =
+ DateTimeFormatter.ISO_INSTANT.format(Instant.ofEpochMilli(((Date)
field).getTime()));
+ }
+ return field;
+ }
+
abstract void start() throws IOException;
@SuppressForbidden(reason = "Command line tool prints out to console")
@@ -356,49 +409,12 @@ public class ExportTool extends ToolBase {
@Override
public synchronized void accept(SolrDocument doc) throws IOException {
charArr.reset();
- Map<String, Object> m = CollectionUtil.newLinkedHashMap(doc.size());
- doc.forEach(
- (s, field) -> {
- if (s.equals("_version_") || s.equals("_roor_")) return;
- if (field instanceof List) {
- if (((List<?>) field).size() == 1) {
- field = ((List<?>) field).get(0);
- }
- }
- field = constructDateStr(field);
- if (field instanceof List<?> list) {
- if (hasdate(list)) {
- ArrayList<Object> listCopy = new ArrayList<>(list.size());
- for (Object o : list) listCopy.add(constructDateStr(o));
- field = listCopy;
- }
- }
- m.put(s, field);
- });
+ Map<String, Object> m = processDocument(doc);
jsonWriter.write(m);
writer.write(charArr.getArray(), charArr.getStart(), charArr.getEnd());
writer.append('\n');
super.accept(doc);
}
-
- private boolean hasdate(List<?> list) {
- boolean hasDate = false;
- for (Object o : list) {
- if (o instanceof Date) {
- hasDate = true;
- break;
- }
- }
- return hasDate;
- }
-
- private Object constructDateStr(Object field) {
- if (field instanceof Date) {
- field =
- DateTimeFormatter.ISO_INSTANT.format(Instant.ofEpochMilli(((Date)
field).getTime()));
- }
- return field;
- }
}
static class JsonSink extends DocsSink {
@@ -435,25 +451,7 @@ public class ExportTool extends ToolBase {
@Override
public synchronized void accept(SolrDocument doc) throws IOException {
charArr.reset();
- Map<String, Object> m = CollectionUtil.newLinkedHashMap(doc.size());
- doc.forEach(
- (s, field) -> {
- if (s.equals("_version_") || s.equals("_roor_")) return;
- if (field instanceof List) {
- if (((List<?>) field).size() == 1) {
- field = ((List<?>) field).get(0);
- }
- }
- field = constructDateStr(field);
- if (field instanceof List<?> list) {
- if (hasdate(list)) {
- ArrayList<Object> listCopy = new ArrayList<>(list.size());
- for (Object o : list) listCopy.add(constructDateStr(o));
- field = listCopy;
- }
- }
- m.put(s, field);
- });
+ Map<String, Object> m = processDocument(doc);
if (firstDoc) {
firstDoc = false;
} else {
@@ -464,25 +462,6 @@ public class ExportTool extends ToolBase {
writer.append('\n');
super.accept(doc);
}
-
- private boolean hasdate(List<?> list) {
- boolean hasDate = false;
- for (Object o : list) {
- if (o instanceof Date) {
- hasDate = true;
- break;
- }
- }
- return hasDate;
- }
-
- private Object constructDateStr(Object field) {
- if (field instanceof Date) {
- field =
- DateTimeFormatter.ISO_INSTANT.format(Instant.ofEpochMilli(((Date)
field).getTime()));
- }
- return field;
- }
}
static class JavabinSink extends DocsSink {
diff --git a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
index f74eedbb0d8..d356c439849 100644
--- a/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
+++ b/solr/core/src/test/org/apache/solr/cli/TestExportTool.java
@@ -19,6 +19,7 @@ package org.apache.solr.cli;
import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
@@ -28,6 +29,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Predicate;
+import java.util.zip.GZIPInputStream;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.client.solrj.SolrClient;
@@ -50,6 +52,31 @@ import org.junit.Test;
@SolrTestCaseJ4.SuppressSSL
public class TestExportTool extends SolrCloudTestCase {
+ public void testOutputFormatToFileNameMapping() {
+
+ ToolRuntime runtime = new CLITestHelper.TestingRuntime(false);
+ String url = "http://example:8983/solr/mycollection";
+ ExportTool.Info info = new ExportTool.MultiThreadedRunner(runtime, url,
null);
+
+ info.setOutFormat(null, "json", false);
+ assertEquals("mycollection.json", info.out);
+
+ info.setOutFormat(null, "jsonl", false);
+ assertEquals("mycollection.jsonl", info.out);
+
+ info.setOutFormat(null, "javabin", false);
+ assertEquals("mycollection.javabin", info.out);
+
+ String tempFile = createTempDir() + "/myoutput.json";
+ info.setOutFormat(tempFile, "json", false);
+ assertEquals(tempFile, info.out);
+
+ // test with compression
+ tempFile = createTempDir() + "/myoutput.myoutput.json.gz";
+ info.setOutFormat(tempFile, "json", true);
+ assertEquals(tempFile, info.out);
+ }
+
@Test
public void testBasic() throws Exception {
String COLLECTION_NAME = "globalLoaderColl";
@@ -92,7 +119,8 @@ public class TestExportTool extends SolrCloudTestCase {
info.fields = "id,desc_s,a_dt";
info.exportDocs();
- assertJsonDocsCount(info, 200, record ->
"2019-09-30T05:58:03Z".equals(record.get("a_dt")));
+ assertJsonLinesDocsCount(
+ info, 200, record ->
"2019-09-30T05:58:03Z".equals(record.get("a_dt")));
info = new ExportTool.MultiThreadedRunner(runtime, url, null);
absolutePath =
@@ -102,7 +130,7 @@ public class TestExportTool extends SolrCloudTestCase {
info.fields = "id,desc_s";
info.exportDocs();
- assertJsonDocsCount(info, 1000, null);
+ assertJsonLinesDocsCount(info, 1000, null);
info = new ExportTool.MultiThreadedRunner(runtime, url, null);
absolutePath =
@@ -131,7 +159,7 @@ public class TestExportTool extends SolrCloudTestCase {
info.fields = "id,desc_s";
info.exportDocs();
- assertJsonDocsCount2(info, 200);
+ assertJsonDocsCount(info, 200);
info = new ExportTool.MultiThreadedRunner(runtime, url, null);
absolutePath =
@@ -141,7 +169,7 @@ public class TestExportTool extends SolrCloudTestCase {
info.fields = "id,desc_s";
info.exportDocs();
- assertJsonDocsCount2(info, 1000);
+ assertJsonDocsCount(info, 1000);
} finally {
cluster.shutdown();
@@ -197,11 +225,9 @@ public class TestExportTool extends SolrCloudTestCase {
assertEquals(docCount, totalDocsFromCores);
ToolRuntime runtime = new CLITestHelper.TestingRuntime(false);
- ExportTool.MultiThreadedRunner info;
- String absolutePath;
- info = new ExportTool.MultiThreadedRunner(runtime, url, null);
- absolutePath =
+ ExportTool.MultiThreadedRunner info = new
ExportTool.MultiThreadedRunner(runtime, url, null);
+ String absolutePath =
baseDir.resolve(COLLECTION_NAME + random().nextInt(100000) +
".javabin").toString();
info.setOutFormat(absolutePath, "javabin", false);
info.setLimit("-1");
@@ -211,6 +237,7 @@ public class TestExportTool extends SolrCloudTestCase {
assertEquals(
e.getValue().longValue(),
info.corehandlers.get(e.getKey()).receivedDocs.get());
}
+
info = new ExportTool.MultiThreadedRunner(runtime, url, null);
absolutePath =
baseDir.resolve(COLLECTION_NAME + random().nextInt(100000) +
".jsonl").toString();
@@ -280,7 +307,7 @@ public class TestExportTool extends SolrCloudTestCase {
}
}
- private void assertJsonDocsCount2(ExportTool.Info info, int expected) {
+ private void assertJsonDocsCount(ExportTool.Info info, int expected) {
assertTrue(
"" + info.docsWritten.get() + " expected " + expected,
info.docsWritten.get() >= expected);
}
@@ -310,4 +337,34 @@ public class TestExportTool extends SolrCloudTestCase {
rdr.close();
}
}
+
+ private void assertJsonLinesDocsCount(
+ ExportTool.Info info, int expected, Predicate<Map<String, Object>>
predicate)
+ throws IOException {
+ assertTrue(
+ "" + info.docsWritten.get() + " expected " + expected,
info.docsWritten.get() >= expected);
+
+ JsonRecordReader jsonReader;
+ Reader rdr;
+ jsonReader = JsonRecordReader.getInst("/", List.of("$FQN:/**"));
+ InputStream is = new FileInputStream(info.out);
+ if (info.compress) {
+ is = new GZIPInputStream(is);
+ }
+ rdr = new InputStreamReader(is, StandardCharsets.UTF_8);
+ try {
+ int[] count = new int[] {0};
+ jsonReader.streamRecords(
+ rdr,
+ (record, path) -> {
+ if (predicate != null) {
+ assertTrue(predicate.test(record));
+ }
+ count[0]++;
+ });
+ assertTrue(count[0] >= expected);
+ } finally {
+ rdr.close();
+ }
+ }
}
diff --git a/solr/packaging/test/test_export.bats
b/solr/packaging/test/test_export.bats
index cb089ecd5fe..f497fde4839 100644
--- a/solr/packaging/test/test_export.bats
+++ b/solr/packaging/test/test_export.bats
@@ -45,9 +45,8 @@ teardown() {
assert [ -e techproducts.javabin ]
rm techproducts.javabin
- # old pattern of putting a suffix on the output that controlled the format
no longer supported ;-).
- run solr export --solr-url http://localhost:${SOLR_PORT} -c techproducts
--query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/output.javabin"
- assert [ -e ${BATS_TEST_TMPDIR}/output.javabin.json ]
+ run solr export --solr-url http://localhost:${SOLR_PORT} -c techproducts
--query "*:* -id:test" --output "${BATS_TEST_TMPDIR}/output.json"
+ assert [ -e ${BATS_TEST_TMPDIR}/output.json ]
run solr export --solr-url http://localhost:${SOLR_PORT} -c techproducts
--query "*:* -id:test" --output "${BATS_TEST_TMPDIR}"
assert [ -e ${BATS_TEST_TMPDIR}/techproducts.json ]
diff --git
a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
index 48fe8e18347..842d294b06e 100644
---
a/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
+++
b/solr/solr-ref-guide/modules/deployment-guide/pages/solr-control-script-reference.adoc
@@ -1619,12 +1619,12 @@ Examples of this command:
=== Exporting Documents to a File
-The `export` command will allow you to export documents from a collection in
JSON, JSON with Lines, or Javabin format.
-All documents can be exported, or only those that match a query.
+The `export` command will allow you to export documents from a collection in
JSON, https://jsonlines.org/[JSON Lines], or Javabin format.
+All documents can be exported, or only those that match a query. You may need
to wrap some parameters with quotes.
NOTE: This hasn't been tested with nested child documents and your results
will vary.
-NOTE: The `export` command only works with in a Solr running in cloud mode.
+NOTE: The `export` command only works with Solr running in cloud mode.
`bin/solr export [options]`
@@ -1661,9 +1661,11 @@ Name of the collection to run an export against.
|Optional |Default: `json`
|===
+
-The file format of the export, `json`, `jsonl`, or `javabin`.
-Choosing `javabin` exports in the native Solr format, and is compact and fast
to import.
-`jsonl` is the Json with Lines format, learn more at https://jsonlines.org/.
+The file format of the export, `json` (default) or `jsonl` or `javabin`, this
also specifies the file extension to be used.
+`json` and `jsonl` both export documents in the same format as using
`wt=json`. The `json` output file is suitable for
+immediately posting back to solr via the `/update/json` endpoint. `jsonl`
outputs each Solr document on it's own line,
+and it useful for parallel processing tasks. Learn more at
https://jsonlines.org/.
+Choosing `javabin` exports in the native binary Solr format and is compact and
faster to import.
`--output <path>`::
+
@@ -1673,6 +1675,7 @@ Choosing `javabin` exports in the native Solr format, and
is compact and fast to
|===
+
Either the path to the directory for the exported data to be written to, or a
specific file to be written out.
+If the file name ends with `.gz` the output will be compressed into a .gz file.
+
If only a directory is specified then the file will be created with the name
of the collection, as in `<collection>.json`.
@@ -1692,7 +1695,7 @@ If you specify `--compress` then the resulting outputting
file with will be gzip
|Optional |Default: `\*:*`
|===
+
-A custom query.
+A custom query to select documents for exporting.
The default is `\*:*` which will export all documents.
`--fields <fields>`::
@@ -1727,7 +1730,7 @@ This parameter is unnecessary if `SOLR_AUTH_TYPE` is
defined in `solr.in.sh` or
*Examples*
-Export all documents from a collection `gettingstarted`:
+Export all documents from a collection `gettingstarted` into a file called
`gettingstarted.json`:
[source,bash]
bin/solr export --solr-url http://localhost:8983 -c gettingstarted --limit -1
@@ -1741,7 +1744,12 @@ bin/solr export --solr-url http://localhost:8983 -c
gettingstarted --limit -1 --
=== Importing Documents into a Collection
-Once you have exported documents in a file, you can use the
xref:indexing-guide:indexing-with-update-handlers.adoc[/update request handler]
to import them to a new Solr collection.
+Once you have exported documents in a file, you can use the
xref:indexing-guide:indexing-with-update-handlers.adoc#json-formatted-index-updates[/update
request handler] to import them to a new Solr collection.
+Notice the different endpoints used depending on the format.
+
+*Example: import `json` files*
+
+`curl -X POST --header "Content-Type: application/json" -d
@gettingstarted.json
http://localhost:8983/solr/gettingstarted/update/json?commit=true`
*Example: import `json` files*
@@ -1763,7 +1771,7 @@ Now import the data with either of these methods:
[,console]
----
-$ curl -X POST -d @gettingstarted.json
'http://localhost:8983/solr/test_collection/update/json/docs?commit=true'
+$ curl -X POST --header "Content-Type: application/json" -d
@gettingstarted.json
'http://localhost:8983/solr/test_collection/update/json/docs?commit=true'
----
or
[,console]