This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new a2ccd460d5 GH-36421: [Java] Enable Support for reading JSON Datasets
(#36422)
a2ccd460d5 is described below
commit a2ccd460d5c161338ea0d760eb3d93f5e2a35164
Author: david dali susanibar arce <[email protected]>
AuthorDate: Mon Jul 3 18:43:38 2023 -0500
GH-36421: [Java] Enable Support for reading JSON Datasets (#36422)
### Rationale for this change
Enable Support for reading JSON Datasets
https://github.com/apache/arrow/pull/33732 on Java side
### What changes are included in this PR?
Support for reading JSON Datasets
### Are these changes tested?
Unit test added
### Are there any user-facing changes?
No
* Closes: #36421
Lead-authored-by: david dali susanibar arce <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
ci/scripts/java_jni_macos_build.sh | 1 +
docs/source/java/dataset.rst | 1 +
java/dataset/src/main/cpp/jni_wrapper.cc | 4 +++
.../org/apache/arrow/dataset/file/FileFormat.java | 1 +
...riteSupport.java => TextBasedWriteSupport.java} | 11 +++----
.../arrow/dataset/file/TestFileSystemDataset.java | 34 ++++++++++++++++++++--
6 files changed, 44 insertions(+), 8 deletions(-)
diff --git a/ci/scripts/java_jni_macos_build.sh
b/ci/scripts/java_jni_macos_build.sh
index c38f072709..2b2384ab7f 100755
--- a/ci/scripts/java_jni_macos_build.sh
+++ b/ci/scripts/java_jni_macos_build.sh
@@ -76,6 +76,7 @@ cmake \
-DARROW_DEPENDENCY_USE_SHARED=OFF \
-DARROW_GANDIVA=${ARROW_GANDIVA} \
-DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \
+ -DARROW_JSON=${ARROW_DATASET} \
-DARROW_ORC=${ARROW_ORC} \
-DARROW_PARQUET=${ARROW_PARQUET} \
-DARROW_S3=${ARROW_S3} \
diff --git a/docs/source/java/dataset.rst b/docs/source/java/dataset.rst
index 6315932a57..35ffa81058 100644
--- a/docs/source/java/dataset.rst
+++ b/docs/source/java/dataset.rst
@@ -43,6 +43,7 @@ Currently supported file formats are:
- Apache ORC (``.orc``)
- Apache Parquet (``.parquet``)
- Comma-Separated Values (``.csv``)
+- Line-delimited JSON Values (``.json``)
Below shows a simplest example of using Dataset to query a Parquet file in
Java:
diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc
b/java/dataset/src/main/cpp/jni_wrapper.cc
index cba2b4a0db..871a2e95b9 100644
--- a/java/dataset/src/main/cpp/jni_wrapper.cc
+++ b/java/dataset/src/main/cpp/jni_wrapper.cc
@@ -105,6 +105,10 @@ arrow::Result<std::shared_ptr<arrow::dataset::FileFormat>>
GetFileFormat(
#ifdef ARROW_CSV
case 3:
return std::make_shared<arrow::dataset::CsvFileFormat>();
+#endif
+#ifdef ARROW_JSON
+ case 4:
+ return std::make_shared<arrow::dataset::JsonFileFormat>();
#endif
default:
std::string error_message =
diff --git
a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java
b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java
index aad4fa5f2a..70a1faab79 100644
--- a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java
+++ b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java
@@ -25,6 +25,7 @@ public enum FileFormat {
ARROW_IPC(1),
ORC(2),
CSV(3),
+ JSON(4),
NONE(-1);
private final int id;
diff --git
a/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java
b/java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java
similarity index 74%
rename from
java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java
rename to
java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java
index 9f404522db..43f96f6d4d 100644
--- a/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java
+++
b/java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java
@@ -24,17 +24,18 @@ import java.net.URI;
import java.net.URISyntaxException;
import java.util.Random;
-public class CsvWriteSupport {
+public class TextBasedWriteSupport {
private final URI uri;
private final Random random = new Random();
- public CsvWriteSupport(File outputFolder) throws URISyntaxException {
- uri = new URI("file", outputFolder.getPath() + "/" + "generated-" +
random.nextLong() + ".csv", null);
+ public TextBasedWriteSupport(File outputFolder, String fileExtension) throws
URISyntaxException {
+ uri = new URI("file", outputFolder.getPath() + File.separator +
+ "generated-" + random.nextLong() + fileExtension, null);
}
- public static CsvWriteSupport writeTempFile(File outputFolder, String...
values)
+ public static TextBasedWriteSupport writeTempFile(File outputFolder, String
fileExtension, String... values)
throws URISyntaxException, IOException {
- CsvWriteSupport writer = new CsvWriteSupport(outputFolder);
+ TextBasedWriteSupport writer = new TextBasedWriteSupport(outputFolder,
fileExtension);
try (FileWriter addValues = new FileWriter(new File(writer.uri), true)) {
for (Object value : values) {
addValues.write(value + "\n");
diff --git
a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java
b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java
index 735b3ae611..6ca559ee2d 100644
---
a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java
+++
b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java
@@ -37,9 +37,9 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.stream.Collectors;
-import org.apache.arrow.dataset.CsvWriteSupport;
import org.apache.arrow.dataset.OrcWriteSupport;
import org.apache.arrow.dataset.ParquetWriteSupport;
+import org.apache.arrow.dataset.TextBasedWriteSupport;
import org.apache.arrow.dataset.jni.NativeDataset;
import org.apache.arrow.dataset.jni.NativeInstanceReleasedException;
import org.apache.arrow.dataset.jni.NativeMemoryPool;
@@ -407,8 +407,8 @@ public class TestFileSystemDataset extends
TestNativeDataset {
@Test
public void testBaseCsvRead() throws Exception {
- CsvWriteSupport writeSupport = CsvWriteSupport.writeTempFile(
- TMP.newFolder(), "Name,Language", "Juno,Java", "Peter,Python",
"Celin,C++");
+ TextBasedWriteSupport writeSupport = TextBasedWriteSupport.writeTempFile(
+ TMP.newFolder(), ".csv", "Name,Language", "Juno,Java",
"Peter,Python", "Celin,C++");
String expectedJsonUnordered = "[[\"Juno\", \"Java\"], [\"Peter\",
\"Python\"], [\"Celin\", \"C++\"]]";
ScanOptions options = new ScanOptions(100);
try (
@@ -429,6 +429,34 @@ public class TestFileSystemDataset extends
TestNativeDataset {
}
}
+ @Test
+ public void testBaseJsonRead() throws Exception {
+ TextBasedWriteSupport writeSupport = TextBasedWriteSupport.writeTempFile(
+ TMP.newFolder(), ".json",
+ "{\"Type\": \"Compiled\", \"Language\": \"Java\"}",
+ "{\"Type\": \"Interpreted\", \"Language\": \"Python\"}");
+ String expectedJsonUnordered = "[[\"Compiled\", \"Java\"], " +
+ "[\"Interpreted\", \"Python\"]]";
+ ScanOptions options = new ScanOptions(100);
+ try (
+ FileSystemDatasetFactory factory = new
FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(),
+ FileFormat.JSON, writeSupport.getOutputURI())
+ ) {
+ List<ArrowRecordBatch> datum = collectResultFromFactory(factory,
options);
+ Schema schema = inferResultSchemaFromFactory(factory, options);
+
+ assertScanBatchesProduced(factory, options);
+ assertEquals(1, datum.size());
+ assertEquals(2, schema.getFields().size());
+ assertEquals("Type", schema.getFields().get(0).getName());
+ assertEquals("Language", schema.getFields().get(1).getName());
+
+ checkParquetReadResult(schema, expectedJsonUnordered, datum);
+
+ AutoCloseables.close(datum);
+ }
+ }
+
private void checkParquetReadResult(Schema schema, String expectedJson,
List<ArrowRecordBatch> actual)
throws IOException {
final ObjectMapper json = new ObjectMapper();