davisusanibar commented on code in PR #258:
URL: https://github.com/apache/arrow-cookbook/pull/258#discussion_r995107408
##########
java/source/dataset.rst:
##########
@@ -317,5 +419,136 @@ In case we need to project only certain columns we could
configure ScanOptions w
Gladis
Juan
+Query IPC File
+==============
+
+Let query information for a IPC file.
+
+Query Data Content For File
+***************************
+
+Reading an IPC file that contains 03 Recordbatch with 03 rows written each one.
+
+In this case, we are configuring ScanOptions batchSize argument equals to 05
rows, it's greater than
+03 rows used on the file, then 03 rows is used on the program execution
instead of 05 rows requested.
+
+.. testcode::
+
+ import org.apache.arrow.dataset.file.FileFormat;
+ import org.apache.arrow.dataset.file.FileSystemDatasetFactory;
+ import org.apache.arrow.dataset.jni.NativeMemoryPool;
+ import org.apache.arrow.dataset.scanner.ScanOptions;
+ import org.apache.arrow.dataset.scanner.Scanner;
+ import org.apache.arrow.dataset.source.Dataset;
+ import org.apache.arrow.dataset.source.DatasetFactory;
+ import org.apache.arrow.memory.BufferAllocator;
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.VectorSchemaRoot;
+ import org.apache.arrow.vector.ipc.ArrowReader;
+
+ import java.io.IOException;
+
+ String uri = "file:" + System.getProperty("user.dir") +
"/thirdpartydeps/arrowfiles/random_access.arrow";
+ ScanOptions options = new ScanOptions(/*batchSize*/ 5);
+ try (
+ BufferAllocator allocator = new RootAllocator();
+ DatasetFactory datasetFactory = new FileSystemDatasetFactory(allocator,
NativeMemoryPool.getDefault(), FileFormat.ARROW_IPC, uri);
+ Dataset dataset = datasetFactory.finish();
+ Scanner scanner = dataset.newScan(options)
+ ) {
+ scanner.scan().forEach(scanTask -> {
+ try (ArrowReader reader = scanTask.execute()) {
+ final int[] count = {1};
+ while (reader.loadNextBatch()) {
+ try (VectorSchemaRoot root = reader.getVectorSchemaRoot()) {
+ System.out.println("Number of rows per batch["+
count[0]++ +"]: " + root.getRowCount());
+ }
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ });
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+.. testoutput::
+
+ Number of rows per batch[1]: 3
+ Number of rows per batch[2]: 3
+ Number of rows per batch[3]: 3
+
+Query ORC File
+==============
+
+Let query information for a ORC file.
+
+Query Data Content For File
+***************************
+
+Reading an ORC ZLib compressed file that contains 385 stripe with 5000 rows
written each one.
+
+.. code-block::
+
+ $ orc-metadata demo-11-zlib.orc | more
+
+ { "name": "demo-11-zlib.orc",
+ "type":
"struct<_col0:int,_col1:string,_col2:string,_col3:string,_col4:int,_col5:string,_col6:int,_col7:int,_col8:int>",
+ "stripe count": 385,
+ "compression": "zlib", "compression block": 262144,
+ "stripes": [
+ { "stripe": 0, "rows": 5000,
+ "offset": 3, "length": 1031,
+ "index": 266, "data": 636, "footer": 129
+ },
+ ...
+
+In this case, we are configuring ScanOptions batchSize argument equals to 4000
rows, it's lower than
+5000 rows used on the file, then 4000 rows is used on the program execution.
Review Comment:
Deleted
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]