This is an automated email from the ASF dual-hosted git repository.
slawrence pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git
The following commit(s) were added to refs/heads/main by this push:
new a79f185de Only mmap regular files in the CLI
a79f185de is described below
commit a79f185dea7342b53b31260c2b13c50458e6dbc2
Author: Steve Lawrence <[email protected]>
AuthorDate: Mon Jun 9 07:22:19 2025 -0400
Only mmap regular files in the CLI
When an input file for the CLI is not stdin, it currently maps the file
to a MappedByteBuffer, which gives significant performance gains,
especially for large files. However, for non-regular files like fifo
files, devices, sockets, the map functions are undefined. In practice,
we get a byte buffer but the result is a buffer with zero bytes, leading
to the CLI parse/unparse seeing no data.
To fix this, the CLI now checks if a file is regular, and only if it is
regular will it consider mapping the file. Non-regular files use the
existing fallback streaming behavior.
DAFFODIL-3002
---
.../main/scala/org/apache/daffodil/cli/Main.scala | 28 +++++++++++++---------
.../daffodil/cli/cliTest/TestCLIParsing.scala | 27 +++++++++++++++++++++
.../org/apache/daffodil/cli/cliTest/Util.scala | 2 +-
3 files changed, 45 insertions(+), 12 deletions(-)
diff --git a/daffodil-cli/src/main/scala/org/apache/daffodil/cli/Main.scala
b/daffodil-cli/src/main/scala/org/apache/daffodil/cli/Main.scala
index 37cc20d8f..9620497a0 100644
--- a/daffodil-cli/src/main/scala/org/apache/daffodil/cli/Main.scala
+++ b/daffodil-cli/src/main/scala/org/apache/daffodil/cli/Main.scala
@@ -1272,18 +1272,24 @@ class Main(
val input = parseOpts.infile.toOption match {
case Some("-") | None => InputSourceDataInputStream(STDIN)
case Some(file) => {
- // for files <= 2GB, use a mapped byte buffer to avoid the
overhead related to
- // the BucketingInputSource. Larger files cannot be mapped so
we cannot avoid it
+ // Try to use a memory mapped byte buffer for input files
since it is
+ // significantly more efficient, especially for large files.
Files larger than
+ // 2GB and non-regular files (e.g. fifo files, devices, unix
sockets) cannot be
+ // mapped--in these cases we use use a normal input stream
which is less
+ // efficient.
val path = Paths.get(file)
- val size = Files.size(path)
- if (size <= Int.MaxValue) {
- val fc = FileChannel.open(path, StandardOpenOption.READ)
- val bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, size)
- fc.close() // we no longer need the channel now that we've
mapped it
- InputSourceDataInputStream(bb)
- } else {
- val is = Files.newInputStream(path, StandardOpenOption.READ)
- InputSourceDataInputStream(is)
+ val optSize = if (Files.isRegularFile(path))
Some(Files.size(path)) else None
+ optSize match {
+ case Some(size) if size <= Int.MaxValue => {
+ val fc = FileChannel.open(path, StandardOpenOption.READ)
+ val bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, size)
+ fc.close() // we no longer need the channel now that we've
mapped it
+ InputSourceDataInputStream(bb)
+ }
+ case _ => {
+ val is = Files.newInputStream(path,
StandardOpenOption.READ)
+ InputSourceDataInputStream(is)
+ }
}
}
}
diff --git
a/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/TestCLIParsing.scala
b/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/TestCLIParsing.scala
index 052b735d7..8dad007e8 100644
---
a/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/TestCLIParsing.scala
+++
b/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/TestCLIParsing.scala
@@ -17,7 +17,10 @@
package org.apache.daffodil.cli.cliTest
+import java.io.FileOutputStream
import java.nio.charset.StandardCharsets.UTF_8
+import scala.sys.process.Process
+import scala.util.Using
import org.apache.daffodil.cli.Main.ExitCode
import org.apache.daffodil.cli.cliTest.Util._
@@ -25,6 +28,7 @@ import org.apache.daffodil.lib.Implicits._
import org.apache.commons.io.FileUtils
import org.junit.Assert._
+import org.junit.Assume.assumeTrue
import org.junit.Test
class TestCLIParsing {
@@ -304,6 +308,29 @@ class TestCLIParsing {
}(ExitCode.LeftOverData)
}
+ @Test def test_CLI_Parsing_SimpleParse_fifo(): Unit = {
+ // disable this test on windows since it requires the mkfifo command
+ assumeTrue("fifo test ignored on Windows", !isWindows)
+
+ val schema = path(
+
"daffodil-test/src/test/resources/org/apache/daffodil/section06/entities/charClassEntities.dfdl.xsd"
+ )
+
+ withTempDir { tempDir =>
+ val fifo = s"$tempDir/fifo"
+ Process("mkfifo", Seq(fifo)).!!
+
+ runCLI(args"parse -s $schema -r matrix $fifo") { cli =>
+ // Write to the fifo file. Calling Using.resource will close the fifo
file when writing
+ // is complete and trigger an EOF in the CLI to end the parse
+ Using.resource(new FileOutputStream(fifo)) { os =>
+ os.write("0,1,2,3".getBytes("UTF-8"))
+ }
+ cli.expect("<tns:cell>3</tns:cell>")
+ }(ExitCode.Success)
+ }
+ }
+
@Test def test_CLI_Parsing_SimpleParse_verboseMode(): Unit = {
val schema = path(
"daffodil-test/src/test/resources/org/apache/daffodil/section06/entities/charClassEntities.dfdl.xsd"
diff --git
a/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/Util.scala
b/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/Util.scala
index dbe0ecf89..821615052 100644
--- a/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/Util.scala
+++ b/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/Util.scala
@@ -50,7 +50,7 @@ import org.junit.Assert.assertEquals
object Util {
- private val isWindows =
System.getProperty("os.name").toLowerCase().startsWith("windows")
+ val isWindows =
System.getProperty("os.name").toLowerCase().startsWith("windows")
private val daffodilBinPath = {
val ext = if (isWindows) ".bat" else ""