This is an automated email from the ASF dual-hosted git repository.

slawrence pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git


The following commit(s) were added to refs/heads/main by this push:
     new d28fb524a Avoid large allocations related to specified length strings
d28fb524a is described below

commit d28fb524a7187a03c662381f5e222e4086461f9c
Author: Steve Lawrence <[email protected]>
AuthorDate: Fri Dec 19 07:59:39 2025 -0500

    Avoid large allocations related to specified length strings
    
    When we need to parse a specified length string, we currently allocate a
    buffer that can be reused to store the decoded string. The size of this
    buffer is based on the maximumSimpleElementSizeInCharacters tunable,
    which defaults to a fairly large size (1MB) that can be slow and put
    added pressure on the garbage collector. Fortunately, this buffer is
    allocated using a LocalBuffer so it is reused during a parse so at worst
    there is only one allocation per parse. But when parsing many small
    files that contain specified length strings, this overhead can become
    noticable. And 1MB is likely orders of magnitude larger than the vast
    majority of data formats will need for any single string element.
    
    To address this, instead of using maximumSimpleElementSizeInCharacters,
    we calculate how many characters the string could possible decode to
    given the current bit position, bit limit, and encoding, and use that as
    the buffer size to request. This way we only ever request and allocate a
    large buffer is one is ever needed, which should be rare.
    
    Note that this new logic requires bitLimit as part of specified string
    parsing. That isn't available in the edge case of specified length
    complex nillables. The specified length nil parser is modified to
    handle this case.
    
    This also modifies the LocalBuffer to allocate buffers of a reasonably
    large minimum size of 1K. This way we will likely only ever need to
    allocate a single buffer rather than allocating small buffers that have
    to be reallocate as larger buffers are needed.
    
    Tested with small NITF files (<4000 bytes) that contain lots of fixed
    length strings, this saw about 30%+ performance improvements. Files
    tested as large as 8000 bytes saw little or no change in performance.
    
    DAFFODIL-2851
---
 .../scala/org/apache/daffodil/io/LocalBuffer.scala |  8 +++-
 .../runtime1/processors/parsers/NilParsers.scala   | 47 +++++++++++++++-------
 .../processors/parsers/StringLengthParsers.scala   | 27 ++++++++++++-
 3 files changed, 66 insertions(+), 16 deletions(-)

diff --git 
a/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala 
b/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala
index dca66ea41..cf37e9c05 100644
--- a/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala
+++ b/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala
@@ -33,7 +33,13 @@ abstract class LocalBuffer[T <: java.nio.Buffer] {
   def getBuf(length: Long) = {
     Assert.usage(length <= Int.MaxValue)
     if (tempBuf.isEmpty || tempBuf.get.capacity < length) {
-      tempBuf = Maybe(allocate(length.toInt))
+      // allocate a buffer that can store the required length, but with a 
minimum size. The
+      // majority of LocalBuffers should be smaller than this minimum size and 
so should avoid
+      // costly reallocations, while still being small enough that the JVM 
should have no
+      // problem quickly allocating it
+      val minBufferSize = 1024
+      val allocationSize = math.max(length.toInt, minBufferSize)
+      tempBuf = Maybe(allocate(allocationSize))
     }
     val buf = tempBuf.get
     buf.clear
diff --git 
a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala
 
b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala
index af2cc7a1c..faaa285ba 100644
--- 
a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala
+++ 
b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala
@@ -40,21 +40,40 @@ abstract class LiteralNilOfSpecifiedLengthParserBase(erd: 
ElementRuntimeData)
   def isFieldNilLit(field: String): Boolean
 
   override def parse(start: PState): Unit = {
-
-    val field = parseString(start)
-
-    val isFieldEmpty = field.length() == 0
-
-    if (isFieldEmpty && isEmptyAllowed) {
-      // Valid! Success ParseResult indicates nilled
-    } else if (isFieldEmpty && !isEmptyAllowed) {
-      // Fail!
-      PE(start, "%s - Empty field found but not allowed!", eName)
-    } else if (isFieldNilLit(field)) {
-      // Contains a nilValue, Success ParseResult indicates nilled
+    if (erd.isComplexType) {
+      // nillable complex types must have a nilValue of %ES;. For a literal 
nil specified length
+      // complex to be nilled, that means either there must be a specified 
length that is zero
+      // or there isn't a specified length and we have reached the end of the 
data. If neither
+      // of these conditions are true, then there is non-empty data for this 
complex element and
+      // it cannot be nilled.
+      val bitLimit0b = start.bitLimit0b
+      val hasSpecifiedLength = bitLimit0b.isDefined
+      if (
+        (hasSpecifiedLength && (bitLimit0b.get - start.bitPos0b) > 0) ||
+        (!hasSpecifiedLength && start.dataInputStream.hasData)
+      ) {
+        // Fail!
+        PE(start, "%s - Does not contain a nil literal", eName)
+      } else {
+        // Valid! Success ParseResult indicates nilled
+      }
     } else {
-      // Fail!
-      PE(start, "%s - Does not contain a nil literal!", eName)
+      // Simple element, read a string up to the bitLimit and see if it 
matches the nilValue
+      val field = parseString(start)
+
+      val isFieldEmpty = field.length() == 0
+
+      if (isFieldEmpty && isEmptyAllowed) {
+        // Valid! Success ParseResult indicates nilled
+      } else if (isFieldEmpty && !isEmptyAllowed) {
+        // Fail!
+        PE(start, "%s - Empty field found but not allowed", eName)
+      } else if (isFieldNilLit(field)) {
+        // Contains a nilValue, Success ParseResult indicates nilled
+      } else {
+        // Fail!
+        PE(start, "%s - Does not contain a nil literal", eName)
+      }
     }
   }
 
diff --git 
a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala
 
b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala
index 16cbbd812..08e28190e 100644
--- 
a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala
+++ 
b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala
@@ -18,6 +18,7 @@
 package org.apache.daffodil.runtime1.processors.parsers
 
 import 
org.apache.daffodil.io.processors.charset.BitsCharsetDecoderUnalignedCharDecodeException
+import org.apache.daffodil.lib.exceptions.Assert
 import org.apache.daffodil.lib.util.MaybeChar
 import org.apache.daffodil.lib.util.Misc
 import org.apache.daffodil.runtime1.processors.CharsetEv
@@ -86,8 +87,32 @@ trait StringOfSpecifiedLengthMixin extends 
PaddingRuntimeMixin with CaptureParsi
 
   protected final def parseString(start: PState): String = {
     val dis = start.dataInputStream
-    val maxLen = start.tunable.maximumSimpleElementSizeInCharacters
     val startBitPos0b = dis.bitPos0b
+    val bitLimit0b = dis.bitLimit0b
+
+    // We want to limit the maximum length passed into getSomeString since 
that function can
+    // pre-allocate a buffer that size even if it won't find that many 
characters. So we
+    // calculate the maximum number of characters that we could possibly 
decode from the
+    // available bits and the character set.
+    //
+    // For fixed-width encodings, that is just the number of available bits 
divided by the
+    // fixed width of the encoding.
+    //
+    // For variable length encodings (e.g. UTF-8), the maximum number of 
characters that the
+    // available bits could possibly decode to is if every decoded character 
was the smallest
+    // possible representation. That smallest representation for 
variable-width encodings is
+    // bitWidthOfACodeUnit. So we divide the available bits but 
bitWidthOfACodeUnit.
+    //
+    // Note that the bitLimit should always be defined because bitLimit is how 
string of
+    // specified lengths limit lengths
+    Assert.invariant(bitLimit0b.isDefined)
+    val availableBits = bitLimit0b.get - startBitPos0b
+    val charset = charsetEv.evaluate(start)
+    val optWidth = charset.maybeFixedWidth
+    val bitsPerChar = if (optWidth.isDefined) optWidth.get else 
charset.bitWidthOfACodeUnit
+    // add one to allow for partial bytes at the end that could parse to a 
replacement char
+    val maxPossibleChars = (availableBits / bitsPerChar) + 1
+    val maxLen = math.min(maxPossibleChars, 
start.tunable.maximumSimpleElementSizeInCharacters)
 
     val strOpt =
       try {

Reply via email to