This is an automated email from the ASF dual-hosted git repository.
slawrence pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git
The following commit(s) were added to refs/heads/main by this push:
new d28fb524a Avoid large allocations related to specified length strings
d28fb524a is described below
commit d28fb524a7187a03c662381f5e222e4086461f9c
Author: Steve Lawrence <[email protected]>
AuthorDate: Fri Dec 19 07:59:39 2025 -0500
Avoid large allocations related to specified length strings
When we need to parse a specified length string, we currently allocate a
buffer that can be reused to store the decoded string. The size of this
buffer is based on the maximumSimpleElementSizeInCharacters tunable,
which defaults to a fairly large size (1MB) that can be slow and put
added pressure on the garbage collector. Fortunately, this buffer is
allocated using a LocalBuffer so it is reused during a parse so at worst
there is only one allocation per parse. But when parsing many small
files that contain specified length strings, this overhead can become
noticable. And 1MB is likely orders of magnitude larger than the vast
majority of data formats will need for any single string element.
To address this, instead of using maximumSimpleElementSizeInCharacters,
we calculate how many characters the string could possible decode to
given the current bit position, bit limit, and encoding, and use that as
the buffer size to request. This way we only ever request and allocate a
large buffer is one is ever needed, which should be rare.
Note that this new logic requires bitLimit as part of specified string
parsing. That isn't available in the edge case of specified length
complex nillables. The specified length nil parser is modified to
handle this case.
This also modifies the LocalBuffer to allocate buffers of a reasonably
large minimum size of 1K. This way we will likely only ever need to
allocate a single buffer rather than allocating small buffers that have
to be reallocate as larger buffers are needed.
Tested with small NITF files (<4000 bytes) that contain lots of fixed
length strings, this saw about 30%+ performance improvements. Files
tested as large as 8000 bytes saw little or no change in performance.
DAFFODIL-2851
---
.../scala/org/apache/daffodil/io/LocalBuffer.scala | 8 +++-
.../runtime1/processors/parsers/NilParsers.scala | 47 +++++++++++++++-------
.../processors/parsers/StringLengthParsers.scala | 27 ++++++++++++-
3 files changed, 66 insertions(+), 16 deletions(-)
diff --git
a/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala
b/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala
index dca66ea41..cf37e9c05 100644
--- a/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala
+++ b/daffodil-core/src/main/scala/org/apache/daffodil/io/LocalBuffer.scala
@@ -33,7 +33,13 @@ abstract class LocalBuffer[T <: java.nio.Buffer] {
def getBuf(length: Long) = {
Assert.usage(length <= Int.MaxValue)
if (tempBuf.isEmpty || tempBuf.get.capacity < length) {
- tempBuf = Maybe(allocate(length.toInt))
+ // allocate a buffer that can store the required length, but with a
minimum size. The
+ // majority of LocalBuffers should be smaller than this minimum size and
so should avoid
+ // costly reallocations, while still being small enough that the JVM
should have no
+ // problem quickly allocating it
+ val minBufferSize = 1024
+ val allocationSize = math.max(length.toInt, minBufferSize)
+ tempBuf = Maybe(allocate(allocationSize))
}
val buf = tempBuf.get
buf.clear
diff --git
a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala
b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala
index af2cc7a1c..faaa285ba 100644
---
a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala
+++
b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/NilParsers.scala
@@ -40,21 +40,40 @@ abstract class LiteralNilOfSpecifiedLengthParserBase(erd:
ElementRuntimeData)
def isFieldNilLit(field: String): Boolean
override def parse(start: PState): Unit = {
-
- val field = parseString(start)
-
- val isFieldEmpty = field.length() == 0
-
- if (isFieldEmpty && isEmptyAllowed) {
- // Valid! Success ParseResult indicates nilled
- } else if (isFieldEmpty && !isEmptyAllowed) {
- // Fail!
- PE(start, "%s - Empty field found but not allowed!", eName)
- } else if (isFieldNilLit(field)) {
- // Contains a nilValue, Success ParseResult indicates nilled
+ if (erd.isComplexType) {
+ // nillable complex types must have a nilValue of %ES;. For a literal
nil specified length
+ // complex to be nilled, that means either there must be a specified
length that is zero
+ // or there isn't a specified length and we have reached the end of the
data. If neither
+ // of these conditions are true, then there is non-empty data for this
complex element and
+ // it cannot be nilled.
+ val bitLimit0b = start.bitLimit0b
+ val hasSpecifiedLength = bitLimit0b.isDefined
+ if (
+ (hasSpecifiedLength && (bitLimit0b.get - start.bitPos0b) > 0) ||
+ (!hasSpecifiedLength && start.dataInputStream.hasData)
+ ) {
+ // Fail!
+ PE(start, "%s - Does not contain a nil literal", eName)
+ } else {
+ // Valid! Success ParseResult indicates nilled
+ }
} else {
- // Fail!
- PE(start, "%s - Does not contain a nil literal!", eName)
+ // Simple element, read a string up to the bitLimit and see if it
matches the nilValue
+ val field = parseString(start)
+
+ val isFieldEmpty = field.length() == 0
+
+ if (isFieldEmpty && isEmptyAllowed) {
+ // Valid! Success ParseResult indicates nilled
+ } else if (isFieldEmpty && !isEmptyAllowed) {
+ // Fail!
+ PE(start, "%s - Empty field found but not allowed", eName)
+ } else if (isFieldNilLit(field)) {
+ // Contains a nilValue, Success ParseResult indicates nilled
+ } else {
+ // Fail!
+ PE(start, "%s - Does not contain a nil literal", eName)
+ }
}
}
diff --git
a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala
b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala
index 16cbbd812..08e28190e 100644
---
a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala
+++
b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/StringLengthParsers.scala
@@ -18,6 +18,7 @@
package org.apache.daffodil.runtime1.processors.parsers
import
org.apache.daffodil.io.processors.charset.BitsCharsetDecoderUnalignedCharDecodeException
+import org.apache.daffodil.lib.exceptions.Assert
import org.apache.daffodil.lib.util.MaybeChar
import org.apache.daffodil.lib.util.Misc
import org.apache.daffodil.runtime1.processors.CharsetEv
@@ -86,8 +87,32 @@ trait StringOfSpecifiedLengthMixin extends
PaddingRuntimeMixin with CaptureParsi
protected final def parseString(start: PState): String = {
val dis = start.dataInputStream
- val maxLen = start.tunable.maximumSimpleElementSizeInCharacters
val startBitPos0b = dis.bitPos0b
+ val bitLimit0b = dis.bitLimit0b
+
+ // We want to limit the maximum length passed into getSomeString since
that function can
+ // pre-allocate a buffer that size even if it won't find that many
characters. So we
+ // calculate the maximum number of characters that we could possibly
decode from the
+ // available bits and the character set.
+ //
+ // For fixed-width encodings, that is just the number of available bits
divided by the
+ // fixed width of the encoding.
+ //
+ // For variable length encodings (e.g. UTF-8), the maximum number of
characters that the
+ // available bits could possibly decode to is if every decoded character
was the smallest
+ // possible representation. That smallest representation for
variable-width encodings is
+ // bitWidthOfACodeUnit. So we divide the available bits but
bitWidthOfACodeUnit.
+ //
+ // Note that the bitLimit should always be defined because bitLimit is how
string of
+ // specified lengths limit lengths
+ Assert.invariant(bitLimit0b.isDefined)
+ val availableBits = bitLimit0b.get - startBitPos0b
+ val charset = charsetEv.evaluate(start)
+ val optWidth = charset.maybeFixedWidth
+ val bitsPerChar = if (optWidth.isDefined) optWidth.get else
charset.bitWidthOfACodeUnit
+ // add one to allow for partial bytes at the end that could parse to a
replacement char
+ val maxPossibleChars = (availableBits / bitsPerChar) + 1
+ val maxLen = math.min(maxPossibleChars,
start.tunable.maximumSimpleElementSizeInCharacters)
val strOpt =
try {