Author: lehmi
Date: Sun Oct 5 12:12:19 2025
New Revision: 1928958
Log:
PDFBOX-6041: limit recursion depth to avoid a stack overflow exception as
proposed by David Justamante
Added:
pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestBaseParser.java
(contents, props changed)
pdfbox/branches/2.0/pdfbox/src/test/resources/org/apache/pdfbox/pdfparser/PDFBOX-5578-2023_StackOverFlowtest.pdf
(contents, props changed)
Modified:
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
Modified:
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
==============================================================================
---
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
Sun Oct 5 12:10:02 2025 (r1928957)
+++
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
Sun Oct 5 12:12:19 2025 (r1928958)
@@ -60,10 +60,16 @@ public abstract class BaseParser
private static final long GENERATION_NUMBER_THRESHOLD = 65535;
- static final int MAX_LENGTH_LONG = Long.toString(Long.MAX_VALUE).length();
+ private static final int MAX_LENGTH_LONG =
Long.toString(Long.MAX_VALUE).length();
private static final Charset ALTERNATIVE_CHARSET;
+ private static final int MAX_RECURSION_DEPTH = 500;
+ private static final String MAX_RECUSRION_MSG = //
+ "Reached maximum recursion depth " +
Integer.toString(MAX_RECURSION_DEPTH);
+
+ private int recursionDepth = 0;
+
static
{
Charset cs;
@@ -237,50 +243,63 @@ public abstract class BaseParser
*/
protected COSDictionary parseCOSDictionary() throws IOException
{
- readExpectedChar('<');
- readExpectedChar('<');
- skipSpaces();
- COSDictionary obj = new COSDictionary();
- boolean done = false;
- while (!done)
+ try
{
- skipSpaces();
- char c = (char) seqSource.peek();
- if (c == '>')
+ recursionDepth++;
+ if (recursionDepth > MAX_RECURSION_DEPTH)
{
- done = true;
+ throw new IOException(MAX_RECUSRION_MSG);
}
- else if (c == '/')
+ readExpectedChar('<');
+ readExpectedChar('<');
+ skipSpaces();
+ COSDictionary obj = new COSDictionary();
+ boolean done = false;
+ while (!done)
{
- // something went wrong, most likely the dictionary is
corrupted
- // stop immediately and return everything read so far
- if (!parseCOSDictionaryNameValuePair(obj))
+ skipSpaces();
+ char c = (char) seqSource.peek();
+ if (c == '>')
{
- return obj;
+ done = true;
}
- }
- else
- {
- // invalid dictionary, we were expecting a /Name, read until
the end or until we can recover
- LOG.warn("Invalid dictionary, found: '" + c + "' but expected:
'/' at offset " + seqSource.getPosition());
- if (readUntilEndOfCOSDictionary())
+ else if (c == '/')
{
- // we couldn't recover
- return obj;
+ // something went wrong, most likely the dictionary is
corrupted
+ // stop immediately and return everything read so far
+ if (!parseCOSDictionaryNameValuePair(obj))
+ {
+ return obj;
+ }
+ }
+ else
+ {
+ // invalid dictionary, we were expecting a /Name, read
until the end or until we can recover
+ LOG.warn("Invalid dictionary, found: '" + c + "' but
expected: '/' at offset "
+ + seqSource.getPosition());
+ if (readUntilEndOfCOSDictionary())
+ {
+ // we couldn't recover
+ return obj;
+ }
}
}
+ try
+ {
+ readExpectedChar('>');
+ readExpectedChar('>');
+ }
+ catch (IOException exception)
+ {
+ LOG.warn("Invalid dictionary, can't find end of dictionary at
offset "
+ + seqSource.getPosition());
+ }
+ return obj;
}
- try
- {
- readExpectedChar('>');
- readExpectedChar('>');
- }
- catch (IOException exception)
+ finally
{
- LOG.warn("Invalid dictionary, can't find end of dictionary at
offset "
- + seqSource.getPosition());
+ recursionDepth--;
}
- return obj;
}
/**
@@ -712,68 +731,81 @@ public abstract class BaseParser
*/
protected COSArray parseCOSArray() throws IOException
{
- long startPosition = seqSource.getPosition();
- readExpectedChar('[');
- COSArray po = new COSArray();
- COSBase pbo;
- skipSpaces();
- int i;
- while( ((i = seqSource.peek()) > 0) && ((char)i != ']') )
+ try
{
- pbo = parseDirObject();
- if( pbo instanceof COSObject )
+ recursionDepth++;
+ if (recursionDepth > MAX_RECURSION_DEPTH)
+ {
+ throw new IOException(MAX_RECUSRION_MSG);
+ }
+ long startPosition = seqSource.getPosition();
+ readExpectedChar('[');
+ COSArray po = new COSArray();
+ COSBase pbo;
+ skipSpaces();
+ int i;
+ while (((i = seqSource.peek()) > 0) && ((char) i != ']'))
{
- // We have to check if the expected values are there or not
PDFBOX-385
- if (po.size() > 0 && po.get(po.size() - 1) instanceof
COSInteger)
+ pbo = parseDirObject();
+ if (pbo instanceof COSObject)
{
- COSInteger genNumber = (COSInteger)po.remove( po.size() -1
);
+ // We have to check if the expected values are there or
not PDFBOX-385
if (po.size() > 0 && po.get(po.size() - 1) instanceof
COSInteger)
{
- COSInteger number = (COSInteger)po.remove( po.size()
-1 );
- COSObjectKey key = new
COSObjectKey(number.longValue(), genNumber.intValue());
- pbo = getObjectFromPool(key);
+ COSInteger genNumber = (COSInteger)
po.remove(po.size() - 1);
+ if (po.size() > 0 && po.get(po.size() - 1) instanceof
COSInteger)
+ {
+ COSInteger number = (COSInteger)
po.remove(po.size() - 1);
+ COSObjectKey key = new
COSObjectKey(number.longValue(),
+ genNumber.intValue());
+ pbo = getObjectFromPool(key);
+ }
+ else
+ {
+ // the object reference is somehow wrong
+ pbo = null;
+ }
}
else
{
- // the object reference is somehow wrong
pbo = null;
}
}
- else
+ if (pbo != null)
{
- pbo = null;
+ po.add(pbo);
}
- }
- if( pbo != null )
- {
- po.add( pbo );
- }
- else
- {
- //it could be a bad object in the array which is just skipped
- LOG.warn("Corrupt array element at offset "
- + seqSource.getPosition() + ", start offset: " +
startPosition);
- String isThisTheEnd = readString();
- // return immediately if a corrupt element is followed by
another array
- // to avoid a possible infinite recursion as most likely the
whole array is corrupted
- if (isThisTheEnd.isEmpty() && seqSource.peek() == '[')
- {
- return po;
- }
- seqSource.unread(isThisTheEnd.getBytes(Charsets.ISO_8859_1));
- // This could also be an "endobj" or "endstream" which means
we can assume that
- // the array has ended.
- if(ENDOBJ_STRING.equals(isThisTheEnd) ||
ENDSTREAM_STRING.equals(isThisTheEnd))
+ else
{
- return po;
+ // it could be a bad object in the array which is just
skipped
+ LOG.warn("Corrupt array element at offset " +
seqSource.getPosition()
+ + ", start offset: " + startPosition);
+ String isThisTheEnd = readString();
+ // return immediately if a corrupt element is followed by
another array
+ // to avoid a possible infinite recursion as most likely
the whole array is corrupted
+ if (isThisTheEnd.isEmpty() && seqSource.peek() == '[')
+ {
+ return po;
+ }
+
seqSource.unread(isThisTheEnd.getBytes(Charsets.ISO_8859_1));
+ // This could also be an "endobj" or "endstream" which
means we can assume that
+ // the array has ended.
+ if (ENDOBJ_STRING.equals(isThisTheEnd) ||
ENDSTREAM_STRING.equals(isThisTheEnd))
+ {
+ return po;
+ }
}
+ skipSpaces();
}
+ // read ']'
+ seqSource.read();
skipSpaces();
+ return po;
+ }
+ finally
+ {
+ recursionDepth--;
}
- // read ']'
- seqSource.read();
- skipSpaces();
- return po;
}
/**
@@ -933,89 +965,100 @@ public abstract class BaseParser
*/
protected COSBase parseDirObject() throws IOException
{
- skipSpaces();
- char c = (char)seqSource.peek();
- switch(c)
+ try
{
- case '<':
- // pull off first left bracket
- int leftBracket = seqSource.read();
- // check for second left bracket
- c = (char) seqSource.peek();
- seqSource.unread(leftBracket);
- return c == '<' ? parseCOSDictionary() : parseCOSString();
- case '[':
- // array
- return parseCOSArray();
- case '(':
- return parseCOSString();
- case '/':
- // name
- return parseCOSName();
- case 'n':
- // null
- readExpectedString(NULL);
- return COSNull.NULL;
- case 't':
- String trueString = new String( seqSource.readFully(4),
Charsets.ISO_8859_1 );
- if( trueString.equals( TRUE ) )
- {
- return COSBoolean.TRUE;
- }
- else
- {
- throw new IOException( "expected true actual='" + trueString +
"' " + seqSource +
- "' at offset " + seqSource.getPosition());
- }
- case 'f':
- String falseString = new String( seqSource.readFully(5),
Charsets.ISO_8859_1 );
- if( falseString.equals( FALSE ) )
- {
- return COSBoolean.FALSE;
- }
- else
+ recursionDepth++;
+ if (recursionDepth > MAX_RECURSION_DEPTH)
{
- throw new IOException( "expected false actual='" + falseString
+ "' " + seqSource +
- "' at offset " + seqSource.getPosition());
+ throw new IOException(MAX_RECUSRION_MSG);
}
- case 'R':
- seqSource.read();
- return new COSObject(null);
- case (char)-1:
- return null;
- default:
- if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
+ skipSpaces();
+ char c = (char) seqSource.peek();
+ switch (c)
{
- return parseCOSNumber();
- }
- // This is not suppose to happen, but we will allow for it
- // so we are more compatible with POS writers that don't
- // follow the spec
- long startOffset = seqSource.getPosition();
- String badString = readString();
- if (badString.isEmpty())
- {
- int peek = seqSource.peek();
- // we can end up in an infinite loop otherwise
- throw new IOException(
- "Unknown dir object c='" + c + "' cInt=" + (int) c + "
peek='" + (char) peek
- + "' peekInt=" + peek + " at offset " +
seqSource.getPosition()
- + " (start offset: " + startOffset + ")");
- }
+ case '<':
+ // pull off first left bracket
+ int leftBracket = seqSource.read();
+ // check for second left bracket
+ c = (char) seqSource.peek();
+ seqSource.unread(leftBracket);
+ return c == '<' ? parseCOSDictionary() : parseCOSString();
+ case '[':
+ // array
+ return parseCOSArray();
+ case '(':
+ return parseCOSString();
+ case '/':
+ // name
+ return parseCOSName();
+ case 'n':
+ // null
+ readExpectedString(NULL);
+ return COSNull.NULL;
+ case 't':
+ String trueString = new String(seqSource.readFully(4),
Charsets.ISO_8859_1);
+ if (trueString.equals(TRUE))
+ {
+ return COSBoolean.TRUE;
+ }
+ else
+ {
+ throw new IOException("expected true actual='" +
trueString + "' " + seqSource
+ + "' at offset " + seqSource.getPosition());
+ }
+ case 'f':
+ String falseString = new String(seqSource.readFully(5),
Charsets.ISO_8859_1);
+ if (falseString.equals(FALSE))
+ {
+ return COSBoolean.FALSE;
+ }
+ else
+ {
+ throw new IOException("expected false actual='" +
falseString + "' " + seqSource
+ + "' at offset " + seqSource.getPosition());
+ }
+ case 'R':
+ seqSource.read();
+ return new COSObject(null);
+ case (char) -1:
+ return null;
+ default:
+ if (Character.isDigit(c) || c == '-' || c == '+' || c == '.')
+ {
+ return parseCOSNumber();
+ }
+ // This is not suppose to happen, but we will allow for it
+ // so we are more compatible with POS writers that don't
+ // follow the spec
+ long startOffset = seqSource.getPosition();
+ String badString = readString();
+ if (badString.isEmpty())
+ {
+ int peek = seqSource.peek();
+ // we can end up in an infinite loop otherwise
+ throw new IOException("Unknown dir object c='" + c + "'
cInt=" + (int) c
+ + " peek='" + (char) peek + "' peekInt=" + peek +
" at offset "
+ + seqSource.getPosition() + " (start offset: " +
startOffset + ")");
+ }
- // if it's an endstream/endobj, we want to put it back so the
caller will see it
- if (ENDOBJ_STRING.equals(badString) ||
ENDSTREAM_STRING.equals(badString))
- {
- seqSource.unread(badString.getBytes(Charsets.ISO_8859_1));
- }
- else
- {
- LOG.warn("Skipped unexpected dir object = '" + badString + "'
at offset "
- + seqSource.getPosition() + " (start offset: " +
startOffset + ")");
- return this instanceof PDFStreamParser ? null : COSNull.NULL;
+ // if it's an endstream/endobj, we want to put it back so the
caller will see it
+ if (ENDOBJ_STRING.equals(badString) ||
ENDSTREAM_STRING.equals(badString))
+ {
+ seqSource.unread(badString.getBytes(Charsets.ISO_8859_1));
+ }
+ else
+ {
+ LOG.warn("Skipped unexpected dir object = '" + badString +
"' at offset "
+ + seqSource.getPosition() + " (start offset: " +
startOffset + ")");
+ return this instanceof PDFStreamParser ? null :
COSNull.NULL;
+ }
}
+ return null;
+ }
+ finally
+ {
+ recursionDepth--;
}
- return null;
}
private COSNumber parseCOSNumber() throws IOException
Added:
pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestBaseParser.java
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++
pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestBaseParser.java
Sun Oct 5 12:12:19 2025 (r1928958)
@@ -0,0 +1,57 @@
+/*****************************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+package org.apache.pdfbox.pdfparser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.junit.Test;
+
+public class TestBaseParser
+{
+
+ @Test
+ public void testBaseParserStackOverflow()
+ {
+ // PDFBOX-6041
+ try
+ {
+ // we have to use the file from PDFBOX-5578 as the file from
PDFBOX-6041
+ // doesn't trigger a stack overflow exception in 2.x
+ InputStream is = TestBaseParser.class
+
.getResourceAsStream("PDFBOX-5578-2023_StackOverFlowtest.pdf");
+ PDDocument.load(is).close();
+ }
+ catch (IOException exception)
+ {
+ assertEquals("Missing root object specification in trailer.",
exception.getMessage());
+ }
+ catch (Exception exception)
+ {
+ fail("Unexpected Exception");
+ }
+
+ }
+}
Added:
pdfbox/branches/2.0/pdfbox/src/test/resources/org/apache/pdfbox/pdfparser/PDFBOX-5578-2023_StackOverFlowtest.pdf
==============================================================================
Binary file. No diff available.