Author: lehmi
Date: Fri Oct 31 07:16:27 2025
New Revision: 1929433
Log:
PDFBOX-6093: move the parsing code to the class it belongs to
Added:
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestCOSParser.java
(contents, props changed)
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestBaseParser.java
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
Fri Oct 31 04:20:03 2025 (r1929432)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
Fri Oct 31 07:16:27 2025 (r1929433)
@@ -19,31 +19,8 @@ package org.apache.pdfbox.pdfparser;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.logging.log4j.Logger;
-import org.apache.logging.log4j.LogManager;
-
-import org.apache.pdfbox.cos.COSArray;
-import org.apache.pdfbox.cos.COSBase;
-import org.apache.pdfbox.cos.COSBoolean;
-import org.apache.pdfbox.cos.COSDictionary;
-import org.apache.pdfbox.cos.COSDocument;
-import org.apache.pdfbox.cos.COSInteger;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.cos.COSNull;
-import org.apache.pdfbox.cos.COSNumber;
-import org.apache.pdfbox.cos.COSObject;
-import org.apache.pdfbox.cos.COSObjectKey;
-import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.RandomAccessRead;
/**
@@ -53,91 +30,8 @@ import org.apache.pdfbox.io.RandomAccess
*/
public abstract class BaseParser
{
- /**
- * Log instance.
- */
- private static final Logger LOG = LogManager.getLogger(BaseParser.class);
-
- private static final long OBJECT_NUMBER_THRESHOLD = 10000000000L;
-
- private static final long GENERATION_NUMBER_THRESHOLD = 65535;
-
private static final int MAX_LENGTH_LONG =
Long.toString(Long.MAX_VALUE).length();
- private static final Charset ALTERNATIVE_CHARSET;
- private static final int MAX_RECURSION_DEPTH = 500;
- private static final String MAX_RECUSRION_MSG = //
- "Reached maximum recursion depth " +
Integer.toString(MAX_RECURSION_DEPTH);
-
- private int recursionDepth = 0;
-
- private final Map<Long, COSObjectKey> keyCache = new HashMap<>();
-
- static
- {
- Charset cs;
- String charsetName = "Windows-1252";
- try
- {
- cs = Charset.forName(charsetName);
- }
- catch (IllegalArgumentException | UnsupportedOperationException e)
- {
- cs = StandardCharsets.ISO_8859_1;
- LOG.warn(() -> "Charset is not supported: " + charsetName + ",
falling back to " +
- StandardCharsets.ISO_8859_1.name(), e);
- }
- ALTERNATIVE_CHARSET = cs;
- }
-
- // CharSetDecoders are not threadsafe so not static
- private final CharsetDecoder utf8Decoder =
StandardCharsets.UTF_8.newDecoder()
- .onMalformedInput(CodingErrorAction.REPORT)
- .onUnmappableCharacter(CodingErrorAction.REPORT);
-
- protected static final int E = 'e';
- protected static final int N = 'n';
- protected static final int D = 'd';
-
- protected static final int S = 's';
- protected static final int T = 't';
- protected static final int R = 'r';
- protected static final int A = 'a';
- protected static final int M = 'm';
-
- protected static final int O = 'o';
- protected static final int B = 'b';
- protected static final int J = 'j';
-
- /**
- * This is a string constant that will be used for comparisons.
- */
- public static final String DEF = "def";
- /**
- * This is a string constant that will be used for comparisons.
- */
- protected static final String ENDOBJ_STRING = "endobj";
- /**
- * This is a string constant that will be used for comparisons.
- */
- protected static final String ENDSTREAM_STRING = "endstream";
- /**
- * This is a string constant that will be used for comparisons.
- */
- protected static final String STREAM_STRING = "stream";
- /**
- * This is a string constant that will be used for comparisons.
- */
- private static final char[] TRUE = { 't', 'r', 'u', 'e' };
- /**
- * This is a string constant that will be used for comparisons.
- */
- private static final char[] FALSE = { 'f', 'a', 'l', 's', 'e' };
- /**
- * This is a string constant that will be used for comparisons.
- */
- private static final char[] NULL = { 'n', 'u', 'l', 'l' };
-
/**
* ASCII code for Null.
*/
@@ -168,11 +62,6 @@ public abstract class BaseParser
protected final RandomAccessRead source;
/**
- * This is the document that will be parsed.
- */
- protected COSDocument document;
-
- /**
* Default constructor.
*/
BaseParser(RandomAccessRead pdfSource)
@@ -180,244 +69,6 @@ public abstract class BaseParser
this.source = pdfSource;
}
- private static boolean isHexDigit(char ch)
- {
- return isDigit(ch) ||
- (ch >= 'a' && ch <= 'f') ||
- (ch >= 'A' && ch <= 'F');
- }
-
- /**
- * Returns the object key for the given combination of object and
generation number. The object key from the cross
- * reference table/stream will be reused if available. Otherwise a newly
created object will be returned.
- *
- * @param num the given object number
- * @param gen the given generation number
- *
- * @return the COS object key
- */
- protected COSObjectKey getObjectKey(long num, int gen)
- {
- if (document == null || document.getXrefTable().isEmpty())
- {
- return new COSObjectKey(num, gen);
- }
- // use a cache to get the COSObjectKey as iterating over the
xref-table-map gets slow for big pdfs
- // in the long run we have to overhaul the object pool or even better
remove it
- Map<COSObjectKey, Long> xrefTable = document.getXrefTable();
- if (xrefTable.size() > keyCache.size())
- {
- for (COSObjectKey key : xrefTable.keySet())
- {
- keyCache.putIfAbsent(key.getInternalHash(), key);
- }
- }
- long internalHashCode = COSObjectKey.computeInternalHash(num, gen);
- COSObjectKey foundKey = keyCache.get(internalHashCode);
- return foundKey != null ? foundKey : new COSObjectKey(num, gen);
- }
-
- /**
- * This will parse a PDF dictionary value.
- *
- * @return The parsed Dictionary object.
- *
- * @throws IOException If there is an error parsing the dictionary object.
- */
- private COSBase parseCOSDictionaryValue() throws IOException
- {
- long numOffset = source.getPosition();
- COSBase value = parseDirObject();
- skipSpaces();
- // proceed if the given object is a number and the following is a
number as well
- if (!(value instanceof COSNumber) || !isDigit())
- {
- return value;
- }
- // read the remaining information of the object number
- long genOffset = source.getPosition();
- COSBase generationNumber = parseDirObject();
- skipSpaces();
- readExpectedChar('R');
- if (!(value instanceof COSInteger))
- {
- LOG.error("expected number, actual={} at offset {}", value,
numOffset);
- return COSNull.NULL;
- }
- if (!(generationNumber instanceof COSInteger))
- {
- LOG.error("expected number, actual={} at offset {}",
generationNumber, genOffset);
- return COSNull.NULL;
- }
- long objNumber = ((COSInteger) value).longValue();
- if (objNumber <= 0)
- {
- LOG.warn("invalid object number value ={} at offset {}",
objNumber, numOffset);
- return COSNull.NULL;
- }
- int genNumber = ((COSInteger) generationNumber).intValue();
- if (genNumber < 0)
- {
- LOG.error("invalid generation number value ={} at offset {}",
genNumber, numOffset);
- return COSNull.NULL;
- }
- // dereference the object
- return getObjectFromPool(getObjectKey(objNumber, genNumber));
- }
-
- private COSBase getObjectFromPool(COSObjectKey key) throws IOException
- {
- if (document == null)
- {
- throw new IOException("object reference " + key + " at offset " +
source.getPosition()
- + " in content stream");
- }
- return document.getObjectFromPool(key);
- }
-
- /**
- * This will parse a PDF dictionary.
- *
- * @param isDirect indicates whether the dictionary to be read is a direct
object
- * @return The parsed dictionary, never null.
- *
- * @throws IOException If there is an error reading the stream.
- */
- protected COSDictionary parseCOSDictionary(boolean isDirect) throws
IOException
- {
- try
- {
- recursionDepth++;
- if (recursionDepth > MAX_RECURSION_DEPTH)
- {
- throw new IOException(MAX_RECUSRION_MSG);
- }
- readExpectedChar('<');
- readExpectedChar('<');
- skipSpaces();
- COSDictionary obj = new COSDictionary();
- obj.setDirect(isDirect);
- while (true)
- {
- skipSpaces();
- char c = (char) source.peek();
- if (c == '>')
- {
- break;
- }
- else if (c == '/')
- {
- // something went wrong, most likely the dictionary is
corrupted
- // stop immediately and return everything read so far
- if (!parseCOSDictionaryNameValuePair(obj))
- {
- return obj;
- }
- }
- else
- {
- // invalid dictionary, we were expecting a /Name, read
until the end or until we can recover
- LOG.warn("Invalid dictionary, found: '{}' but expected:
'/' at offset {}", c,
- source.getPosition());
- if (readUntilEndOfCOSDictionary())
- {
- // we couldn't recover
- return obj;
- }
- }
- }
- try
- {
- readExpectedChar('>');
- readExpectedChar('>');
- }
- catch (IOException exception)
- {
- LOG.warn("Invalid dictionary, can't find end of dictionary at
offset {}",
- source.getPosition());
- }
- return obj;
- }
- finally
- {
- recursionDepth--;
- }
- }
-
- /**
- * Keep reading until the end of the dictionary object or the file has
been hit, or until a '/'
- * has been found.
- *
- * @return true if the end of the object or the file has been found, false
if not, i.e. that the
- * caller can continue to parse the dictionary at the current position.
- *
- * @throws IOException if there is a reading error.
- */
- private boolean readUntilEndOfCOSDictionary() throws IOException
- {
- int c = source.read();
- while (c != -1 && c != '/' && c != '>')
- {
- // in addition to stopping when we find / or >, we also want
- // to stop when we find endstream or endobj.
- if (c == E)
- {
- c = source.read();
- if (c == N)
- {
- c = source.read();
- if (c == D)
- {
- c = source.read();
- boolean isStream = c == S && source.read() == T &&
source.read() == R
- && source.read() == E && source.read() == A &&
source.read() == M;
- boolean isObj = !isStream && c == O && source.read()
== B
- && source.read() == J;
- if (isStream || isObj)
- {
- // we're done reading this object!
- return true;
- }
- }
- }
- }
- c = source.read();
- }
- if (c == -1)
- {
- return true;
- }
- source.rewind(1);
- return false;
- }
-
- private boolean parseCOSDictionaryNameValuePair(COSDictionary obj) throws
IOException
- {
- COSName key = parseCOSName();
- if (key == null || key.getName().isEmpty())
- {
- LOG.warn("Empty COSName at offset {}", source.getPosition());
- }
- COSBase value = parseCOSDictionaryValue();
- skipSpaces();
- if (value == null)
- {
- LOG.warn("Bad dictionary declaration at offset {}",
source.getPosition());
- return false;
- }
- else if (value instanceof COSInteger && !((COSInteger)
value).isValid())
- {
- LOG.warn("Skipped out of range number value at offset {}",
source.getPosition());
- }
- else
- {
- // label this item as direct, to avoid signature problems.
- value.setDirect(true);
- obj.setItem(key, value);
- }
- return true;
- }
-
/**
* Skip the upcoming CRLF or LF which are supposed to follow a stream.
Trailing spaces are removed as well.
*
@@ -536,27 +187,71 @@ public abstract class BaseParser
}
/**
- * This will parse a PDF string.
+ * Determine if a character terminates a PDF name.
*
- * @return The parsed PDF string.
+ * @param ch The character
+ * @return true if the character terminates a PDF name, otherwise false.
+ */
+ protected static boolean isEndOfName(int ch)
+ {
+ switch (ch)
+ {
+ case ASCII_SPACE:
+ case ASCII_CR:
+ case ASCII_LF:
+ case ASCII_TAB:
+ case '>':
+ case '<':
+ case '[':
+ case '/':
+ case ']':
+ case ')':
+ case '(':
+ case ASCII_NULL:
+ case '\f':
+ case '%':
+ case -1:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ /**
+ * This will read the next string from the stream.
+ *
+ * @return The string that was read from the stream, never null.
*
* @throws IOException If there is an error reading from the stream.
*/
- protected COSString parseCOSString() throws IOException
+ protected String readString() throws IOException
{
- char nextChar = (char) source.read();
- if (nextChar == '<')
+ skipSpaces();
+ StringBuilder buffer = new StringBuilder();
+ int c = source.read();
+ while (!isEndOfName(c))
{
- return parseCOSHexString();
+ buffer.append( (char)c );
+ c = source.read();
}
- else if (nextChar != '(')
+ if (c != -1)
{
- throw new IOException( "parseCOSString string should start with
'(' or '<' and not '" +
- nextChar + "' at offset " + source.getPosition());
+ source.rewind(1);
}
-
+ return buffer.toString();
+ }
+
+ /**
+ * This will parse a PDF string.
+ *
+ * @return The parsed PDF string.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ protected byte[] readLiteralString() throws IOException
+ {
+ readExpectedChar('(');
ByteArrayOutputStream out = new ByteArrayOutputStream();
-
// This is the number of braces read
int braces = 1;
int c = source.read();
@@ -570,7 +265,7 @@ public abstract class BaseParser
braces--;
braces = checkForEndOfString(braces);
- if( braces != 0 )
+ if (braces != 0)
{
out.write(ch);
}
@@ -604,509 +299,98 @@ public abstract class BaseParser
case ')':
// PDFBox 276 /Title (c:\)
braces = checkForEndOfString(braces);
- if( braces != 0 )
- {
- out.write(next);
- }
- else
- {
- out.write('\\');
- }
- break;
- case '(':
- case '\\':
+ if (braces != 0)
+ {
out.write(next);
- break;
- case ASCII_LF:
- case ASCII_CR:
- //this is a break in the line so ignore it and the
newline and continue
+ }
+ else
+ {
+ out.write('\\');
+ }
+ break;
+ case '(':
+ case '\\':
+ out.write(next);
+ break;
+ case ASCII_LF:
+ case ASCII_CR:
+ // this is a break in the line so ignore it and the
newline and continue
+ c = source.read();
+ while (isEOL(c) && c != -1)
+ {
c = source.read();
- while( isEOL(c) && c != -1)
- {
- c = source.read();
- }
- nextc = c;
- break;
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- StringBuilder octal = new StringBuilder();
- octal.append( next );
+ }
+ nextc = c;
+ break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ StringBuilder octal = new StringBuilder();
+ octal.append(next);
+ c = source.read();
+ char digit = (char) c;
+ if (digit >= '0' && digit <= '7')
+ {
+ octal.append(digit);
c = source.read();
- char digit = (char)c;
- if( digit >= '0' && digit <= '7' )
+ digit = (char) c;
+ if (digit >= '0' && digit <= '7')
{
- octal.append( digit );
- c = source.read();
- digit = (char)c;
- if( digit >= '0' && digit <= '7' )
- {
- octal.append( digit );
- }
- else
- {
- nextc = c;
- }
+ octal.append(digit);
}
else
{
nextc = c;
}
-
- int character = 0;
- try
- {
- character = Integer.parseInt( octal.toString(), 8
);
- }
- catch( NumberFormatException e )
- {
- throw new IOException( "Error: Expected octal
character, actual='" + octal + "'", e );
- }
- out.write(character);
- break;
- default:
- // dropping the backslash
- // see 7.3.4.2 Literal Strings for further information
- out.write(next);
- }
- }
- else
- {
- out.write(ch);
- }
- if (nextc != -2)
- {
- c = nextc;
- }
- else
- {
- c = source.read();
- }
- }
- if (c != -1)
- {
- source.rewind(1);
- }
- return new COSString(out.toByteArray());
- }
-
- /**
- * This will parse a PDF HEX string with fail fast semantic
- * meaning that we stop if a not allowed character is found.
- * This is necessary in order to detect malformed input and
- * be able to skip to next object start.
- *
- * We assume starting '<' was already read.
- *
- * @return The parsed PDF string.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- private COSString parseCOSHexString() throws IOException
- {
- final StringBuilder sBuf = new StringBuilder();
- while( true )
- {
- int c = source.read();
- if ( isHexDigit((char)c) )
- {
- sBuf.append( (char) c );
- }
- else if ( c == '>' )
- {
- break;
- }
- else if ( c < 0 )
- {
- throw new IOException( "Missing closing bracket for hex
string. Reached EOS." );
- }
- else if ( ( c == ' ' ) || ( c == '\n' ) ||
- ( c == '\t' ) || ( c == '\r' ) ||
- ( c == '\b' ) || ( c == '\f' ) )
- {
- continue;
- }
- else
- {
- // if invalid chars was found: discard last
- // hex character if it is not part of a pair
- if (sBuf.length()%2!=0)
- {
- sBuf.deleteCharAt(sBuf.length()-1);
- }
-
- // read till the closing bracket was found
- do
- {
- c = source.read();
- }
- while ( c != '>' && c >= 0 );
-
- // might have reached EOF while looking for the closing bracket
- // this can happen for malformed PDFs only. Make sure that
there is
- // no endless loop.
- if ( c < 0 )
- {
- throw new IOException( "Missing closing bracket for hex
string. Reached EOS." );
- }
-
- // exit loop
- break;
- }
- }
- return COSString.parseHex(sBuf.toString());
- }
-
- /**
- * This will parse a PDF array object.
- *
- * @return The parsed PDF array.
- *
- * @throws IOException If there is an error parsing the stream.
- */
- protected COSArray parseCOSArray() throws IOException
- {
- try
- {
- recursionDepth++;
- if (recursionDepth > MAX_RECURSION_DEPTH)
- {
- throw new IOException(MAX_RECUSRION_MSG);
- }
- long startPosition = source.getPosition();
- readExpectedChar('[');
- COSArray po = new COSArray();
- COSBase pbo;
- skipSpaces();
- int i;
- while (((i = source.peek()) > 0) && ((char) i != ']'))
- {
- pbo = parseDirObject();
- if (pbo instanceof COSObject)
- {
- // the current empty COSObject is replaced with the
correct one
- pbo = null;
- // We have to check if the expected values are there or
not PDFBOX-385
- if (po.size() > 1 && po.get(po.size() - 1) instanceof
COSInteger)
- {
- COSInteger genNumber = (COSInteger)
po.remove(po.size() - 1);
- if (po.size() > 0 && po.get(po.size() - 1) instanceof
COSInteger)
- {
- COSInteger number = (COSInteger)
po.remove(po.size() - 1);
- if (number.longValue() >= 0 &&
genNumber.intValue() >= 0)
- {
- COSObjectKey key =
getObjectKey(number.longValue(),
- genNumber.intValue());
- pbo = getObjectFromPool(key);
- }
- else
- {
- LOG.warn("Invalid value(s) for an object key
{} {}", number.longValue(),
- genNumber.intValue());
- }
- }
- }
- }
- // something went wrong
- if (pbo == null)
- {
- //it could be a bad object in the array which is just
skipped
- LOG.warn("Corrupt array element at offset {}, start
offset: {}",
- source.getPosition(), startPosition);
- String isThisTheEnd = readString();
- // return immediately if a corrupt element is followed by
another array
- // to avoid a possible infinite recursion as most likely
the whole array is corrupted
- if (isThisTheEnd.isEmpty() && source.peek() == '[')
- {
- return po;
}
-
source.rewind(isThisTheEnd.getBytes(StandardCharsets.ISO_8859_1).length);
- // This could also be an "endobj" or "endstream" which
means we can assume that
- // the array has ended.
- if (ENDOBJ_STRING.equals(isThisTheEnd) ||
ENDSTREAM_STRING.equals(isThisTheEnd))
+ else
{
- return po;
+ nextc = c;
}
- }
- else
- {
- po.add(pbo);
- }
- skipSpaces();
- }
- // read ']'
- source.read();
- skipSpaces();
- return po;
- }
- finally
- {
- recursionDepth--;
- }
- }
-
- /**
- * Determine if a character terminates a PDF name.
- *
- * @param ch The character
- * @return true if the character terminates a PDF name, otherwise false.
- */
- protected static boolean isEndOfName(int ch)
- {
- switch (ch)
- {
- case ASCII_SPACE:
- case ASCII_CR:
- case ASCII_LF:
- case ASCII_TAB:
- case '>':
- case '<':
- case '[':
- case '/':
- case ']':
- case ')':
- case '(':
- case ASCII_NULL:
- case '\f':
- case '%':
- case -1:
- return true;
- default:
- return false;
- }
- }
- /**
- * This will parse a PDF name from the stream.
- *
- * @return The parsed PDF name.
- * @throws IOException If there is an error reading from the stream.
- */
- protected COSName parseCOSName() throws IOException
- {
- readExpectedChar('/');
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- int c = source.read();
- while (!isEndOfName(c))
- {
- final int ch = c;
- if (ch == '#')
- {
- int ch1 = source.read();
- int ch2 = source.read();
- // Prior to PDF v1.2, the # was not a special character. Also,
- // it has been observed that various PDF tools do not follow
the
- // spec with respect to the # escape, even though they report
- // PDF versions of 1.2 or later. The solution here is that we
- // interpret the # as an escape only when it is followed by two
- // valid hex digits.
- if (isHexDigit((char)ch1) && isHexDigit((char)ch2))
- {
- String hex = Character.toString((char) ch1) + (char) ch2;
+ int character = 0;
try
{
- buffer.write(Integer.parseInt(hex, 16));
+ character = Integer.parseInt(octal.toString(), 8);
}
catch (NumberFormatException e)
{
- throw new IOException("Error: expected hex digit,
actual='" + hex + "'", e);
+ throw new IOException(
+ "Error: Expected octal character, actual='" +
octal + "'", e);
}
- c = source.read();
- }
- else
- {
- // check for premature EOF
- if (ch2 == -1 || ch1 == -1)
- {
- LOG.error("Premature EOF in BaseParser#parseCOSName");
- c = -1;
- break;
- }
- source.rewind(1);
- c = ch1;
- buffer.write(ch);
+ out.write(character);
+ break;
+ default:
+ // dropping the backslash
+ // see 7.3.4.2 Literal Strings for further information
+ out.write(next);
}
}
else
{
- buffer.write(ch);
- c = source.read();
+ out.write(ch);
}
- }
- if (c != -1)
- {
- source.rewind(1);
- }
-
- return COSName.getPDFName(decodeBuffer(buffer));
- }
-
- /**
- * Tries to decode the buffer content to an UTF-8 String. If that fails,
tries the alternative Encoding.
- *
- * @param buffer the {@link ByteArrayOutputStream} containing the bytes to
decode
- * @return the decoded String
- */
- private String decodeBuffer(ByteArrayOutputStream buffer)
- {
- try
- {
- return
utf8Decoder.decode(ByteBuffer.wrap(buffer.toByteArray())).toString();
- }
- catch (CharacterCodingException e)
- {
- // some malformed PDFs don't use UTF-8 see PDFBOX-3347
- LOG.debug(() -> "Buffer could not be decoded using
StandardCharsets.UTF_8 - trying " +
- ALTERNATIVE_CHARSET.name(), e);
- return buffer.toString(ALTERNATIVE_CHARSET);
- }
- }
-
- /**
- * This will parse a directory object from the stream.
- *
- * @return The parsed object.
- *
- * @throws IOException If there is an error during parsing.
- */
- protected COSBase parseDirObject() throws IOException
- {
- try
- {
- recursionDepth++;
- if (recursionDepth > MAX_RECURSION_DEPTH)
+ if (nextc != -2)
{
- throw new IOException(MAX_RECUSRION_MSG);
+ c = nextc;
}
- skipSpaces();
- char c = (char) source.peek();
- switch (c)
+ else
{
- case '<':
- // pull off first left bracket
- source.read();
- // check for second left bracket
- c = (char) source.peek();
- source.rewind(1);
- return c == '<' ? parseCOSDictionary(true) : parseCOSString();
- case '[':
- // array
- return parseCOSArray();
- case '(':
- return parseCOSString();
- case '/':
- // name
- return parseCOSName();
- case 'n':
- // null
- readExpectedString(NULL, false);
- return COSNull.NULL;
- case 't':
- readExpectedString(TRUE, false);
- return COSBoolean.TRUE;
- case 'f':
- readExpectedString(FALSE, false);
- return COSBoolean.FALSE;
- case 'R':
- source.read();
- return new COSObject(null);
- case (char) -1:
- return null;
- default:
- if (isDigit(c) || c == '-' || c == '+' || c == '.')
- {
- return parseCOSNumber();
- }
- // This is not suppose to happen, but we will allow for it
- // so we are more compatible with POS writers that don't
- // follow the spec
- long startOffset = source.getPosition();
- String badString = readString();
- if (badString.isEmpty())
- {
- int peek = source.peek();
- // we can end up in an infinite loop otherwise
- throw new IOException("Unknown dir object c='" + c + "'
cInt=" + (int) c + " peek='"
- + (char) peek + "' peekInt=" + peek + " at offset
" + source.getPosition()
- + " (start offset: " + startOffset + ")");
- }
-
- // if it's an endstream/endobj, we want to put it back so the
caller will see it
- if (ENDOBJ_STRING.equals(badString) ||
ENDSTREAM_STRING.equals(badString))
- {
-
source.rewind(badString.getBytes(StandardCharsets.ISO_8859_1).length);
- }
- else
- {
- LOG.warn("Skipped unexpected dir object = '{}' at offset
{} (start offset: {})",
- badString, source.getPosition(), startOffset);
- return this instanceof PDFStreamParser ? null :
COSNull.NULL;
- }
+ c = source.read();
}
- return null;
- }
- finally
- {
- recursionDepth--;
- }
- }
-
- private COSNumber parseCOSNumber() throws IOException
- {
- StringBuilder buf = new StringBuilder();
- int ic = source.read();
- char c = (char) ic;
- while (Character.isDigit(c) || c == '-' || c == '+' || c == '.' || c
== 'E' || c == 'e')
- {
- buf.append(c);
- ic = source.read();
- c = (char) ic;
- }
- if (ic != -1)
- {
- source.rewind(1);
- }
-
- // PDFBOX-5025: catch "74191endobj"
- char lastc = buf.charAt(buf.length() - 1);
- if (lastc == 'e' || lastc == 'E')
- {
- buf.deleteCharAt(buf.length() - 1);
- source.rewind(1);
- }
-
- return COSNumber.get(buf.toString());
- }
-
- /**
- * This will read the next string from the stream.
- *
- * @return The string that was read from the stream, never null.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected String readString() throws IOException
- {
- skipSpaces();
- StringBuilder buffer = new StringBuilder();
- int c = source.read();
- while (!isEndOfName(c))
- {
- buffer.append( (char)c );
- c = source.read();
}
if (c != -1)
{
source.rewind(1);
}
- return buffer.toString();
+ return out.toByteArray();
}
-
+
/**
* Reads given pattern from {@link #source}. Skipping whitespace at start
and end if wanted.
*
@@ -1147,43 +431,6 @@ public abstract class BaseParser
}
/**
- * This will read bytes until the first end of line marker occurs.
- * NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes
- * which is an important detail if one wants to unread the line.
- *
- * @return The characters between the current position and the end of the
line.
- *
- * @throws IOException If there is an error reading from the stream.
- */
- protected String readLine() throws IOException
- {
- if (source.isEOF())
- {
- throw new IOException( "Error: End-of-File, expected line at
offset " +
- source.getPosition());
- }
-
- StringBuilder buffer = new StringBuilder( 11 );
-
- int c;
- while ((c = source.read()) != -1)
- {
- // CR and LF are valid EOLs
- if (isEOL(c))
- {
- break;
- }
- buffer.append( (char)c );
- }
- // CR+LF is also a valid EOL
- if (isCR(c) && isLF(source.peek()))
- {
- source.read();
- }
- return buffer.toString();
- }
-
- /**
* This will tell if the end of the data is reached.
*
* @return true if the end of the data is reached.
@@ -1211,7 +458,7 @@ public abstract class BaseParser
* @param c The character to check against line feed
* @return true if the next byte is 0x0A.
*/
- private static boolean isLF(int c)
+ protected static boolean isLF(int c)
{
return ASCII_LF == c;
}
@@ -1222,7 +469,7 @@ public abstract class BaseParser
* @param c The character to check against carriage return
* @return true if the next byte is 0x0D.
*/
- private static boolean isCR(int c)
+ protected static boolean isCR(int c)
{
return ASCII_CR == c;
}
@@ -1339,41 +586,6 @@ public abstract class BaseParser
}
/**
- * This will read a long from the Stream and throw an {@link IOException}
if
- * the long value is negative or has more than 10 digits (i.e. : bigger
than
- * {@link #OBJECT_NUMBER_THRESHOLD})
- *
- * @return the object number being read.
- * @throws IOException if an I/O error occurs
- */
- protected long readObjectNumber() throws IOException
- {
- long retval = readLong();
- if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD)
- {
- throw new IOException("Object Number '" + retval + "' has more
than 10 digits or is negative");
- }
- return retval;
- }
-
- /**
- * This will read a integer from the Stream and throw an {@link
IllegalArgumentException} if the integer value
- * has more than the maximum object revision (i.e. : bigger than {@link
#GENERATION_NUMBER_THRESHOLD})
- * @return the generation number being read.
- * @throws IOException if an I/O error occurs
- */
- protected int readGenerationNumber() throws IOException
- {
- int retval = readInt();
- if(retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
- {
- throw new IOException(
- "Generation Number '" + retval + "' has more than 5 digits
or is negative");
- }
- return retval;
- }
-
- /**
* This will read an integer from the stream.
*
* @return The integer that was read from the stream.
@@ -1400,7 +612,6 @@ public abstract class BaseParser
}
return retval;
}
-
/**
* This will read an long from the stream.
@@ -1455,4 +666,5 @@ public abstract class BaseParser
}
return buffer;
}
+
}
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
Fri Oct 31 04:20:03 2025 (r1929432)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
Fri Oct 31 07:16:27 2025 (r1929433)
@@ -16,8 +16,14 @@
*/
package org.apache.pdfbox.pdfparser;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.security.GeneralSecurityException;
import java.security.KeyStore;
@@ -31,14 +37,17 @@ import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSBoolean;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSObjectKey;
import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.cos.ICOSParser;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.RandomAccessRead;
@@ -61,20 +70,62 @@ import org.apache.pdfbox.pdmodel.encrypt
*/
public class COSParser extends BaseParser implements ICOSParser
{
+ private static final Logger LOG = LogManager.getLogger(COSParser.class);
+
private static final String PDF_HEADER = "%PDF-";
private static final String FDF_HEADER = "%FDF-";
private static final String PDF_DEFAULT_VERSION = "1.4";
private static final String FDF_DEFAULT_VERSION = "1.0";
+ private static final int E = 'e';
+ private static final int N = 'n';
+ private static final int D = 'd';
+
+ private static final int S = 's';
+ private static final int T = 't';
+ private static final int R = 'r';
+ private static final int A = 'a';
+ private static final int M = 'm';
+
+ private static final int O = 'o';
+ private static final int B = 'b';
+ private static final int J = 'j';
+
+ /**
+ * This is a string constant that will be used for comparisons.
+ */
+ private static final String ENDOBJ_STRING = "endobj";
+ /**
+ * This is a string constant that will be used for comparisons.
+ */
+ private static final String ENDSTREAM_STRING = "endstream";
+ /**
+ * This is a string constant that will be used for comparisons.
+ */
+ private static final String STREAM_STRING = "stream";
+
private static final char[] STARTXREF = {
's','t','a','r','t','x','r','e','f' };
private static final byte[] ENDSTREAM = { E, N, D, S, T, R, E, A, M };
private static final byte[] ENDOBJ = { E, N, D, O, B, J };
+ /**
+ * This is a string constant that will be used for comparisons.
+ */
+ private static final char[] TRUE = { 't', 'r', 'u', 'e' };
+ /**
+ * This is a string constant that will be used for comparisons.
+ */
+ private static final char[] FALSE = { 'f', 'a', 'l', 's', 'e' };
+ /**
+ * This is a string constant that will be used for comparisons.
+ */
+ private static final char[] NULL = { 'n', 'u', 'l', 'l' };
+
+ private static final long OBJECT_NUMBER_THRESHOLD = 10000000000L;
+ private static final long GENERATION_NUMBER_THRESHOLD = 65535;
- protected static final long MINIMUM_SEARCH_OFFSET = 6;
-
private static final int STRMBUFLEN = 2048;
private final byte[] strmBuf = new byte[ STRMBUFLEN ];
@@ -84,6 +135,30 @@ public class COSParser extends BaseParse
private String password = "";
private String keyAlias = null;
+ private static final Charset ALTERNATIVE_CHARSET;
+
+ static
+ {
+ Charset cs;
+ String charsetName = "Windows-1252";
+ try
+ {
+ cs = Charset.forName(charsetName);
+ }
+ catch (IllegalArgumentException | UnsupportedOperationException e)
+ {
+ cs = StandardCharsets.ISO_8859_1;
+ LOG.warn(() -> "Charset is not supported: " + charsetName + ",
falling back to "
+ + StandardCharsets.ISO_8859_1.name(), e);
+ }
+ ALTERNATIVE_CHARSET = cs;
+ }
+
+ // CharSetDecoders are not threadsafe so not static
+ private final CharsetDecoder utf8Decoder =
StandardCharsets.UTF_8.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+
/**
* The range within the %%EOF marker will be searched.
* Useful if there are additional characters after %%EOF within the PDF.
@@ -114,6 +189,12 @@ public class COSParser extends BaseParse
*/
private boolean isLenient = true;
+ private static final int MAX_RECURSION_DEPTH = 500;
+ private static final String MAX_RECUSRION_MSG = //
+ "Reached maximum recursion depth " +
Integer.toString(MAX_RECURSION_DEPTH);
+
+ private int recursionDepth = 0;
+
protected boolean initialParseDone = false;
private boolean trailerWasRebuild = false;
@@ -121,7 +202,14 @@ public class COSParser extends BaseParse
private BruteForceParser bruteForceParser = null;
private PDEncryption encryption = null;
private final Map<COSObjectKey, Long> xrefTable = new HashMap<>();
-
+
+ private final Map<Long, COSObjectKey> keyCache = new HashMap<>();
+
+ /**
+ * This is the document that will be parsed.
+ */
+ protected COSDocument document;
+
/**
* Intermediate cache. Contains all objects of already read compressed
object streams. Objects are removed after
* dereferencing them.
@@ -138,8 +226,6 @@ public class COSParser extends BaseParse
*/
private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT;
- private static final Logger LOG = LogManager.getLogger(COSParser.class);
-
/**
* Default constructor.
*
@@ -697,18 +783,428 @@ public class COSParser extends BaseParse
}
/**
- * This will read a COSStream from the input stream using length attribute
within dictionary. If
- * length attribute is a indirect reference it is first resolved to get
the stream length. This
- * means we copy stream data without testing for 'endstream' or 'endobj'
and thus it is no
- * problem if these keywords occur within stream. We require 'endstream'
to be found after
- * stream data is read.
+ * This will parse a PDF array object.
+ *
+ * @return The parsed PDF array.
+ *
+ * @throws IOException If there is an error parsing the stream.
+ */
+ protected COSArray parseCOSArray() throws IOException
+ {
+ try
+ {
+ recursionDepth++;
+ if (recursionDepth > MAX_RECURSION_DEPTH)
+ {
+ throw new IOException(MAX_RECUSRION_MSG);
+ }
+ long startPosition = source.getPosition();
+ readExpectedChar('[');
+ COSArray po = new COSArray();
+ COSBase pbo;
+ skipSpaces();
+ int i;
+ while (((i = source.peek()) > 0) && ((char) i != ']'))
+ {
+ pbo = parseDirObject();
+ if (pbo instanceof COSObject)
+ {
+ // the current empty COSObject is replaced with the
correct one
+ pbo = null;
+ // We have to check if the expected values are there or
not PDFBOX-385
+ if (po.size() > 1 && po.get(po.size() - 1) instanceof
COSInteger)
+ {
+ COSInteger genNumber = (COSInteger)
po.remove(po.size() - 1);
+ if (po.size() > 0 && po.get(po.size() - 1) instanceof
COSInteger)
+ {
+ COSInteger number = (COSInteger)
po.remove(po.size() - 1);
+ if (number.longValue() >= 0 &&
genNumber.intValue() >= 0)
+ {
+ COSObjectKey key =
getObjectKey(number.longValue(),
+ genNumber.intValue());
+ pbo = getObjectFromPool(key);
+ }
+ else
+ {
+ LOG.warn("Invalid value(s) for an object key
{} {}",
+ number.longValue(),
genNumber.intValue());
+ }
+ }
+ }
+ }
+ // something went wrong
+ if (pbo == null)
+ {
+ // it could be a bad object in the array which is just
skipped
+ LOG.warn("Corrupt array element at offset {}, start
offset: {}",
+ source.getPosition(), startPosition);
+ String isThisTheEnd = readString();
+ // return immediately if a corrupt element is followed by
another array
+ // to avoid a possible infinite recursion as most likely
the whole array is corrupted
+ if (isThisTheEnd.isEmpty() && source.peek() == '[')
+ {
+ return po;
+ }
+
source.rewind(isThisTheEnd.getBytes(StandardCharsets.ISO_8859_1).length);
+ // This could also be an "endobj" or "endstream" which
means we can assume that
+ // the array has ended.
+ if (ENDOBJ_STRING.equals(isThisTheEnd) ||
ENDSTREAM_STRING.equals(isThisTheEnd))
+ {
+ return po;
+ }
+ }
+ else
+ {
+ po.add(pbo);
+ }
+ skipSpaces();
+ }
+ // read ']'
+ source.read();
+ skipSpaces();
+ return po;
+ }
+ finally
+ {
+ recursionDepth--;
+ }
+ }
+
+ /**
+ * This will parse a PDF dictionary.
+ *
+ * @param isDirect indicates whether the dictionary to be read is a direct
object
+ * @return The parsed dictionary, never null.
+ *
+ * @throws IOException If there is an error reading the stream.
+ */
+ protected COSDictionary parseCOSDictionary(boolean isDirect) throws
IOException
+ {
+ try
+ {
+ recursionDepth++;
+ if (recursionDepth > MAX_RECURSION_DEPTH)
+ {
+ throw new IOException(MAX_RECUSRION_MSG);
+ }
+ readExpectedChar('<');
+ readExpectedChar('<');
+ skipSpaces();
+ COSDictionary obj = new COSDictionary();
+ obj.setDirect(isDirect);
+ while (true)
+ {
+ skipSpaces();
+ char c = (char) source.peek();
+ if (c == '>')
+ {
+ break;
+ }
+ else if (c == '/')
+ {
+ // something went wrong, most likely the dictionary is
corrupted
+ // stop immediately and return everything read so far
+ if (!parseCOSDictionaryNameValuePair(obj))
+ {
+ return obj;
+ }
+ }
+ else
+ {
+ // invalid dictionary, we were expecting a /Name, read
until the end or until we can recover
+ LOG.warn("Invalid dictionary, found: '{}' but expected:
'/' at offset {}", c,
+ source.getPosition());
+ if (readUntilEndOfCOSDictionary())
+ {
+ // we couldn't recover
+ return obj;
+ }
+ }
+ }
+ try
+ {
+ readExpectedChar('>');
+ readExpectedChar('>');
+ }
+ catch (IOException exception)
+ {
+ LOG.warn("Invalid dictionary, can't find end of dictionary at
offset {}",
+ source.getPosition());
+ }
+ return obj;
+ }
+ finally
+ {
+ recursionDepth--;
+ }
+ }
+
+ private boolean parseCOSDictionaryNameValuePair(COSDictionary obj) throws
IOException
+ {
+ COSName key = parseCOSName();
+ if (key == null || key.getName().isEmpty())
+ {
+ LOG.warn("Empty COSName at offset {}", source.getPosition());
+ }
+ COSBase value = parseCOSDictionaryValue();
+ skipSpaces();
+ if (value == null)
+ {
+ LOG.warn("Bad dictionary declaration at offset {}",
source.getPosition());
+ return false;
+ }
+ else if (value instanceof COSInteger && !((COSInteger)
value).isValid())
+ {
+ LOG.warn("Skipped out of range number value at offset {}",
source.getPosition());
+ }
+ else
+ {
+ // label this item as direct, to avoid signature problems.
+ value.setDirect(true);
+ obj.setItem(key, value);
+ }
+ return true;
+ }
+
+ private COSNumber parseCOSNumber() throws IOException
+ {
+ StringBuilder buf = new StringBuilder();
+ int ic = source.read();
+ char c = (char) ic;
+ while (Character.isDigit(c) || c == '-' || c == '+' || c == '.' || c
== 'E' || c == 'e')
+ {
+ buf.append(c);
+ ic = source.read();
+ c = (char) ic;
+ }
+ if (ic != -1)
+ {
+ source.rewind(1);
+ }
+
+ // PDFBOX-5025: catch "74191endobj"
+ char lastc = buf.charAt(buf.length() - 1);
+ if (lastc == 'e' || lastc == 'E')
+ {
+ buf.deleteCharAt(buf.length() - 1);
+ source.rewind(1);
+ }
+
+ return COSNumber.get(buf.toString());
+ }
+
+ /**
+ * This will parse a PDF dictionary value.
+ *
+ * @return The parsed Dictionary object.
+ *
+ * @throws IOException If there is an error parsing the dictionary object.
+ */
+ private COSBase parseCOSDictionaryValue() throws IOException
+ {
+ long numOffset = source.getPosition();
+ COSBase value = parseDirObject();
+ skipSpaces();
+ // proceed if the given object is a number and the following is a
number as well
+ if (!(value instanceof COSNumber) || !isDigit())
+ {
+ return value;
+ }
+ // read the remaining information of the object number
+ long genOffset = source.getPosition();
+ COSBase generationNumber = parseDirObject();
+ skipSpaces();
+ readExpectedChar('R');
+ if (!(value instanceof COSInteger))
+ {
+ LOG.error("expected number, actual={} at offset {}", value,
numOffset);
+ return COSNull.NULL;
+ }
+ if (!(generationNumber instanceof COSInteger))
+ {
+ LOG.error("expected number, actual={} at offset {}",
generationNumber, genOffset);
+ return COSNull.NULL;
+ }
+ long objNumber = ((COSInteger) value).longValue();
+ if (objNumber <= 0)
+ {
+ LOG.warn("invalid object number value ={} at offset {}",
objNumber, numOffset);
+ return COSNull.NULL;
+ }
+ int genNumber = ((COSInteger) generationNumber).intValue();
+ if (genNumber < 0)
+ {
+ LOG.error("invalid generation number value ={} at offset {}",
genNumber, numOffset);
+ return COSNull.NULL;
+ }
+ // dereference the object
+ return getObjectFromPool(getObjectKey(objNumber, genNumber));
+ }
+
+ /**
+ * This will parse a directory object from the stream.
+ *
+ * @return The parsed object.
+ *
+ * @throws IOException If there is an error during parsing.
+ */
+ protected COSBase parseDirObject() throws IOException
+ {
+ try
+ {
+ recursionDepth++;
+ if (recursionDepth > MAX_RECURSION_DEPTH)
+ {
+ throw new IOException(MAX_RECUSRION_MSG);
+ }
+ skipSpaces();
+ char c = (char) source.peek();
+ switch (c)
+ {
+ case '<':
+ // pull off first left bracket
+ source.read();
+ // check for second left bracket
+ c = (char) source.peek();
+ if (c == '<')
+ {
+ source.rewind(1);
+ return parseCOSDictionary(true);
+ }
+ else
+ {
+ return parseCOSHexString();
+ }
+ case '[':
+ // array
+ return parseCOSArray();
+ case '(':
+ return parseCOSLiteralString();
+ case '/':
+ // name
+ return parseCOSName();
+ case 'n':
+ // null
+ readExpectedString(NULL, false);
+ return COSNull.NULL;
+ case 't':
+ readExpectedString(TRUE, false);
+ return COSBoolean.TRUE;
+ case 'f':
+ readExpectedString(FALSE, false);
+ return COSBoolean.FALSE;
+ case 'R':
+ source.read();
+ return new COSObject(null);
+ case (char) -1:
+ return null;
+ default:
+ if (isDigit(c) || c == '-' || c == '+' || c == '.')
+ {
+ return parseCOSNumber();
+ }
+ // This is not suppose to happen, but we will allow for it
+ // so we are more compatible with POS writers that don't
+ // follow the spec
+ long startOffset = source.getPosition();
+ String badString = readString();
+ if (badString.isEmpty())
+ {
+ int peek = source.peek();
+ // we can end up in an infinite loop otherwise
+ throw new IOException("Unknown dir object c='" + c + "'
cInt=" + (int) c
+ + " peek='" + (char) peek + "' peekInt=" + peek +
" at offset "
+ + source.getPosition() + " (start offset: " +
startOffset + ")");
+ }
+
+ // if it's an endstream/endobj, we want to put it back so the
caller will see it
+ if (ENDOBJ_STRING.equals(badString) ||
ENDSTREAM_STRING.equals(badString))
+ {
+
source.rewind(badString.getBytes(StandardCharsets.ISO_8859_1).length);
+ }
+ else
+ {
+ LOG.warn("Skipped unexpected dir object = '{}' at offset
{} (start offset: {})",
+ badString, source.getPosition(), startOffset);
+ return this instanceof PDFStreamParser ? null :
COSNull.NULL;
+ }
+ }
+ return null;
+ }
+ finally
+ {
+ recursionDepth--;
+ }
+ }
+
+ private COSBase getObjectFromPool(COSObjectKey key) throws IOException
+ {
+ if (document == null)
+ {
+ throw new IOException("object reference " + key + " at offset " +
source.getPosition()
+ + " in content stream");
+ }
+ return document.getObjectFromPool(key);
+ }
+
+ /**
+ * Keep reading until the end of the dictionary object or the file has
been hit, or until a '/' has been found.
+ *
+ * @return true if the end of the object or the file has been found, false
if not, i.e. that the caller can continue
+ * to parse the dictionary at the current position.
+ *
+ * @throws IOException if there is a reading error.
+ */
+ private boolean readUntilEndOfCOSDictionary() throws IOException
+ {
+ int c = source.read();
+ while (c != -1 && c != '/' && c != '>')
+ {
+ // in addition to stopping when we find / or >, we also want
+ // to stop when we find endstream or endobj.
+ if (c == E)
+ {
+ c = source.read();
+ if (c == N)
+ {
+ c = source.read();
+ if (c == D)
+ {
+ c = source.read();
+ boolean isStream = c == S && source.read() == T &&
source.read() == R
+ && source.read() == E && source.read() == A &&
source.read() == M;
+ boolean isObj = !isStream && c == O && source.read()
== B
+ && source.read() == J;
+ if (isStream || isObj)
+ {
+ // we're done reading this object!
+ return true;
+ }
+ }
+ }
+ }
+ c = source.read();
+ }
+ if (c == -1)
+ {
+ return true;
+ }
+ source.rewind(1);
+ return false;
+ }
+
+ /**
+ * This will read a COSStream from the input stream using length attribute
within dictionary. If length attribute is
+ * a indirect reference it is first resolved to get the stream length.
This means we copy stream data without
+ * testing for 'endstream' or 'endobj' and thus it is no problem if these
keywords occur within stream. We require
+ * 'endstream' to be found after stream data is read.
*
* @param dic dictionary that goes with this stream.
*
* @return parsed pdf stream.
*
- * @throws IOException if an error occurred reading the stream, like
problems with reading
- * length attribute, stream does not end with 'endstream' after data read,
stream too short etc.
+ * @throws IOException if an error occurred reading the stream, like
problems with reading length attribute, stream
+ * does not end with 'endstream' after data read, stream too short etc.
*/
protected COSStream parseCOSStream(COSDictionary dic) throws IOException
{
@@ -1050,6 +1546,78 @@ public class COSParser extends BaseParse
}
/**
+ * This will read a long from the Stream and throw an {@link IOException}
if the long value is negative or has more
+ * than 10 digits (i.e. : bigger than {@link #OBJECT_NUMBER_THRESHOLD})
+ *
+ * @return the object number being read.
+ * @throws IOException if an I/O error occurs
+ */
+ protected long readObjectNumber() throws IOException
+ {
+ long retval = readLong();
+ if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD)
+ {
+ throw new IOException(
+ "Object Number '" + retval + "' has more than 10 digits or
is negative");
+ }
+ return retval;
+ }
+
+ /**
+ * This will read a integer from the Stream and throw an {@link
IllegalArgumentException} if the integer value has
+ * more than the maximum object revision (i.e. : bigger than {@link
#GENERATION_NUMBER_THRESHOLD})
+ *
+ * @return the generation number being read.
+ * @throws IOException if an I/O error occurs
+ */
+ protected int readGenerationNumber() throws IOException
+ {
+ int retval = readInt();
+ if (retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
+ {
+ throw new IOException(
+ "Generation Number '" + retval + "' has more than 5 digits
or is negative");
+ }
+ return retval;
+ }
+
+ /**
+ * This will read bytes until the first end of line marker occurs. NOTE:
The EOL marker may consists of 1 (CR or LF)
+ * or 2 (CR and CL) bytes which is an important detail if one wants to
unread the line.
+ *
+ * @return The characters between the current position and the end of the
line.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ protected String readLine() throws IOException
+ {
+ if (source.isEOF())
+ {
+ throw new IOException(
+ "Error: End-of-File, expected line at offset " +
source.getPosition());
+ }
+
+ StringBuilder buffer = new StringBuilder(11);
+
+ int c;
+ while ((c = source.read()) != -1)
+ {
+ // CR and LF are valid EOLs
+ if (isEOL(c))
+ {
+ break;
+ }
+ buffer.append((char) c);
+ }
+ // CR+LF is also a valid EOL
+ if (isCR(c) && isLF(source.peek()))
+ {
+ source.read();
+ }
+ return buffer.toString();
+ }
+
+ /**
* Parse the header of a pdf.
*
* @return true if a PDF header was found
@@ -1253,4 +1821,202 @@ public class COSParser extends BaseParse
{
return securityHandler;
}
+
+ /**
+ * This will parse a PDF name from the stream.
+ *
+ * @return The parsed PDF name.
+ * @throws IOException If there is an error reading from the stream.
+ */
+ protected COSName parseCOSName() throws IOException
+ {
+ readExpectedChar('/');
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ int c = source.read();
+ while (!isEndOfName(c))
+ {
+ final int ch = c;
+ if (ch == '#')
+ {
+ int ch1 = source.read();
+ int ch2 = source.read();
+ // Prior to PDF v1.2, the # was not a special character. Also,
+ // it has been observed that various PDF tools do not follow
the
+ // spec with respect to the # escape, even though they report
+ // PDF versions of 1.2 or later. The solution here is that we
+ // interpret the # as an escape only when it is followed by two
+ // valid hex digits.
+ if (isHexDigit((char) ch1) && isHexDigit((char) ch2))
+ {
+ String hex = Character.toString((char) ch1) + (char) ch2;
+ try
+ {
+ buffer.write(Integer.parseInt(hex, 16));
+ }
+ catch (NumberFormatException e)
+ {
+ throw new IOException("Error: expected hex digit,
actual='" + hex + "'", e);
+ }
+ c = source.read();
+ }
+ else
+ {
+ // check for premature EOF
+ if (ch2 == -1 || ch1 == -1)
+ {
+ LOG.error("Premature EOF in BaseParser#parseCOSName");
+ c = -1;
+ break;
+ }
+ source.rewind(1);
+ c = ch1;
+ buffer.write(ch);
+ }
+ }
+ else
+ {
+ buffer.write(ch);
+ c = source.read();
+ }
+ }
+ if (c != -1)
+ {
+ source.rewind(1);
+ }
+
+ return COSName.getPDFName(decodeBuffer(buffer));
+ }
+
+ private static boolean isHexDigit(char ch)
+ {
+ return isDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <=
'F');
+ }
+
+ /**
+ * This will parse a PDF string.
+ *
+ * @return The parsed PDF string.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ protected COSString parseCOSLiteralString() throws IOException
+ {
+ return new COSString(readLiteralString());
+ }
+
+ /**
+ * This will parse a PDF HEX string with fail fast semantic meaning that
we stop if a not allowed character is
+ * found. This is necessary in order to detect malformed input and be able
to skip to next object start.
+ *
+ * We assume starting '<' was already read.
+ *
+ * @return The parsed PDF string.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ protected COSString parseCOSHexString() throws IOException
+ {
+ final StringBuilder sBuf = new StringBuilder();
+ while (true)
+ {
+ int c = source.read();
+ if (isHexDigit((char) c))
+ {
+ sBuf.append((char) c);
+ }
+ else if (c == '>')
+ {
+ break;
+ }
+ else if (c < 0)
+ {
+ throw new IOException("Missing closing bracket for hex string.
Reached EOS.");
+ }
+ else if ((c == ' ') || (c == '\n') || (c == '\t') || (c == '\r')
|| (c == '\b')
+ || (c == '\f'))
+ {
+ continue;
+ }
+ else
+ {
+ // if invalid chars was found: discard last
+ // hex character if it is not part of a pair
+ if (sBuf.length() % 2 != 0)
+ {
+ sBuf.deleteCharAt(sBuf.length() - 1);
+ }
+
+ // read till the closing bracket was found
+ do
+ {
+ c = source.read();
+ } while (c != '>' && c >= 0);
+
+ // might have reached EOF while looking for the closing bracket
+ // this can happen for malformed PDFs only. Make sure that
there is
+ // no endless loop.
+ if (c < 0)
+ {
+ throw new IOException("Missing closing bracket for hex
string. Reached EOS.");
+ }
+
+ // exit loop
+ break;
+ }
+ }
+ return COSString.parseHex(sBuf.toString());
+ }
+
+ /**
+ * Tries to decode the buffer content to an UTF-8 String. If that fails,
tries the alternative Encoding.
+ *
+ * @param buffer the {@link ByteArrayOutputStream} containing the bytes to
decode
+ * @return the decoded String
+ */
+ private String decodeBuffer(ByteArrayOutputStream buffer)
+ {
+ try
+ {
+ return
utf8Decoder.decode(ByteBuffer.wrap(buffer.toByteArray())).toString();
+ }
+ catch (CharacterCodingException e)
+ {
+ // some malformed PDFs don't use UTF-8 see PDFBOX-3347
+ LOG.debug(() -> "Buffer could not be decoded using
StandardCharsets.UTF_8 - trying "
+ + ALTERNATIVE_CHARSET.name(), e);
+ return buffer.toString(ALTERNATIVE_CHARSET);
+ }
+ }
+
+ /**
+ * Returns the object key for the given combination of object and
generation number. The object key from the cross
+ * reference table/stream will be reused if available. Otherwise a newly
created object will be returned.
+ *
+ * @param num the given object number
+ * @param gen the given generation number
+ *
+ * @return the COS object key
+ */
+ protected COSObjectKey getObjectKey(long num, int gen)
+ {
+ // return new COSObjectKey(num, gen);
+ if (document == null || document.getXrefTable().isEmpty())
+ {
+ return new COSObjectKey(num, gen);
+ }
+ // use a cache to get the COSObjectKey as iterating over the
xref-table-map gets slow for big pdfs
+ // in the long run we have to overhaul the object pool or even better
remove it
+ Map<COSObjectKey, Long> xrefTable = document.getXrefTable();
+ if (xrefTable.size() > keyCache.size())
+ {
+ for (COSObjectKey key : xrefTable.keySet())
+ {
+ keyCache.putIfAbsent(key.getInternalHash(), key);
+ }
+ }
+ long internalHashCode = COSObjectKey.computeInternalHash(num, gen);
+ COSObjectKey foundKey = keyCache.get(internalHashCode);
+ return foundKey != null ? foundKey : new COSObjectKey(num, gen);
+ }
+
}
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
Fri Oct 31 04:20:03 2025 (r1929432)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
Fri Oct 31 07:16:27 2025 (r1929433)
@@ -34,7 +34,7 @@ import org.apache.pdfbox.cos.COSStream;
* @author Ben Litchfield
*
*/
-public class PDFObjectStreamParser extends BaseParser
+public class PDFObjectStreamParser extends COSParser
{
private final int numberOfObjects;
private final int firstObject;
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
Fri Oct 31 04:20:03 2025 (r1929432)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
Fri Oct 31 07:16:27 2025 (r1929433)
@@ -39,7 +39,7 @@ import org.apache.pdfbox.io.RandomAccess
*
* @author Ben Litchfield
*/
-public class PDFStreamParser extends BaseParser
+public class PDFStreamParser extends COSParser
{
/**
* Log instance.
@@ -66,8 +66,9 @@ public class PDFStreamParser extends Bas
* Constructor.
*
* @param bytes the bytes to parse.
+ * @throws IOException If there is an error initializing the stream.
*/
- public PDFStreamParser(byte[] bytes)
+ public PDFStreamParser(byte[] bytes) throws IOException
{
super(new RandomAccessReadBuffer(bytes));
}
@@ -114,15 +115,13 @@ public class PDFStreamParser extends Bas
case '<':
// pull off first left bracket
source.read();
-
// check for second left bracket
c = (char) source.peek();
- // put back first bracket
- source.rewind(1);
-
if (c == '<')
{
+ // put back first bracket
+ source.rewind(1);
try
{
return parseCOSDictionary(true);
@@ -137,7 +136,7 @@ public class PDFStreamParser extends Bas
}
else
{
- return parseCOSString();
+ return parseCOSHexString();
}
case '[':
// array
@@ -154,7 +153,7 @@ public class PDFStreamParser extends Bas
}
case '(':
// string
- return parseCOSString();
+ return parseCOSLiteralString();
case '/':
// name
return parseCOSName();
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java
Fri Oct 31 04:20:03 2025 (r1929432)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java
Fri Oct 31 07:16:27 2025 (r1929433)
@@ -46,6 +46,8 @@ public class XrefParser
private static final char[] XREF_TABLE = { 'x', 'r', 'e', 'f' };
private static final char[] STARTXREF = { 's', 't', 'a', 'r', 't', 'x',
'r', 'e', 'f' };
+ private static final long MINIMUM_SEARCH_OFFSET = 6;
+
/**
* Collects all Xref/trailer objects and resolves them into single
* object using startxref reference.
@@ -476,7 +478,7 @@ public class XrefParser
Map<COSObjectKey, Long> xrefOffset) throws IOException
{
// there can't be any object at the very beginning of a pdf
- if (offset < COSParser.MINIMUM_SEARCH_OFFSET)
+ if (offset < MINIMUM_SEARCH_OFFSET)
{
return null;
}
Modified:
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestBaseParser.java
==============================================================================
---
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestBaseParser.java
Fri Oct 31 04:20:03 2025 (r1929432)
+++
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestBaseParser.java
Fri Oct 31 07:16:27 2025 (r1929433)
@@ -24,72 +24,11 @@ import java.io.IOException;
import java.io.InputStream;
import org.apache.pdfbox.Loader;
-import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.junit.jupiter.api.Test;
class TestBaseParser
{
- @Test
- void testCheckForEndOfString() throws IOException
- {
- // (Test)
- byte[] inputBytes = { 40, 84, 101, 115, 116, 41 };
-
- RandomAccessReadBuffer buffer = new RandomAccessReadBuffer(inputBytes);
- BaseParser baseParser = new COSParser(buffer);
- COSString cosString = baseParser.parseCOSString();
- assertEquals("Test", cosString.getString());
-
- String output = "(Test";
- // ((Test) + LF + "/ "
- inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 10, '/',
' ' };
-
- buffer = new RandomAccessReadBuffer(inputBytes);
- baseParser = new COSParser(buffer);
- cosString = baseParser.parseCOSString();
- assertEquals(output, cosString.getString());
-
- // ((Test) + CR + "/ "
- inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 13, '/',
' ' };
-
- buffer = new RandomAccessReadBuffer(inputBytes);
- baseParser = new COSParser(buffer);
- cosString = baseParser.parseCOSString();
- assertEquals(output, cosString.getString());
-
- // ((Test) + CR + LF + "/ "
- inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 13, 10,
'/' };
-
- buffer = new RandomAccessReadBuffer(inputBytes);
- baseParser = new COSParser(buffer);
- cosString = baseParser.parseCOSString();
- assertEquals(output, cosString.getString());
-
- // ((Test) + LF + "> "
- inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 10, '>',
' ' };
-
- buffer = new RandomAccessReadBuffer(inputBytes);
- baseParser = new COSParser(buffer);
- cosString = baseParser.parseCOSString();
- assertEquals(output, cosString.getString());
-
- // ((Test) + CR + "> "
- inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 13, '>',
' ' };
-
- buffer = new RandomAccessReadBuffer(inputBytes);
- baseParser = new COSParser(buffer);
- cosString = baseParser.parseCOSString();
- assertEquals(output, cosString.getString());
-
- // ((Test) + CR + LF + "> "
- inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 13, 10,
'>' };
-
- buffer = new RandomAccessReadBuffer(inputBytes);
- baseParser = new COSParser(buffer);
- cosString = baseParser.parseCOSString();
- assertEquals(output, cosString.getString());
- }
@Test
void testBaseParserStackOverflow()
Added:
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestCOSParser.java
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestCOSParser.java
Fri Oct 31 07:16:27 2025 (r1929433)
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pdfbox.pdfparser;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.io.RandomAccessReadBuffer;
+import org.junit.jupiter.api.Test;
+
+class TestCOSParser
+{
+ @Test
+ void testCheckForEndOfString() throws IOException
+ {
+ // (Test)
+ byte[] inputBytes = { 40, 84, 101, 115, 116, 41 };
+
+ RandomAccessReadBuffer buffer = new RandomAccessReadBuffer(inputBytes);
+ COSParser cosParser = new COSParser(buffer);
+ COSString cosString = cosParser.parseCOSLiteralString();
+ assertEquals("Test", cosString.getString());
+
+ String output = "(Test";
+ // ((Test) + LF + "/ "
+ inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 10, '/',
' ' };
+
+ buffer = new RandomAccessReadBuffer(inputBytes);
+ cosParser = new COSParser(buffer);
+ cosString = cosParser.parseCOSLiteralString();
+ assertEquals(output, cosString.getString());
+
+ // ((Test) + CR + "/ "
+ inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 13, '/',
' ' };
+
+ buffer = new RandomAccessReadBuffer(inputBytes);
+ cosParser = new COSParser(buffer);
+ cosString = cosParser.parseCOSLiteralString();
+ assertEquals(output, cosString.getString());
+
+ // ((Test) + CR + LF + "/ "
+ inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 13, 10,
'/' };
+
+ buffer = new RandomAccessReadBuffer(inputBytes);
+ cosParser = new COSParser(buffer);
+ cosString = cosParser.parseCOSLiteralString();
+ assertEquals(output, cosString.getString());
+
+ // ((Test) + LF + "> "
+ inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 10, '>',
' ' };
+
+ buffer = new RandomAccessReadBuffer(inputBytes);
+ cosParser = new COSParser(buffer);
+ cosString = cosParser.parseCOSLiteralString();
+ assertEquals(output, cosString.getString());
+
+ // ((Test) + CR + "> "
+ inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 13, '>',
' ' };
+
+ buffer = new RandomAccessReadBuffer(inputBytes);
+ cosParser = new COSParser(buffer);
+ cosString = cosParser.parseCOSLiteralString();
+ assertEquals(output, cosString.getString());
+
+ // ((Test) + CR + LF + "> "
+ inputBytes = new byte[] { '(', '(', 'T', 'e', 's', 't', ')', 13, 10,
'>' };
+
+ buffer = new RandomAccessReadBuffer(inputBytes);
+ cosParser = new COSParser(buffer);
+ cosString = cosParser.parseCOSLiteralString();
+ assertEquals(output, cosString.getString());
+ }
+
+}