Author: lehmi Date: Sun Jun 1 11:50:43 2025 New Revision: 1926036 URL: http://svn.apache.org/viewvc?rev=1926036&view=rev Log: PDFBOX-5992: skip either a line break (CR, LF or CRLF) or any one-byte whitespace at the beginning of an inline image
Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1926036&r1=1926035&r2=1926036&view=diff ============================================================================== --- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original) +++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun Jun 1 11:50:43 2025 @@ -356,6 +356,11 @@ public abstract class BaseParser return true; } + /** + * Skip the upcoming CRLF or LF which are supposed to follow a stream. Trailing spaces are removed as well. + * + * @throws IOException if something went wrong + */ protected void skipWhiteSpaces() throws IOException { //PDF Ref 3.2.7 A stream must be followed by either @@ -370,24 +375,56 @@ public abstract class BaseParser { whitespace = seqSource.read(); } + if (!skipLinebreak(whitespace)) + { + seqSource.unread(whitespace); + } + } - if (ASCII_CR == whitespace) + /** + * Skip one line break, such as CR, LF or CRLF. + * + * @return true if a line break was found and removed. + * + * @throws IOException if something went wrong + */ + protected boolean skipLinebreak() throws IOException + { + int whitespace = seqSource.read(); + // a line break is a CR, or LF or CRLF + if (!skipLinebreak(whitespace)) { - whitespace = seqSource.read(); - if (ASCII_LF != whitespace) + seqSource.unread(whitespace); + return false; + } + return true; + } + + /** + * Skip one line break, such as CR, LF or CRLF. + * + * @param linebreak the first character to be checked. + * + * @return true if a line break was found and removed. + * + * @throws IOException if something went wrong + */ + private boolean skipLinebreak(int linebreak) throws IOException + { + // a line break is a CR, or LF or CRLF + if (isCR(linebreak)) + { + int next = seqSource.read(); + if (!isLF(next)) { - seqSource.unread(whitespace); - //The spec says this is invalid but it happens in the real - //world so we must support it. + seqSource.unread(next); } } - else if (ASCII_LF != whitespace) + else if (!isLF(linebreak)) { - //we are in an error. - //but again we will do a lenient parsing and just assume that everything - //is fine - seqSource.unread(whitespace); + return false; } + return true; } /** Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1926036&r1=1926035&r2=1926036&view=diff ============================================================================== --- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java (original) +++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Sun Jun 1 11:50:43 2025 @@ -313,9 +313,10 @@ public class PDFStreamParser extends Bas "' at stream offset " + currentPosition); } ByteArrayOutputStream imageData = new ByteArrayOutputStream(); - if( isWhitespace() ) + // skip one line break (CR, LF or CRLF) or any one-byte whitespace + if (!skipLinebreak() && isWhitespace()) { - //pull off the whitespace character + // pull off the whitespace character seqSource.read(); } int lastByte = seqSource.read();