Author: lehmi
Date: Wed Sep 27 17:37:31 2017
New Revision: 1809891

URL: http://svn.apache.org/viewvc?rev=1809891&view=rev
Log:
PDFBOX-3934: include compressed objects in brute force search when rebuilding 
the trailer

Modified:
    
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Modified: 
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1809891&r1=1809890&r2=1809891&view=diff
==============================================================================
--- 
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
 (original)
+++ 
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
 Wed Sep 27 17:37:31 2017
@@ -38,6 +38,7 @@ import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSInputStream;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSNull;
 import org.apache.pdfbox.cos.COSNumber;
@@ -111,6 +112,11 @@ public class COSParser extends BaseParse
      */
     protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };
 
+    /**
+     * ObjStream-marker.
+     */
+    private static final char[] OBJ_STREAM = new char[] { '/', 'O', 'b', 'j', 
'S', 't', 'm' };
+
     private long trailerOffset;
     
     /**
@@ -1510,6 +1516,7 @@ public class COSParser extends BaseParse
                 bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, 
lastGenID),
                         lastObjOffset);
             }
+            bfSearchForObjStreams();
             // reestablish origin position
             source.seek(originOffset);
         }
@@ -1644,6 +1651,119 @@ public class COSParser extends BaseParse
     }
 
     /**
+     * Brute force search for all object streams.
+     * 
+     * @throws IOException if something went wrong
+     */
+    private void bfSearchForObjStreams() throws IOException
+    {
+        HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new 
HashMap<Long, COSObjectKey>();
+        long originOffset = source.getPosition();
+        source.seek(MINIMUM_SEARCH_OFFSET);
+        char[] string = " obj".toCharArray();
+        while (!source.isEOF())
+        {
+            // search for EOF marker
+            if (isString(OBJ_STREAM))
+            {
+                long currentPosition = source.getPosition();
+                // search backwards for the beginning of the object
+                long newOffset = -1;
+                COSObjectKey streamObjectKey = null;
+                boolean objFound = false;
+                for (int i = 1; i < 40 && !objFound; i++)
+                {
+                    long currentOffset = currentPosition - (i * 10);
+                    if (currentOffset > 0)
+                    {
+                        source.seek(currentOffset);
+                        for (int j = 0; j < 10; j++)
+                        {
+                            if (isString(string))
+                            {
+                                long tempOffset = currentOffset - 1;
+                                source.seek(tempOffset);
+                                int genID = source.peek();
+                                // is the next char a digit?
+                                if (isDigit(genID))
+                                {
+                                    tempOffset--;
+                                    source.seek(tempOffset);
+                                    if (isSpace())
+                                    {
+                                        int length = 0;
+                                        source.seek(--tempOffset);
+                                        while (tempOffset > 
MINIMUM_SEARCH_OFFSET && isDigit())
+                                        {
+                                            source.seek(--tempOffset);
+                                            length++;
+                                        }
+                                        if (length > 0)
+                                        {
+                                            source.read();
+                                            newOffset = source.getPosition();
+                                            long objNumber = 
readObjectNumber();
+                                            int genNumber = 
readGenerationNumber();
+                                            streamObjectKey = new 
COSObjectKey(objNumber,
+                                                    genNumber);
+                                            
bfSearchObjStreamsOffsets.put(newOffset,
+                                                    streamObjectKey);
+                                        }
+                                    }
+                                }
+                                LOG.debug("Dictionary start for object stream 
-> " + newOffset);
+                                objFound = true;
+                                break;
+                            }
+                            else
+                            {
+                                currentOffset++;
+                                source.read();
+                            }
+                        }
+                    }
+                }
+                source.seek(currentPosition + OBJ_STREAM.length);
+            }
+            source.read();
+        }
+        // add all found compressed objects to the brute force search result
+        for (Long offset : bfSearchObjStreamsOffsets.keySet())
+        {
+            long bfOffset = 
bfSearchCOSObjectKeyOffsets.get(bfSearchObjStreamsOffsets.get(offset));
+            // check if the object was overwritten
+            if (offset == bfOffset)
+            {
+                source.seek(offset);
+                long stmObjNumber = readObjectNumber();
+                readGenerationNumber();
+                readExpectedString(OBJ_MARKER, true);
+                COSDictionary dict = parseCOSDictionary();
+                int offsetFirstStream = dict.getInt(COSName.FIRST);
+                int nrOfObjects = dict.getInt(COSName.N);
+                COSStream stream = parseCOSStream(dict);
+                COSInputStream is = stream.createInputStream();
+                byte[] numbersStr = new byte[offsetFirstStream];
+                is.read(numbersStr);
+                is.close();
+                stream.close();
+                String[] numbers = new String(numbersStr, 
"ISO-8859-1").split(" ");
+                for (int i = 0; i < nrOfObjects; i++)
+                {
+                    long objNumber = Long.parseLong(numbers[i * 2]);
+                    COSObjectKey objKey = new COSObjectKey(objNumber, 0);
+                    Long existingOffset = 
bfSearchCOSObjectKeyOffsets.get(objKey);
+                    if (existingOffset == null || offset > existingOffset)
+                    {
+                        bfSearchCOSObjectKeyOffsets.put(objKey, -stmObjNumber);
+                    }
+                }
+            }
+        }
+        source.seek(originOffset);
+    }
+
+    /**
      * Brute force search for all xref entries (tables).
      * 
      * @throws IOException if something went wrong
@@ -1787,6 +1907,11 @@ public class COSParser extends BaseParse
             for (Entry<COSObjectKey, Long> entry : 
bfSearchCOSObjectKeyOffsets.entrySet())
             {
                 Long offset = entry.getValue();
+                // skip compressed objects
+                if (offset < 0)
+                {
+                    continue;
+                }
                 source.seek(offset);
                 readObjectNumber();
                 readGenerationNumber();


Reply via email to