Hi all,

I have attached to this e-mail the revised (and now longer) patch, a
test PDF document, the source code (using PoDoFo) to generate it, and
the source code of a kind-of "unit test" (without using libcppunit, I
don't have it installed). My rationale for the patch is as follows:

- I don't like unreachable code and I hope you don't mind it removed.
- I think that the code removed was very difficult to correct, if at
  all possible
- The comment I removed doesn't apply anymore (IMO), because my changes
  make the code now cover all the possible cases (IIUC) for pagefinding
- I changed some checks to make them catch previously-missed cases
- I added extensive PDF-document brokenness logging to tree traversal

So I hope that the patch now makes the method so robust that you'll
accept it, please review it, and if accepted, please apply (separately)
to the public repository.

Best regards, mabri

----- Original Message -----
From: zyx <z...@litepdf.cz>
To: podofo-users@lists.sourceforge.net
CC: 
Sent: 21:03, Tuesday, 1 September 2015
Subject: Re: [Podofo-users] unreachable-code and robustness fixes in 
PdfPagesTree::GetPageNode()

On Wed, 2015-08-26 at 15:59 +0000, Matthew Brincke wrote:
> What do you think?

    Hi,
I do not know the exact rationale for the code being changed by your
patch. I think the best option is to have a test .pdf file(s) for it
and kind of a unit test, proving the change being right. The mix of
numKids and numDirectKids also makes things more complicated.

> +                          ); // TODO: perhaps log reference too?

I'd prefer to not add any new TODO, just decide and do it (or don't do
it).
    Bye,
    zyx

-- 
http://www.litePDF.cz                                 i...@litepdf.cz




------------------------------------------------------------------------------
_______________________________________________
Podofo-users mailing list
Podofo-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/podofo-users
Index: src/doc/PdfPagesTree.cpp
===================================================================
--- src/doc/PdfPagesTree.cpp	(revision 1676)
+++ src/doc/PdfPagesTree.cpp	(working copy)
@@ -327,9 +327,11 @@
     const size_t numDirectKids = rKidsArray.size();
     const size_t numKids = GetChildCount(pParent);
 
-    if( static_cast<int>(numKids) < nPageNum ) 
+    // use <= since nPageNum is 0-based
+    if( static_cast<int>(numKids) <= nPageNum ) 
     {
-        PdfError::LogMessage( eLogSeverity_Critical, "Cannot retrieve page %i from a document with only %i pages.",
+        PdfError::LogMessage( eLogSeverity_Critical,
+	    "Cannot retrieve page %i from a document with only %i pages.",
                               nPageNum, static_cast<int>(numKids) );
         return NULL;
     }
@@ -342,14 +344,6 @@
         rLstParents.push_back( pParent );
         return GetPageNodeFromArray( nPageNum, rKidsArray, rLstParents );
     } 
-    else if( numDirectKids == numKids && static_cast<size_t>(nPageNum) < numDirectKids )
-    {
-        // This node has only page nodes as kids,
-        // but does not contain our page,
-        // skip it - this case should never occur because
-        // of handling of childs in the else part below.
-        return NULL;
-    }
     else
     {
         // We have to traverse the tree
@@ -356,10 +350,45 @@
         while( it != rKidsArray.end() ) 
         {
             if( (*it).IsArray() ) 
-            {
-                // Fixes some broken PDFs who have trees with 1 element kids arrays
+            { // Fixes PDFs broken by having trees with arrays nested once
+                
                 rLstParents.push_back( pParent );
-                return GetPageNodeFromArray( nPageNum, (*it).GetArray(), rLstParents ); 
+
+                // the following code is to find the reference to log this with
+                const PdfReference & rIterArrayRef = (*it).Reference();
+                PdfReference refToLog;
+                bool isDirectObject // don't worry about 0-num. indirect ones
+                    = ( !(rIterArrayRef.ObjectNumber() ) );
+                if ( isDirectObject ) 
+		{
+                    if ( !(pObj->Reference().ObjectNumber() ) ) // rKidsArray's
+		    {
+                        refToLog = pParent->Reference();
+                    }
+		    else
+                    {
+                        refToLog = pObj->Reference();
+                    }
+                }
+                else
+                {
+                    refToLog = rIterArrayRef;
+                }
+                PdfError::LogMessage( eLogSeverity_Error,
+                                    "Entry in Kids array is itself an array"
+                    "%s reference: %s\n", isDirectObject ? " (direct object)"
+                    ", in object with" : ",", refToLog.ToString().c_str() );
+
+                    const PdfArray & rIterArray = (*it).GetArray();
+
+                    // is the array large enough to potentially have the page?
+                    if( static_cast<size_t>(nPageNum) < rIterArray.GetSize() )
+                    {
+                        PdfObject* pPageNode = GetPageNodeFromArray( nPageNum,
+                                                    rIterArray, rLstParents );
+                        if ( pPageNode ) // and if not, search further
+                            return pPageNode;
+                    }
             }
             else if( (*it).IsReference() ) 
             {
@@ -386,7 +415,7 @@
                         return this->GetPageNode( nPageNum, pChild, rLstParents );
                     }
                 }
-                else // Type == Page
+                else if( this->IsTypePage(pChild) ) 
                 {
                     if( 0 == nPageNum )
                     {
@@ -398,6 +427,17 @@
                     if(nPageNum > 0 )
                         nPageNum--;
                 }
+		else
+		{
+                    const PdfReference & rLogRef = pChild->Reference();
+                    pdf_objnum nLogObjNum = rLogRef.ObjectNumber();
+                    pdf_gennum nLogGenNum = rLogRef.GenerationNumber();
+		    PdfError::LogMessage( eLogSeverity_Critical,
+                                          "Requesting page index %i. "
+                        "Invalid datatype referenced in kids array: %s\n"
+                        "Reference to invalid object: %i %i R\n", nPageNum,
+                        pChild->GetDataTypeString(), nLogObjNum, nLogGenNum);
+                }
             }
             else
             {

Attachment: test-1_elem_array.pdf
Description: Adobe PDF document

#include <podofo/podofo.h>
#include <sstream>
#include <iostream> // for exception handling

using namespace PoDoFo;

#define NUM_PAGES_FIRST 3
#define NUM_PAGES_EXTRA 2
#define TEXTPOS_X 72.0
#define TEXTPOS_Y 72.0

/** Create Pages node which will be referenced from the parent of the last page
    in place of the latter's reference which'll be new node's Kids' only entry.
*/
void CreatePagesNodePlacedWithLastKid(PdfDocument & doc)
{
    PdfPage* pLastPage = doc.GetPage( doc.GetPageCount() - 1 ); // 0-based
    PdfObject* pLastPageObj = pLastPage->GetObject();
    PdfObject* pParentObj = pLastPageObj->MustGetIndirectKey( "Parent" );
    PdfArray & rParentKidsArray = pParentObj->MustGetIndirectKey( "Kids" )
                                            ->GetArray();
    PdfObject& pLastKidRefObj = *(rParentKidsArray.rbegin()); // num's implicit
    const PdfReference & rLastKidRef = pLastKidRefObj.GetReference();
    PdfVecObjects* vecObjs = doc.GetObjects();
    PdfObject* pCreatedPagesNode = vecObjs->CreateObject( "Pages" );  // dict.
    PdfDictionary & rLastKidDict = vecObjs->GetObject( rLastKidRef )
                    ->GetDictionary();
    rLastKidDict.AddKey( "Parent", pCreatedPagesNode->Reference() );
    PdfDictionary & rCreatedPagesNodeDict = pCreatedPagesNode->GetDictionary();
    rCreatedPagesNodeDict.AddKey( "Kids", PdfArray( rLastKidRef ) ); // 
    rCreatedPagesNodeDict.AddKey( "Count", 1LL); // PdfObject(pdf_int64) called
    pLastKidRefObj = pCreatedPagesNode->Reference(); // page ref's a level down
}


void DrawSomeTextOnPage(PdfPainter & painter, PdfPage* pPage, PdfFont* pFont,
                        pdf_uint8 nPageNumber) // this parameter is 1-based
{
    painter.SetPage( pPage );
    painter.SetFont( pFont );

    // Draw on the page, e.g. "Page "+page number
    std::ostringstream textstream;
    textstream << "Page " << nPageNumber;
    painter.DrawText( TEXTPOS_X, TEXTPOS_Y, PdfString( textstream.str(),
                                                pFont->GetEncoding() ) );
    painter.FinishPage();
}

int main( int argc, const char* argv[] )
{
    if (argc != 2)
    {
        return 1;
    }
    PdfMemDocument outputDoc;
    PdfPainter painter;
    PdfVecObjects& vecObjs = outputDoc.GetObjects();
    PdfFont* pFont = PdfFontFactory::CreateBase14Font( "Helvetica", 0,
                            PdfEncodingFactory::GlobalWinAnsiEncodingInstance()
                                        , &vecObjs );
    for ( pdf_uint8 i = 0; i < NUM_PAGES_FIRST; i++ )
    {
        PdfPage* pPage = outputDoc.CreatePage( PdfPage::CreateStandardPageSize(
                                               ePdfPageSize_A4 ));
        DrawSomeTextOnPage( painter, pPage, pFont, i + 1 );
    }
    
    CreatePagesNodePlacedWithLastKid(outputDoc); // page tree 2-level ->3-level
    
    std::vector< PdfRect > vecPageSizes;
    for ( pdf_uint8 j = 0; j < NUM_PAGES_EXTRA; j++ )
    {
        vecPageSizes.push_back( PdfPage::CreateStandardPageSize(
                                ePdfPageSize_A4 ) );
    }

    outputDoc.CreatePages( vecPageSizes );

    // Manipulate the page tree here to get a middle page in a 1-element array
    const PdfObject* pDocCatalog = outputDoc.GetCatalog();
    PdfObject* pPagesObj = pDocCatalog->MustGetIndirectKey( PdfName("Pages") );
    PdfObject* pKidsObj = pPagesObj->MustGetIndirectKey( PdfName("Kids") );
    PdfArray::iterator kidsIt = pKidsObj->GetArray().begin()
                                + (NUM_PAGES_FIRST / 2);
    
    PdfReference middlePageRef = (*kidsIt).GetReference();
    *kidsIt = PdfArray(middlePageRef);
    
    outputDoc.SetWriteMode( ePdfWriteMode_Clean ); // readable in a text editor
    outputDoc.Write( argv[1] ); // Caution: if the file exists it's overwritten
    return 0;
}
#include <podofo/podofo.h>
#include <iostream>

using namespace PoDoFo;

#define TEST_FILE "test-1_elem_array.pdf"
#define PAGE_TO_TEST 3

int main()
{
	PdfMemDocument testDoc;
    testDoc.Load( TEST_FILE );
    PdfPagesTree* testPagesTree = testDoc.GetPagesTree();
    // GetPage argument is expected 0-based
    // testPagesTree->ClearCache();
    PdfPage* testPage = NULL;
    try {
        testPage= testPagesTree->GetPage( PAGE_TO_TEST - 1 );
    } catch (PdfError & e) {
        e.PrintErrorMsg();
    }
    if (testPage != NULL)
    {
        std::cerr << "SUCCESS!" << std::endl;
        return 0;
    }
    else
    {
        std::cerr << "FAILURE!" << std::endl;
        return 1;
    }
}
    
------------------------------------------------------------------------------
_______________________________________________
Podofo-users mailing list
Podofo-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/podofo-users

Reply via email to