Hi all,
I have attached to this e-mail the revised (and now longer) patch, a test PDF document, the source code (using PoDoFo) to generate it, and the source code of a kind-of "unit test" (without using libcppunit, I don't have it installed). My rationale for the patch is as follows: - I don't like unreachable code and I hope you don't mind it removed. - I think that the code removed was very difficult to correct, if at all possible - The comment I removed doesn't apply anymore (IMO), because my changes make the code now cover all the possible cases (IIUC) for pagefinding - I changed some checks to make them catch previously-missed cases - I added extensive PDF-document brokenness logging to tree traversal So I hope that the patch now makes the method so robust that you'll accept it, please review it, and if accepted, please apply (separately) to the public repository. Best regards, mabri ----- Original Message ----- From: zyx <z...@litepdf.cz> To: podofo-users@lists.sourceforge.net CC: Sent: 21:03, Tuesday, 1 September 2015 Subject: Re: [Podofo-users] unreachable-code and robustness fixes in PdfPagesTree::GetPageNode() On Wed, 2015-08-26 at 15:59 +0000, Matthew Brincke wrote: > What do you think? Hi, I do not know the exact rationale for the code being changed by your patch. I think the best option is to have a test .pdf file(s) for it and kind of a unit test, proving the change being right. The mix of numKids and numDirectKids also makes things more complicated. > + ); // TODO: perhaps log reference too? I'd prefer to not add any new TODO, just decide and do it (or don't do it). Bye, zyx -- http://www.litePDF.cz i...@litepdf.cz ------------------------------------------------------------------------------ _______________________________________________ Podofo-users mailing list Podofo-users@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/podofo-users
Index: src/doc/PdfPagesTree.cpp =================================================================== --- src/doc/PdfPagesTree.cpp (revision 1676) +++ src/doc/PdfPagesTree.cpp (working copy) @@ -327,9 +327,11 @@ const size_t numDirectKids = rKidsArray.size(); const size_t numKids = GetChildCount(pParent); - if( static_cast<int>(numKids) < nPageNum ) + // use <= since nPageNum is 0-based + if( static_cast<int>(numKids) <= nPageNum ) { - PdfError::LogMessage( eLogSeverity_Critical, "Cannot retrieve page %i from a document with only %i pages.", + PdfError::LogMessage( eLogSeverity_Critical, + "Cannot retrieve page %i from a document with only %i pages.", nPageNum, static_cast<int>(numKids) ); return NULL; } @@ -342,14 +344,6 @@ rLstParents.push_back( pParent ); return GetPageNodeFromArray( nPageNum, rKidsArray, rLstParents ); } - else if( numDirectKids == numKids && static_cast<size_t>(nPageNum) < numDirectKids ) - { - // This node has only page nodes as kids, - // but does not contain our page, - // skip it - this case should never occur because - // of handling of childs in the else part below. - return NULL; - } else { // We have to traverse the tree @@ -356,10 +350,45 @@ while( it != rKidsArray.end() ) { if( (*it).IsArray() ) - { - // Fixes some broken PDFs who have trees with 1 element kids arrays + { // Fixes PDFs broken by having trees with arrays nested once + rLstParents.push_back( pParent ); - return GetPageNodeFromArray( nPageNum, (*it).GetArray(), rLstParents ); + + // the following code is to find the reference to log this with + const PdfReference & rIterArrayRef = (*it).Reference(); + PdfReference refToLog; + bool isDirectObject // don't worry about 0-num. indirect ones + = ( !(rIterArrayRef.ObjectNumber() ) ); + if ( isDirectObject ) + { + if ( !(pObj->Reference().ObjectNumber() ) ) // rKidsArray's + { + refToLog = pParent->Reference(); + } + else + { + refToLog = pObj->Reference(); + } + } + else + { + refToLog = rIterArrayRef; + } + PdfError::LogMessage( eLogSeverity_Error, + "Entry in Kids array is itself an array" + "%s reference: %s\n", isDirectObject ? " (direct object)" + ", in object with" : ",", refToLog.ToString().c_str() ); + + const PdfArray & rIterArray = (*it).GetArray(); + + // is the array large enough to potentially have the page? + if( static_cast<size_t>(nPageNum) < rIterArray.GetSize() ) + { + PdfObject* pPageNode = GetPageNodeFromArray( nPageNum, + rIterArray, rLstParents ); + if ( pPageNode ) // and if not, search further + return pPageNode; + } } else if( (*it).IsReference() ) { @@ -386,7 +415,7 @@ return this->GetPageNode( nPageNum, pChild, rLstParents ); } } - else // Type == Page + else if( this->IsTypePage(pChild) ) { if( 0 == nPageNum ) { @@ -398,6 +427,17 @@ if(nPageNum > 0 ) nPageNum--; } + else + { + const PdfReference & rLogRef = pChild->Reference(); + pdf_objnum nLogObjNum = rLogRef.ObjectNumber(); + pdf_gennum nLogGenNum = rLogRef.GenerationNumber(); + PdfError::LogMessage( eLogSeverity_Critical, + "Requesting page index %i. " + "Invalid datatype referenced in kids array: %s\n" + "Reference to invalid object: %i %i R\n", nPageNum, + pChild->GetDataTypeString(), nLogObjNum, nLogGenNum); + } } else {
test-1_elem_array.pdf
Description: Adobe PDF document
#include <podofo/podofo.h> #include <sstream> #include <iostream> // for exception handling using namespace PoDoFo; #define NUM_PAGES_FIRST 3 #define NUM_PAGES_EXTRA 2 #define TEXTPOS_X 72.0 #define TEXTPOS_Y 72.0 /** Create Pages node which will be referenced from the parent of the last page in place of the latter's reference which'll be new node's Kids' only entry. */ void CreatePagesNodePlacedWithLastKid(PdfDocument & doc) { PdfPage* pLastPage = doc.GetPage( doc.GetPageCount() - 1 ); // 0-based PdfObject* pLastPageObj = pLastPage->GetObject(); PdfObject* pParentObj = pLastPageObj->MustGetIndirectKey( "Parent" ); PdfArray & rParentKidsArray = pParentObj->MustGetIndirectKey( "Kids" ) ->GetArray(); PdfObject& pLastKidRefObj = *(rParentKidsArray.rbegin()); // num's implicit const PdfReference & rLastKidRef = pLastKidRefObj.GetReference(); PdfVecObjects* vecObjs = doc.GetObjects(); PdfObject* pCreatedPagesNode = vecObjs->CreateObject( "Pages" ); // dict. PdfDictionary & rLastKidDict = vecObjs->GetObject( rLastKidRef ) ->GetDictionary(); rLastKidDict.AddKey( "Parent", pCreatedPagesNode->Reference() ); PdfDictionary & rCreatedPagesNodeDict = pCreatedPagesNode->GetDictionary(); rCreatedPagesNodeDict.AddKey( "Kids", PdfArray( rLastKidRef ) ); // rCreatedPagesNodeDict.AddKey( "Count", 1LL); // PdfObject(pdf_int64) called pLastKidRefObj = pCreatedPagesNode->Reference(); // page ref's a level down } void DrawSomeTextOnPage(PdfPainter & painter, PdfPage* pPage, PdfFont* pFont, pdf_uint8 nPageNumber) // this parameter is 1-based { painter.SetPage( pPage ); painter.SetFont( pFont ); // Draw on the page, e.g. "Page "+page number std::ostringstream textstream; textstream << "Page " << nPageNumber; painter.DrawText( TEXTPOS_X, TEXTPOS_Y, PdfString( textstream.str(), pFont->GetEncoding() ) ); painter.FinishPage(); } int main( int argc, const char* argv[] ) { if (argc != 2) { return 1; } PdfMemDocument outputDoc; PdfPainter painter; PdfVecObjects& vecObjs = outputDoc.GetObjects(); PdfFont* pFont = PdfFontFactory::CreateBase14Font( "Helvetica", 0, PdfEncodingFactory::GlobalWinAnsiEncodingInstance() , &vecObjs ); for ( pdf_uint8 i = 0; i < NUM_PAGES_FIRST; i++ ) { PdfPage* pPage = outputDoc.CreatePage( PdfPage::CreateStandardPageSize( ePdfPageSize_A4 )); DrawSomeTextOnPage( painter, pPage, pFont, i + 1 ); } CreatePagesNodePlacedWithLastKid(outputDoc); // page tree 2-level ->3-level std::vector< PdfRect > vecPageSizes; for ( pdf_uint8 j = 0; j < NUM_PAGES_EXTRA; j++ ) { vecPageSizes.push_back( PdfPage::CreateStandardPageSize( ePdfPageSize_A4 ) ); } outputDoc.CreatePages( vecPageSizes ); // Manipulate the page tree here to get a middle page in a 1-element array const PdfObject* pDocCatalog = outputDoc.GetCatalog(); PdfObject* pPagesObj = pDocCatalog->MustGetIndirectKey( PdfName("Pages") ); PdfObject* pKidsObj = pPagesObj->MustGetIndirectKey( PdfName("Kids") ); PdfArray::iterator kidsIt = pKidsObj->GetArray().begin() + (NUM_PAGES_FIRST / 2); PdfReference middlePageRef = (*kidsIt).GetReference(); *kidsIt = PdfArray(middlePageRef); outputDoc.SetWriteMode( ePdfWriteMode_Clean ); // readable in a text editor outputDoc.Write( argv[1] ); // Caution: if the file exists it's overwritten return 0; }
#include <podofo/podofo.h> #include <iostream> using namespace PoDoFo; #define TEST_FILE "test-1_elem_array.pdf" #define PAGE_TO_TEST 3 int main() { PdfMemDocument testDoc; testDoc.Load( TEST_FILE ); PdfPagesTree* testPagesTree = testDoc.GetPagesTree(); // GetPage argument is expected 0-based // testPagesTree->ClearCache(); PdfPage* testPage = NULL; try { testPage= testPagesTree->GetPage( PAGE_TO_TEST - 1 ); } catch (PdfError & e) { e.PrintErrorMsg(); } if (testPage != NULL) { std::cerr << "SUCCESS!" << std::endl; return 0; } else { std::cerr << "FAILURE!" << std::endl; return 1; } }
------------------------------------------------------------------------------
_______________________________________________ Podofo-users mailing list Podofo-users@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/podofo-users