On Fri, 2012-05-11 at 22:22 +0100, Michael Meeks wrote: > Argh - and it's entirely possible that this breaks the > CVE-2010-3454-1.doc test on -3-5 - but it's rather too late to double > check that now; seems to pass on master though; most odd. Will poke > Monday.
Wow - this was a -really- 'fun' problem to nail ;-) it turns out that simply walking the fat chain pollutes the state of the streams in a way that is extraordinarily hard to unwind; ie. even just calling the original makePageChainCache method (or sim.) would seek beyond the end of the stream, putting it in some un-recoverable state (or somesuch). Anyhow - after the big chunk of life working that out, it dawned on me that there is no need for the pagechaincache building to be slow, and that we should do it always, and incrementally as we read. Hopefully that'll still allow us to recover parts of word documents that are not seekable. So - I re-worked this to simplify, incrementally build the page chain cache which might help performance in nasty corner cases, and also wrote some regression tests [ which are hairy, the sot/ 'pass' documents have some nice instances of broken FAT chains ;-]. The 'slow.doc' parses in ~4 seconds for me now; though there is some hyper-long and painfully incomprehensible 15 second thrash after that (with no progress bar) still ;-) I'd like to get this reviewed, more widely tested and into -3-5 (as yet it's not in master either, this is vs. -3-5 ;-) It'd be worth testing any documents we know of, where previously we could recover some of the document content from the beginning of the stream, where the end was corrupt (I guess). Thanks ! Michael. -- michael.me...@suse.com <><, Pseudo Engineer, itinerant idiot
>From fe0e7c2664a6fc9e3fae4404d41645cecc9e2624 Mon Sep 17 00:00:00 2001 From: Michael Meeks <michael.me...@suse.com> Date: Fri, 11 May 2012 20:19:21 +0100 Subject: [PATCH] sot: re-work OLE2 offset-to-page computation with a FAT chain cache --- sot/qa/cppunit/test_sot.cxx | 74 ++++++++++++++++++++- sot/source/sdstor/stgstrms.cxx | 144 +++++++++++++++++++++++++++++----------- sot/source/sdstor/stgstrms.hxx | 3 + 3 files changed, 180 insertions(+), 41 deletions(-) diff --git a/sot/qa/cppunit/test_sot.cxx b/sot/qa/cppunit/test_sot.cxx index 36d0b02..0f91bdf 100644 --- a/sot/qa/cppunit/test_sot.cxx +++ b/sot/qa/cppunit/test_sot.cxx @@ -33,6 +33,7 @@ #include <osl/file.hxx> #include <osl/process.h> #include <sot/storage.hxx> +#include <sot/storinfo.hxx> using namespace ::com::sun::star; @@ -45,6 +46,11 @@ namespace public: SotTest() {} + bool checkStream( const SotStorageRef &xObjStor, + const String &rStreamName, + sal_uLong nSize ); + bool checkStorage( const SotStorageRef &xObjStor ); + virtual bool load(const rtl::OUString &, const rtl::OUString &rURL, const rtl::OUString &); @@ -55,12 +61,78 @@ namespace CPPUNIT_TEST_SUITE_END(); }; + bool SotTest::checkStream( const SotStorageRef &xObjStor, + const String &rStreamName, + sal_uLong nSize ) + { + unsigned char *pData = (unsigned char*)malloc( nSize ); + sal_uLong nReadableSize = 0; + if( !pData ) + return true; + + { // Read the data in one block + SotStorageStreamRef xStream( xObjStor->OpenSotStream( rStreamName ) ); + xStream->Seek(0); + sal_uLong nRemaining = xStream->GetSize() - xStream->Tell(); + + CPPUNIT_ASSERT_MESSAGE( "check size", nRemaining == nSize ); + CPPUNIT_ASSERT_MESSAGE( "check size #2", xStream->remainingSize() == nSize ); + + // Read as much as we can, a corrupted FAT chain can cause real grief here + nReadableSize = xStream->Read( (void *)pData, nSize ); +// fprintf(stderr, "readable size %d vs size %d remaining %d\n", nReadableSize, nSize, nReadableSize); + } + { // Read the data backwards as well + SotStorageStreamRef xStream( xObjStor->OpenSotStream( rStreamName ) ); + for( sal_uLong i = nReadableSize; i > 0; i-- ) + { + CPPUNIT_ASSERT_MESSAGE( "sot reading error", !xStream->GetError() ); + unsigned char c; + xStream->Seek( i - 1 ); + CPPUNIT_ASSERT_MESSAGE( "sot storage reading byte", + xStream->Read( &c, 1 ) == 1); + CPPUNIT_ASSERT_MESSAGE( "mismatching data storage reading byte", + pData[i - 1] == c ); + } + } + + return true; + } + + bool SotTest::checkStorage( const SotStorageRef &xObjStor ) + { + SvStorageInfoList aInfoList; + xObjStor->FillInfoList( &aInfoList ); + + for( SvStorageInfoList::iterator aIt = aInfoList.begin(); + aIt != aInfoList.end(); aIt++ ) + { + sal_uLong nSize = aIt->GetSize(); + fprintf( stderr, "Stream '%s' size %ld\n", + rtl::OUStringToOString( aIt->GetName(), RTL_TEXTENCODING_UTF8 ).getStr(), + (long)nSize ); + if( aIt->IsStorage() ) + { + SotStorageRef xChild( xObjStor->OpenSotStorage( aIt->GetName() ) ); + checkStorage( xChild ); + } + else if( aIt->IsStream() ) + checkStream( xObjStor, aIt->GetName(), aIt->GetSize() ); + } + + return true; + } + bool SotTest::load(const rtl::OUString &, const rtl::OUString &rURL, const rtl::OUString &) { SvFileStream aStream(rURL, STREAM_READ); SotStorageRef xObjStor = new SotStorage(aStream); - return xObjStor.Is() && !xObjStor->GetError(); + if (!xObjStor.Is() && !xObjStor->GetError()) + return false; + + CPPUNIT_ASSERT_MESSAGE("sot storage is not valid", xObjStor->Validate()); + return checkStorage (xObjStor); } void SotTest::test() diff --git a/sot/source/sdstor/stgstrms.cxx b/sot/source/sdstor/stgstrms.cxx index 5e6c1ea..22ff35f 100644 --- a/sot/source/sdstor/stgstrms.cxx +++ b/sot/source/sdstor/stgstrms.cxx @@ -26,12 +26,12 @@ * ************************************************************************/ +#include <algorithm> #include <string.h> // memcpy() - +#include <sal/log.hxx> #include <osl/file.hxx> #include <tools/tempfile.hxx> -#include <tools/debug.hxx> #include "sot/stg.hxx" #include "stgelem.hxx" @@ -334,13 +334,46 @@ void StgStrm::SetEntry( StgDirEntry& r ) r.SetDirty(); } +/* + * The page chain, is basically a singly linked list of slots each + * point to the next page. Instead of traversing the file structure + * for this each time build a simple flat in-memory vector list + * of pages. + */ +void StgStrm::scanBuildPageChainCache(sal_Int32 *pOptionalCalcSize) +{ + if (nSize > 0) + m_aPagesCache.reserve(nSize/nPageSize); + + bool bError = false; + sal_Int32 nBgn = nStart; + sal_Int32 nOldBgn = -1; + sal_Int32 nOptSize = 0; + while( nBgn >= 0 && nBgn != nOldBgn ) + { + if( nBgn >= 0 ) + m_aPagesCache.push_back(nBgn); + nOldBgn = nBgn; + nBgn = pFat->GetNextPage( nBgn ); + if( nBgn == nOldBgn ) + bError = true; + nOptSize += nPageSize; + } + if (bError) + { + if (pOptionalCalcSize) + rIo.SetError( ERRCODE_IO_WRONGFORMAT ); + m_aPagesCache.clear(); + } + if (pOptionalCalcSize) + *pOptionalCalcSize = nOptSize; +} + // Compute page number and offset for the given byte position. // If the position is behind the size, set the stream right // behind the EOF. - sal_Bool StgStrm::Pos2Page( sal_Int32 nBytePos ) { - sal_Int32 nRel, nBgn; // Values < 0 seek to the end if( nBytePos < 0 || nBytePos >= nSize ) nBytePos = nSize; @@ -353,41 +386,71 @@ sal_Bool StgStrm::Pos2Page( sal_Int32 nBytePos ) nPos = nBytePos; if( nOld == nNew ) return sal_True; - if( nNew > nOld ) - { - // the new position is behind the current, so an incremental - // positioning is OK. Set the page relative position - nRel = nNew - nOld; - nBgn = nPage; - } - else + + // See fdo#47644 for a .doc with a vast amount of pages where seeking around the + // document takes a colossal amount of time + // + // Please Note: we build the pagescache incrementally as we go if necessary, + // so that a corrupted FAT doesn't poison the stream state for earlier reads + size_t nIdx = nNew / nPageSize; + if( nIdx >= m_aPagesCache.size() ) { - // the new position is before the current, so we have to scan - // the entire chain. - nRel = nNew; - nBgn = nStart; + // Extend the FAT cache ! ... + size_t nToAdd = nIdx + 1; + + if (m_aPagesCache.empty()) + m_aPagesCache.push_back( nStart ); + + nToAdd -= m_aPagesCache.size(); + + sal_Int32 nBgn = m_aPagesCache.back(); +// fprintf (stderr, "nIdx %d to-add %d start page %d pagesize %d\n", +// (int)nIdx, (int)nToAdd, (int)nBgn, (int) nPageSize); + + // Start adding pages while we can + while( nToAdd > 0 && nBgn >= 0 ) + { + nBgn = pFat->GetNextPage( nBgn ); +// fprintf (stderr, "add page %d\n", (int)nBgn); + if( nBgn >= 0 ) + { + m_aPagesCache.push_back( nBgn ); + nToAdd--; + } + } } - // now, traverse the FAT chain. - nRel /= nPageSize; - sal_Int32 nLast = STG_EOF; - while( nRel && nBgn >= 0 ) + + if ( nIdx > m_aPagesCache.size() ) { - nLast = nBgn; - nBgn = pFat->GetNextPage( nBgn ); - nRel--; + rIo.SetError( SVSTREAM_FILEFORMAT_ERROR ); + nPage = STG_EOF; + nOffset = nPageSize; +// fprintf (stderr, "returns over-end page %d offset %d err %d ret %d\n", +// (int)nPage, (int)nOffset, (int) rIo.GetError(), false); + return sal_False; } // special case: seek to 1st byte of new, unallocated page // (in case the file size is a multiple of the page size) - if( nBytePos == nSize && nBgn == STG_EOF && !nRel && !nOffset ) - nBgn = nLast, nOffset = nPageSize; - if( nBgn < 0 && nBgn != STG_EOF ) + if( nBytePos == nSize && !nOffset && nIdx > 0 && nIdx == m_aPagesCache.size() ) { - rIo.SetError( SVSTREAM_FILEFORMAT_ERROR ); - nBgn = STG_EOF; +// fprintf (stderr, "this odd case\n"); + nIdx--; nOffset = nPageSize; } - nPage = nBgn; - return sal_Bool( nRel == 0 && nPage >= 0 ); + else if ( nIdx == m_aPagesCache.size() ) + { + nPage = STG_EOF; +// fprintf (stderr, "returns eof page %d offset %d err %d ret %d\n", +// (int)nPage, (int)nOffset, (int) rIo.GetError(), false); + return sal_False; + } + + nPage = m_aPagesCache[ nIdx ]; + +// fprintf (stderr, "returns page %d offset %d err %d ret %d\n", +// (int)nPage, (int)nOffset, (int) rIo.GetError(), nPage >= 0 ); + + return nPage >= 0; } // Retrieve the physical page for a given byte offset. @@ -404,6 +467,8 @@ StgPage* StgStrm::GetPhysPage( sal_Int32 nBytePos, sal_Bool bForce ) sal_Bool StgStrm::Copy( sal_Int32 nFrom, sal_Int32 nBytes ) { + m_aPagesCache.clear(); + sal_Int32 nTo = nStart; sal_Int32 nPgs = ( nBytes + nPageSize - 1 ) / nPageSize; while( nPgs-- ) @@ -430,6 +495,8 @@ sal_Bool StgStrm::Copy( sal_Int32 nFrom, sal_Int32 nBytes ) sal_Bool StgStrm::SetSize( sal_Int32 nBytes ) { + m_aPagesCache.clear(); + // round up to page size sal_Int32 nOld = ( ( nSize + nPageSize - 1 ) / nPageSize ) * nPageSize; sal_Int32 nNew = ( ( nBytes + nPageSize - 1 ) / nPageSize ) * nPageSize; @@ -528,6 +595,8 @@ sal_Int32 StgFATStrm::GetPage( short nOff, sal_Bool bMake, sal_uInt16 *pnMasterA { if( bMake ) { + m_aPagesCache.clear(); + // create a new master page nFAT = nMaxPage++; pMaster = rIo.Copy( nFAT, STG_FREE ); @@ -582,6 +651,8 @@ sal_Int32 StgFATStrm::GetPage( short nOff, sal_Bool bMake, sal_uInt16 *pnMasterA sal_Bool StgFATStrm::SetPage( short nOff, sal_Int32 nNewPage ) { + m_aPagesCache.clear(); + sal_Bool bRes = sal_True; if( nOff < rIo.aHdr.GetFAT1Size() ) rIo.aHdr.SetFATPage( nOff, nNewPage ); @@ -631,6 +702,8 @@ sal_Bool StgFATStrm::SetPage( short nOff, sal_Int32 nNewPage ) sal_Bool StgFATStrm::SetSize( sal_Int32 nBytes ) { + m_aPagesCache.clear(); + // Set the number of entries to a multiple of the page size short nOld = (short) ( ( nSize + ( nPageSize - 1 ) ) / nPageSize ); short nNew = (short) ( @@ -747,16 +820,7 @@ void StgDataStrm::Init( sal_Int32 nBgn, sal_Int32 nLen ) { // determine the actual size of the stream by scanning // the FAT chain and counting the # of pages allocated - nSize = 0; - sal_Int32 nOldBgn = -1; - while( nBgn >= 0 && nBgn != nOldBgn ) - { - nOldBgn = nBgn; - nBgn = pFat->GetNextPage( nBgn ); - if( nBgn == nOldBgn ) - rIo.SetError( ERRCODE_IO_WRONGFORMAT ); - nSize += nPageSize; - } + scanBuildPageChainCache( &nSize ); } } diff --git a/sot/source/sdstor/stgstrms.hxx b/sot/source/sdstor/stgstrms.hxx index a6a0ad1..5a9558b 100644 --- a/sot/source/sdstor/stgstrms.hxx +++ b/sot/source/sdstor/stgstrms.hxx @@ -30,6 +30,7 @@ #define _STGSTRMS_HXX #include <tools/stream.hxx> +#include <vector> class StgIo; class StgStrm; @@ -77,6 +78,8 @@ protected: sal_Int32 nPage; // current logical page short nOffset; // offset into current page short nPageSize; // logical page size + std::vector<sal_Int32> m_aPagesCache; + void scanBuildPageChainCache(sal_Int32 *pOptionalCalcSize = NULL); sal_Bool Copy( sal_Int32 nFrom, sal_Int32 nBytes ); StgStrm( StgIo& ); public: -- 1.7.7
_______________________________________________ LibreOffice mailing list LibreOffice@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/libreoffice