Re: [Podofo-users] TextExtractor.cpp (loading stack)

Palmer Zent Thu, 22 Aug 2013 12:52:45 -0700

Hi,

The stack is actually already loaded when it is popped as the keyword (token of 
type ePdfContentsType_Keyword) comes after its arguments (tokens of type 
ePdfContentsType_Variant) in the PDF command. Example: "0 0 m".


Regards

Palmer Zent

On Aug 22, 2013, at 3:34 PM, Domonic Tom <abdom...@hotmail.com> wrote:

> Hi there.
> 
> I understand 99 percent of the code below.  This is from TextExtractor.cpp.
> 
> I'm just not sure how the following occurs.
> 
> When does the stack have the first token pushed onto it?  I can see further 
> down that if the variable type is a variant then we push a variant onto the 
> stack but I'm not sure why we are establishing position x and y before we 
> load the stack with anything.   This could be a really silly question as I 
> can't see where anyone else has asked it but I've highlighted the section 
> below in bold and red.  The general area is in bold. 
> 
> Thank you..
> 
> 
> 
> 
> /***************************************************************************
>  *   Copyright (C) 2008 by Dominik Seichter                                *
>  *   domseich...@web.de                                                    *
>  *                                                                         *
>  *   This program is free software; you can redistribute it and/or modify  *
>  *   it under the terms of the GNU General Public License as published by  *
>  *   the Free Software Foundation; either version 2 of the License, or     *
>  *   (at your option) any later version.                                   *
>  *                                                                         *
>  *   This program is distributed in the hope that it will be useful,       *
>  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
>  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
>  *   GNU General Public License for more details.                          *
>  *                                                                         *
>  *   You should have received a copy of the GNU General Public License     *
>  *   along with this program; if not, write to the                         *
>  *   Free Software Foundation, Inc.,                                       *
>  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
>  ***************************************************************************/
> 
> #include "TextExtractor.h"
> 
> #include <stack>
> #include <iostream>
> using namespace std;
> 
> TextExtractor::TextExtractor()
> {
> 
> }
> 
> TextExtractor::~TextExtractor()
> {
> }
> 
> void TextExtractor::Init( const char* pszInput )
> {
>     if( !pszInput )
>     {
>         PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
>     }
> 
>     PdfMemDocument document( pszInput );
> 
>     int nCount = document.GetPageCount();
>     for( int i=0; i<nCount; i++ )
>     {
>         PdfPage* pPage = document.GetPage( i );
> 
>         this->ExtractText( &document, pPage );
>     }
> }
> 
> void TextExtractor::ExtractText( PdfMemDocument* pDocument, PdfPage* pPage )
> {
>     const char*      pszToken = NULL;
>     PdfVariant       var;
>     EPdfContentsType eType;
> 
>     PdfContentsTokenizer tokenizer( pPage );
> 
>     double dCurPosX     = 0.0;
>     double dCurPosY     = 0.0;
>     double dCurFontSize = 0.0;
>     bool   bTextBlock   = false;
>     PdfFont* pCurFont   = NULL;
> 
>     std::stack<PdfVariant> stack;
> 
>     while( tokenizer.ReadNext( eType, pszToken, var ) )
>     {
>         if( eType == ePdfContentsType_Keyword )
>         {
>             // support 'l' and 'm' tokens ---------------------------'l' 
> token means: Append straight line segment to path
>                       // 'm' token means: Begin new subpath
>             if( strcmp( pszToken, "l" ) == 0 ||
>                 strcmp( pszToken, "m" ) == 0 )
>             {
>                 dCurPosX = stack.top().GetReal();  // WHY ARE WE POPPING OFF 
> THE STACK BEFORE WE LOAD IT?
>                 stack.pop();
>                 dCurPosY = stack.top().GetReal();
>                 stack.pop();
>             }
>             else if( strcmp( pszToken, "BT" ) == 0 ) //Begin text object
>             {
>                 bTextBlock   = true;
>                 // BT does not reset font
>                 // dCurFontSize = 0.0;
>                 // pCurFont     = NULL;
>             }
>             else if( strcmp( pszToken, "ET" ) == 0 ) // End text object
>             {
>                 if( !bTextBlock )
>                     fprintf( stderr, "WARNING: Found ET without BT!\n" );
>             }
> 
>             if( bTextBlock )
>             {
>                 if( strcmp( pszToken, "Tf" ) == 0 ) // Set text font and size
>                 {
>                     dCurFontSize = stack.top().GetReal();
>                     stack.pop();
>                     PdfName fontName = stack.top().GetName();
>                     PdfObject* pFont = pPage->GetFromResources( 
> PdfName("Font"), fontName );
>                     if( !pFont )
>                     {
>                         PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidHandle, 
> "Cannot create font!" );
>                     }
> 
>                     pCurFont = pDocument->GetFont( pFont );
>                     if( !pCurFont )
>                     {
>                         fprintf( stderr, "WARNING: Unable to create font for 
> object %i %i R\n",
>                                  pFont->Reference().ObjectNumber(),
>                                  pFont->Reference().GenerationNumber() );
>                     }
>                 }
>                 else if( strcmp( pszToken, "Tj" ) == 0 || //Show text.. Means 
> Show..
>                          strcmp( pszToken, "'" ) == 0 ) //Move to next line 
> and show text
>                 {
>                     AddTextElement( dCurPosX, dCurPosY, pCurFont, 
> stack.top().GetString() );
>                     stack.pop();
>                 }
>                 else if( strcmp( pszToken, "\"" ) == 0 ) // escape sequence.. 
> return.
>                 {
>                     AddTextElement( dCurPosX, dCurPosY, pCurFont, 
> stack.top().GetString() );
>                     stack.pop();
>                     stack.pop(); // remove char spacing from stack
>                     stack.pop(); // remove word spacing from stack
>                 }
>                 else if( strcmp( pszToken, "TJ" ) == 0 ) //Show text, 
> allowing individual glyph positioning
>                 {
>                     PdfArray array = stack.top().GetArray();
>                     stack.pop();
> 
>                     for( int i=0; i<static_cast<int>(array.GetSize()); i++ )
>                     {
>                         if( array[i].IsString() )
>                             AddTextElement( dCurPosX, dCurPosY, pCurFont, 
> array[i].GetString() );
>                     }
>                 }
>             }
>         }
>         else if ( eType == ePdfContentsType_Variant ) // this happens first 
> then it loops back to the top
>         {
>             stack.push( var ); // initial push of variant onto the stack.  
> Consecutive variant tokens are pushed to the stack here.
>         }
>         else
>         {
>             // Impossible; type must be keyword or variant
>             PODOFO_RAISE_ERROR( ePdfError_InternalLogic );
>         }
>     }
> }
> 
> void TextExtractor::AddTextElement( double dCurPosX, double dCurPosY,
>                                     PdfFont* pCurFont, const PdfString & 
> rString )
> {
>     if( !pCurFont )
>     {
>         fprintf( stderr, "WARNING: Found text but do not have a current font: 
> %s\n", rString.GetString() );
>         return;
>     }
> 
>     if( !pCurFont->GetEncoding() )
>     {
>         fprintf( stderr, "WARNING: Found text but do not have a current 
> encoding: %s\n", rString.GetString() );
>         return;
>     }
> 
>     // For now just write to console
>     PdfString unicode = pCurFont->GetEncoding()->ConvertToUnicode( rString, 
> pCurFont );
>     const char* pszData = unicode.GetStringUtf8().c_str();
>     while( *pszData ) {
>         printf("%02x", static_cast<unsigned char>(*pszData) );
>         ++pszData;
>     }
> 
> 
>     printf("\n");
> 
>     printf("(%.3f,%.3f) %s \n", dCurPosX, dCurPosY, 
> unicode.GetStringUtf8().c_str() );
> cout << "THISX:" << dCurPosX << endl;
> 
> }
> 
> ------------------------------------------------------------------------------
> Introducing Performance Central, a new site from SourceForge and 
> AppDynamics. Performance Central is your source for news, insights, 
> analysis and resources for efficient Application Performance Management. 
> Visit us today!
> http://pubads.g.doubleclick.net/gampad/clk?id=48897511&iu=/4140/ostg.clktrk_______________________________________________
> Podofo-users mailing list
> Podofo-users@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/podofo-users

------------------------------------------------------------------------------
Introducing Performance Central, a new site from SourceForge and 
AppDynamics. Performance Central is your source for news, insights, 
analysis and resources for efficient Application Performance Management. 
Visit us today!
http://pubads.g.doubleclick.net/gampad/clk?id=48897511&iu=/4140/ostg.clktrk

_______________________________________________
Podofo-users mailing list
Podofo-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/podofo-users

Re: [Podofo-users] TextExtractor.cpp (loading stack)

Reply via email to