Hi,
The stack is actually already loaded when it is popped as the keyword (token of
type ePdfContentsType_Keyword) comes after its arguments (tokens of type
ePdfContentsType_Variant) in the PDF command. Example: "0 0 m".
Regards
Palmer Zent
On Aug 22, 2013, at 3:34 PM, Domonic Tom <abdom...@hotmail.com> wrote:
> Hi there.
>
> I understand 99 percent of the code below. This is from TextExtractor.cpp.
>
> I'm just not sure how the following occurs.
>
> When does the stack have the first token pushed onto it? I can see further
> down that if the variable type is a variant then we push a variant onto the
> stack but I'm not sure why we are establishing position x and y before we
> load the stack with anything. This could be a really silly question as I
> can't see where anyone else has asked it but I've highlighted the section
> below in bold and red. The general area is in bold.
>
> Thank you..
>
>
>
>
> /***************************************************************************
> * Copyright (C) 2008 by Dominik Seichter *
> * domseich...@web.de *
> * *
> * This program is free software; you can redistribute it and/or modify *
> * it under the terms of the GNU General Public License as published by *
> * the Free Software Foundation; either version 2 of the License, or *
> * (at your option) any later version. *
> * *
> * This program is distributed in the hope that it will be useful, *
> * but WITHOUT ANY WARRANTY; without even the implied warranty of *
> * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
> * GNU General Public License for more details. *
> * *
> * You should have received a copy of the GNU General Public License *
> * along with this program; if not, write to the *
> * Free Software Foundation, Inc., *
> * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
> ***************************************************************************/
>
> #include "TextExtractor.h"
>
> #include <stack>
> #include <iostream>
> using namespace std;
>
> TextExtractor::TextExtractor()
> {
>
> }
>
> TextExtractor::~TextExtractor()
> {
> }
>
> void TextExtractor::Init( const char* pszInput )
> {
> if( !pszInput )
> {
> PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
> }
>
> PdfMemDocument document( pszInput );
>
> int nCount = document.GetPageCount();
> for( int i=0; i<nCount; i++ )
> {
> PdfPage* pPage = document.GetPage( i );
>
> this->ExtractText( &document, pPage );
> }
> }
>
> void TextExtractor::ExtractText( PdfMemDocument* pDocument, PdfPage* pPage )
> {
> const char* pszToken = NULL;
> PdfVariant var;
> EPdfContentsType eType;
>
> PdfContentsTokenizer tokenizer( pPage );
>
> double dCurPosX = 0.0;
> double dCurPosY = 0.0;
> double dCurFontSize = 0.0;
> bool bTextBlock = false;
> PdfFont* pCurFont = NULL;
>
> std::stack<PdfVariant> stack;
>
> while( tokenizer.ReadNext( eType, pszToken, var ) )
> {
> if( eType == ePdfContentsType_Keyword )
> {
> // support 'l' and 'm' tokens ---------------------------'l'
> token means: Append straight line segment to path
> // 'm' token means: Begin new subpath
> if( strcmp( pszToken, "l" ) == 0 ||
> strcmp( pszToken, "m" ) == 0 )
> {
> dCurPosX = stack.top().GetReal(); // WHY ARE WE POPPING OFF
> THE STACK BEFORE WE LOAD IT?
> stack.pop();
> dCurPosY = stack.top().GetReal();
> stack.pop();
> }
> else if( strcmp( pszToken, "BT" ) == 0 ) //Begin text object
> {
> bTextBlock = true;
> // BT does not reset font
> // dCurFontSize = 0.0;
> // pCurFont = NULL;
> }
> else if( strcmp( pszToken, "ET" ) == 0 ) // End text object
> {
> if( !bTextBlock )
> fprintf( stderr, "WARNING: Found ET without BT!\n" );
> }
>
> if( bTextBlock )
> {
> if( strcmp( pszToken, "Tf" ) == 0 ) // Set text font and size
> {
> dCurFontSize = stack.top().GetReal();
> stack.pop();
> PdfName fontName = stack.top().GetName();
> PdfObject* pFont = pPage->GetFromResources(
> PdfName("Font"), fontName );
> if( !pFont )
> {
> PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidHandle,
> "Cannot create font!" );
> }
>
> pCurFont = pDocument->GetFont( pFont );
> if( !pCurFont )
> {
> fprintf( stderr, "WARNING: Unable to create font for
> object %i %i R\n",
> pFont->Reference().ObjectNumber(),
> pFont->Reference().GenerationNumber() );
> }
> }
> else if( strcmp( pszToken, "Tj" ) == 0 || //Show text.. Means
> Show..
> strcmp( pszToken, "'" ) == 0 ) //Move to next line
> and show text
> {
> AddTextElement( dCurPosX, dCurPosY, pCurFont,
> stack.top().GetString() );
> stack.pop();
> }
> else if( strcmp( pszToken, "\"" ) == 0 ) // escape sequence..
> return.
> {
> AddTextElement( dCurPosX, dCurPosY, pCurFont,
> stack.top().GetString() );
> stack.pop();
> stack.pop(); // remove char spacing from stack
> stack.pop(); // remove word spacing from stack
> }
> else if( strcmp( pszToken, "TJ" ) == 0 ) //Show text,
> allowing individual glyph positioning
> {
> PdfArray array = stack.top().GetArray();
> stack.pop();
>
> for( int i=0; i<static_cast<int>(array.GetSize()); i++ )
> {
> if( array[i].IsString() )
> AddTextElement( dCurPosX, dCurPosY, pCurFont,
> array[i].GetString() );
> }
> }
> }
> }
> else if ( eType == ePdfContentsType_Variant ) // this happens first
> then it loops back to the top
> {
> stack.push( var ); // initial push of variant onto the stack.
> Consecutive variant tokens are pushed to the stack here.
> }
> else
> {
> // Impossible; type must be keyword or variant
> PODOFO_RAISE_ERROR( ePdfError_InternalLogic );
> }
> }
> }
>
> void TextExtractor::AddTextElement( double dCurPosX, double dCurPosY,
> PdfFont* pCurFont, const PdfString &
> rString )
> {
> if( !pCurFont )
> {
> fprintf( stderr, "WARNING: Found text but do not have a current font:
> %s\n", rString.GetString() );
> return;
> }
>
> if( !pCurFont->GetEncoding() )
> {
> fprintf( stderr, "WARNING: Found text but do not have a current
> encoding: %s\n", rString.GetString() );
> return;
> }
>
> // For now just write to console
> PdfString unicode = pCurFont->GetEncoding()->ConvertToUnicode( rString,
> pCurFont );
> const char* pszData = unicode.GetStringUtf8().c_str();
> while( *pszData ) {
> printf("%02x", static_cast<unsigned char>(*pszData) );
> ++pszData;
> }
>
>
> printf("\n");
>
> printf("(%.3f,%.3f) %s \n", dCurPosX, dCurPosY,
> unicode.GetStringUtf8().c_str() );
> cout << "THISX:" << dCurPosX << endl;
>
> }
>
> ------------------------------------------------------------------------------
> Introducing Performance Central, a new site from SourceForge and
> AppDynamics. Performance Central is your source for news, insights,
> analysis and resources for efficient Application Performance Management.
> Visit us today!
> http://pubads.g.doubleclick.net/gampad/clk?id=48897511&iu=/4140/ostg.clktrk_______________________________________________
> Podofo-users mailing list
> Podofo-users@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/podofo-users
------------------------------------------------------------------------------
Introducing Performance Central, a new site from SourceForge and
AppDynamics. Performance Central is your source for news, insights,
analysis and resources for efficient Application Performance Management.
Visit us today!
http://pubads.g.doubleclick.net/gampad/clk?id=48897511&iu=/4140/ostg.clktrk
_______________________________________________
Podofo-users mailing list
Podofo-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/podofo-users