Hi,
We’ve been using poppler as a python extension module to turn a pdf to text and
extract information about each token. We construct a textOutputDevice and
then a textWordList from that, returning the font etc for each term.
One thing we’d like to add is which line the token appears on, and optionally
its index in that line. Is there an easy way to do this given a TextOutDev ?
-kim
#include "poppler.h"
#include "TextOutputDev.h"
#include <sstream>
#include <cstring>
#include "PDFDocFactory.h"
const double PopplerParser::resolution = 72.0;
PopplerParser::PopplerParser (const std::string inputFilename) {
GooString *ownerPW, *userPW;
ownerPW = NULL;
userPW = NULL; //assume no user and owner passwords
char st[inputFilename.length()+1];
strcpy(st,inputFilename.c_str());
GooString* fileName;
fileName = new GooString(st);
//create the document
//assumes no owner or userpassword
PopplerParser::doc = PDFDocFactory().createPDFDoc(*fileName,
ownerPW, userPW);
PopplerParser::numPages = PopplerParser::doc->getNumPages();
delete fileName;
}
int PopplerParser::getPages() {
return PopplerParser::numPages;
}
PopplerParser::~PopplerParser() {
//delete PopplerParser::numPages;
delete PopplerParser::doc;
}
std::string PopplerParser::Parse() {
GBool physLayout = gTrue;
GBool fixedPitch = gFalse;
GBool rawOrder = gFalse;
GBool htmlMeta = gTrue; // required to get the bounding box
information
int firstPage = 1;
int lastPage = PopplerParser::doc->getNumPages();
TextOutputDev *textOut;
std::string page_text;
std::string pages_text_data;
std::stringstream ss;
//Word Features
double xMinA, yMinA, xMaxA, yMaxA, r, g, b, fontSize;
TextWord *word;
GooString* fontName;
GBool underLined;
TextFontInfo *fontInfo;
GBool fixedWidth = gFalse;
GBool serif = gFalse;
GBool symbolic = gFalse;
GBool italic = gFalse;
GBool bold =gFalse;
//create our page
// read config file this is requried
globalParams = new GlobalParams();
//create a textOut
textOut = new TextOutputDev(NULL, physLayout, fixedPitch,
rawOrder, htmlMeta);
//walk over the pages
for (int page = firstPage; page <= lastPage; ++page) {
PopplerParser::doc->displayPage(textOut, page,
resolution, resolution, 0, gTrue, gFalse, gFalse);
TextWordList *wordlist = textOut->makeWordList();
const int word_length = wordlist != NULL ?
wordlist->getLength() : 0;
if (word_length > 0) {
//words on the page
for (int i = 0; i < word_length; ++i) {
word = wordlist->get(i);
//Word Features
word->getColor(&r , &g, &b);
underLined = word->isUnderlined();
fontSize = word->getFontSize();
word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA);
fontName = word->getFontName(0);
const std::string wordString =
word->getText()->getCString();
//fontIno
fontInfo = word->getFontInfo(0); //do this for
the first char in the word
fontName = fontInfo->getFontName();
fixedWidth = fontInfo ->isFixedWidth();
serif = fontInfo->isSerif();
symbolic = fontInfo->isSymbolic();
italic = fontInfo->isItalic();
bold = fontInfo->isBold();
// escape quotes in string
std::stringstream newStr;
for (int i = 0; i < wordString.length(); ++i) {
if (wordString[i] == '"' || wordString[i] == '\\') {
newStr << "\\";
}
newStr << wordString[i];
}
//construct our string output
ss << "{"
<< "\"xMin\":\"" << xMinA << "\",\"yMin\":\""
<< yMinA << "\",\"xMax\":\"" << xMaxA << "\",\"yMax\":\"" << yMaxA
<< "\",\"red\":\"" << r << "\",\"green\":\"" <<
g << "\",\"blue\":\""<< b
<< "\",\"fontSize\":\"" << fontSize
<< "\",\"italic\":\"" << italic
<< "\",\"serif\":\"" << serif
<< "\",\"symbolic\":\"" << symbolic
<< "\",\"fixedWidth\":\"" << fixedWidth
<< "\",\"bold\":\"" << bold
<< "\",\"fontName\":\"" <<
fontName->getCString()
<< "\",\"word\":\"" << newStr.str() <<
"\",\"page\":\""<< page
<< "\"}"
<< std::endl;
//std::cout << ss.str() << std::endl;
}
}
}
delete textOut;
delete globalParams;
//delete wordlist;
pages_text_data = ss.str();
return pages_text_data;
}
_______________________________________________
poppler mailing list
poppler@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/poppler