http://www.mediawiki.org/wiki/Special:Code/MediaWiki/56413
Revision: 56413 Author: thomasv Date: 2009-09-16 13:50:09 +0000 (Wed, 16 Sep 2009) Log Message: ----------- extract text layer from pdf Modified Paths: -------------- trunk/extensions/PdfHandler/PdfHandler.image.php trunk/extensions/PdfHandler/PdfHandler.php trunk/extensions/PdfHandler/PdfHandler_body.php Modified: trunk/extensions/PdfHandler/PdfHandler.image.php =================================================================== --- trunk/extensions/PdfHandler/PdfHandler.image.php 2009-09-16 13:28:01 UTC (rev 56412) +++ trunk/extensions/PdfHandler/PdfHandler.image.php 2009-09-16 13:50:09 UTC (rev 56413) @@ -79,7 +79,7 @@ } public function retrieveMetaData() { - global $wgPdfInfo; + global $wgPdfInfo, $wgPdftoText; if ( $wgPdfInfo ) { wfProfileIn( 'pdfinfo' ); @@ -93,6 +93,25 @@ } else { $data = null; } + + # Read text layer + if ( isset( $wgPdftoText ) ) { + wfProfileIn( 'pdftotext' ); + $cmd = wfEscapeShellArg( $wgPdftoText ) . ' '. wfEscapeShellArg( $this->mFilename ) . ' - '; + wfDebug( __METHOD__.": $cmd\n" ); + $txt = wfShellExec( $cmd, $retval ); + wfProfileOut( 'pdftotext' ); + if( $retval == 0) { + # Get rid of invalid UTF-8, strip control characters + wfSuppressWarnings(); + $txt = iconv( "UTF-8","UTF-8//IGNORE", $txt ); + wfRestoreWarnings(); + $txt = preg_replace( "/[\013\035\037]/", "", $txt ); + $txt = htmlspecialchars($txt); + $pages = preg_split("/\f/s", $txt ); + $data['text'] = $pages; + } + } return $data; } Modified: trunk/extensions/PdfHandler/PdfHandler.php =================================================================== --- trunk/extensions/PdfHandler/PdfHandler.php 2009-09-16 13:28:01 UTC (rev 56412) +++ trunk/extensions/PdfHandler/PdfHandler.php 2009-09-16 13:50:09 UTC (rev 56413) @@ -39,6 +39,7 @@ $wgPdfProcessor = 'gs'; $wgPdfPostProcessor = 'convert'; $wgPdfInfo = 'pdfinfo'; +$wgPdftoText = 'pdftotext'; $wgPdfOutputExtension = "jpg"; $wgPdfHandlerDpi = 150; Modified: trunk/extensions/PdfHandler/PdfHandler_body.php =================================================================== --- trunk/extensions/PdfHandler/PdfHandler_body.php 2009-09-16 13:28:01 UTC (rev 56412) +++ trunk/extensions/PdfHandler/PdfHandler_body.php 2009-09-16 13:50:09 UTC (rev 56413) @@ -202,4 +202,19 @@ $data = $this->getMetaArray( $image ); return PdfImage::getPageSize( $data, $page ); } + + function getPageText( $image, $page ){ + $data = $this->getMetaArray( $image, true ); + if ( !$data ) { + return false; + } + if( ! isset( $data['text'] ) ) { + return false; + } + if( ! isset( $data['text'][$page-1] ) ) { + return false; + } + return $data['text'][$page-1]; + } + } _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs