http://www.mediawiki.org/wiki/Special:Code/MediaWiki/56413

Revision: 56413
Author:   thomasv
Date:     2009-09-16 13:50:09 +0000 (Wed, 16 Sep 2009)

Log Message:
-----------
extract text layer from pdf

Modified Paths:
--------------
    trunk/extensions/PdfHandler/PdfHandler.image.php
    trunk/extensions/PdfHandler/PdfHandler.php
    trunk/extensions/PdfHandler/PdfHandler_body.php

Modified: trunk/extensions/PdfHandler/PdfHandler.image.php
===================================================================
--- trunk/extensions/PdfHandler/PdfHandler.image.php    2009-09-16 13:28:01 UTC 
(rev 56412)
+++ trunk/extensions/PdfHandler/PdfHandler.image.php    2009-09-16 13:50:09 UTC 
(rev 56413)
@@ -79,7 +79,7 @@
        }
 
        public function retrieveMetaData() {
-               global $wgPdfInfo;
+               global $wgPdfInfo, $wgPdftoText;
 
                if ( $wgPdfInfo ) {
                        wfProfileIn( 'pdfinfo' );
@@ -93,6 +93,25 @@
                } else {
                        $data = null;
                }
+
+               # Read text layer
+               if ( isset( $wgPdftoText ) ) { 
+                       wfProfileIn( 'pdftotext' );
+                       $cmd = wfEscapeShellArg( $wgPdftoText ) . ' '. 
wfEscapeShellArg( $this->mFilename ) . ' - ';
+                       wfDebug( __METHOD__.": $cmd\n" );
+                       $txt = wfShellExec( $cmd, $retval );
+                       wfProfileOut( 'pdftotext' );
+                       if( $retval == 0) {
+                               # Get rid of invalid UTF-8, strip control 
characters
+                               wfSuppressWarnings();
+                               $txt = iconv( "UTF-8","UTF-8//IGNORE", $txt );
+                               wfRestoreWarnings();
+                               $txt = preg_replace( "/[\013\035\037]/", "", 
$txt );
+                               $txt = htmlspecialchars($txt);
+                               $pages = preg_split("/\f/s", $txt  );
+                               $data['text'] = $pages;
+                       }
+               }
                return $data;
        }
 

Modified: trunk/extensions/PdfHandler/PdfHandler.php
===================================================================
--- trunk/extensions/PdfHandler/PdfHandler.php  2009-09-16 13:28:01 UTC (rev 
56412)
+++ trunk/extensions/PdfHandler/PdfHandler.php  2009-09-16 13:50:09 UTC (rev 
56413)
@@ -39,6 +39,7 @@
 $wgPdfProcessor     = 'gs';
 $wgPdfPostProcessor = 'convert';
 $wgPdfInfo          = 'pdfinfo';
+$wgPdftoText        = 'pdftotext';
 
 $wgPdfOutputExtension = "jpg";
 $wgPdfHandlerDpi = 150;

Modified: trunk/extensions/PdfHandler/PdfHandler_body.php
===================================================================
--- trunk/extensions/PdfHandler/PdfHandler_body.php     2009-09-16 13:28:01 UTC 
(rev 56412)
+++ trunk/extensions/PdfHandler/PdfHandler_body.php     2009-09-16 13:50:09 UTC 
(rev 56413)
@@ -202,4 +202,19 @@
                $data = $this->getMetaArray( $image );
                return PdfImage::getPageSize( $data, $page );
        }
+
+       function getPageText( $image, $page ){
+               $data = $this->getMetaArray( $image, true );
+               if ( !$data ) {
+                       return false;
+               }
+               if( ! isset( $data['text'] ) ) {
+                       return false;
+               }
+               if( ! isset( $data['text'][$page-1] ) ) {
+                       return false;
+               }
+               return $data['text'][$page-1];
+       }
+
 }



_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to