pdfextracttextbatch

Kai Dietrich Sun, 01 Mar 2009 12:06:18 -0800

Hello list,

first of all, thank you all for pdfbox, I'm currently using it to extract the 
text from a huge (40.000+) collection of PDF files and it works pretty good 
(besides failing on some strange encodings, broken headers and the like). 
Then again, here comes the problem: My poor little box is running at 100% CPU 
all night with a "find . -name '*.pdf' | xargs pdfextracttext" and I'm not 
even at 1 file/second. Most of the CPU load probably comes from starting and 
stopping the Java VM - which is a huge waste of time and energy. So what 
would be quite helpful is a tool which has the "for file in files"-loop 
inside the VM. So, here comes the tool :) It's an addon to 
org.apache.pdfbox.ExtractText -- it just removes the output-file parameter 
and handles all given files as input pdf files.


Problem is, I can't get pdfbox 0.8.0 to work well, because of some 
incompatability with the fontbox package from my distro (Gentoo). And I think 
I'll just torture my little box a bit longer until the find-xargs-extracttext 
job is done. But I don't want to waste the idea and the code, so maybe 
someone who has a working developer setup could test and improve on exception 
handling.

Greetings

Kai

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox;

import org.apache.pdfbox.ExtractText;

/**
 * This is the main program that simply parses the pdf documents and transforms them
 * into text (batch version of ExtractText).
 *
 * @author <a href="mailto:[email protected]">Ben Litchfield</a>
 * @author <a href="mailto:[email protected]">Kai Dietrich</a>
 * @version $Revision$
 */
public class ExtractTextBatch
{
    private static final String PASSWORD = "-password";
    private static final String ENCODING = "-encoding";
    private static final String CONSOLE = "-console";
    private static final String START_PAGE = "-startPage";
    private static final String END_PAGE = "-endPage";
    private static final String SORT = "-sort";
    private static final String HTML = "-html";

    /**
     * private constructor.
    */
    private ExtractTextBatch()
    {
        //static class
    }

    /**
     * Infamous main method.
     *
     * @param args Command line arguments, should be one and a reference to a file.
     *
     * @throws Exception If there is an error parsing the document.
     */
    public static void main( String[] args ) throws Exception
    {
    	int startPDFs = 0;
        for( int i=0; i<args.length; i++ )
        {
            if( args[i].equals( PASSWORD   ) ||
	    	args[i].equals( ENCODING   ) ||
		args[i].equals( START_PAGE ) ||
		args[i].equals( END_PAGE   )
	      )
            {
                i++;
            }
	    else if (
	        args[i].equals( CONSOLE ) ||
		args[i].equals( SORT    ) ||
		args[i].equals( HTML    )
	      )
	    {
	    }
	    else {
	    	startPDFs = i;
	    	break;
	    }
	}
	
	String params[] = new String[startPDFs+1];
	for( int i=0; i<startPDFs; i++) {
		params[i] = args[i];
	}
	for( int i=startPDFs; i<args.length; i++) {
		params[startPDFs] = args[i];
		try {
			org.apache.pdfbox.ExtractText.main(params);
		} catch(Exception e) {
			System.out.println("Oooops. Exception occured with file " + args[i] + ", continuing...");
		}
	}

    }

    /**
     * This will print the usage requirements and exit.
     */
    private static void usage()
    {
        System.err.println( "Usage: java org.apache.pdfbox.ExtractTextBatch [OPTIONS] <PDF file>...\n" +
            "  -password  <password>        Password to decrypt documents\n" +
            "  -encoding  <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
            "  -console                     Send text to console instead of file\n" +
            "  -html                        Output in HTML format instead of raw text\n" +
            "  -sort                        Sort the text before writing\n" +
            "  -startPage <number>          The first page to start extraction(1 based)\n" +
            "  -endPage <number>            The last page to extract(inclusive)\n" +
            "  <PDF file>...                The PDF document(s) to use\n"
            );
        System.exit( 1 );
    }
}

pdfextracttextbatch

Reply via email to