Hi
I get pdfbox and use the jar file in myproject but...
when i use the this source to extract text from a pdf It take error in this
line in netbeans 6.5
"parsedText = pdfStripper.getText(pdDoc);"

This Is my source that i use it:


"
import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.util.PDFTextStripper;

import java.io.File;
import java.io.FileInputStream;
import java.io.PrintWriter;

public class PDFTextParser {

    PDFParser parser;
    String parsedText;
    PDFTextStripper pdfStripper;
    PDDocument pdDoc;
    COSDocument cosDoc;
    PDDocumentInformation pdDocInfo;

    // PDFTextParser Constructor
    public PDFTextParser() {
    }

    // Extract text from PDF Document
    String pdftoText(String fileName) {

        System.out.println("Parsing text from PDF file " + fileName +
"....");
        File f = new File(fileName);

        if (!f.isFile()) {
            System.out.println("File " + fileName + " does not exist.");
            return null;
        }

        try {
            parser = new PDFParser(new FileInputStream(f));
        } catch (Exception e) {
            System.out.println("Unable to open PDF Parser.");
            return null;
        }

        try {
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            pdfStripper.setStartPage(1);
            pdfStripper.setEndPage(5);

            parsedText = pdfStripper.getText(pdDoc);
        } catch (Exception e) {
            System.out.println("An exception occured in parsing the PDF
Document.");
            e.printStackTrace();
            try {
                   if (cosDoc != null) cosDoc.close();
                   if (pdDoc != null) pdDoc.close();
               } catch (Exception e1) {
               e.printStackTrace();
            }
            return null;
        }
        System.out.println("Done.");
        return parsedText;
    }

    // Write the parsed text from PDF to a file
    void writeTexttoFile(String pdfText, String fileName) {

        System.out.println("\nWriting PDF text to output text file " +
fileName + "....");
        try {
            PrintWriter pw = new PrintWriter(fileName);
            pw.print(pdfText);
            pw.close();
        } catch (Exception e) {
            System.out.println("An exception occured in writing the pdf text
to file.");
            e.printStackTrace();
        }
        System.out.println("Done.");
    }

    //Extracts text from a PDF Document and writes it to a text file
    public static void main(String args[]) {

        if (args.length != 2) {
            System.out.println("Usage: java PDFTextParser <InputPDFFilename>
<OutputTextFile>");
            //System.exit(1);
        }

        PDFTextParser pdfTextParserObj = new PDFTextParser();
        String pdfToText =
pdfTextParserObj.pdftoText("/home/mnm/index/test.pdf");

        if (pdfToText == null) {
            System.out.println("PDF to Text Conversion failed.");
        }
        else {
            System.out.println("\nThe text parsed from the PDF
Document....\n" + pdfToText);
            pdfTextParserObj.writeTexttoFile(pdfToText, "
/home/mnm/index/m.txt");
        }
    }
}
"


thanks very much

-- 
GNU-LINUX IS THE BEST O.S.

Reply via email to