Here is my source code where I convert pdf files to text for indexing, I
got this source code from lucene in action examples and adapted it for my
convenience, I hop you could help me to fix this problem, anyway if you know
another more efficient way to do it please tell me how to:
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.encryption.DecryptDocument;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.util.PDFTextStripper;
import cu.co.cenatav.kernel.parser.DocumentHandler;
import cu.co.cenatav.kernel.parser.DocumentHandlerException;
import cu.co.cenatav.kernel.parser.schema.SchemaExtractor;
public class PDFBoxPDFHandler implements DocumentHandler {
public static String password = "-password";
public Document getDocument(InputStream is)
throws DocumentHandlerException {
COSDocument cosDoc = null;
try {
cosDoc = parseDocument(is);
}
catch (IOException e) {
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot parse PDF document", e);
}
// decrypt the PDF document, if it is encrypted
try {
if (cosDoc.isEncrypted()) {
DecryptDocument decryptor = new DecryptDocument(cosDoc);
decryptor.decryptDocument(password);
}
}
catch (CryptographyException e) {
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot decrypt PDF document", e);
}
catch (InvalidPasswordException e) {
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot decrypt PDF document", e);
}
catch (IOException e) {
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot decrypt PDF document", e);
}
// extract PDF document's textual content
String bodyText = null;
try {
PDFTextStripper stripper = new PDFTextStripper();
bodyText = stripper.getText(new PDDocument(cosDoc));
}
catch (IOException e) {
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot parse PDF document", e);
// String errS = e.toString();
// if (errS.toLowerCase().indexOf("font") != -1) {
// }
}
Document doc = new Document();
if (bodyText != null) {
PDDocument pdDoc = null;
PDDocumentInformation docInfo = null;
try {
pdDoc = new PDDocument(cosDoc);
docInfo = pdDoc.getDocumentInformation();
}
catch (Exception e) {
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
System.err.println("Cannot extraxt metadata from PDF: " +
e.getMessage());
}
SchemaExtractor schemaExtractor = new SchemaExtractor(bodyText);
String author = null;
if (docInfo != null)
author = docInfo.getAuthor();
if (author == null || author.equals("")){
//TODO Hacer el componente schemaExtractor
List Authors = schemaExtractor.getAuthor();
Iterator I = Authors.iterator();
while (I.hasNext()){
String Author = (String)I.next();
doc.add(new Field("author", Author, Field.Store.YES ,
Field.Index.TOKENIZED, Field.TermVector.YES));
}
}else{
doc.add(new Field("author", author, Field.Store.YES ,
Field.Index.TOKENIZED, Field.TermVector.YES));
}
String title = null;
if (docInfo != null)
title = docInfo.getTitle();
if (title == null || title.equals("")){
title = schemaExtractor.getTitle();
}
String keywords = null;
if (docInfo != null)
keywords = docInfo.getKeywords();
if (keywords == null)
keywords = "";
String summary = null;
if (docInfo != null)
summary = docInfo.getProducer() + " " +
docInfo.getCreator() + " " + docInfo.getSubject();
if (summary == null || summary.equals("")){
summary = schemaExtractor.getAbstract();
}
String content = schemaExtractor.getContent();
Field fieldTitle = new Field("title", title, Field.Store.YES ,
Field.Index.TOKENIZED,Field.TermVector.YES);
//fieldTitle.setBoost(new Float(1.5));
doc.add(fieldTitle);
Field fieldSumary = new Field("sumary", summary, Field.Store.YES ,
Field.Index.TOKENIZED,Field.TermVector.YES);
//fieldSumary.setBoost(new Float(1.3));
doc.add(fieldSumary);
doc.add(new Field("content", content, Field.Store.YES ,
Field.Index.TOKENIZED,Field.TermVector.YES));
doc.add(new Field("keywords", keywords, Field.Store.YES ,
Field.Index.UN_TOKENIZED,Field.TermVector.YES));
closePDDocument(pdDoc);
}
// extract PDF document's meta-data
closeCOSDocument(cosDoc);
return doc;
}
private static COSDocument parseDocument(InputStream is)
throws IOException {
PDFParser parser = new PDFParser(is);
parser.parse();
return parser.getDocument();
}
private void closeCOSDocument(COSDocument cosDoc) {
if (cosDoc != null) {
try {
cosDoc.close();
}
catch (IOException e) {
// eat it, what else can we do?
}
}
}
private void closePDDocument(PDDocument pdDoc) {
if (pdDoc != null) {
try {
pdDoc.close();
}
catch (IOException e) {
// eat it, what else can we do?
}
}
}
public static void main(String[] args) throws Exception
{
PDFBoxPDFHandler handler = new PDFBoxPDFHandler();
Document doc = handler.getDocument(new FileInputStream(new
File(args[0])));
System.out.println(doc);
}
}
Could you help me please.